
    g                        d Z ddlZddlmZ ddlmZmZmZ ddlZddl	Zddlm
Z
 ddlmZmZmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZmZmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&  e$j'        e(          Z)dZ*dZ+da,d Z-d Z.d Z/d Z0 G d dej1        j2                  Z3 G d dej1        j2                  Z4 G d de
j5                  Z6 G d de
j5                  Z7 G d de
j5                  Z8 G d d e
j5                  Z9 G d! d"e
j5                  Z: G d# d$e
j5                  Z; G d% d&e
j5                  Z< G d' d(e
j5                  Z= G d) d*e
j5                  Z> G d+ d,e
j5                  Z? G d- d.e
j5                  Z@ G d/ d0e          ZAd1ZBd2ZC e d3eB           G d4 d5eA                      ZD e d6eB           G d7 d8eA                      ZE G d9 d:e
j5                  ZF e d;eB           G d< d=eA                      ZG e d>eB           G d? d@eA                      ZH e dAeB           G dB dCeA                      ZI e dDeB           G dE dFeA                      ZJdS )GzPyTorch YOSO model.    N)Path)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardis_ninja_availableis_torch_cuda_availablelogging   )
YosoConfigzuw-madison/yoso-4096r   c                  V    ddl m}  d } |g d          } | d|d           dd lad S )Nr   )loadc                     t          t                                                    j        j        j        dz  dz  fd| D             S )Nkernelsyosoc                     g | ]}|z  S  r%   ).0file
src_folders     b/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/yoso/modeling_yoso.py
<listcomp>z:load_cuda_kernels.<locals>.append_root.<locals>.<listcomp>?   s    444d
T!444    )r   __file__resolveparent)filesr(   s    @r)   append_rootz&load_cuda_kernels.<locals>.append_root=   sH    (^^++--4;BYNQWW
4444e4444r+   )zfast_lsh_cumulation_torch.cppzfast_lsh_cumulation.cuzfast_lsh_cumulation_cuda.cufast_lsh_cumulationT)verbose)torch.utils.cpp_extensionr    r1   lsh_cumulation)r    r0   	src_filess      r)   load_cuda_kernelsr6   9   se    ......5 5 5 vvvwwID		48888000000r+   c                    t          | t                    rFg }| D ]?}|                                s|                                }|                    |           @|S |                                 s|                                 } | S N)
isinstancelistis_contiguous
contiguousappendinput_tensorsouttensors      r)   to_contiguousrB   H   s    -&& 
# 	 	F'')) -**,,JJv
**,, 	7)4466Mr+   c                     t          | t                    r>g }| D ]7}|                    t          j                            |dd                     8|S t          j                            | dd          S )N   )pdim)r9   r:   r=   r   
functional	normalizer>   s      r)   rI   rI   V   sy    -&& C# 	E 	EFJJr}..v.CCDDDD
}&&}r&BBBr+   c                    t          |                                           dk    rt          d          t          |                                          dk    rt          d          t          j        |                     d          |                     d          ||z  | j                  }dt          j        || j                  z  }t          j        | |                              |                     d          |                     d          ||          }t          j        ||                              |                    d          |                    d          ||          }|dk    	                                }|dk    	                                }	t          j
        ||z  d	          }
t          j
        |	|z  d	          }
|
	                                |
	                                fS )
Nr   zQuery has incorrect size.zKey has incorrect size.r   rD   devicer   rE   rG   )lensize
ValueErrortorchrandnrL   arangematmulreshapeintsum)querykeynum_hashhash_lenrmat	raise_powquery_projectionkey_projectionquery_binary
key_binary
query_hashs              r)   hashingrc   `   s   
5::<<A4555
388::!2333;uzz!}}ejjmmX5HQVQ]^^^DU\(5<@@@@I|E40088A

STW_aijj\#t,,44SXXa[[#((1++xYabbN$q(--//L 1$))++J<)3<<<J:	1r:::J>>Z^^----r+   c                   :    e Zd Zed             Zed             ZdS )YosoCumulationc           
      b   |d         }dt          j        t          j        ||                    dd                              t          j        z  z
  |z  }||d d d d d f         z  |d d d d d f         z  }t          j        ||          }	|                     ||||||           || _        |	S )Nhash_code_lenr   rE   )rQ   acosrT   	transposemathpisave_for_backwardconfig)
ctx
query_maskkey_maskrX   rY   valuern   rg   expectationcumulation_values
             r)   forwardzYosoCumulation.forwardt   s    /5:el5#--B:O:O&P&PQQTXT[[[`mm!Jqqq!!!Tz$::Xaaaqqqj=QQ <U;;j(KUSSS
r+   c                    t          |          }| j        \  }}}}}}| j        }|d         }	t          j        ||                    dd                    |z  }
t          j        |
|	dz  |z            }t          j        |
                    dd          |	dz  |z            }t          j        |                    dd          |          }d d |||d fS )Nrg   rE   rh   rD   )rB   saved_tensorsrn   rQ   rT   rj   )ro   gradrp   rq   rs   rX   rY   rr   rn   rg   weighted_exp
grad_querygrad_key
grad_values                 r)   backwardzYosoCumulation.backward   s    T""?B?P<
Hk5#u/|D%//"b*A*ABB[P\,1Bc0IJJ
< 6 6r2 > >QRARV[@[\\\+"7"7B"?"?FF
T:xTAAr+   N__name__
__module____qualname__staticmethodru   r}   r%   r+   r)   re   re   s   sM        
  
  \
  B B \B B Br+   re   c                   :    e Zd Zed             Zed             ZdS )YosoLSHCumulationc           
      Z   |                     d          |                     d          k    rt          d          |                     d          |                     d          k    rt          d          |                     d          |                     d          k    rt          d          |                     d          |                     d          k    rt          d          |                     d          |                     d          k    rt          d          |                     d          |                     d          k    rt          d	          t          |||||g          \  }}}}}|j        }|d
         }|d         }	t	          d|	z            }
|d         r%t
                              ||||||	|d          \  }}nt          ||||	          \  }}t
                              ||||||
|d          }|                     |||||||           || _	        |S )Nr   z6Query mask and Key mask differ in sizes in dimension 0z3Query mask and Query differ in sizes in dimension 0z1Query mask and Key differ in sizes in dimension 0z8Query mask and Value mask differ in sizes in dimension 0r   z,Key and Value differ in sizes in dimension 1rD   z,Query and Key differ in sizes in dimension 2rZ   rg   use_fast_hash)
rO   rP   rB   is_cudarV   r4   	fast_hashrc   rm   rn   )ro   rp   rq   rX   rY   rr   rn   use_cudarZ   rg   hashtable_capacityquery_hash_codekey_hash_codert   s                 r)   ru   zYosoLSHCumulation.forward   s'   ??1q!1!111UVVV??1A..RSSS??1!,,PQQQ??1A..WXXX88A;;%**Q--''KLLL::a==CHHQKK''KLLL2?XW\^ach@i2j2j/
HeS%%*%/ M!122/" 	Z-;-E-EE8S(M8UV. .*O]] .5UC=-Y-Y*O])88=%I[]egh
 
 	j(O]TY[^`efff
r+   c                    t          |          }| j        \  }}}}}}}| j        }	|j        }
|	d         }t	          d|z            }|	d         rut
                              |||||||
d          }t
                              |||||||dz  |z  ||
d
  
        }t
                              |||||||dz  |z  ||
d
  
        }ndt          j        t          j	        ||
                    dd                              t          j        z  z
  |z  }||d d d d d f         z  |d d d d d f         z  }t          j	        ||
                    dd                    |z  }t          j	        ||dz  |z            }t          j	        |
                    dd          |dz  |z            }t          j	        |
                    dd          |          }d d |||d fS )Nrg   rD   lsh_backwardr      rE   rh   )rB   rw   rn   r   rV   r4   lsh_weighted_cumulationrQ   ri   rT   rj   rk   rl   )ro   rx   rp   rq   r   r   rX   rY   rr   rn   r   rg   r   r|   rz   r{   rs   ry   s                     r)   r}   zYosoLSHCumulation.backward   s    T""RURcO
Ho}eS%</ M!122.! "	K'66-_dL^`hjk J (??"c)" J &=="e+" HH uz%,ucmmBPR>S>S*T*TUUX\X___dqqK%
111aaa:(>>!!!TSTSTST*AUUK <eoob".E.EFFTLl]Q5F#4MNNJ|L$:$:2r$B$B]UVEVZ_D_``Hk&;&;B&C&CTJJJT:xTAAr+   Nr~   r%   r+   r)   r   r      sN        #  #  \# J .B .B \.B .B .Br+   r   c                   *     e Zd ZdZ fdZddZ xZS )YosoEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                     t                                                       t          j        |j        |j        |j                  | _        t          j        |j        dz   |j                  | _	        t          j        |j
        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        |                     dt%          j        |j                                      d          dz   d           t+          |dd	          | _        |                     d
t%          j        | j                                        t$          j        | j        j                  d           d S )N)padding_idxrD   epsposition_ids)r   rE   F)
persistentposition_embedding_typeabsolutetoken_type_idsdtyperL   )super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferrQ   rS   expandgetattrr   zerosr   rO   longrL   selfrn   	__class__s     r)   r   zYosoEmbeddings.__init__   sb   !|F,=v?Q_e_rsss#%<0NQR0RTZTf#g#g %'\&2H&J\%]%]" f&8f>STTTz&"<== 	EL)GHHOOPWXX[\\in 	 	
 	
 	
 (/v7PR\']']$K)..00
4K\Kcddd 	 	
 	
 	
 	
 	
r+   Nc                    ||                                 }n|                                 d d         }|d         }|| j        d d d |f         }|mt          | d          r2| j        d d d |f         }|                    |d         |          }|}n+t          j        |t
          j        | j        j                  }|| 	                    |          }| 
                    |          }	||	z   }
| j        dk    r|                     |          }|
|z  }
|                     |
          }
|                     |
          }
|
S )NrE   r   r   r   r   r   )rO   r   hasattrr   r   rQ   r   r   rL   r   r   r   r   r   r   )r   	input_idsr   r   inputs_embedsinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   s               r)   ru   zYosoEmbeddings.forward  sb    #..**KK',,..ss3K ^
,QQQ^<L
 !t-.. m*.*=aaa*n*M'3J3Q3QR]^_R`bl3m3m0!A!&[
SWSdSk!l!l!l  00;;M $ : :> J J"%::
':55"&":":<"H"H--J^^J//
\\*--
r+   )NNNNr   r   r   __doc__r   ru   __classcell__r   s   @r)   r   r      sR        QQ
 
 
 
 
,               r+   r   c                   .     e Zd Zd fd	Zd ZddZ xZS )YosoSelfAttentionNc                    t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          t          d u}t                      rTt                      rF|sD	 t                       n4# t          $ r'}t                              d|            Y d }~nd }~ww xY w|j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t!          j        |j        | j                  | _        t!          j        |j        | j                  | _        t!          j        |j        | j                  | _        t!          j        |j                  | _        ||n|j        | _        |j        | _        |j        | _        |j        d u| _        |j        | _        |j        | _        |j        | _        | j        | j        | j        | j        d| _         |j        At!          j!        |j        |j        |j        df|j        d	z  dfd
|j                  | _"        d S d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zGCould not load the custom kernel for multi-scale deformable attention: )rg   r   rZ   r   r   rD   F)in_channelsout_channelskernel_sizepaddingbiasgroups)#r   r   r   num_attention_headsr   rP   r4   r   r   r6   	ExceptionloggerwarningrV   attention_head_sizeall_head_sizer   LinearrX   rY   rr   r   attention_probs_dropout_probr   r   use_expectationrg   conv_windowuse_convr   rZ   r   
lsh_configConv2dconv)r   rn   r   kernel_loadeder   s        r)   r   zYosoSelfAttention.__init__)  s    ::a??PVXhHiHi?8F$6 8 8 48 8 8   'd2"$$ 	n);)=)= 	nm 	nn!#### n n nlijllmmmmmmmmn $*#= #&v'9F<V'V#W#W !58PPYv143EFF
9V/1CDDYv143EFF
z&"EFF'>'J##PVPn 	$  &5#1*$6#1"/ "/!/ -	
 
 )	"6#7#/3+q0!41  DIII *)s   B 
C&CCc                     |                                 d d         | j        | j        fz   } |j        | }|                    dddd          S )NrE   r   rD   r   r   )rO   r   r   viewpermute)r   layernew_layer_shapes      r)   transpose_for_scoresz&YosoSelfAttention.transpose_for_scores\  sN    **,,ss+t/GIa.bb
O,}}Q1a(((r+   Fc                 |   |                      |          }|                     |                     |                    }|                     |                     |                    }|                     |          }| j        r&|                     ||d d d d d d f         z            }|                                \  }	}
}}|                    |	|
z  ||          }|                    |	|
z  ||          }|                    |	|
z  ||          }d|dz  z   }|                    d          	                    |
d                              |	|
z  |          
                                }d}| j        s||k     r|	|
z  |||z
  f}t          j        |t          j        ||j                  gd          }t          j        |t          j        ||j                  gd          }t          j        |t          j        ||j                  gd          }| j        s| j        rt#          ||g          \  }}| j        r%t$                              |||||| j                  }n$t*                              |||||| j                  }| j        s||k     r|d d d d d |f         }t#          |          }|                    |	|
||          }| j        r||z  }|                    dd	dd
                                          }|                                d d         | j        fz   } |j        | }|r||fn|f}|S )N      ?g     @r   rM       rK   rE   r   rD   r   rh   )rX   r   rY   rr   r   r   rO   rU   	unsqueezerepeat_interleaverV   r   rQ   catr   rL   trainingrI   re   applyr   r   r   r<   r   r   )r   hidden_statesattention_maskoutput_attentionsmixed_query_layer	key_layervalue_layerquery_layerconv_value_layer
batch_size	num_headsseq_lenhead_dimgpu_warp_sizepad_sizecontext_layernew_context_layer_shapeoutputss                     r)   ru   zYosoSelfAttention.forwarda  s    JJ}55--dhh}.E.EFF	//

=0I0IJJ//0ABB= 	Y#yy~aaaqqqRVFV7W)WXX3>3C3C3E3E0
Iw!))*y*@'8TT%%j9&<gxPP	!))*y*@'8TT~77$$Q''ya00WZ)+W55SUU	 	 $ 	(]*B*B!I-w8PPH)K1CDDD   K 	K1ABBB   I  )K1CDDD   K  	I4= 	I%.Y/G%H%H"K 	*00YUYUd MM .33YUYUd M $ 	;(]*B*B)!!!QQQ		/:M!-00%--j)WhWW= 	.--M%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S**,CD4E[=-00MK[r+   r8   NF)r   r   r   r   r   ru   r   r   s   @r)   r   r   (  si        1 1 1 1 1 1f) ) )
Q Q Q Q Q Q Q Qr+   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )YosoSelfOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j	                  | _
        d S Nr   )r   r   r   r   r   denser   r   r   r   r   r   s     r)   r   zYosoSelfOutput.__init__  sf    Yv163EFF
f&8f>STTTz&"<==r+   r   input_tensorreturnc                     |                      |          }|                     |          }|                     ||z             }|S r8   r  r   r   r   r   r  s      r)   ru   zYosoSelfOutput.forward  @    

=11]33}|'CDDr+   r   r   r   r   rQ   Tensorru   r   r   s   @r)   r   r     i        > > > > >U\  RWR^        r+   r   c                   .     e Zd Zd fd	Zd ZddZ xZS )YosoAttentionNc                     t                                                       t          ||          | _        t	          |          | _        t                      | _        d S )N)r   )r   r   r   r   r   outputsetpruned_heads)r   rn   r   r   s      r)   r   zYosoAttention.__init__  sO    %fF]^^^	$V,,EEr+   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   rM   )rN   r   r   r   r   r  r   rX   rY   rr   r  r  r   union)r   headsindexs      r)   prune_headszYosoAttention.prune_heads  s    u::??F7490$)2OQUQb
 
u
 -TY_eDD	*49=%@@	,TY_eDD	.t{/@%QOOO )-	(EE

(R	%"&)"?$)B_"_	 -33E::r+   Fc                     |                      |||          }|                     |d         |          }|f|dd          z   }|S )Nr   r   )r   r  )r   r   r   r   self_outputsattention_outputr   s          r)   ru   zYosoAttention.forward  sK    yy@QRR;;|AFF#%QRR(88r+   r8   r   )r   r   r   r   r  ru   r   r   s   @r)   r  r    s`        " " " " " "; ; ;$       r+   r  c                   B     e Zd Z fdZdej        dej        fdZ xZS )YosoIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r8   )r   r   r   r   r   intermediate_sizer  r9   
hidden_actstrr   intermediate_act_fnr   s     r)   r   zYosoIntermediate.__init__  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r+   r   r  c                 Z    |                      |          }|                     |          }|S r8   )r  r   r   r   s     r)   ru   zYosoIntermediate.forward  s,    

=1100??r+   r	  r   s   @r)   r  r    s^        9 9 9 9 9U\ el        r+   r  c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )
YosoOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j	        |j
                  | _        d S r  )r   r   r   r   r  r   r  r   r   r   r   r   r   s     r)   r   zYosoOutput.__init__  sf    Yv79KLL
f&8f>STTTz&"<==r+   r   r  r  c                     |                      |          }|                     |          }|                     ||z             }|S r8   r  r  s      r)   ru   zYosoOutput.forward  r  r+   r	  r   s   @r)   r$  r$    r  r+   r$  c                   ,     e Zd Z fdZddZd Z xZS )	YosoLayerc                     t                                                       |j        | _        d| _        t	          |          | _        |j        | _        t          |          | _        t          |          | _
        d S Nr   )r   r   chunk_size_feed_forwardseq_len_dimr  	attentionadd_cross_attentionr  intermediater$  r  r   s     r)   r   zYosoLayer.__init__  si    '-'E$&v..#)#= ,V44 ((r+   NFc                     |                      |||          }|d         }|dd          }t          | j        | j        | j        |          }|f|z   }|S )N)r   r   r   )r-  r   feed_forward_chunkr+  r,  )r   r   r   r   self_attention_outputsr  r   layer_outputs           r)   ru   zYosoLayer.forward  si    !%~ar!s!s1!4(,0#T%A4CSUe
 
  /G+r+   c                 \    |                      |          }|                     ||          }|S r8   )r/  r  )r   r  intermediate_outputr3  s       r)   r1  zYosoLayer.feed_forward_chunk  s2    "//0@AA{{#68HIIr+   r   )r   r   r   r   ru   r1  r   r   s   @r)   r(  r(    s[        ) ) ) ) )         r+   r(  c                   0     e Zd Z fdZ	 	 	 	 	 ddZ xZS )YosoEncoderc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r%   )r(  )r&   _rn   s     r)   r*   z(YosoEncoder.__init__.<locals>.<listcomp>$  s!    #_#_#_!If$5$5#_#_#_r+   F)	r   r   rn   r   
ModuleListrangenum_hidden_layersr   gradient_checkpointingr   s    `r)   r   zYosoEncoder.__init__!  s`    ]#_#_#_#_uVE]?^?^#_#_#_``
&+###r+   NFTc                 l   |rdnd }|rdnd }t          | j                  D ]\\  }	}
|r||fz   }| j        r%| j        r|                     |
j        |||          }n |
|||          }|d         }|r||d         fz   }]|r||fz   }|st          d |||fD                       S t          |||          S )Nr%   r   r   c              3      K   | ]}||V  	d S r8   r%   )r&   vs     r)   	<genexpr>z&YosoEncoder.forward.<locals>.<genexpr>I  s(      mmq_`_l_l_l_l_lmmr+   )last_hidden_stater   
attentions)	enumerater   r>  r   _gradient_checkpointing_func__call__tupler   )r   r   r   	head_maskr   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsilayer_modulelayer_outputss               r)   ru   zYosoEncoder.forward'  s7    #7@BBD$5?bb4(44 	P 	POA|# I$58H$H!* _t} _ $ A A )!"%	! ! !-]NL] ^ ^)!,M  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm1++*
 
 
 	
r+   )NNFFT)r   r   r   r   ru   r   r   s   @r)   r7  r7     s]        , , , , , "'
 '
 '
 '
 '
 '
 '
 '
r+   r7  c                   B     e Zd Z fdZdej        dej        fdZ xZS )YosoPredictionHeadTransformc                 V   t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _
        n|j        | _
        t          j        |j        |j                  | _        d S r  )r   r   r   r   r   r  r9   r  r  r   transform_act_fnr   r   r   s     r)   r   z$YosoPredictionHeadTransform.__init__S  s    Yv163EFF
f'-- 	6$*6+<$=D!!$*$5D!f&8f>STTTr+   r   r  c                     |                      |          }|                     |          }|                     |          }|S r8   )r  rT  r   r"  s     r)   ru   z#YosoPredictionHeadTransform.forward\  s=    

=11--m<<}55r+   r	  r   s   @r)   rR  rR  R  sc        U U U U UU\ el        r+   rR  c                   *     e Zd Z fdZd Zd Z xZS )YosoLMPredictionHeadc                 >   t                                                       t          |          | _        t	          j        |j        |j        d          | _        t	          j	        t          j        |j                            | _        | j        | j        _        d S )NF)r   )r   r   rR  	transformr   r   r   r   decoder	ParameterrQ   r   r   r   s     r)   r   zYosoLMPredictionHead.__init__e  sz    4V<< y!3V5FUSSSLV->!?!?@@	 !Ir+   c                 (    | j         | j        _         d S r8   )r   rZ  r   s    r)   _tie_weightsz!YosoLMPredictionHead._tie_weightsr  s     Ir+   c                 Z    |                      |          }|                     |          }|S r8   )rY  rZ  r"  s     r)   ru   zYosoLMPredictionHead.forwardu  s*    }55]33r+   )r   r   r   r   r^  ru   r   r   s   @r)   rW  rW  d  sV        & & & & && & &      r+   rW  c                   B     e Zd Z fdZdej        dej        fdZ xZS )YosoOnlyMLMHeadc                 p    t                                                       t          |          | _        d S r8   )r   r   rW  predictionsr   s     r)   r   zYosoOnlyMLMHead.__init__}  s/    /77r+   sequence_outputr  c                 0    |                      |          }|S r8   )rc  )r   rd  prediction_scoress      r)   ru   zYosoOnlyMLMHead.forward  s     ,,_==  r+   r	  r   s   @r)   ra  ra  |  s^        8 8 8 8 8!u| ! ! ! ! ! ! ! ! !r+   ra  c                   $    e Zd ZdZeZdZdZd ZdS )YosoPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    r#   Tc                    t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j        	                                 dS dS t          |t          j
                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 	                                 dS dS t          |t          j                  r?|j        j        	                                 |j        j                            d           dS dS )zInitialize the weightsg        )meanstdNr   )r9   r   r   weightdatanormal_rn   initializer_ranger   zero_r   r   r   fill_)r   modules     r)   _init_weightsz!YosoPreTrainedModel._init_weights  s)   fbi(( 	* M&&CT[5R&SSS{& &&((((( '&-- 	*M&&CT[5R&SSS!-"6#56<<>>>>> .--- 	*K""$$$M$$S)))))	* 	*r+   N)	r   r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointingrs  r%   r+   r)   rh  rh    s@         
 L&*#* * * * *r+   rh  aG  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`YosoConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a5
  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z^The bare YOSO Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd Z fdZd Zd Zd Z ee	                    d                     e
eee          	 	 	 	 	 	 	 	 	 ddeej                 d	eej                 d
eej                 deej                 deej                 deej                 dee         dee         dee         deeef         fd                        Z xZS )	YosoModelc                     t                                          |           || _        t          |          | _        t          |          | _        |                                  d S r8   )r   r   rn   r   r   r7  encoder	post_initr   s     r)   r   zYosoModel.__init__  sX       (00"6** 	r+   c                     | j         j        S r8   r   r   r]  s    r)   get_input_embeddingszYosoModel.get_input_embeddings  s    ..r+   c                     || j         _        d S r8   r}  )r   rr   s     r)   set_input_embeddingszYosoModel.set_input_embeddings  s    */'''r+   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrz  r   r-  r  )r   heads_to_pruner   r  s       r)   _prune_headszYosoModel._prune_heads  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr+   batch_size, sequence_length
checkpointoutput_typert  Nr   r   r   r   rI  r   r   rJ  rK  r  c
                    ||n| j         j        }||n| j         j        }|	|	n| j         j        }	||t	          d          |+|                     ||           |                                }
n.||                                d d         }
nt	          d          |
\  }}||j        n|j        }|t          j	        ||f|          }|gt          | j        d          r1| j        j        d d d |f         }|                    ||          }|}n!t          j        |
t          j        |          }|                     || j         j                  }|                     ||||          }|                     ||||||	          }|d	         }|	s|f|d
d          z   S t'          ||j        |j        |j                  S )NzDYou cannot specify both input_ids and inputs_embeds at the same timerE   z5You have to specify either input_ids or inputs_embedsrK   r   r   )r   r   r   r   )r   rI  r   rJ  rK  r   r   )rC  r   rD  cross_attentions)rn   r   rJ  use_return_dictrP   %warn_if_padding_and_no_attention_maskrO   rL   rQ   onesr   r   r   r   r   r   get_head_maskr=  rz  r   r   rD  r  )r   r   r   r   r   rI  r   r   rJ  rK  r   r   r   rL   r   r   embedding_outputencoder_outputsrd  s                      r)   ru   zYosoModel.forward  s4   $ 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU!,
J%.%:!!@T!"Z*j)A6RRRN!t(899 [*./*HKZK*X'3J3Q3QR\^h3i3i0!A!&[
SY!Z!Z!Z &&y$+2OPP	??%)'	 + 
 
 ,,)/!5# ' 
 
 *!, 	<#%(;;;1-)7&1,=	
 
 
 	
r+   )	NNNNNNNNN)r   r   r   r   r~  r  r  r   YOSO_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   rQ   r
  boolr   r   ru   r   r   s   @r)   rx  rx    s       
    / / /0 0 0C C C +*+@+G+GHe+f+fgg&6$   -11515/3,004,0/3&*I
 I
EL)I
 !.I
 !.	I

 u|,I
 EL)I
  -I
 $D>I
 'tnI
 d^I
 
u88	9I
 I
 I
  hgI
 I
 I
 I
 I
r+   rx  z2YOSO Model with a `language modeling` head on top.c                       e Zd ZddgZ fdZd Zd Z ee	                    d                     e
eee          	 	 	 	 	 	 	 	 	 	 dd	eej                 d
eej                 deej                 deej                 deej                 deej                 deej                 dee         dee         dee         deeef         fd                        Z xZS )YosoForMaskedLMzcls.predictions.decoder.weightzcls.predictions.decoder.biasc                     t                                          |           t          |          | _        t	          |          | _        |                                  d S r8   )r   r   rx  r#   ra  clsr{  r   s     r)   r   zYosoForMaskedLM.__init__Q  sQ       f%%	"6** 	r+   c                 $    | j         j        j        S r8   )r  rc  rZ  r]  s    r)   get_output_embeddingsz%YosoForMaskedLM.get_output_embeddingsZ  s    x#++r+   c                 T    || j         j        _        |j        | j         j        _        d S r8   )r  rc  rZ  r   )r   new_embeddingss     r)   set_output_embeddingsz%YosoForMaskedLM.set_output_embeddings]  s%    '5$$2$7!!!r+   r  r  Nr   r   r   r   rI  r   labelsr   rJ  rK  r  c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }d}|Kt	                      } ||                    d| j         j                  |                    d                    }|
s|f|dd         z   }||f|z   n|S t          |||j        |j	                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        Nr   r   r   rI  r   r   rJ  rK  r   rE   r   losslogitsr   rD  )
rn   r  r#   r  r	   r   r   r   r   rD  )r   r   r   r   r   rI  r   r  r   rJ  rK  r   rd  rf  masked_lm_lossloss_fctr  s                    r)   ru   zYosoForMaskedLM.forwarda  s   2 &1%<kk$+B]))))%'/!5#  

 

 "!* HH_55'))H%X&7&<&<RAW&X&XZ`ZeZefhZiZijjN 	Z')GABBK7F3A3M^%..SYY$!/)	
 
 
 	
r+   
NNNNNNNNNN)r   r   r   _tied_weights_keysr   r  r  r   r  r  r   r  r   r  r   rQ   r
  r  r   r   ru   r   r   s   @r)   r  r  M  s       :<Z[    , , ,8 8 8 +*+@+G+GHe+f+fgg&"$   -11515/3,004)-,0/3&*2
 2
EL)2
 !.2
 !.	2

 u|,2
 EL)2
  -2
 &2
 $D>2
 'tn2
 d^2
 
un$	%2
 2
 2
  hg2
 2
 2
 2
 2
r+   r  c                   (     e Zd ZdZ fdZd Z xZS )YosoClassificationHeadz-Head for sentence-level classification tasks.c                 "   t                                                       t          j        |j        |j                  | _        t          j        |j                  | _        t          j        |j        |j	                  | _
        || _        d S r8   )r   r   r   r   r   r  r   r   r   
num_labelsout_projrn   r   s     r)   r   zYosoClassificationHead.__init__  sj    Yv163EFF
z&"<==	&"4f6GHHr+   c                 
   |d d dd d f         }|                      |          }|                     |          }t          | j        j                 |          }|                      |          }|                     |          }|S )Nr   )r   r  r   rn   r  r  )r   featureskwargsxs       r)   ru   zYosoClassificationHead.forward  st    QQQ111WLLOOJJqMM4;)*1--LLOOMM!r+   r   r   s   @r)   r  r    sM        77          r+   r  zYOSO Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks.c                       e Zd Z fdZ ee                    d                     eee	e
          	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 dee         dee         dee         deee	f         fd                        Z xZS )YosoForSequenceClassificationc                     t                                          |           |j        | _        t          |          | _        t          |          | _        |                                  d S r8   )r   r   r  rx  r#   r  
classifierr{  r   s     r)   r   z&YosoForSequenceClassification.__init__  s[        +f%%	088 	r+   r  r  Nr   r   r   r   rI  r   r  r   rJ  rK  r  c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }d}|Z| j         j        f| j        dk    rd| j         _        nN| j        dk    r7|j        t          j        k    s|j        t          j	        k    rd| j         _        nd| j         _        | j         j        dk    rWt                      }| j        dk    r1 ||                                |                                          }n |||          }n| j         j        dk    rGt                      } ||                    d| j                  |                    d                    }n*| j         j        dk    rt                      } |||          }|
s|f|dd         z   }||f|z   n|S t          |||j        |j        	          S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationrE   r  )rn   r  r#   r  problem_typer  r   rQ   r   rV   r
   squeezer	   r   r   r   r   rD  )r   r   r   r   r   rI  r   r  r   rJ  rK  r   rd  r  r  r  r  s                    r)   ru   z%YosoForSequenceClassification.forward  s   2 &1%<kk$+B]))))%'/!5#  

 

 "!*11{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE'!/)	
 
 
 	
r+   r  )r   r   r   r   r   r  r  r   r  r   r  r   rQ   r
  r  r   r   ru   r   r   s   @r)   r  r    s            +*+@+G+GHe+f+fgg&,$   -11515/3,004)-,0/3&*C
 C
EL)C
 !.C
 !.	C

 u|,C
 EL)C
  -C
 &C
 $D>C
 'tnC
 d^C
 
u..	/C
 C
 C
  hgC
 C
 C
 C
 C
r+   r  zYOSO Model with a multiple choice classification head on top (a linear layer on top of
    the pooled output and a softmax) e.g. for RocStories/SWAG tasks.c                       e Zd Z fdZ ee                    d                     eee	e
          	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 dee         dee         dee         deee	f         fd                        Z xZS )YosoForMultipleChoicec                     t                                          |           t          |          | _        t	          j        |j        |j                  | _        t	          j        |j        d          | _        | 	                                 d S r*  )
r   r   rx  r#   r   r   r   pre_classifierr  r{  r   s     r)   r   zYosoForMultipleChoice.__init__  sr       f%%	 i(:F<NOO)F$6:: 	r+   z(batch_size, num_choices, sequence_lengthr  Nr   r   r   r   rI  r   r  r   rJ  rK  r  c                 
   |
|
n| j         j        }
||j        d         n|j        d         }|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|=|                    d|                    d          |                    d                    nd}|                     ||||||||	|
	  	        }|d         }|dddf         }|                     |          } t          j                    |          }| 	                    |          }|                    d|          }d}|t                      } |||          }|
s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )aJ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   rE   rh   r  r   r  )rn   r  shaper   rO   r#   r  r   ReLUr  r	   r   r   rD  )r   r   r   r   r   rI  r   r  r   rJ  rK  num_choicesr   hidden_statepooled_outputr  reshaped_logitsr  r  r  s                       r)   ru   zYosoForMultipleChoice.forward  sS   2 &1%<kk$+B],5,Aioa((}GZ[\G]>G>SINN2y~~b'9'9:::Y]	M[Mg,,R1D1DR1H1HIIImqM[Mg,,R1D1DR1H1HIIImqGSG_|((\->->r-B-BCCCei ( r=#5#5b#9#9=;M;Mb;Q;QRRR 	 ))))%'/!5#  

 

 qz$QQQT*++M::!		-00// ++b+66'))H8OV44D 	F%''!""+5F)-)9TGf$$vE("!/)	
 
 
 	
r+   r  )r   r   r   r   r   r  r  r   r  r   r  r   rQ   r
  r  r   r   ru   r   r   s   @r)   r  r    s            +*+@+G+GHr+s+stt&-$   -11515/3,004)-,0/3&*B
 B
EL)B
 !.B
 !.	B

 u|,B
 EL)B
  -B
 &B
 $D>B
 'tnB
 d^B
 
u//	0B
 B
 B
  utB
 B
 B
 B
 B
r+   r  zYOSO Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.c                       e Zd Z fdZ ee                    d                     eee	e
          	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 dee         dee         dee         deee	f         fd                        Z xZS )YosoForTokenClassificationc                 6   t                                          |           |j        | _        t          |          | _        t          j        |j                  | _        t          j	        |j
        |j                  | _        |                                  d S r8   )r   r   r  rx  r#   r   r   r   r   r   r   r  r{  r   s     r)   r   z#YosoForTokenClassification.__init__m  sy        +f%%	z&"<==)F$68IJJ 	r+   r  r  Nr   r   r   r   rI  r   r  r   rJ  rK  r  c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }d}|t                      }||                    d          dk    }|                    d| j                  }t          j	        ||                    d          t          j
        |j                                      |                    } |||          }n8 ||                    d| j                  |                    d                    }|
s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   rE   r   r  )rn   r  r#   r   r  r	   r   r  rQ   whererA   ignore_indextype_asr   r   rD  )r   r   r   r   r   rI  r   r  r   rJ  rK  r   rd  r  r  r  active_lossactive_logitsactive_labelsr  s                       r)   ru   z"YosoForTokenClassification.forwardx  s   . &1%<kk$+B]))))%'/!5#  

 

 "!*,,7711'))H),11"55: &B @ @ %R%,x?T2U2U2]2]^d2e2e! !  x}==xB @ @&++b//RR 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
r+   r  )r   r   r   r   r   r  r  r   r  r   r  r   rQ   r
  r  r   r   ru   r   r   s   @r)   r  r  g  sv       	 	 	 	 	 +*+@+G+GHe+f+fgg&)$   -11515/3,004)-,0/3&*;
 ;
EL);
 !.;
 !.	;

 u|,;
 EL);
  -;
 &;
 $D>;
 'tn;
 d^;
 
u++	,;
 ;
 ;
  hg;
 ;
 ;
 ;
 ;
r+   r  zYOSO Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).c                       e Zd Z fdZ ee                    d                     eee	e
          	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 deej                 dee         dee         dee         deee	f         fd                        Z xZS )YosoForQuestionAnsweringc                    t                                          |           d|_        |j        | _        t          |          | _        t          j        |j        |j                  | _        | 	                                 d S )NrD   )
r   r   r  rx  r#   r   r   r   
qa_outputsr{  r   s     r)   r   z!YosoForQuestionAnswering.__init__  sm        +f%%	)F$68IJJ 	r+   r  r  Nr   r   r   r   rI  r   start_positionsend_positionsr   rJ  rK  r  c                 h   ||n| j         j        }|                     |||||||	|
|	  	        }|d         }|                     |          }|                    dd          \  }}|                    d          }|                    d          }d}||t          |                                          dk    r|                    d          }t          |                                          dk    r|                    d          }|                    d          }|                    d|          }|                    d|          }t          |          } |||          } |||          }||z   dz  }|s||f|dd         z   }||f|z   n|S t          ||||j        |j        	          S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        Nr  r   r   rE   rM   )r  rD   )r  start_logits
end_logitsr   rD  )rn   r  r#   r  splitr  rN   rO   clampr	   r   r   rD  )r   r   r   r   r   rI  r   r  r  r   rJ  rK  r   rd  r  r  r  
total_lossignored_indexr  
start_lossend_lossr  s                          r)   ru   z YosoForQuestionAnswering.forward  s   < &1%<kk$+B]))))%'/!5#  

 

 "!*11#)<<r<#:#: j#++B//''++

&=+D?''))**Q.."1"9"9""="==%%''((1,, - 5 5b 9 9(--a00M-33A}EEO)//=AAM']CCCH!,@@Jx
M::H$x/14J 	R"J/'!""+=F/9/EZMF**6Q+%!!/)
 
 
 	
r+   )NNNNNNNNNNN)r   r   r   r   r   r  r  r   r  r   r  r   rQ   r
  r  r   r   ru   r   r   s   @r)   r  r    s       
 
 
 
 
 +*+@+G+GHe+f+fgg&0$   -11515/3,0042604,0/3&*H
 H
EL)H
 !.H
 !.	H

 u|,H
 EL)H
  -H
 "%,/H
  -H
 $D>H
 'tnH
 d^H
 
u22	3H
 H
 H
  hgH
 H
 H
 H
 H
r+   r  )Kr   rk   pathlibr   typingr   r   r   rQ   torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   r   configuration_yosor   
get_loggerr   r   r  r  r4   r6   rB   rI   rc   autogradFunctionre   r   Moduler   r   r   r  r  r$  r(  r7  rR  rW  ra  rh  YOSO_START_DOCSTRINGr  rx  r  r  r  r  r  r  r%   r+   r)   <module>r     s            ) ) ) ) ) ) ) ) ) )            A A A A A A A A A A ! ! ! ! ! !                . - - - - - l l l l l l l l l l                + * * * * * 
	H	%	%,  1 1 1  C C C. . .&B B B B BU^, B B B>VB VB VB VB VB/ VB VB VBt9 9 9 9 9RY 9 9 9xJ J J J J	 J J J\    RY       BI   B    ry               	   :.
 .
 .
 .
 .
") .
 .
 .
d    ")   $    29   0! ! ! ! !bi ! ! !* * * * */ * * *6	 / d d h
 h
 h
 h
 h
# h
 h
	 h
V NPdeeK
 K
 K
 K
 K
) K
 K
 feK
\    RY   * / 
S
 S
 S
 S
 S
$7 S
 S
 
S
l H 
S
 S
 S
 S
 S
/ S
 S
 
S
l P 
M
 M
 M
 M
 M
!4 M
 M
 
M
` h 
[
 [
 [
 [
 [
2 [
 [
 
[
 [
 [
r+   