
    g3                     ~   d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZmZ ddlmZmZmZmZ ddlm Z   ej!        e"          Z#dZ$dZ% G d dej&                  Z' G d dej&                  Z( G d dej&                  Z)de(iZ* G d dej&                  Z+ G d dej&                  Z, G d dej&                  Z- G d dej&                  Z. G d  d!ej&                  Z/ G d" d#e          Z0d$Z1d%Z2 ed&e1           G d' d(e0                      Z3 G d) d*ej&                  Z4 G d+ d,ej&                  Z5 ed-e1           G d. d/e0                      Z6e G d0 d1e                      Z7 ed2e1           G d3 d4e0                      Z8dS )5zPyTorch Splinter model.    N)	dataclass)ListOptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN))BaseModelOutputWithPastAndCrossAttentionsModelOutputQuestionAnsweringModelOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )SplinterConfigztau/splinter-baser   c                        e Zd ZdZ fdZ	 	 	 	 	 ddeej                 deej                 deej                 deej                 d	ee	         d
e
fdZ xZS )SplinterEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 d   t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j        |j
        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        |                     dt%          j        |j                                      d          d           t+          |dd          | _        d S )	N)padding_idxepsposition_ids)r   F)
persistentposition_embedding_typeabsolute)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandgetattrr"   selfconfig	__class__s     j/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/splinter/modeling_splinter.pyr%   zSplinterEmbeddings.__init__+   s   !|F,=v?Q_e_rsss#%<0NPVPb#c#c %'\&2H&J\%]%]" f&8f>STTTz&"<== 	EL)GHHOOPWXXej 	 	
 	
 	
 (/v7PR\']']$$$    Nr   	input_idstoken_type_idsr   inputs_embedspast_key_values_lengthreturnc                    ||                                 }n|                                 d d         }|d         }|| j        d d |||z   f         }|+t          j        |t          j        | j        j                  }||                     |          }|                     |          }||z   }	| j        dk    r| 	                    |          }
|	|
z  }	| 
                    |	          }	|                     |	          }	|	S )Nr    r   dtypedevicer#   )sizer   r5   zeroslongrG   r*   r.   r"   r,   r/   r3   )r:   r?   r@   r   rA   rB   input_shape
seq_lengthr.   
embeddingsr,   s              r=   forwardzSplinterEmbeddings.forward<   s     #..**KK',,..ss3K ^
,QQQ0FVlIl0l-lmL!"[EJtO`OghhhN  00;;M $ : :> J J"%::
':55"&":":<"H"H--J^^J//
\\*--
r>   )NNNNr   )__name__
__module____qualname____doc__r%   r   r5   
LongTensorFloatTensorintr   rN   __classcell__r<   s   @r=   r   r   (   s        QQ^ ^ ^ ^ ^& 1559375901 E,- !!12 u/0	
   12 !) 
       r>   r   c                   ,    e Zd Zd fd	Zdej        dej        fdZ	 	 	 	 	 	 ddej        deej                 d	eej                 d
eej                 deej                 dee	e	ej                                   dee
         de	ej                 fdZ xZS )SplinterSelfAttentionNc                 D   t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j        |j                  | _        |pt#          |dd          | _        | j        dk    s| j        d	k    r6|j        | _        t          j        d
|j        z  dz
  | j                  | _        |j        | _        d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r"   r#   relative_keyrelative_key_query   r   )r$   r%   r(   num_attention_headshasattr
ValueErrorrU   attention_head_sizeall_head_sizer   Linearquerykeyvaluer1   attention_probs_dropout_probr3   r8   r"   r+   r&   distance_embedding
is_decoderr:   r;   r"   r<   s      r=   r%   zSplinterSelfAttention.__init__`   s    ::a??PVXhHiHi?8F$6 8 8 48 8 8  
 $*#= #&v'9F<V'V#W#W !58PPYv143EFF
9V/1CDDYv143EFF
z&"EFF'> (
'-zC
 C
$ '>99T=Y]q=q=q+1+ID(&(l1v7U3UXY3Y[_[s&t&tD# +r>   xrC   c                     |                                 d d         | j        | j        fz   }|                    |          }|                    dddd          S )Nr    r   r_   r   r
   )rH   r`   rc   viewpermute)r:   rm   new_x_shapes      r=   transpose_for_scoresz*SplinterSelfAttention.transpose_for_scoresz   sP    ffhhssmt'?AY&ZZFF;yyAq!$$$r>   Fhidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                 ^   |                      |          }|d u}	|	r||d         }
|d         }|}n4|	rS|                     |                     |                    }
|                     |                     |                    }|}n||                     |                     |                    }
|                     |                     |                    }t	          j        |d         |
gd          }
t	          j        |d         |gd          }nP|                     |                     |                    }
|                     |                     |                    }|                     |          }|d u}| j        r|
|f}t	          j        ||
                    dd                    }| j	        dk    s| j	        dk    rt|j
        d         |
j
        d         }}|r>t	          j        |dz
  t          j        |j        	                              dd          }n:t	          j        |t          j        |j        	                              dd          }t	          j        |t          j        |j        	                              dd          }||z
  }|                     || j        z   dz
            }|                    |j        
          }| j	        dk    rt	          j        d||          }||z   }n?| j	        dk    r4t	          j        d||          }t	          j        d|
|          }||z   |z   }|t+          j        | j                  z  }|||z   }t0          j                            |d          }|                     |          }|||z  }t	          j        ||          }|                    dddd                                          }|                                d d         | j        fz   }|                    |          }|r||fn|f}| j        r||fz   }|S )Nr   r   r_   dimr    r]   r^   rE   )rF   zbhld,lrd->bhlrzbhrd,lrd->bhlrr
   ) rf   rr   rg   rh   r5   catrk   matmul	transposer"   shapetensorrJ   rG   ro   r6   rj   r+   torF   einsummathsqrtrc   r   
functionalsoftmaxr3   rp   
contiguousrH   rd   )r:   rs   rt   ru   rv   rw   rx   ry   mixed_query_layeris_cross_attention	key_layervalue_layerquery_layer	use_cacheattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                               r=   rN   zSplinterSelfAttention.forward   sZ    !JJ}55
 3$> 	O."<&q)I(+K3NN 	O11$((;P2Q2QRRI33DJJ?T4U4UVVK3NN'11$((=2I2IJJI33DJJ}4M4MNNK	>!#4i"@aHHHI)^A%6$D!LLLKK11$((=2I2IJJI33DJJ}4M4MNNK//0ABB"$.	? 	6 (5N !<Y5H5HR5P5PQQ'>99T=Y]q=q=q'2'8';Y_Q=O*L w!&j1nEJWdWk!l!l!l!q!q" " "'l%*UbUi!j!j!j!o!oprtu!v!v"\*EJ}OcdddiijkmoppN%6H#'#:#:8dFb;bef;f#g#g #7#:#:AR#:#S#S +~==+0<8H+Wk+l+l(#36N#N  -1EEE16>NP[]q1r1r./4|<LiYm/n/n,#36T#TWs#s +di8P.Q.QQ%/.@ -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S%**+BCC6G]=/22mM]? 	2 11Gr>   NNNNNNF)rO   rP   rQ   r%   r5   Tensorrr   r   rT   r   boolrN   rV   rW   s   @r=   rY   rY   _   s.       , , , , , ,4%el %u| % % % % 7;15=A>BDH,1c c|c !!23c E-.	c
  ((9:c !)): ;c !uU->'?!@Ac $D>c 
u|	c c c c c c c cr>   rY   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )SplinterSelfOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j	                  | _
        d S Nr   )r$   r%   r   re   r(   denser/   r0   r1   r2   r3   r9   s     r=   r%   zSplinterSelfOutput.__init__   sf    Yv163EFF
f&8f>STTTz&"<==r>   rs   input_tensorrC   c                     |                      |          }|                     |          }|                     ||z             }|S r   r   r3   r/   r:   rs   r   s      r=   rN   zSplinterSelfOutput.forward   @    

=11]33}|'CDDr>   rO   rP   rQ   r%   r5   r   rN   rV   rW   s   @r=   r   r      i        > > > > >U\  RWR^        r>   r   eagerc                       e Zd Zd fd	Zd Z	 	 	 	 	 	 ddej        deej                 deej                 deej                 d	eej                 d
ee	e	ej                                   dee
         de	ej                 fdZ xZS )SplinterAttentionNc                     t                                                       t          |j                 ||          | _        t          |          | _        t                      | _        d S )Nr"   )	r$   r%   SPLINTER_SELF_ATTENTION_CLASSES_attn_implementationr:   r   outputsetpruned_headsrl   s      r=   r%   zSplinterAttention.__init__   s`    3F4OP,C
 
 
	 )00EEr>   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   r{   )lenr   r:   r`   rc   r   r   rf   rg   rh   r   r   rd   union)r:   headsindexs      r=   prune_headszSplinterAttention.prune_heads  s    u::??F7490$)2OQUQb
 
u
 -TY_eDD	*49=%@@	,TY_eDD	.t{/@%QOOO )-	(EE

(R	%"&)"?$)B_"_	 -33E::r>   Frs   rt   ru   rv   rw   rx   ry   rC   c           	          |                      |||||||          }|                     |d         |          }	|	f|dd          z   }
|
S )Nr   r   )r:   r   )r:   rs   rt   ru   rv   rw   rx   ry   self_outputsattention_outputr   s              r=   rN   zSplinterAttention.forward  sa     yy!"
 
  ;;|AFF#%QRR(88r>   r   r   )rO   rP   rQ   r%   r   r5   r   r   rT   r   r   rN   rV   rW   s   @r=   r   r      s       " " " " " "; ; ;* 7;15=A>BDH,1 | !!23 E-.	
  ((9: !)): ; !uU->'?!@A $D> 
u|	       r>   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )SplinterIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r   )r$   r%   r   re   r(   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr9   s     r=   r%   zSplinterIntermediate.__init__/  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r>   rs   rC   c                 Z    |                      |          }|                     |          }|S r   )r   r   )r:   rs   s     r=   rN   zSplinterIntermediate.forward7  s,    

=1100??r>   r   rW   s   @r=   r   r   .  s^        9 9 9 9 9U\ el        r>   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )SplinterOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j	        |j
                  | _        d S r   )r$   r%   r   re   r   r(   r   r/   r0   r1   r2   r3   r9   s     r=   r%   zSplinterOutput.__init__?  sf    Yv79KLL
f&8f>STTTz&"<==r>   rs   r   rC   c                     |                      |          }|                     |          }|                     ||z             }|S r   r   r   s      r=   rN   zSplinterOutput.forwardE  r   r>   r   rW   s   @r=   r   r   >  r   r>   r   c                       e Zd Z fdZ	 	 	 	 	 	 ddej        deej                 deej                 deej                 deej                 d	eeeej                                   d
ee	         deej                 fdZ
d Z xZS )SplinterLayerc                    t                                                       |j        | _        d| _        t	          |          | _        |j        | _        |j        | _        | j        r/| j        st          |  d          t	          |d          | _	        t          |          | _        t          |          | _        d S )Nr   z> should be used as a decoder model if cross attention is addedr#   r   )r$   r%   chunk_size_feed_forwardseq_len_dimr   	attentionrk   add_cross_attentionrb   crossattentionr   intermediater   r   r9   s     r=   r%   zSplinterLayer.__init__N  s    '-'E$*622 +#)#= # 	`? j D!h!h!hiii"3FT^"_"_"_D088$V,,r>   NFrs   rt   ru   rv   rw   rx   ry   rC   c           	         |
|d d         nd }|                      |||||          }	|	d         }
| j        r|	dd         }|	d         }n
|	dd          }d }| j        rp|nt          | d          st          d|  d          |
|d	d          nd }|                     |
||||||          }|d         }
||dd         z   }|d         }||z   }t          | j        | j        | j        |
          }|f|z   }| j        r||fz   }|S )
Nr_   )ry   rx   r   r   r    r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r}   )	r   rk   ra   rb   r   r   feed_forward_chunkr   r   )r:   rs   rt   ru   rv   rw   rx   ry   self_attn_past_key_valueself_attention_outputsr   r   present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputslayer_outputs                    r=   rN   zSplinterLayer.forward\  s    :H9S>"1"#5#5Y] !%/3 "0 "
 "
 2!4 ? 	1,QrT2G 6r :,QRR0G'+$? 	Q4@4!122  Dd D D D   @N?Yrss(;(;_c%&*&9&9 %&)!' '#  7q9 7" ==G ,C2+F( 14P P0#T%A4CSUe
 
  /G+ ? 	5!2 44Gr>   c                 \    |                      |          }|                     ||          }|S r   )r   r   )r:   r   intermediate_outputr   s       r=   r   z SplinterLayer.feed_forward_chunk  s2    "//0@AA{{#68HIIr>   r   )rO   rP   rQ   r%   r5   r   r   rT   r   r   rN   r   rV   rW   s   @r=   r   r   M  s       - - - - -" 7;15=A>BDH,1? ?|? !!23? E-.	?
  ((9:? !)): ;? !uU->'?!@A? $D>? 
u|	? ? ? ?B      r>   r   c                   L    e Zd Z fdZ	 	 	 	 	 	 	 	 	 ddej        deej                 deej                 deej                 d	eej                 d
eeeej                                   dee	         dee	         dee	         dee	         de
eej                 ef         fdZ xZS )SplinterEncoderc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S  )r   ).0_r;   s     r=   
<listcomp>z,SplinterEncoder.__init__.<locals>.<listcomp>  s!    #c#c#caM&$9$9#c#c#cr>   F)	r$   r%   r;   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr9   s    `r=   r%   zSplinterEncoder.__init__  s`    ]#c#c#c#c5IaCbCb#c#c#cdd
&+###r>   NFTrs   rt   ru   rv   rw   past_key_valuesr   ry   output_hidden_statesreturn_dictrC   c                    |	rdnd }|rdnd }|r| j         j        rdnd }| j        r%| j        r|rt                              d           d}|rdnd }t          | j                  D ]\  }}|	r||fz   }|||         nd }|||         nd }| j        r)| j        r"|                     |j	        |||||||          }n ||||||||          }|d         }|r||d         fz  }|r$||d         fz   }| j         j        r||d         fz   }|	r||fz   }|
st          d |||||fD                       S t          |||||	          S )
Nr   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r    r   r_   c              3      K   | ]}||V  	d S r   r   )r   vs     r=   	<genexpr>z*SplinterEncoder.forward.<locals>.<genexpr>  s4       
 
 =  !===
 
r>   last_hidden_stater   rs   
attentionscross_attentions)r;   r   r   trainingloggerwarning_once	enumerater   _gradient_checkpointing_func__call__tupler   )r:   rs   rt   ru   rv   rw   r   r   ry   r   r   all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskrx   layer_outputss                       r=   rN   zSplinterEncoder.forward  sB    #7@BBD$5?bb4%6d4;;Zdrr`d& 	"4= 	" "##p   "	#,6RR$(44 #	V #	VOA|# I$58H$H!.7.CillO3B3N_Q//TXN* t}  $ A A )!"#)*"%	! 	! !-!"#)*"%! ! *!,M ;"}R'8&::"  V&9]1=M<O&O#;2 V+?=QRCSBU+U( 	E 1]4D D 	 
 
 "&%'(
 
 
 
 
 
 9+.+*1
 
 
 	
r>   )	NNNNNNFFT)rO   rP   rQ   r%   r5   r   r   rT   r   r   r   r   rN   rV   rW   s   @r=   r   r     sD       , , , , , 7;15=A>BEI$(,1/4&*S
 S
|S
 !!23S
 E-.	S

  ((9:S
 !)): ;S
 "%e.?(@"ABS
 D>S
 $D>S
 'tnS
 d^S
 
uU\"$MM	NS
 S
 S
 S
 S
 S
 S
 S
r>   r   c                   $    e Zd ZdZeZdZdZd ZdS )SplinterPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    splinterTc                    t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j        	                                 dS dS t          |t          j
                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 	                                 dS dS t          |t          j                  r?|j        j        	                                 |j        j                            d           dS dS )zInitialize the weightsg        )meanstdNg      ?)r   r   re   weightdatanormal_r;   initializer_rangebiaszero_r&   r   r/   fill_)r:   modules     r=   _init_weightsz%SplinterPreTrainedModel._init_weights  s)   fbi(( 	* M&&CT[5R&SSS{& &&((((( '&-- 	*M&&CT[5R&SSS!-"6#56<<>>>>> .--- 	*K""$$$M$$S)))))	* 	*r>   N)	rO   rP   rQ   rR   r   config_classbase_model_prefixsupports_gradient_checkpointingr  r   r>   r=   r
  r
    s@         
 "L"&*#* * * * *r>   r
  aK  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`SplinterConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a/
  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `{0}`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `{0}`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `{0}`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zbThe bare Splinter Model transformer outputting raw hidden-states without any specific head on top.c            !           e Zd ZdZ fdZd Zd Zd Z ee	
                    d                     eeee          	 	 	 	 	 	 	 	 	 	 	 	 	 dd	eej                 d
eej                 deej                 deej                 deej                 deej                 deej                 deej                 deeej                          dee         dee         dee         dee         deeef         fd                        Z xZS )SplinterModela*  
    The model is an encoder (with only self-attention) following the architecture described in [Attention is all you
    need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
    Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
    c                     t                                          |           || _        t          |          | _        t          |          | _        |                                  d S r   )r$   r%   r;   r   rM   r   encoder	post_initr9   s     r=   r%   zSplinterModel.__init__e  sX       ,V44&v.. 	r>   c                     | j         j        S r   rM   r*   )r:   s    r=   get_input_embeddingsz"SplinterModel.get_input_embeddingso  s    ..r>   c                     || j         _        d S r   r!  )r:   rh   s     r=   set_input_embeddingsz"SplinterModel.set_input_embeddingsr  s    */'''r>   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   r   r   )r:   heads_to_pruner   r   s       r=   _prune_headszSplinterModel._prune_headsu  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr>   batch_size, sequence_length
checkpointoutput_typer  Nr?   rt   r@   r   ru   rA   rv   rw   r   r   ry   r   r   rC   c                 r   ||n| j         j        }||n| j         j        }||n| j         j        }| j         j        r|
|
n| j         j        }
nd}
||t          d          |+|                     ||           |                                }n.||                                dd         }nt          d          |\  }}||j	        n|j	        }|	|	d         d         j
        d         nd}|t          j        |||z   f|          }|!t          j        |t          j        |	          }|                     ||          }| j         j        rL|J|                                \  }}}||f}|t          j        ||          }|                     |          }nd}|                     || j         j                  }|                     |||||
          }|                     ||||||	|
|||
  
        }|d         }|s|f|dd         z   S t+          ||j        |j        |j        |j                  S )a  
        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        NFzDYou cannot specify both input_ids and inputs_embeds at the same timer    z5You have to specify either input_ids or inputs_embedsr   r_   )rG   rE   )r?   r   r@   rA   rB   )	rt   ru   rv   rw   r   r   ry   r   r   r   r   )r;   ry   r   use_return_dictrk   r   rb   %warn_if_padding_and_no_attention_maskrH   rG   r   r5   onesrI   rJ   get_extended_attention_maskinvert_attention_maskget_head_maskr   rM   r  r   r   rs   r   r   )r:   r?   rt   r@   r   ru   rA   rv   rw   r   r   ry   r   r   rK   
batch_sizerL   rG   rB   extended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputssequence_outputs                               r=   rN   zSplinterModel.forward}  s   R 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B];! 	%.%:		@UIII ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU!,
J%.%:!!@T DSC^!3A!6!<Q!?!?de!"Z*jCY6Y)ZdjkkkN!"[EJvVVVN 150P0PQ_al0m0m ;! 	3&;&G=R=W=W=Y=Y: 7$68O#P %-).4HQW)X)X)X&.2.H.HI_.`.`++.2+ &&y$+2OPP	??%)'#9 + 
 
 ,,2"7#B+/!5# ' 
 
 *!, 	<#%(;;;8-+;)7&1,=
 
 
 	
r>   )NNNNNNNNNNNNN)rO   rP   rQ   rR   r%   r"  r$  r(  r   SPLINTER_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r5   r   r   rT   r   r   r   rN   rV   rW   s   @r=   r  r  Z  s       
     / / /0 0 0C C C +*+D+K+KLi+j+jkk&=$   -11515/3,0048<9==A$(,0/3&*w
 w
EL)w
 !.w
 !.	w

 u|,w
 EL)w
  -w
  (5w
 !) 6w
 "$u'8"9:w
 D>w
 $D>w
 'tnw
 d^w
 
u??	@w
 w
 w
  lkw
 w
 w
 w
 w
r>   r  c                   D     e Zd Zd fd	Zdej        dej        fdZ xZS )SplinterFullyConnectedLayergeluc                    t                                                       || _        || _        t	          j        | j        | j                  | _        t          |         | _        t	          j	        | j                  | _	        d S r   )
r$   r%   	input_dim
output_dimr   re   r   r   act_fnr/   )r:   rE  rF  r   r<   s       r=   r%   z$SplinterFullyConnectedLayer.__init__  sa    "$Yt~t??
Z(do66r>   inputsrC   c                     |                      |          }|                     |          }|                     |          }|S r   )r   rG  r/   )r:   rH  rs   s      r=   rN   z#SplinterFullyConnectedLayer.forward  s;    

6**M22}55r>   )rC  r   rW   s   @r=   rB  rB    sc        7 7 7 7 7 7el u|        r>   rB  c                   (     e Zd ZdZ fdZd Z xZS )QuestionAwareSpanSelectionHeadzf
    Implementation of Question-Aware Span Selection (QASS) head, described in Splinter's paper:

    c                    t                                                       t          |j        |j                  | _        t          |j        |j                  | _        t          |j        |j                  | _        t          |j        |j                  | _        t          j	        |j        |j        d          | _
        t          j	        |j        |j        d          | _        d S )NF)r  )r$   r%   rB  r(   query_start_transformquery_end_transformstart_transformend_transformr   re   start_classifierend_classifierr9   s     r=   r%   z'QuestionAwareSpanSelectionHead.__init__  s    %@ASU[Ug%h%h"#>v?QSYSe#f#f :6;MvOabb89KVM_`` "	&*<f>PW\ ] ] ] i(:F<NUZ[[[r>   c                 h   |                                 \  }}}|                    d                              dd|          }t          j        |d|          }|                     |          }|                     |          }|                     |          }	|                     |          }
| 	                    |          }|	
                    ddd          }	t          j        ||	          }|                     |          }|

                    ddd          }
t          j        ||
          }||fS )Nr    r   )r|   r   r   r_   )rH   	unsqueezerepeatr5   gatherrM  rN  rO  rP  rQ  rp   r   rR  )r:   rH  	positionsr   r|   r   gathered_repsquery_start_repsquery_end_reps
start_repsend_repsrs   start_logits
end_logitss                 r=   rN   z&QuestionAwareSpanSelectionHead.forward   s   KKMM	1c##B''..q!S99V%@@@55mDD11-@@))&11
%%f----.>??''1a00
|M:>>++N;;##Aq!,,\-::
Z''r>   )rO   rP   rQ   rR   r%   rN   rV   rW   s   @r=   rK  rK    sV         
	\ 	\ 	\ 	\ 	\( ( ( ( ( ( (r>   rK  z
    Splinter Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd Z fdZ ee                    d                     eee	e
          	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 deej                 dee         dee         dee         deej                 deee	f         fd                        Z xZS )SplinterForQuestionAnsweringc                     t                                          |           t          |          | _        t	          |          | _        |j        | _        |                                  d S r   r$   r%   r  r  rK  splinter_qassquestion_token_idr  r9   s     r=   r%   z%SplinterForQuestionAnswering.__init__=  ]       %f--;FCC!'!9 	r>   r)  r*  Nr?   rt   r@   r   ru   rA   start_positionsend_positionsry   r   r   question_positionsrC   c                    ||n| j         j        }d}||At          j        t          j        || j                                                  d          }n?t          j        |                    d          t          j	        |j
        |j                  }|                    d          }d}|                     |||||||	|
|	  	        }|d         }|                     ||          \  }}|r*|                    d	          |                    d	          }}|N|d	|z
  t          j        |j                  j        z  z   }|d	|z
  t          j        |j                  j        z  z   }d}||t'          |                                          d	k    r|                    d          }t'          |                                          d	k    r|                    d          }|                    d	          }|                    d|           |                    d|           t+          |
          } |||          } |||          }||z   dz  }|s||f|d	d         z   }||f|z   n|S t-          ||||j        |j                  S )a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        question_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            The positions of all question tokens. If given, start_logits and end_logits will be of shape `(batch_size,
            num_questions, sequence_length)`. If None, the first question token in each sequence in the batch will be
            the only one for which start_logits and end_logits are calculated and they will be of shape `(batch_size,
            sequence_length)`.
        NFr    r{   r   )rF   layoutrG   Trt   r@   r   ru   rA   ry   r   r   r   ignore_indexr_   lossr]  r^  rs   r   )r;   r.  r5   argmaxeqrd  rU   rI   rH   rJ   rj  rG   rT  r  rc  squeezefinforF   minr   clamp_r	   r   rs   r   )r:   r?   rt   r@   r   ru   rA   rf  rg  ry   r   r   rh  question_positions_were_none"question_position_for_each_exampler   r<  r]  r^  
total_lossignored_indexloss_fct
start_lossend_lossr   s                            r=   rN   z$SplinterForQuestionAnswering.forwardG  s   H &1%<kk$+B]',$%$5:\Xi)?@@EEGGR6 6 622 6;[!&&q))MDXanau6 6 62 "D!M!Mb!Q!Q+/(--))%'/!5#   

 

 "!*#'#5#5oGY#Z#Z j' 	V'3';';A'>'>
@R@RST@U@U*L%'1~+=\M_A`A`Ad*ddL#q>'9U[IY=Z=Z=^&^^J
&=+D?''))**Q.."1"9"9""="==%%''((1,, - 5 5b 9 9(--a00M""1m444  M222']CCCH!,@@Jx
M::H$x/14J 	R"J/'!""+=F/9/EZMF**6Q+%!!/)
 
 
 	
r>   NNNNNNNNNNNN)rO   rP   rQ   r%   r   r=  r>  r   r?  r   r@  r   r5   r   rS   r   r   r   rN   rV   rW   s   @r=   r`  r`  5  s            +*+D+K+KLi+j+jkk&0$   -11515/3,0046:48,0/3&*9=^
 ^
EL)^
 !.^
 !.	^

 u|,^
 EL)^
  -^
 "%"23^
   01^
 $D>^
 'tn^
 d^^
 %U%56^
 
u22	3^
 ^
 ^
  lk^
 ^
 ^
 ^
 ^
r>   r`  c                       e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
ej        ed<   dZeeej                          ed<   dZeeej                          ed<   dS )SplinterForPreTrainingOutputa  
    Class for outputs of Splinter as a span selection model.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when start and end positions are provided):
            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
        start_logits (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length)`):
            Span-start scores (before SoftMax).
        end_logits (`torch.FloatTensor` of shape `(batch_size, num_questions, sequence_length)`):
            Span-end scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nro  r]  r^  rs   r   )rO   rP   rQ   rR   ro  r   r5   rT   __annotations__r]  r^  rs   r   r   r   r>   r=   r  r    s          . )-D(5$
%,,,&*L%#***$(J!(((8<M8E%"345<<<59Ju01299999r>   r  z
    Splinter Model for the recurring span selection task as done during the pretraining. The difference to the QA task
    is that we do not have a question, but multiple question tokens that replace the occurrences of recurring spans
    instead.
    c                       e Zd Z fdZ ee                    d                    	 	 	 	 	 	 	 	 	 	 	 	 ddeej	                 deej	                 deej	                 deej	                 deej	                 d	eej	                 d
eej
                 deej
                 dee         dee         dee         deej
                 deeef         fd            Zdej	        dej	        fdZ xZS )SplinterForPreTrainingc                     t                                          |           t          |          | _        t	          |          | _        |j        | _        |                                  d S r   rb  r9   s     r=   r%   zSplinterForPreTraining.__init__  re  r>   z*batch_size, num_questions, sequence_lengthNr?   rt   r@   r   ru   rA   rf  rg  ry   r   r   rh  rC   c                    ||n| j         j        }|||t          d          ||t          d          ||                     |          }|                     |||||||	|
|	  	        }|d         }|                                \  }}}|                     ||          \  }}|                    d          }|x|                    d                              |||          }|d|z
  t          j
        |j                  j        z  z   }|d|z
  t          j
        |j                  j        z  z   }d}|||                    dt          d|dz
                       |                    dt          d|dz
                       t          | j         j                  } ||                    ||z  |          |                    ||z                      } ||                    ||z  |          |                    ||z                      }||z   dz  }|s||f|dd         z   }||f|z   n|S t%          ||||j        |j        	          S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        question_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*):
            The positions of all question tokens. If given, start_logits and end_logits will be of shape `(batch_size,
            num_questions, sequence_length)`. If None, the first question token in each sequence in the batch will be
            the only one for which start_logits and end_logits are calculated and they will be of shape `(batch_size,
            sequence_length)`.
        NzCquestion_positions must be specified in order to calculate the lossz>question_positions must be specified when input_embeds is usedrk  r   r   rl  r_   rn  )r;   r.  	TypeError_prepare_question_positionsr  rH   rc  rT  r7   r5   rs  rF   rt  ru  maxr	   r)   ro   r  rs   r   )r:   r?   rt   r@   r   ru   rA   rf  rg  ry   r   r   rh  r   r<  r4  sequence_lengthr|   r]  r^  num_questions attention_mask_for_each_questionrx  rz  r{  r|  r   s                              r=   rN   zSplinterForPreTraining.forward  s   B &1%<kk$+B]%/*E-Jcabbb'I,=\]]]'!%!A!A)!L!L--))%'/!5#   

 

 "!*+:+?+?+A+A(
OS#'#5#5oGY#Z#Z j*//22%/=/G/G/J/J/Q/QM?0 0, (1/O+OSXS^_k_qSrSrSv*vvL#q+K'Ku{[e[kOlOlOp&ppJ
&=+D""1c!_q-@&A&ABBB  C?Q+>$?$?@@@ (T[5MNNNH!!!*}"<oNN$$Z-%?@@ J  x
] :OLL"":#=>> H %x/14J 	R"J/'!""+=F/9/EZMF**6Q+%!!/)
 
 
 	
r>   c                 r   t          j        || j        j        k              \  }}t          j        |          }t          j        |                    d          |                                f| j        j        t           j	        |j
                  }t          j        d |D                       }||||f<   |S )Nr   rE   c                 6    g | ]}t          j        |          S r   )r5   r6   )r   ns     r=   r   zFSplinterForPreTraining._prepare_question_positions.<locals>.<listcomp>Q  s     AAAa%,q//AAAr>   )r5   wherer;   rd  bincountfullrH   r  r)   rJ   rG   r~   )r:   r?   rowsflat_positionsr  rW  colss          r=   r  z2SplinterForPreTraining._prepare_question_positionsH  s    ${98U+UVVnt,,J^^A 1 1 3 34K$*#	
 
 
	 yAA=AAABB .	$*r>   r}  )rO   rP   rQ   r%   r   r=  r>  r   r5   r   rS   r   r   r   r  rN   r  rV   rW   s   @r=   r  r    s            +*!(()UVV 
 -11515/3,0046:48,0/3&*9=b
 b
EL)b
 !.b
 !.	b

 u|,b
 EL)b
  -b
 "%"23b
   01b
 $D>b
 'tnb
 d^b
 %U%56b
 
u22	3b
 b
 b
 b
HU\ el        r>   r  )9rR   r   dataclassesr   typingr   r   r   r   r5   torch.utils.checkpointr   torch.nnr	   activationsr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   configuration_splinterr   
get_loggerrO   r   r?  r@  Moduler   rY   r   r   r   r   r   r   r   r
  SPLINTER_START_DOCSTRINGr=  r  rB  rK  r`  r  r  r   r>   r=   <module>r     s      ! ! ! ! ! ! / / / / / / / / / / / /            % % % % % % ! ! ! ! ! ! t t t t t t t t t t - - - - - - l l l l l l l l l l u u u u u u u u u u u u 2 2 2 2 2 2 
	H	%	%) "3 3 3 3 3 3 3 3nC C C C CBI C C CN        "# 0 0 0 0 0	 0 0 0h    29        RY   S S S S SBI S S SnZ
 Z
 Z
 Z
 Z
bi Z
 Z
 Z
z* * * * *o * * *8	 / d h \
 \
 \
 \
 \
+ \
 \
	 \
~    ")   $#( #( #( #( #(RY #( #( #(L   o
 o
 o
 o
 o
#: o
 o
 o
d : : : : :; : : :> 
  } } } } }4 } } } } }r>   