
    g!                       d Z ddlZddlZddlmZ ddlmZmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(  e%j)        e*          Z+dZ,dZ-de
j.        de/de/fdZ0dGde
j.        de
j1        dee/         fdZ2 G d dej3                  Z4 G d dej5                  Z6 G d dej5                  Z7 G d  d!ej5                  Z8 G d" d#ej5                  Z9 G d$ d%ej5                  Z: G d& d'ej5                  Z; G d( d)e          Z<e G d* d+e                       Z=e G d, d-e                       Z>e G d. d/e                       Z?e G d0 d1e                       Z@e G d2 d3e                       ZAd4ZBd5ZCd6ZD G d7 d8e<          ZE G d9 d:e<          ZF e#d;eB           G d< d=e<                      ZG e#d>eB           G d? d@e<e                      ZH e#dAeB           G dB dCe<                      ZI e#dDeB           G dE dFe<                      ZJdS )HzPyTorch LED model.    N)	dataclass)ListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GenerationMixin) _create_4d_causal_attention_mask))BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutput)PreTrainedModel)ModelOutputadd_code_sample_docstringsadd_end_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )	LEDConfigzallenai/led-base-16384r   	input_idspad_token_iddecoder_start_token_idc                     |                      | j                  }| ddddf                                         |ddddf<   ||dddf<   |t          d          |                    |dk    |           |S )z1
    Shift input ids one token to the right.
    Nr   r   z&config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r   r    r!   shifted_input_idss       `/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/led/modeling_led.pyshift_tokens_rightr+   8   s     "++IO<<(CRC06688aaae4aaadABBB""#4#<lKKK    maskdtypetgt_lenc                 D   |                                  \  }}||n|}| ddddddf                             |d||                              |          }d|z
  }|                    |                                t          j        |          j                  }||z  }|S )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    Nr   g      ?)sizeexpandtomasked_fillbooltorchfinfomin)r-   r.   r/   bszsrc_lenexpanded_maskinverted_maskexpanded_attention_masks           r*   #_prepare_4d_attention_mask_invertedr>   H   s     99;;LC ,gg'GD$)*11#q'7KKNNuUUM-'M+778J8J8L8LekZ_N`N`Ndee 6E""r,   c                   L     e Zd ZdZdedef fdZd	dej        def fdZ xZ	S )
LEDLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                 L    t                                          ||           d S N)super__init__)selfrA   rB   	__class__s      r*   rF   z&LEDLearnedPositionalEmbedding.__init___   s#    77777r,   r   input_ids_shapepast_key_values_lengthc                     |dd         \  }}t          j        |||z   t           j        | j        j                  }t                                          |          S )z3`input_ids_shape` is expected to be [bsz x seqlen].N   )r.   device)r6   arangelongweightrM   rE   forward)rG   rI   rJ   r9   seq_len	positionsrH   s         r*   rQ   z%LEDLearnedPositionalEmbedding.forwardb   s[    &rr*WL"$:W$DEJ_c_j_q
 
 
	 wwy)))r,   )r   )
__name__
__module____qualname____doc__intrF   r6   SizerQ   __classcell__rH   s   @r*   r@   r@   Z   s         8s 83 8 8 8 8 8 8* *uz *3 * * * * * * * * * *r,   r@   c                       e Zd Z fdZ	 	 	 	 	 	 ddZed             Zed             Zeddefd            Z	ed	e
j        fd
            Zde
j        de
j        defdZde
j        de
j        defdZed             Zd Zd Zd Z xZS )LEDEncoderSelfAttentionc                    t                                                       |j        |j        z  dk    r t	          d|j         d|j         d          |j        | _        t          |j        |j        z            | _        |j        | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        |j        | _        || _        |j        | j                 }|dz  dk    sJ d| j         d|             |dk    sJ d| j         d|             |dz  | _        || _        d S )	Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()rL   z`attention_window` for layer z  has to be an even value. Given z has to be positive. Given )rE   rF   hidden_sizenum_attention_headsr'   	num_headsrX   head_dim	embed_dimr   Linearquerykeyvaluequery_global
key_globalvalue_globalattention_probs_dropout_probdropoutlayer_idattention_windowone_sided_attn_window_sizeconfig)rG   rq   rn   ro   rH   s       r*   rF   z LEDEncoderSelfAttention.__init__m   s    ::a??8F$6 8 8 48 8 8    3F.1KKLL+Yv14>BB
9V/@@Yv14>BB
 If&8$.II)F$6GGIf&8$.II: !24=Aq A%%%l4=llZjll &%% q   g4=ggUegg !   +;a*?'r,   NFc                 
   |                     dd          }|                     |          }|                     |          }	|                     |          }
|                                \  }}}|| j        k    sJ d| j         d|             |t          j        | j                  z  }|	                    ||| j
        | j                                       dd          }|		                    ||| j
        | j                                       dd          }	|                     ||	| j                  }|dk    ddddddf         }|                    |                              |t          j        |j                  j                  }|                     |                    |                                          || j                  }||z  }t)          |                                          ||| j
        | j        dz  dz   gk    s;J d| d	| d	| j
         d	| j        dz  dz    d
|                                 
            |rN|                     |          \  }}}}|                     ||	||||          }t          j        ||fd          }~t0          j                            |dt          j                  }|a|                                | j
        fk    s(J d| j
        f d|                                             |	                    dddd          |z  }t          j        ||ddddddf         d          }|                    |          }~t0          j                            || j        | j                  }|
	                    ||| j
        | j                                       dd          }
|r|                     |
||||          }n|                     ||
| j                  }|                                ||| j
        | j        fk    s
J d            |                     dd                               |||          !                                }|rr| "                    |||||||          \  }}||d         dd|d         f         }|	                    tG          |d                   d          ||ddd         <   d||<   |                     dd          f}|r||fz  }|r|r||fz   n|S )a  
        [`LEDEncoderSelfAttention`] expects *len(hidden_states)* to be multiple of *attention_window*. Padding to
        *attention_window* happens in [`LEDEncoderModel.forward`] to avoid redoing the padding on each layer.

        The *attention_mask* is changed in [`LEDEncoderModel.forward`] from 0, 1, 2 to:

            - -10000: no attention
            - 0: local attention
            - +10000: global attention
        r   r   z&hidden_states should have embed_dim = z
, but has N)r1   rL   z$local_attn_probs should be of size (z, z), but is of size )query_vectorskey_vectorsmax_num_global_attn_indicesis_index_global_attn_nonzero"is_local_index_global_attn_nonzero%is_local_index_no_global_attn_nonzeror#   dimrz   r.   /Head mask for a single layer should be of size 	, but is         ptraining)value_vectors
attn_probsru   rv   rw   zUnexpected size)hidden_statesru   layer_head_maskrw   rv   rx   is_index_masked)$	transposerf   rg   rh   r1   rd   mathsqrtrc   viewrb    _sliding_chunks_query_key_matmulrp   type_asr4   r6   r7   r.   r8   new_oneslist_get_global_attn_indices"_concat_with_global_key_attn_probscatr   
functionalsoftmaxfloat32rm   r   (_compute_attn_output_with_global_indices'_sliding_chunks_matmul_attn_probs_valuereshape
contiguous'_compute_global_attn_output_from_hiddenlen)rG   r   attention_maskr   r   is_index_global_attnis_global_attnoutput_attentionsrs   rt   r   rR   
batch_sizerd   attn_scores#remove_from_windowed_attention_mask
float_maskdiagonal_maskru   rv   rw   rx   global_key_attn_scoresr   attn_outputglobal_attn_outputglobal_attn_probsnonzero_global_attn_outputoutputss                                r*   rQ   zLEDEncoderSelfAttention.forward   s   ( &//155 

=11hh}--

=11)6););)=)=&Y'''YDNYYiYY ('' 	4=111%**7JPTP]^^hhijlmnn!&&w
DNDMZZddefhijj;;;(G
 

 0>/BAAAqqq$PTDT.U+ 9@@OO[[/]=P1Q1Q1U
 

 ==Z__%6%677TEd
 

 	}$K$$&&''N+a/!3	,
 
 
 
`: ` ` ` `DN ` `/!3a7` `KVK[K[K]K]` `
 
 
  	' --.BCC+,25 &*%L%L+',G-I3U6[ &M & &"  )%;[$IrRRRK ']**Ru} + 
 

 &"'')).   u$.ARuu]l]q]q]s]suu   )--aB::ZGJ &z?111aaatCS3TVYZZ
''44
  ]**:PTP]*^^
%**7JPTP]^^hhijlmnn  	GG+%,G-I3U H  KK FFM4+J K !!j'4>4=%YYYY[lYYY!++Aq1199':yYYddff  	9484`4`+,G /3U-I6[ / 5a 5 51 1 *<215qqq:\]^:__*&
 ?Y>]>]6q9::B? ?K4TTrT:; 89J34((A..0 	%
}$G2@dEVdw+---]ddr,   c                     t           j                            | |          }  | j        g |                                 dd         |                     d          |                     d          R  } | S )z)pads rows and then flips rows and columnsNr#   )r   r   padr   r1   )hidden_states_paddedpaddings     r*    _pad_and_transpose_last_two_dimsz8LEDEncoderSelfAttention._pad_and_transpose_last_two_dims8  s      "}00 ' 
  
  938  
!&&(("- 
/C/H/H/L/L 
NbNgNghjNkNk 
  
  
 $#r,   c                 2   |                                  \  }}}}t          j                            | d|dz   f          } |                     ||d          } | ddddd| f         } |                     |||||z             } | ddddddddf         } | S )aY  
        shift every row 1 step right, converting columns into diagonals.

        Example:

        ```python
        chunked_hidden_states: [
            0.4983,
            2.6918,
            -0.0071,
            1.0492,
            -1.8348,
            0.7672,
            0.2986,
            0.0285,
            -0.7584,
            0.4206,
            -0.0405,
            0.1599,
            2.0514,
            -1.1600,
            0.5372,
            0.2629,
        ]
        window_overlap = num_rows = 4
        ```

                     (pad & diagonalize) => [ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000
                       0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000 0.0000, 0.0000, -0.7584, 0.4206,
                       -0.0405, 0.1599, 0.0000 0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ]
        r   r   r#   N)r1   r   r   r   r   )chunked_hidden_statestotal_num_heads
num_chunkswindow_overlap
hidden_dims        r*   _pad_and_diagonalizez,LEDEncoderSelfAttention._pad_and_diagonalizeC  s    B CXB\B\B^B^?^Z " 1 1!A~'9#:!
 !
 !6 : :Z!
 !
 !6AAqqq"N?""!
 !6 : :Z*9T!
 !
 !6aaaAAAssl C$$r,   onnx_exportc                 @   |s|                      |                     d          t          j        |                     d          |dz  d          |dz  |                     d                    } t	          |                                           }|d         dz  dz
  |d<   t	          |                                           }|d         dz  |d<   |                     ||          S |                     d          t          j        |                     d          |d          dz
  |dz  |                     d          g}t          j        || j                  }t          |d                   D ],}| dd||z  ||z  d|z  z   ddf         |dd|ddddf<   -|S )	zBconvert into overlapping chunks. Chunk size = 2w, overlap size = wr   r   rL   truncrounding_moder1   striderM   N)
r   r1   r6   divr   r   
as_stridedemptyrM   range)r   r   r   
chunk_sizechunk_strideoverlapping_chunkschunks          r*   _chunkzLEDEncoderSelfAttention._chunkt  s     	R)..""1%%	-,,Q//.12DU\]]]"""1%%	 M m002233J&qMA-1JqM 4 4 6 677L*1o2LO ++L+QQQ q!!Im((++^7SSSVWWQq!!	

 #[M<PQQQ:a=)) 	 	E1>5>)EN,BQEW,WWYZYZYZZ2qqq%AAA~.. "!r,   returnc                 >   |                      ||dz                                                                 dg          }|d d d d d d f         }|                    d          }| d d d |d d d |dz   f         }|                    |                                          }t          j        |t          d                                         |	                                |          | d d d |d d d |dz   f<   | d d | d d d |dz    d f         }|                    |                                          }t          j        |t          d                                         |	                                |          | d d | d d d |dz    d f<   d S )Nr   r   )dims)r   r   inf)
r   trilflipr2   r1   r6   	full_likefloatwherer5   )input_tensoraffected_seq_lenbeginning_mask_2dbeginning_maskending_maskbeginning_inputending_inputs          r*   _mask_invalid_locationsz/LEDEncoderSelfAttention._mask_invalid_locations  s   (112BDTWXDXYY^^``eelmkneoo*4D!!!+;<$))v)66&qqq*;+;*;QQQ@VBRUVBV@V'VW'../C/C/E/EFFHMeEll]I
 I

%##%%
7
7 	QQQ)))111.D0@10D.DDE $AAA(8'8'9'9111@PST@T>U>W>W$WX!(():):)<)<==LQO5<<-M
 M

%  ""L
1
1 	QQQ))**AAA1AA1E/F/H/HHIIIr,   rf   rg   r   c           	         |                                 \  }}}}||dz  z  dk    sJ d|dz   d|             |                                 |                                 k    sJ t          j        ||d          dz
  }|                    dd                              ||z  ||          }|                    dd                              ||z  ||          }|                     ||t          | j        dd	                    }|                     ||t          | j        dd	                    }t          j        d
||f          }	| 	                    |	d          }	|	
                    ||z  |dz   ||dz  dz   f          }
|	ddddd|d|dz   f         |
dddddd|df<   |	ddd|dd|dz   f         |
ddddd|df<   |	dddd|dz    d|dz   df         |
ddddddd|f<   |	dddd|dz
  d|z
  df         |
dddd|d|f<   |
                    |||d|z  dz                                 dd          }
|                     |
|           |
S )a  
        Matrix multiplication of query and key tensors using with a sliding window attention pattern. This
        implementation splits the input into overlapping chunks of size 2w (e.g. 512 for pretrained LEDEncoder) with an
        overlap of size window_overlap
        rL   r   z&Sequence length should be multiple of z. Given r   r   r   r   Fzbcxd,bcyd->bcxy)r   r   r   r   )r   Nr#   )r1   r6   r   r   r   r   getattrrq   einsumr   r$   r   r   )rG   rf   rg   r   r   rR   rb   rc   chunks_count!diagonal_chunked_attention_scoresdiagonal_attention_scoress              r*   r   z8LEDEncoderSelfAttention._sliding_chunks_query_key_matmul  sH    49::<<0
GY~)*a///YNQ4FYYPWYY 0//zz||sxxzz))))y.PPPSTT 1%%--j9.DgxXXmmAq!!))*y*@'8TTE>74;W\3]3]^^kk#~wt{MSX/Y/YZZ -2L9JUTWL,Y,Y) -1,Q,Q-| -R -
 -
) %F$O$O)#\A%5~~XYGY\]G]^%
 %
! AbAAqqq/>/#7^a%7#77A
!!!!SbS!!!^__"<= @aAAr>??$8nq&8$88@
!!!!RNOO";< @aAAqqqNQ&'",nq.@.B.BB@
!!!!QRRO^O";< OpAAq&NQ&&N(:(<(<<O
!!!!Q.(8!N:J"JK
 %>$B$B	7A,>,B%
 %

)Aq// 	" 	$$%>OOO((r,   r   rh   c                    |                                 \  }}}}||dz  z  dk    sJ |                                 dd         |                                 dd         k    sJ |                     d          d|z  dz   k    sJ t          j        ||d          dz
  }|                    dd                              ||z  t          j        ||d          |d|z  dz             }	|                    dd                              ||z  ||          }t
          j                            |dd||fd	          }
||z  |dz   d|z  |f}|
                                }|d         ||d         z  |d         |d         f}|
	                    ||
          }| 
                    |	          }	t          j        d|	|f          }|                    ||||                              dd          S )z
        Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors. Returned tensor will be of the
        same shape as `attn_probs`
        rL   r   Nr   r   r   r   r#   rh   r   zbcwd,bcdh->bcwh)r1   r6   r   r   r   r   r   r   r   r   r   r   r   )rG   r   rh   r   r   rR   rb   rc   r   chunked_attn_probspadded_valuechunked_value_sizechunked_value_stridechunked_valuecontexts                  r*   r   z?LEDEncoderSelfAttention._sliding_chunks_matmul_attn_probs_value  s)    49::<<0
GY.1,-2222  !$

RaR(88888q!!Q%7!%;;;;;y.PPPSTT (11!Q77??"Ig~WEEE"	
 
 1%%--j9.DgxXX }((A~~0V^`(aa )94lQ6FNHZ\de+2244 #1!44 # #	 
 %//5GPd/ee!667IJJ,03E}2UVV||J	7HEEOOPQSTUUUr,   c                 x   |                                                      d          }|                                }|                     d          }t	          j        || j                  |                    d          k     }|                    d          }|dk                        d          }||||fS )z<compute global attn indices required throughout forward passr   ry   T)as_tupler   r#   r   )rO   summaxnonzeror6   rN   rM   	unsqueeze)r   num_global_attn_indicesru   rv   is_local_index_global_attnrw   rx   s          r*   r   z0LEDEncoderSelfAttention._get_global_attn_indices  s     #7";";"="="A"Aa"A"H"H '>&A&A&C&C# (<'C'CT'C'R'R$ &+\'0D0K&
 &
 &
#--"-55&6"
 .H-O-OY]-O-^-^* 2Lq1P0Y0Ycg0Y0h0h-'(.1	
 	
r,   c                 j   |j         d         }|                    ||| j        | j                  }||         ||<   t	          j        d||f          }	|	                    dd          }	t	          j        |	j                  j	        |	|d         |d         d d d d f<   |	                    dd          }	|	S )Nr   zblhd,bshd->blhsr   r   )
r%   r$   rb   rc   r6   r   r   r7   r.   r8   )
rG   rt   rs   ru   rv   rw   rx   r   key_vectors_only_globalattn_probs_from_global_keys
             r*   r   z:LEDEncoderSelfAttention._concat_with_global_key_attn_probs8  s     !&q)
 #."7"73T^T]#
 #
 GRRnFo BC &+\2CmUlEm%n%n" &@%I%I!Q%O%O" K2899= 	#1!46[\]6^`a`a`acdcdcdd	
 &@%I%I!Q%O%O"))r,   c                 N   |j         d         }|                    dd|          }|                    ||| j        | j                  }||         ||<   t          j        |                    dd                                          |                    dd                                                                        dd          }	|                    d||	                    d          |z
            
                                }
|                     |
|| j                  }|	|z   S )Nr   r#   r   rL   )r%   narrowr$   rb   rc   r6   matmulr   r&   r1   r   r   rp   )rG   r   r   ru   rv   rw   r   attn_probs_only_globalvalue_vectors_only_globalattn_output_only_globalattn_probs_without_globalattn_output_without_globals               r*   r   z@LEDEncoderSelfAttention._compute_attn_output_with_global_indicesV  s6     %a(
 ",!2!22q:U!V!V$1$;$;3T^T]%
 %
! IVVrHs!"DE
 #(,",,Q2288::<U<_<_`acd<e<e<k<k<m<m#
 #

)Aq// 	 
 %/$5$5+Z__R-@-@C^-^%
 %

*,, 	"
 &*%Q%Q%}d6U&
 &
" ')CCCr,   c                 b	   |j         d d         \  }}	|                    ||	| j                  }
||d d d                  |
|d d d         <   |                     |
          }|                     |          }|                     |          }|t          j        | j                  z  }|	                                
                    ||	| j        z  | j                                      dd          }|	                                
                    d|	| j        z  | j                                      dd          }|	                                
                    d|	| j        z  | j                                      dd          }t          j        ||                    dd                    }t          |                                          |	| j        z  ||gk    s.J d|	| j        z  ||f d|                                 d            |
                    |	| j        ||          }|                    dd          }t          j        |j                  j        ||d         |d         d d d d f<   |                    dd          }|                    |d d d d d d f         t          j        |j                  j                  }|
                    |	| j        z  ||          }t*          j                            |dt          j                  }||                                | j        fk    s(J d	| j        f d|                                             |
                    dddd          |
                    |	| j        ||          z  }|
                    |	| j        z  ||          }t*          j                            |                    |          | j        | j        
          }t          j        ||          }t          |                                          |	| j        z  || j        gk    s3J d|	| j        z  || j        f d|                                 d            |
                    |	| j        ||          }|
                    |	| j        || j                  }||fS )NrL   r#   r   r   z7global_attn_scores have the wrong size. Size should be r}   .r{   r|   r   z=global_attn_output tensor has the wrong size. Size should be )r%   r$   rd   ri   rj   rk   r   r   rc   r   r   rb   r   r6   bmmr   r1   r7   r.   r8   r4   r   r   r   r   rm   r   r   )rG   r   ru   r   rw   rv   rx   r   rR   r   global_attn_hidden_states global_query_vectors_only_globalglobal_key_vectorsglobal_value_vectorsglobal_attn_scoresglobal_attn_probs_floatr   r   s                     r*   r   z?LEDEncoderSelfAttention._compute_global_attn_output_from_hiddenz  sW    ,1"1"5 %2$;$;<WYceies$t$t!N[(2.O
!"DTTrT"JK
 ,0+<+<=V+W+W(!__];;#00?? 	)DIdm,D,DD( -7799T-zDN/JDMZZYq!__ 	) ))++00Z$.5PRVR_``jjklnopp 	 !++--222zDN7RTXTabbllmnpqrr 	
 #Y'GI[IeIefgijIkIkll&++--..''3
 
 
 

-dn,.I7S- -"''))- - -
 
 
 044ZQlnuvv 099!Q?? K*0115 	1!46[\]6^`a`a`acdcdcdd	
 099!Q??/;;AAAtT111,-K*0115
 

 044Z$.5PRmovww #%-"7"7Bem #8 #
 #

 &"'')).   u$.ARuu]l]q]q]s]suu   '6&:&:1b!Q&G&GJaJfJfDN,GK K '# '>&B&BT^+-H'' '# M11#++,>??4<Z^Zg 2 
 

 #Y'8:NOO&++--..''M3
 
 
 

-dn,.I4=Y- -"''))- - -
 
 
 .22:t~Ojlstt/44(CT]
 
 "#444r,   NNNNNF)F)rT   rU   rV   rF   rQ   staticmethodr   r   r5   r   r6   Tensorr   rX   r   r   r   r   r   r   rZ   r[   s   @r*   r]   r]   l   s       ! ! ! ! !L !fe fe fe feP $ $ \$ .% .% \.%` &" &"4 &" &" &" \&"P 25< 2 2 2 \2@)el @) @)gj @) @) @) @)D*V,*V/4|*VMP*V *V *V *VX 
 
 \
8* * *<"D "D "DHj5 j5 j5 j5 j5 j5 j5r,   r]   c                       e Zd Z fdZ	 	 	 	 	 	 ddej        deej                 deej                 deej                 deej                 d	ee         d
edeej        eej                 eeej                          f         fdZ	 xZ
S )LEDEncoderAttentionc                     t                                                       t          ||          | _        t	          j        |j        |j                  | _        d S )N)rn   )rE   rF   r]   longformer_self_attnr   re   d_modeloutputrG   rq   rn   rH   s      r*   rF   zLEDEncoderAttention.__init__  sI    $;FX$V$V$V!i??r,   NFr   r   r   r   r   r   r   r   c           	          |                      |||||||          }|                     |d                   }	|	f|dd         z   }
|
S )#Input shape: Batch x Time x Channelr   r   r   r   r   r   r   r   r   N)r	  r  )rG   r   r   r   r   r   r   r   self_outputsr   r   s              r*   rQ   zLEDEncoderAttention.forward  sd     00')++!5)/ 1 
 
 kk,q/22.<#33r,   r  )rT   rU   rV   rF   r6   r  r   r5   r   rQ   rZ   r[   s   @r*   r  r    s        @ @ @ @ @ 2626267;)-"' | !. "%,/	
 "%,/ 'u|4 !   
u|Xel3XeEL>Q5RR	S       r,   r  c                   P    e Zd ZdZ	 	 	 ddedededed	ef
 fd
Zdej	        dedefdZ
	 	 	 	 	 ddej	        deej	                 deeej	                          deej	                 deej	                 dedeej	        eej	                 eeej	                          f         fdZ xZS )LEDDecoderAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr~   FTrd   rb   rm   
is_decoderbiasc                    t                                                       || _        || _        || _        ||z  | _        | j        |z  | j        k    rt          d| j         d| d          | j        dz  | _        || _        t          j
        |||          | _        t          j
        |||          | _        t          j
        |||          | _        t          j
        |||          | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩r  )rE   rF   rd   rb   rm   rc   r'   scalingr  r   re   k_projv_projq_projout_proj)rG   rd   rb   rm   r  r  rH   s         r*   rF   zLEDDecoderAttention.__init__  s    	""!Y.=9$66"dn " "" " "   }d*$i	94@@@i	94@@@i	94@@@	)YTBBBr,   tensorrR   r9   c                     |                     ||| j        | j                                      dd                                          S )Nr   rL   )r   rb   rc   r   r   )rG   r  rR   r9   s       r*   _shapezLEDDecoderAttention._shape&  s<    {{3GGQQRSUVWWbbdddr,   Nr   key_value_statespast_key_valuer   r   r   r   c                  
   |du}|                                 \  }}	}
|                     |          | j        z  }|r||d         }|d         }n>|rU|                     |                     |          d|          }|                     |                     |          d|          }n||                     |                     |          d|          }|                     |                     |          d|          }t          j        |d         |gd          }t          j        |d         |gd          }nT|                     |                     |          d|          }|                     |                     |          d|          }| j        r||f}|| j	        z  d| j
        f} |                     ||	|          j        | } |j        | } |j        | }|                     d          }t          j        ||                    dd                    }|                                 || j	        z  |	|fk    r2t          d|| j	        z  |	|f d|                                            ||                                 |d|	|fk    r+t          d	|d|	|f d|                                            |                    || j	        |	|          |z   }|                    || j	        z  |	|          }t          j                            |d          }||                                 | j	        fk    r-t          d
| j	        f d|                                            |                    dddd          |                    || j	        |	|          z  }|                    || j	        z  |	|          }|r=|                    || j	        |	|          }|                    || j	        z  |	|          }nd}t          j                            || j        | j                  }t          j        ||          }|                                 || j	        z  |	| j
        fk    r5t          d|| j	        |	| j
        f d|                                            |                    || j	        |	| j
                                      dd                              ||	|
          }|                     |          }|||fS )r  Nr   r   r#   rL   ry   z$Attention weights should be of size r}   z!Attention mask should be of size r|   r   z `attn_output` should be of size )r1   r  r  r  r  r  r6   r   r  rb   rc   r   r   r   r'   r   r   r   rm   r   r   r  )rG   r   r  r   r   r   r   is_cross_attentionr9   r/   rd   query_states
key_statesvalue_states
proj_shaper:   attn_weightsattn_weights_reshapedr   r   s                       r*   rQ   zLEDDecoderAttention.forward)  sU    .T9"/"4"4"6"6Wi {{=11DL@ 	L."<'*J)!,LL 	LT[[1A%B%BBLLJ;;t{{3C'D'Db#NNLL'T[[%?%?SIIJ;;t{{='A'A2sKKLN1$5z#BJJJJ 9nQ&7%FANNNLL T[[%?%?SIIJ;;t{{='A'A2sKKL? 	8 ),7NDN*B>
Ct{{<#>>CZP$Z_j1
(|(*5//!$$yz/C/CAq/I/IJJ3#7'"JJJ*dn8LgW^7_ * * %%''* *  
 %""$$a'(BBB ta'8Rtt]k]p]p]r]rtt   (,,S$.'7SSVddL',,S4>-A7GTTL},,\r,BB&##%%$.)::: 1t~FW 1 1',,..1 1   +//2q!<<|?P?PQTVZVdfmov?w?wwL',,S4>-A7GTTL 	)
 %1$5$5c4>7T[$\$\!055cDN6JGU\]]LL$(!]**<4<RVR_*``
i
L99#"6!OOO)CRVR_3` ) )$$&&) )   S$.'4=IIYq!__WS'9-- 	 mmK001>AAr,   )r~   FT)NNNNF)rT   rU   rV   rW   rX   r   r5   rF   r6   r  r  r   r   rQ   rZ   r[   s   @r*   r  r  	  s       GG  C CC C 	C
 C C C C C C C4eU\ eC ec e e e e 488<1526"'lB lB|lB #5<0lB !u|!45	lB
 !.lB "%,/lB  lB 
u|Xel3XeEL>Q5RR	SlB lB lB lB lB lB lB lBr,   r  c                   d     e Zd Zdedef fdZ	 	 	 	 d
dej        dej        dej        fd	Z xZ	S )LEDEncoderLayerrq   rn   c                    t                                                       |j        | _        t	          ||          | _        t          j        | j                  | _        |j	        | _	        t          |j                 | _        |j        | _        t          j        | j        |j                  | _        t          j        |j        | j                  | _        t          j        | j                  | _        d S rD   )rE   rF   r
  rd   r  	self_attnr   	LayerNormself_attn_layer_normrm   r   activation_functionactivation_fnactivation_dropoutre   encoder_ffn_dimfc1fc2final_layer_normr  s      r*   rF   zLEDEncoderLayer.__init__  s    ,VX>>$&L$@$@!~#F$>?"(";9T^V-CDD9V3T^DD "T^ < <r,   NFr   r   r   c           	         |}|                      |||||||          }	|	d         }t          j                            || j        | j                  }||z   }|                     |          }|}|                     |                     |                    }t          j                            || j        | j                  }| 	                    |          }t          j                            || j        | j                  }||z   }| 
                    |          }|j        t          j        k    rt          j        |                                          s&t          j        |                                          r9t          j        |j                  j        dz
  }
t          j        ||
 |
          }|f|	dd         z   S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
            attention_mask (`torch.FloatTensor`): attention mask of size
                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                *(encoder_attention_heads,)*.
        r  r   r   i  )r8   r   r   N)r,  r   r   rm   r   r.  r0  r3  r1  r4  r5  r.   r6   float16isinfanyisnanr7   r   clamp)rG   r   r   r   r   r   r   r   residualattn_outputsclamp_values              r*   rQ   zLEDEncoderLayer.forward  s   $ !~~')++!5)/ & 
 
 %Q--mt|VZVc-dd =011-@@ **488M+B+BCC--mt?Vaean-oo//--mt|VZVc-dd =0--m<<%-//K&&**,, 005M0J0J0N0N0P0P 0  +m&9::>EK!KK<[YYYM,qrr"222r,   )NNNF)
rT   rU   rV   r   rX   rF   r6   r  rQ   rZ   r[   s   @r*   r*  r*    s        
=y 
=C 
= 
= 
= 
= 
= 
=" !.3 .3|.3 .3 	.3 .3 .3 .3 .3 .3 .3 .3r,   r*  c                       e Zd Zdef fdZ	 	 	 	 	 	 	 	 ddej        deej                 deej                 d	eej                 d
eej                 deej                 deeej                          dee	         dee	         fdZ
 xZS )LEDDecoderLayerrq   c                    t                                                       |j        | _        t	          | j        |j        |j        d          | _        |j        | _        t          |j
                 | _        |j        | _        t          j        | j                  | _        t	          | j        |j        |j        d          | _        t          j        | j                  | _        t          j        | j        |j                  | _        t          j        |j        | j                  | _        t          j        | j                  | _        d S )NT)rd   rb   rm   r  )rm   r  )rE   rF   r
  rd   r  decoder_attention_headsattention_dropoutr,  rm   r   r/  r0  r1  r   r-  r.  encoder_attnencoder_attn_layer_normre   decoder_ffn_dimr3  r4  r5  rG   rq   rH   s     r*   rF   zLEDDecoderLayer.__init__  s   ,n4,	
 
 
 ~#F$>?"(";$&L$@$@!/N*,	
 
 
 (*|DN'C'C$9T^V-CDD9V3T^DD "T^ < <r,   NFTr   r   encoder_hidden_statesencoder_attention_maskr   cross_attn_layer_head_maskr   r   	use_cachec
                 x   |}
|
|dd         nd}|                      |||||          \  }}}t          j                            || j        | j                  }|
|z   }|                     |          }d}d}|z|}
|
|dd         nd}|                     ||||||          \  }}}t          j                            || j        | j                  }|
|z   }|                     |          }||z   }|}
|                     | 	                    |                    }t          j                            || j
        | j                  }|                     |          }t          j                            || j        | j                  }|
|z   }|                     |          }|f}|r|||fz  }|	r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
            attention_mask (`torch.FloatTensor`): attention mask of size
                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                *(decoder_attention_heads,)*.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for encoder attention heads in a given layer of
                size *(decoder_attention_heads,)*.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`): Whether the base model outputs attentions.
                This requires the attentions tensor to be reshaped in this function.
        NrL   )r   r   r   r   r   r   r   )r   r  r   r   r   r   )r,  r   r   rm   r   r.  rD  rE  r0  r3  r1  r4  r5  )rG   r   r   rH  rI  r   rJ  r   r   rK  r<  self_attn_past_key_valueself_attn_weightspresent_key_valuecross_attn_present_key_valuecross_attn_weightscross_attn_past_key_valuer   s                     r*   rQ   zLEDDecoderLayer.forward  s,   : ! :H9S>"1"#5#5Y] >Bnn'3)+/ ?M ?
 ?
;(*; --mt|VZVc-dd =011-@@ (,$! ,$H @N?Yrss(;(;_c%NRN_N_+!65 :8"3 O` O OKM-/K M11-4<Z^Zg1hhM$}4M 88GGM !24P P !**488M+B+BCC--mt?Vaean-oo//--mt|VZVc-dd =0--m<< " 	?)+=>>G 	,)++Gr,   )NNNNNNFT)rT   rU   rV   r   rF   r6   r  r   r   r5   rQ   rZ   r[   s   @r*   r@  r@    s       =y = = = = = =: 268<9=26=A8<,1$(V V|V !.V  (5	V
 !) 6V "%,/V %-U\$:V !u|!45V $D>V D>V V V V V V V Vr,   r@  c                   J     e Zd ZdZdedededef fdZdej        fdZ	 xZ
S )	LEDClassificationHeadz-Head for sentence-level classification tasks.	input_dim	inner_dimnum_classespooler_dropoutc                     t                                                       t          j        ||          | _        t          j        |          | _        t          j        ||          | _        d S )N)r   )rE   rF   r   re   denseDropoutrm   r  )rG   rU  rV  rW  rX  rH   s        r*   rF   zLEDClassificationHead.__init__M  sY     	Yy)44
zN333	)[99r,   r   c                     |                      |          }|                     |          }t          j        |          }|                      |          }|                     |          }|S rD   )rm   rZ  r6   tanhr  )rG   r   s     r*   rQ   zLEDClassificationHead.forwardY  s[    ]33

=11
=11]33m44r,   )rT   rU   rV   rW   rX   r   rF   r6   r  rQ   rZ   r[   s   @r*   rT  rT  J  s        77
:
: 
: 	
:
 
: 
: 
: 
: 
: 
:U\        r,   rT  c                   6    e Zd ZeZdZdZd Zed             Z	dS )LEDPreTrainedModelledTc                    | j         j        }t          |t          j                  rJ|j        j                            d|           |j         |j        j        	                                 d S d S t          |t          j
                  rS|j        j                            d|           |j        -|j        j        |j                 	                                 d S d S d S )Nr~   )meanstd)rq   init_std
isinstancer   re   rP   datanormal_r  zero_	Embeddingpadding_idx)rG   modulerc  s      r*   _init_weightsz LEDPreTrainedModel._init_weightsg  s    k"fbi(( 	?M&&CS&999{& &&((((( '&-- 	?M&&CS&999!-"6#56<<>>>>>	? 	?--r,   c                     | j         j        }t          j        g ddddd|gg| j                  }|                    |          |d}|S )N)r      
      rL   r         rL   r   )r   r   )rq   r    r6   r  rM   ne)rG   	pad_tokenr   dummy_inputss       r*   ru  zLEDPreTrainedModel.dummy_inputsr  sa    K,	L"2"2"2Q2q)4L!MVZVabbb	'll955"
 
 r,   N)
rT   rU   rV   r   config_classbase_model_prefixsupports_gradient_checkpointingrl  propertyru   r,   r*   r_  r_  b  sO        L&*#	? 	? 	?   X  r,   r_  c                       e Zd ZU dZej        ed<   dZee	ej        df                  ed<   dZ
ee	ej        df                  ed<   dZee	ej        df                  ed<   dS )LEDEncoderBaseModelOutputa  
    Base class for LEDEncoder's outputs, with potential hidden states, local and global attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
            attention_window + 1)`, where `x` is the number of tokens with global attention mask.

            Local attentions weights after the attention softmax, used to compute the weighted average in the
            self-attention heads. Those are the attention weights from every token in the sequence to every token with
            global attention (first `x` values) and to every token in the attention window (remaining `attention_window
            + 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
            remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
            token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
            (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
            If the attention window contains a token with global attention, the attention weight at the corresponding
            index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
            attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
            accessed from `global_attentions`.
        global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
            where `x` is the number of tokens with global attention mask.

            Global attentions weights after the attention softmax, used to compute the weighted average in the
            self-attention heads. Those are the attention weights from every token with global attention to every token
            in the sequence.
    last_hidden_stateN.r   
attentionsglobal_attentions)rT   rU   rV   rW   r6   FloatTensor__annotations__r   r   r   r~  r  rz  r,   r*   r|  r|  }  s         ! !F ((((=AM8E%"3S"89:AAA:>Ju0#567>>>AExe&7&< =>EEEEEr,   r|  c                      e Zd ZU dZdZej        ed<   dZe	e
ej                          ed<   dZe	eej        df                  ed<   dZe	eej        df                  ed<   dZe	eej        df                  ed<   dZe	ej                 ed	<   dZe	eej        df                  ed
<   dZe	eej        df                  ed<   dZe	eej        df                  ed<   dS )LEDSeq2SeqModelOutputaC  
    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
    decoding.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the decoder of the model.

            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
        past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
            num_heads, sequence_length, embed_size_per_head)`).

            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
            used (see `past_key_values` input) to speed up sequential decoding.
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
            where `x` is the number of tokens with global attention mask.

            Global attentions weights after the attention softmax, used to compute the weighted average in the
            self-attention heads. Those are the attention weights from every token with global attention to every token
            in the sequence.
    Nr}  past_key_values.decoder_hidden_statesdecoder_attentionscross_attentionsencoder_last_hidden_staterH  encoder_attentionsencoder_global_attentions)rT   rU   rV   rW   r}  r6   r  r  r  r   r   r  r   r  r  r  rH  r  r  rz  r,   r*   r  r    s;        5 5n ,0u(///9=OXd5#456===EI8E%*;S*@$ABIIIBFu'8#'=!>?FFF@DhuU%6%;<=DDD=Ax(9:AAAEI8E%*;S*@$ABIIIBFu'8#'=!>?FFFIMxe.?.D(EFMMMMMr,   r  c                      e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
eeej                          ed<   dZeeej        df                  ed<   dZeeej        df                  ed<   dZeeej        df                  ed	<   dZeej                 ed
<   dZeeej        df                  ed<   dZeeej        df                  ed<   dZeeej        df                  ed<   dS )LEDSeq2SeqLMOutputa  
    Base class for sequence-to-sequence language models outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss.
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
            num_heads, sequence_length, embed_size_per_head)`).

            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
            used (see `past_key_values` input) to speed up sequential decoding.
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
            where `x` is the number of tokens with global attention mask.

            Global attentions weights after the attention softmax, used to compute the weighted average in the
            self-attention heads. Those are the attention weights from every token with global attention to every token
            in the sequence.
    Nlosslogitsr  .r  r  r  r  rH  r  r  rT   rU   rV   rW   r  r   r6   r  r  r  r  r   r  r   r  r  r  rH  r  r  rz  r,   r*   r  r    R        3 3j )-D(5$
%,,, $FE$$$9=OXd5#456===EI8E%*;S*@$ABIIIBFu'8#'=!>?FFF@DhuU%6%;<=DDD=Ax(9:AAAEI8E%*;S*@$ABIIIBFu'8#'=!>?FFFIMxe.?.D(EFMMMMMr,   r  c                      e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
eeej                          ed<   dZeeej        df                  ed<   dZeeej        df                  ed<   dZeeej        df                  ed	<   dZeej                 ed
<   dZeeej        df                  ed<   dZeeej        df                  ed<   dZeeej        df                  ed<   dS )"LEDSeq2SeqSequenceClassifierOutputa
  
    Base class for outputs of sequence-to-sequence sentence classification models.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `label` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
            num_heads, sequence_length, embed_size_per_head)`).

            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
            used (see `past_key_values` input) to speed up sequential decoding.
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
            where `x` is the number of tokens with global attention mask.

            Global attentions weights after the attention softmax, used to compute the weighted average in the
            self-attention heads. Those are the attention weights from every token with global attention to every token
            in the sequence.
    Nr  r  r  .r  r  r  r  rH  r  r  r  rz  r,   r*   r  r  0  r  r,   r  c                      e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
ej        ed<   dZeeej                          ed<   dZeeej        df                  ed<   dZeeej        df                  ed	<   dZeeej        df                  ed
<   dZeej                 ed<   dZeeej        df                  ed<   dZeeej        df                  ed<   dZeeej        df                  ed<   dS )&LEDSeq2SeqQuestionAnsweringModelOutputa|  
    Base class for outputs of sequence-to-sequence question answering models.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
        start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Span-start scores (before SoftMax).
        end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Span-end scores (before SoftMax).
        past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
            num_heads, sequence_length, embed_size_per_head)`).

            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
            used (see `past_key_values` input) to speed up sequential decoding.
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        encoder_global_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`,
            where `x` is the number of tokens with global attention mask.

            Global attentions weights after the attention softmax, used to compute the weighted average in the
            self-attention heads. Those are the attention weights from every token with global attention to every token
            in the sequence.
    Nr  start_logits
end_logitsr  .r  r  r  r  rH  r  r  )rT   rU   rV   rW   r  r   r6   r  r  r  r  r  r   r  r   r  r  r  rH  r  r  rz  r,   r*   r  r  s  se        5 5n )-D(5$
%,,,&*L%#***$(J!(((9=OXd5#456===EI8E%*;S*@$ABIIIBFu'8#'=!>?FFF@DhuU%6%;<=DDD=Ax(9:AAAEI8E%*;S*@$ABIIIBFu'8#'=!>?FFFIMxe.?.D(EFMMMMMr,   r  a)  
    This model inherits from [`PreTrainedModel`]. See the superclass documentation for the generic methods the library
    implements for all its models (such as downloading or saving, resizing the input embeddings, pruning heads etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for general usage and behavior.

    Parameters:
        config ([`LEDConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a
  
    Summarization example:

    ```python
    >>> import torch
    >>> from transformers import AutoTokenizer, LEDForConditionalGeneration

    >>> model = LEDForConditionalGeneration.from_pretrained("allenai/led-large-16384-arxiv")
    >>> tokenizer = AutoTokenizer.from_pretrained("allenai/led-large-16384-arxiv")

    >>> ARTICLE_TO_SUMMARIZE = '''Transformers (Vaswani et al., 2017) have achieved state-of-the-art
    ...     results in a wide range of natural language tasks including generative language modeling
    ...     (Dai et al., 2019; Radford et al., 2019) and discriminative ... language understanding (Devlin et al., 2019).
    ...     This success is partly due to the self-attention component which enables the network to capture contextual
    ...     information from the entire sequence. While powerful, the memory and computational requirements of
    ...     self-attention grow quadratically with sequence length, making it infeasible (or very expensive) to
    ...     process long sequences. To address this limitation, we present Longformer, a modified Transformer
    ...     architecture with a self-attention operation that scales linearly with the sequence length, making it
    ...     versatile for processing long documents (Fig 1). This is an advantage for natural language tasks such as
    ...     long document classification, question answering (QA), and coreference resolution, where existing approaches
    ...     partition or shorten the long context into smaller sequences that fall within the typical 512 token limit
    ...     of BERT-style pretrained models. Such partitioning could potentially result in loss of important
    ...     cross-partition information, and to mitigate this problem, existing methods often rely on complex
    ...     architectures to address such interactions. On the other hand, our proposed Longformer is able to build
    ...     contextual representations of the entire context using multiple layers of attention, reducing the need for
    ...     task-specific architectures.'''
    >>> inputs = tokenizer.encode(ARTICLE_TO_SUMMARIZE, return_tensors="pt")

    >>> # Global attention on the first token (cf. Beltagy et al. 2020)
    >>> global_attention_mask = torch.zeros_like(inputs)
    >>> global_attention_mask[:, 0] = 1

    >>> # Generate Summary
    >>> summary_ids = model.generate(inputs, global_attention_mask=global_attention_mask, num_beams=3, max_length=32)
    >>> print(tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
    ```
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`LedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)

            LED uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_led._prepare_decoder_inputs`] and modify
            to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the
            default strategy.
        global_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to decide the attention given on each token, local attention or global attention for the encoder.
            Tokens with global attention attends to all other tokens, and all other tokens attend to them. This is
            important for task-specific finetuning because it makes the model more flexible at representing the task.
            For example, for classification, the <s> token should be given global attention. For QA, all question
            tokens should also have global attention. Please refer to the [Longformer
            paper](https://arxiv.org/abs/2004.05150) for more details. Mask values selected in `[0, 1]`:

            - 0 for local attention (a sliding window attention),
            - 1 for global attention (tokens that attend to all other tokens, and all other tokens attend to them).
        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
            input (see `past_key_values`). This is useful if you want more control over how to convert
            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.

            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
            of `inputs_embeds`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                        e Zd ZdZddedeej                 f fdZde	j
        de	j
        fdZd	e	j
        de	j
        d
e	j
        defdZ	 	 	 	 	 	 	 	 ddZ xZS )
LEDEncoderz
    Transformer encoder consisting of *config.encoder_layers* self-attention layers. Each layer is a
    [`LEDEncoderLayer`].

    Args:
        config: LEDConfig
        embed_tokens (nn.Embedding): output embedding
    Nrq   embed_tokensc                    t                                                     j        | _        j        | _        j        }j        | _        j        | _	        t          j        t                    rMj        dz  dk    rt          d          j        dk    rt          d          j        gj        z  _        nIt          j                  j        k    r,t          dj         dt          j                             ||| _        n%t#          j        j        || j                  | _        t)          | j	        |          | _        t#          j        fdt/          j                  D                       | _        t#          j        |          | _        d| _        |                                  d S )	NrL   r   z1`config.attention_window` has to be an even valuez,`config.attention_window` has to be positivezQ`len(config.attention_window)` should equal `config.num_hidden_layers`. Expected z, given c                 0    g | ]}t          |          S rz  )r*  ).0irq   s     r*   
<listcomp>z'LEDEncoder.__init__.<locals>.<listcomp>~  s#    $f$f$fA_VQ%?%?$f$f$fr,   F)rE   rF   rm   encoder_layerdrop	layerdropr
  r    rj  max_encoder_position_embeddingsmax_source_positionsre  ro   rX   r'   num_hidden_layersr   r  r   ri  
vocab_sizer@   embed_positions
ModuleListr   encoder_layerslayersr-  layernorm_embeddinggradient_checkpointing	post_init)rG   rq   r  rd   rH   s    `  r*   rF   zLEDEncoder.__init__^  s      ~1N	!.$*$J!f-s33 	&*a// !TUUU&!++ !OPPP'-'>&?&BZ&ZF##6*++v/GGG a & 8a aBEfF]B^B^a a  
 # ,D "V->	4K[ \ \D<% 
  
 m$f$f$f$fvOdIeIe$f$f$fgg#%<	#:#: &+#r,   r   global_attention_maskc                 &    |	||dz   z  }n|dz   }|S )Nr   rz  )rG   r   r  s      r*   _merge_to_attention_maskz#LEDEncoder._merge_to_attention_mask  s.     %+/Dq/HINN 3Q6Nr,   r   inputs_embedsr    c                    t          | j        j        t                    r| j        j        nt	          | j        j                  }|dz  dk    rt          d|           ||j        n|j        }|dd         \  }}|||z  z
  |z  }	|	dk    rt                              d| d||	z    d|            |$t          j
                            |d|	f|          }|[|                    ||	f| j        j        t          j        	          }
|                     |
          }t          j        ||gd
          }t          j
                            |d|	fd          }|	|||fS )zbA helper function to pad tokens and mask to work with implementation of Longformer self-attention.rL   r   z2`attention_window` should be an even value. Given Nz(Input ids are automatically padded from z to z0 to be a multiple of `config.attention_window`: r   )r.   r   ry   F)re  rq   ro   rX   r   r'   r%   loggerwarning_oncer   r   r   new_fullr    r6   rO   r  r   )rG   r   r   r  r    ro   input_shaper   rR   padding_leninput_ids_paddinginputs_embeds_paddings               r*   _pad_to_window_sizezLEDEncoder._pad_to_window_size  s    $+6<<3DK((T[122 	 a1$$dRbddeee)2)>iooMDW)"1"o
G''4D*DDHXX??A7 A AR]H] A A.>A A   $M--i![9IQ]-^^	($1$:$:-K,* %; % %!
 )-(9(9:K(L(L% %	=:O*PVX Y Y Y]..K 0 /  N I~}DDr,   c	                    ||n| j         j        }||n| j         j        }||n| j         j        }||t	          d          ||t	          d          ||                     |          }|@t          j        |                                dd         |j	        t          j
                  }||                     ||          }|                     |||| j         j                  \  }}}|1|                                }	|                    d|	d                   }n||                                dd         }	|#t          ||j                  ddddddf         }|dk     }
|dk    }|                                                                                                }|                     |	          }||z   }|                     |          }t,          j                            || j        | j                  }|rd	nd}|rd	nd}|r|rd	nd}|p|                                d         t5          | j                  k    r@t	          d
t5          | j                   d|                                d          d          t9          | j                  D ]\  }}|r||fz   }t          j        g           }| j        r|| j        k     rd}n^| j        r3| j        r,|                      |j!        |||||         nd|
|||          }n ||||||         nd|
|||          }|d         }|rB||d         "                    dd          fz   }|r ||d         "                    dd          fz   }|r||fz   }dk    rI|ddd f         }|rtG          fd|D                       }|rtG          fd|D                       }|stG          d ||||fD                       S tI          ||||          S )aO  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            global_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to decide the attention given on each token, local attention or global attention for the encoder.
                Tokens with global attention attends to all other tokens, and all other tokens attend to them. This is
                important for task-specific finetuning because it makes the model more flexible at representing the
                task. For example, for classification, the <s> token should be given global attention. For QA, all
                question tokens should also have global attention. Please refer to the [Longformer
                paper](https://arxiv.org/abs/2004.05150) for more details. Mask values selected in `[0, 1]`:

                - 0 for local attention (a sliding window attention),
                - 1 for global attention (tokens that attend to all other tokens, and all other tokens attend to them).
            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedsr#   )rM   r.   )r   r   r  r    r   r   rz  z&The head_mask should be specified for  layers, but it is for r   )NNN)r   r   r   r   r   r   r   rL   r   c                 .    g | ]}|d d d  f         S rD   rz  r  stater  s     r*   r  z&LEDEncoder.forward.<locals>.<listcomp>e  s-    '\'\'\Eaaa;,.>(?'\'\'\r,   c                 :    g | ]}|d d d d d  d d f         S rD   rz  r  s     r*   r  z&LEDEncoder.forward.<locals>.<listcomp>h  s9    'b'b'b%aaaMk\M111.D(E'b'b'br,   c              3      K   | ]}||V  	d S rD   rz  r  vs     r*   	<genexpr>z%LEDEncoder.forward.<locals>.<genexpr>k  s1        efererererer r,   r}  r   r~  r  )%rq   r   output_hidden_statesuse_return_dictr'   r  r6   onesr1   rM   rO   r  r  r    r   r>   r.   flattenr9  itemr  r  r   r   rm   r   r   r  	enumeraterandr  r  _gradient_checkpointing_func__call__r   tupler|  )rG   r   r   r  	head_maskr  r   r  return_dictr  r   r   r   	embed_posr   encoder_statesall_attentionsall_global_attentionsidxencoder_layerdropout_probabilitylayer_outputsr  s                         @r*   rQ   zLEDEncoder.forward  sD   p 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]  ]%>cddd=#8TUUU  --i88M !"Z(:(:(<(<SbS(A-J^fkfpqqqN !,!::>K`aaN AE@X@X)'1	 AY A
 A
=Y  #..**K!r;r?;;II&',,..ss3K %@Q^QdeefgfgfgijlmopopopfpqN )1,-1-5577;;==BBDD((55	%	100??--mt|VZVc-dd3=0:d'8 V^ VRV  ~~"c$+&6&666 /S=M=M / /!((+/ / /   #,DK"8"8 &	h &	hC# C!/=2B!B"'*R..} 1"5"F"F 2. 4= $($E$E%.%&*3*?	#T',&)	% 	%MM %2M%'5;D;P3VZ(7-A'5*;% % %M !.a 0  h!/=3C3M3MaQR3S3S2U!U! h,A]STEUE_E_`acdEeEeDg,g) 	?+}.>>N ??)!!!]{l]*:;M# ^!&'\'\'\'\^'\'\'\!]!]  d!&'b'b'b'bSa'b'b'b!c!c 	  )>>K`a      )+(%3	
 
 
 	
r,   rD   )NNNNNNNN)rT   rU   rV   rW   r   r   r   ri  rF   r6   r  r  rX   r  rQ   rZ   r[   s   @r*   r  r  T  s	        % %y %8N % % % % % %N
u| 
\a\h 
 
 
 
)E<)E )E |	)E
 )E )E )E )EZ "!w
 w
 w
 w
 w
 w
 w
 w
r,   r  c                   f     e Zd ZdZddedeej                 f fdZ	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ	 xZ
S )	
LEDDecoderz
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`LEDDecoderLayer`]

    Args:
        config: LEDConfig
        embed_tokens (nn.Embedding): output embedding
    Nrq   r  c                 2   t                                                     j        | _        j        | _        j        | _        j        | _        ||| _	        n*t          j        j        j        | j                  | _	        t          | j        j                  | _        t          j        fdt#          j                  D                       | _        t          j        j                  | _        d| _        |                                  d S )Nc                 .    g | ]}t                    S rz  )r@  )r  _rq   s     r*   r  z'LEDDecoder.__init__.<locals>.<listcomp>  s!    $c$c$c_V%<%<$c$c$cr,   F)rE   rF   rm   decoder_layerdropr  r    rj  max_decoder_position_embeddingsmax_target_positionsr  r   ri  r  r
  r@   r  r  r   decoder_layersr  r-  r  r  r  )rG   rq   r  rH   s    ` r*   rF   zLEDDecoder.__init__  s       ~1!.$*$J!# ,D "V->PTP` a aD<%N 
  
 m$c$c$c$ceFLaFbFb$c$c$cdd#%<#?#? &+#r,   c                    ||n| j         j        }||n| j         j        }|
|
n| j         j        }
||n| j         j        }||	t          d          |1|                                }|                    d|d                   }n.|	|	                                dd         }nt          d          ||d         d         j        d         nd}|	| 	                    |          }	d}|d         dk    rt          ||	j        |	j        |          }|"| |t          ||	j        |d         	          z   }||t          ||	j        |d         	          }|                     ||          }|	|z   }|                     |          }t           j                            || j        | j        
          }| j        r%| j        r|
rt*                              d           d}
|rdnd}|rdnd}|rdnd}|
rdnd}t/          ||gddg          D ]z\  }}|s|                                d         t1          | j                  k    rCt          d| dt1          | j                   d|                                d          d          {t5          | j                  D ]\  }}|r||fz  }| j        r t7          j        g           }|| j        k     r4|||         nd}| j        r?| j        r8|                     |j        |||||||         nd|||         ndd||

  
        }n( ||||||||         nd|||         nd|||
	  	        }|d         }|
r|||rdnd         fz  }|r||d         fz  }||d         fz  }|r||fz  }|
r|nd}|stA          d |||||fD                       S tC          |||||          S )az  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            global_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to decide the attention given on each token, local attention or global attention. Tokens with
                global attention attends to all other tokens, and all other tokens attend to them. This is important
                for task-specific finetuning because it makes the model more flexible at representing the task. For
                example, for classification, the <s> token should be given global attention. For QA, all question
                tokens should also have global attention. Please refer to the [Longformer
                paper](https://arxiv.org/abs/2004.05150) for more details. Mask values selected in `[0, 1]`:

                - 0 for local attention (a sliding window attention),
                - 1 for global attention (tokens that attend to all other tokens, and all other tokens attend to them).
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer#   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsr   rL   r   )rJ   )r/   r   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Frz  r  cross_attn_head_maskzThe `z` should be specified for r  r   )r   rH  rI  r   rJ  r   r   rK  r   c              3      K   | ]}||V  	d S rD   rz  r  s     r*   r  z%LEDDecoder.forward.<locals>.<genexpr>p  s0        =  === r,   )r}  r  r   r~  r  )"rq   r   r  rK  r  r'   r1   r   r%   r  r   r.   rM   r>   r  r  r   r   rm   r   r  r  r  zipr   r  r  r6   r  r  r  r  r  r   )rG   r   r   r  rH  rI  r  r  r  r  rK  r   r  r  r  rJ   combined_attention_maskrS   r   all_hidden_statesall_self_attnsall_cross_attentionsnext_decoder_cache	attn_mask	mask_namer  decoder_layerr  r   r  
next_caches                                  r*   rQ   zLEDDecoder.forward  si   t 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B]  ]%>sttt"#..**K!r;r?;;II&',,..ss3KKdeee DSC^!3A!6!<Q!?!?de  --i88M #'r?Q&F]0-2F_u' ' '# %*A*M&=@c 3[_A A A '#
 !,1G1S%H&(;[QS_& & &"
 ((6LMM	%	100??--mt|VZVc-dd& 	"4= 	" "##p   "	 #7@BBD0:d%6@rrD#,6RR$ %(4H(IKYoKp$q$q 	 	 Iy$>>##A&#dk*:*:::$3	 3 3SEUEU 3 3%NN,,Q/3 3 3   #,DK"8"8 .	< .	<C# 6!m%55!} &+jnn#&775D5P_S11VZN* t}  $ A A!*!+)*&/&;IcNN1E1Q(--W[%! ! !.!#:*?+A7@7LYs^^RV5I5U,S11[_#1&7'! ! ! *!,M V"}:K5RQQQR'S&UU"  <=#3"55$q)9(;;$   	2-!11+4>''$
 	  '5FXlm     
 9+&+%1
 
 
 	
r,   rD   )NNNNNNNNNNNNN)rT   rU   rV   rW   r   r   r   ri  rF   rQ   rZ   r[   s   @r*   r  r  v  s          y 8N      2 ""#!!e
 e
 e
 e
 e
 e
 e
 e
r,   r  zQThe bare LED Model outputting raw hidden-states without any specific head on top.c            '           e Zd ZddgZdef fdZd Zd Zd Zd Z	 e
e           eeee	          	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 deej                 deeeej                                   deej                 deeeej                                   deej                 deej                 dee         dee         dee         dee         deeej                 ef         f"d                        Z xZS )LEDModeldecoder.embed_tokens.weightencoder.embed_tokens.weightrq   c                 6   t                                          |           |j        |j        }}t	          j        ||j        |          | _        t          || j                  | _	        t          || j                  | _        |                                  d S rD   )rE   rF   r    r  r   ri  r
  sharedr  encoderr  decoderr  )rG   rq   rj  r  rH   s       r*   rF   zLEDModel.__init__  s       "("5v7HZl:v~{KK!&$+66!&$+66 	r,   c                     | j         S rD   )r  rG   s    r*   get_input_embeddingszLEDModel.get_input_embeddings  s
    {r,   c                 X    || _         | j         | j        _        | j         | j        _        d S rD   )r  r  r  r  )rG   rh   s     r*   set_input_embeddingszLEDModel.set_input_embeddings  s'    $(K!$(K!!!r,   c                     | j         S rD   )r  r  s    r*   get_encoderzLEDModel.get_encoder  
    |r,   c                     | j         S rD   )r  r  s    r*   get_decoderzLEDModel.get_decoder  r  r,   
checkpointoutput_typerv  Nr   r   decoder_input_idsdecoder_attention_maskr  decoder_head_maskr  encoder_outputsr  r  r  decoder_inputs_embedsrK  r   r  r  r   c                    ||n| j         j        }||n| j         j        }||n| j         j        }||n| j         j        }|'|%t          || j         j        | j         j                  }||                     |||	|||||          }n|rt          |t                    sjt          |d         t          |          dk    r|d         nd t          |          dk    r|d         nd t          |          dk    r|d         nd           }|                     |||d         ||||
|||||          }|s||z   S t          |j        |j        |j        |j        |j        |j        |j        |j        |j        	  	        S )	N)r   r   r  r  r  r   r  r  r   r   rL   r   r  )r   r   rH  rI  r  r  r  r  rK  r   r  r  )	r}  r  r  r  r  r  rH  r  r  )rq   r   r  rK  r  r+   r    r!   r  re  r|  r   r  r  r}  r  r   r~  r  r  )rG   r   r   r  r  r  r  r  r  r  r  r  r  rK  r   r  r  decoder_outputss                     r*   rQ   zLEDModel.forward  s   2 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B]
 $)>)F 24;3T[5W! ! ""ll#-&;#+"3%9' + 	 	OO  	O=V!W!W 	7"1!"4474H4H14L4Loa00RV14_1E1E1I1I?1--t8;O8L8Lq8P8P/!"4"4VZ	  O ,,'1"1!"4#1'!5+//!5# ' 
 
  	5"_44$-?+;"1"?.9,=&5&G"1"?.9&5&G

 

 

 
	
r,   NNNNNNNNNNNNNNNN)rT   rU   rV   _tied_weights_keysr   rF   r  r  r  r  r   LED_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r6   
LongTensorr  r   r  r5   r   r  rQ   rZ   r[   s   @r*   r  r  ~  so       
 89VW
y 
 
 
 
 
 
  0 0 0
     +*+?@@&&$   15158<=A,0487;EI=AEI59=A$(,0/3&*#S
 S
E,-S
 !.S
 $E$45	S

 !))9 :S
 EL)S
 $EL1S
 'u|4S
 "%e.?(@"ABS
  ((9:S
 "%e.?(@"ABS
   12S
  ((9:S
 D>S
 $D>S
  'tn!S
" d^#S
$ 
uU\"$99	:%S
 S
 S
  A@S
 S
 S
 S
 S
r,   r  zKThe LED Model with a language modeling head. Can be used for summarization.c            *       <    e Zd ZdZdgZg dZdef fdZd Zd Z	d%d	e
d
ee
         dej        f fdZd	e
ddfdZd Zd Z ee           eee           ee          	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d&deej                 deej                 deej                 deej                 deej                 deej                 deej                 deeeej                                   deej                 deeeej                                   deej                 deej                 deej                 dee         dee         d ee         d!ee         deeej                 e f         f$d"                                    Z!dej        fd#Z"e#d$             Z$ xZ%S )'LEDForConditionalGenerationr`  final_logits_bias)r  r  zlm_head.weightrq   c                 l   t                                          |           t          |          | _        |                     dt          j        d| j        j        j        f                     t          j
        |j        | j        j        j        d          | _        |                                  d S )Nr  r   Fr  )rE   rF   r  r`  register_bufferr6   zerosr  rA   r   re   r
  lm_headr  rG  s     r*   rF   z$LEDForConditionalGeneration.__init__	  s       F##0%+q$(/B`>a2b2bcccy1OV[\\\ 	r,   c                 4    | j                                         S rD   )r`  r  r  s    r*   r  z'LEDForConditionalGeneration.get_encoder	      x##%%%r,   c                 4    | j                                         S rD   )r`  r  r  s    r*   r  z'LEDForConditionalGeneration.get_decoder	  r  r,   Nnew_num_tokenspad_to_multiple_ofr   c                     t                                          ||          }|                     |j        j        d                    |S )Nr   )rE   resize_token_embeddings_resize_final_logits_biasrP   r%   )rG   r  r  new_embeddingsrH   s       r*   r  z3LEDForConditionalGeneration.resize_token_embeddings	  sB    88I[\\&&~'<'B1'EFFFr,   c                    | j         j        d         }||k    r| j         d d d |f         }nBt          j        d||z
  f| j         j                  }t          j        | j         |gd          }|                     d|           d S )Nr#   r   r   ry   r  )r  r%   r6   r  rM   r   r  )rG   r  old_num_tokensnew_bias
extra_biass        r*   r  z5LEDForConditionalGeneration._resize_final_logits_bias	  s    /5b9^++-aaa..@AHHa.)H%IRVRhRopppJy$"8*!E1MMMH0(;;;;;r,   c                     | j         S rD   r  r  s    r*   get_output_embeddingsz1LEDForConditionalGeneration.get_output_embeddings 	  r  r,   c                     || _         d S rD   r%  )rG   r  s     r*   set_output_embeddingsz1LEDForConditionalGeneration.set_output_embeddings#	  s    %r,   )r  rv  r   r   r  r  r  r  r  r  r  r  r  r  labelsrK  r   r  r  c                    ||n| j         j        }|G|rt                              d           d}|'|%t	          || j         j        | j         j                  }|                     ||||||	||||
||||||          }|                     |d                   | j	        z   }d}|Kt                      } ||                    d| j         j                  |                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j        |j        |j        |j        |j        |j        |j        |j        
  
        S )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:

        Conditional generation example:

        ```python
        >>> from transformers import AutoTokenizer, LEDForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")
        >>> TXT = "My friends are <mask> but they eat too many carbs."

        >>> model = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384")
        >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]

        >>> prediction = model.generate(input_ids)[0]
        >>> print(tokenizer.decode(prediction, skip_special_tokens=True))
        ```NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)r   r  r  r  r  r  r  r  r  r  r  rK  r   r  r  r   r#   r   
r  r  r  r  r  r  r  rH  r  r  )rq   r  r  warningr+   r    r!   r`  r  r  r
   r   r  r  r  r  r  r  r  rH  r  r  )rG   r   r   r  r  r  r  r  r  r  r  r  r  r)  rK  r   r  r  r   	lm_logitsmasked_lm_lossloss_fctr  s                          r*   rQ   z#LEDForConditionalGeneration.forward&	  s   Z &1%<kk$+B] mklllI (-B-J$6DK4dk6X% %! (()/#9+"7/!5+'"7/!5#!  
 
$ LL,,t/EE	'))H%XinnR9O&P&PRXR]R]^`RaRabbN 	Z\GABBK/F3A3M^%..SYY!#3")"?&9$5&-&G")"?&9&-&G
 
 
 	
r,   c                 L    t          || j        j        | j        j                  S rD   )r+   rq   r    r!   )rG   r)  s     r*   %prepare_decoder_input_ids_from_labelszALEDForConditionalGeneration.prepare_decoder_input_ids_from_labels	  s    !&$+*BDKDfgggr,   c                 z    d}| D ]4}|t          fd|d d         D                       |dd          z   fz  }5|S )Nrz  c              3   t   K   | ]2}|                     d                     |j                            V  3dS )r   N)index_selectr3   rM   )r  
past_statebeam_idxs     r*   r  z=LEDForConditionalGeneration._reorder_cache.<locals>.<genexpr>	  sC      rrU_j--aZ=N1O1OPPrrrrrrr,   rL   )r  )r  r6  reordered_past
layer_pasts    `  r*   _reorder_cachez*LEDForConditionalGeneration._reorder_cache	  sm    ) 	 	JrrrrcmnpopnpcqrrrrrQRR.! NN r,   rD   NNNNNNNNNNNNNNNNN)&rT   rU   rV   rw  _keys_to_ignore_on_load_missingr  r   rF   r  r  rX   r   r   ri  r  r  r&  r(  r   r  r   r   r  r   LED_GENERATION_EXAMPLEr6   r  r  r   r  r5   r   r  rQ   r1  r  r9  rZ   r[   s   @r*   r  r    s<        ':&;#iiiy      & & && & & c xX[} hjht      
< < < < < <  & & & +*+?@@?YYY.// 15158<=A,0487;EI=AEI59=A-1$(,0/3&*%]
 ]
E,-]
 !.]
 $E$45	]

 !))9 :]
 EL)]
 $EL1]
 'u|4]
 "%e.?(@"AB]
  ((9:]
 "%e.?(@"AB]
   12]
  ((9:]
 )*]
 D>]
  $D>!]
" 'tn#]
$ d^%]
& 
uU\"$66	7']
 ]
 ]
 0/ ZY A@]
~hEL h h h h   \    r,   r  z
    LED model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c            '       Z    e Zd ZddgZdef fdZ ee           ee	e
e          	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 d	eej                 d
eej                 deej                 deej                 deej                 deeeej                                   deej                 deej                 deej                 deej                 dee         dee         dee         dee         deeej                 ef         f"d                        Z xZS )LEDForSequenceClassificationr  r  rq   c                    t          j        dt                      t                      j        |fi | t          |          | _        t          |j        |j        |j	        |j
                  | _        |                                  d S )NzThe `transformers.LEDForSequenceClassification` class is deprecated and will be removed in version 5 of Transformers. No actual method were provided in the original paper on how to perfom sequence classification.)warningswarnFutureWarningrE   rF   r  r`  rT  r
  
num_labelsclassifier_dropoutclassification_headr  )rG   rq   kwargsrH   s      r*   rF   z%LEDForSequenceClassification.__init__	  s    ( 		
 	
 	
 	**6***F###8NN%	$
 $
  	r,   r   Nr   r   r  r  r  r  r  r  r  r  r  r)  rK  r   r  r  r   c                 ,   ||n| j         j        }|d}||
t          d| j        j                   |                     |||||	|||||
|||||          }|d         }|                    | j         j                                      |j	                  }t          t          j        |                    d                              dk    rt          d          ||ddf                             |                    d          d|                    d                    dddddf         }|                     |          }d}|n| j         j        p| j         j        dk    rd	| j         _        nS| j         j        dk    r7|j        t          j        k    s|j        t          j        k    rd
| j         _        nd| j         _        | j         j        d	k    r\t/                      }| j         j        dk    r1 ||                                |                                          }n |||          }n| j         j        d
k    rLt3                      } ||                    d| j         j                  |                    d                    }n*| j         j        dk    rt5                      } |||          }|s|f|dd         z   }||f|z   n|S t7          |||j        |j        |j        |j        |j         |j!        |j"        |j#        
  
        S )a3  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NFz8Passing input embeddings is currently not supported for r   r  r  r  r  r  r  r  r  r  rK  r   r  r  r   r   z7All examples must have the same number of <eos> tokens.r#   
regressionsingle_label_classificationmulti_label_classificationr+  )$rq   r  NotImplementedErrorrH   rT   r`  eqeos_token_idr3   rM   r   r6   unique_consecutiver   r'   r   r1   rE  problem_typerC  r.   rO   rX   r   squeezer
   r	   r  r  r  r  r  r  rH  r  r  )rG   r   r   r  r  r  r  r  r  r  r  r  r)  rK  r   r  r  r   r   eos_masksentence_representationr  r  r/  r  s                            r*   rQ   z$LEDForSequenceClassification.forward	  sH   < &1%<kk$+B]I!:%d4>Kbdd   (()/#9"7/!5+'"7/!5#  
 
"  
<< 899<<]=QRRu'Q8899A==VWWW"/!!!"<"A"A-BTBTUVBWBWY[]j]o]opr]s]s"t"tAAr111H#
 ))*ABB{'/;)Q../;DK,,[+a//V\UZ5O5OSYS_chclSlSl/LDK,,/KDK,{'<77"99;)Q..#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB0F G GUWYY)-III,..x// 	FY,F)-)9TGf$$vE1#3")"?&9$5&-&G")"?&9&-&G
 
 
 	
r,   r
  )rT   rU   rV   r  r   rF   r   r  r   r  r   r  r   r6   r  r  r   r  r5   r   r  rQ   rZ   r[   s   @r*   r>  r>  	  s)        89VWy      & +*+?@@&3$   15158<=A,0487;EI=A59=A-1$(,0/3&*#b
 b
E,-b
 !.b
 $E$45	b

 !))9 :b
 EL)b
 $EL1b
 'u|4b
 "%e.?(@"ABb
  ((9:b
   12b
  ((9:b
 )*b
 D>b
 $D>b
  'tn!b
" d^#b
$ 
uU\"$FF	G%b
 b
 b
  A@b
 b
 b
 b
 b
r,   r>  z
    LED Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer
    on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c            )       p    e Zd ZddgZ fdZ ee           eee	e
          	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 d	eej                 d
eej                 deej                 deej                 deeeej                                   deej                 deej                 deej                 deej                 deej                 dee         dee         dee         dee         deeej                 ef         f$d                        Z xZS )LEDForQuestionAnsweringr  r  c                    t                                          |           d|_        |j        | _        t          |          | _        t          j        |j        |j                  | _        | 	                                 d S )NrL   )
rE   rF   rC  r  r`  r   re   r`   
qa_outputsr  rG  s     r*   rF   z LEDForQuestionAnswering.__init__)
  sm        +F##)F$68IJJ 	r,   r   Nr   r   r  r  r  r  r  r  r  start_positionsend_positionsr  r  rK  r   r  r  r   c                    ||n| j         j        }|
|d}|                     |||||	||||||||||          }|d         }|                     |          }|                    dd          \  }}|                    d                                          }|                    d                                          }d}|
|t          |
                                          dk    r|
                    d          }
t          |                                          dk    r|                    d          }|                    d          }|
	                    d|          }
|	                    d|          }t          |          } |||
          } |||          }||z   d	z  }|s||f|dd         z   }||f|z   n|S t          ||||j        |j        |j        |j        |j        |j        |j        |j        
          S )a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
            are not taken into account for computing the loss.
        NFrH  r   r   r#   ry   )ignore_indexrL   )r  r  r  r  r  r  r  r  rH  r  r  )rq   r  r`  rW  splitrQ  r   r   r1   r;  r
   r  r  r  r  r  r  rH  r  r  )rG   r   r   r  r  r  r  r  r  r  rX  rY  r  r  rK  r   r  r  r   sequence_outputr  r  r  
total_lossignored_indexr/  
start_lossend_lossr  s                                r*   rQ   zLEDForQuestionAnswering.forward5
  sd   H &1%<kk$+B]&=+DI(()/#9"7/!5+'"7/!5#  
 
$ "!*11#)<<r<#:#: j#++B//::<<''++6688

&=+D?''))**Q.."1"9"9""="==%%''((1,, - 5 5b 9 9(--a00M-33A}EEO)//=AAM']CCCH!,@@Jx
M::H$x/14J 	R F 0:/EZMF**6Q5%!#3")"?&9$5&-&G")"?&9&-&G
 
 
 	
r,   r:  )rT   rU   rV   r  rF   r   r  r   r  r   r  r   r6   r  r  r   r  r5   r   r  rQ   rZ   r[   s   @r*   rU  rU  
  s3        89VW
 
 
 
 
 +*+?@@&7$   15158<=A,0487;EI=A6:4859=A$(,0/3&*%_
 _
E,-_
 !._
 $E$45	_

 !))9 :_
 EL)_
 $EL1_
 'u|4_
 "%e.?(@"AB_
  ((9:_
 "%"23_
   01_
   12_
  ((9:_
 D>_
  $D>!_
" 'tn#_
$ d^%_
& 
uU\"$JJ	K'_
 _
 _
  A@_
 _
 _
 _
 _
r,   rU  rD   )KrW   r   r@  dataclassesr   typingr   r   r   r   r6   torch.utils.checkpointr   torch.nnr	   r
   r   activationsr   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   r   r   configuration_ledr   
get_loggerrT   r  r  r  r  rX   r+   r.   r>   ri  r@   Moduler]   r  r  r*  r@  rT  r_  r|  r  r  r  r  LED_START_DOCSTRINGr<  r  r  r  r  r  r>  rU  rz  r,   r*   <module>rp     s       ! ! ! ! ! ! / / / / / / / / / / / /            A A A A A A A A A A ! ! ! ! ! ! ) ) ) ) ) ) H H H H H H              . - - - - -                  ) ( ( ( ( ( 
	H	%	%. %, c [^     # #el #5; #YabeYf # # # #$* * * * *BL * * *$x	5 x	5 x	5 x	5 x	5bi x	5 x	5 x	5v    ")   DLB LB LB LB LB") LB LB LB^;3 ;3 ;3 ;3 ;3bi ;3 ;3 ;3|q q q q qbi q q qh    BI   0       6 'F 'F 'F 'F 'F 'F 'F 'FT @N @N @N @N @NK @N @N @NF ?N ?N ?N ?N ?N ?N ?N ?ND ?N ?N ?N ?N ?N ?N ?N ?ND BN BN BN BN BN[ BN BN BNJ $ Ld N_
 _
 _
 _
 _
# _
 _
 _
D	E
 E
 E
 E
 E
# E
 E
 E
P W v
 v
 v
 v
 v
! v
 v
	 v
r QSf V V V V V"4o V V Vr   ~
 ~
 ~
 ~
 ~
#5 ~
 ~
 ~
B   t
 t
 t
 t
 t
0 t
 t
 t
 t
 t
r,   