
    g7                    R   d Z ddlZddlZddlZddlmZmZmZmZm	Z	 ddl
Z
ddl
mZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ ddl m!Z!m"Z"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z.  e+j/        e0          Z1dZ2dZ3dVde
j4        de5de5de5de
j4        f
dZ6de
j4        de5de5de
j4        fdZ7dVde
j4        de5de5de5de
j4        f
dZ8de5de
j4        fdZ9de
j4        de5de
j4        fdZ:d e
j4        de5d!e
j;        de
j4        fd"Z<d e
j4        d#e5dee
j4        e
j4        f         fd$Z=d e
j4        d#e5de
j4        fd%Z>d&e
j4        d'e
j4        d(e5de
j4        fd)Z? G d* d+ej@                  ZA	 dd,lBmCZC eCZAe1D                    d-           n&# eE$ r Y neF$ r e1G                    d.           Y nw xY w e!jH        eA            G d/ d0ej@                  ZI G d1 d2ej@                  ZJ G d3 d4ej@                  ZK G d5 d6ej@                  ZL G d7 d8ej@                  ZM G d9 d:ej@                  ZN G d; d<ej@                  ZO G d= d>ej@                  ZP G d? d@ej@                  ZQ G dA dBej@                  ZR G dC dDej@                  ZS G dE dFe          ZT G dG dHeT          ZUdIZVdJZWdKZXdLZY e'dMeV           G dN dOeT                      ZZ e'dPeV           G dQ dReTe                      Z[ e'dSeV           G dT dUeT                      Z\dS )WzPyTorch LongT5 model.    N)AnyListOptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCacheStaticCache)GenerationMixin)AttentionMaskConverter)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)ALL_LAYERNORM_LAYERS find_pruneable_heads_and_indicesprune_linear_layer)DUMMY_INPUTS
DUMMY_MASKadd_start_docstrings%add_start_docstrings_to_model_forwardis_torch_fx_proxyis_torchdynamo_compilingloggingreplace_return_docstrings   )LongT5Configr#   zgoogle/long-t5-local-basex	block_lendim	pad_valuereturnc                 j   | j         |          |z  }t          | j                   s?t          | j                   }||xx         |z  cc<   t          j        || j                  S dg| j        z  }d|f||<   t          |ddd         d          }t          j	        
                    | |d|          } | S )	zHPad a tensor so that a sequence length will be a multiple of `block_len`dtyper   r   r   N constantpadmodevalue)shapealllisttorchzerosr+   ndimsumr   
functionalr1   )r$   r%   r&   r'   pad_len	new_shaper1   s          f/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/longt5/modeling_longt5.py_pad_to_multipler?   ;   s    ws|mi'Gqw<< 5MM	#'!{9AG4444(QV
C7|CH
c$$B$i

C
!:YGGAH    c                 2   | j         |         |z  dk    rt          | ||d          } | j         |         |z  }| j         d|         ||fz   | j         |dz   d         z   }d|v r!t          j        || j        | j                  S |                     |          S )zSplit an input tensor into blocks of a given `block_len` along the given `dim`. If the dimension length
    is not a multiple of `block_len`, it will be padded first with selected `pad_value`.
    r   )r'   Nr"   r+   device)r4   r?   r7   emptyr+   rC   reshape)r$   r%   r&   
num_blocksoutput_shapes        r>   _split_into_blocksrH   K   s    
 	ws|i1$$Q	3!<<<*J74C4=J	#::QWcAg[[=QQLL{<qwqxHHHH99\"""r@   	block_dimsequence_dimc                    | j         |         }dg| j        z  }d||<   t          |ddd         d          }t          j                            | |d|          } g }t          d          D ][}t          d	d          g| j        z  }t          |||z             ||<   t          |          }|	                    | |                    \t          j        ||
          S )zConcatenate three consecutive blocks for each input block for local attentiont.

    For more information, see: https://arxiv.org/pdf/2112.07916.pdf.
    r,   )r"   r"   Nr-   r.   r/   r0   r
   r   r&   )r4   r9   r:   r   r;   r1   rangeslicetupleappendr7   cat)	r$   rI   rJ   r'   rF   r1   blocks_listiindicess	            r>   _concatenate_3_blocksrU   Z   s    
 #J(QV
CC	N
c$$B$i

C
!:YGGA&(K1XX ' ' D>>"QV+"1a*n55	..1W:&&&&9[l3333r@   c                     t          j        d| z  t           j                  }|| |           }|                    d          |                    d          z
  }|S )z:Makes 3-blocked relative position ids for local attention.r
   r*   r   r"   )r7   arangeint32	unsqueeze)r%   position_idscenter_position_idsrelative_position_idss       r>   "_make_3block_relative_position_idsr]   s   s]    <IU[AAAL&y)';<(221558K8U8UVW8X8XX  r@   local_attention_maskc                     t          |          }t          j        |          |k     }|ddddddf         }|                    | j                  }t          j        | |          S )znMask local attention mask to enforce that tokens are not allowed to attend tokens farther than ``local_radius.N)r]   r7   abstorC   logical_and)r^   r%   r\   locality_masks       r>   _mask_local_attention_maskrd   |   sk    >yIII344y@M!$aaa"23M!$$%9%@AAM1=AAAr@   attention_maskrC   c                 8   t          | |d          }t          |dd          }|                    d          }|                    d          }t          j        ||          }t          ||          }|                    d                              |          S )z;Prepare attention mask to be applied for a local attention.r"   rL      rI   rJ   r-   )rH   rU   rY   r7   rb   rd   ra   )re   r%   rC   _blocked_attention_mask_3blocked_attention_maskr^   s         r>   _get_local_attention_maskrl      s     1PQRRR45LXYhijjj5??CC7AA"EE ,-DF^__56JIVV))!,,//777r@   global_block_sizec                 ^   | j         dd         \  }dt          j        dt          j        ffd}t          j        | | j                  z  }t          j        |d          |z
  }t          j        | d	k    d
d                              | j                  }t          j	        ||z   d
z
                                | j                  }t          j
        d|j        |j                  }t          j        ||k    ||          }|| z  | dz
  z   } ||          }z  }|dk    rDt          j        |d          j                            |d                              dd          }	n"t          j        |d|j        |j                  }	t          j        t          j        ||          d          dz
  }
|
                    | j                  }
t          j        |
|	k    dd          }
|                    t          j                  |
                    t          j                  fS )a  Obtain the "fixed block" global id corresponding to each input token.

    This implementation is a simlified version of the original Flaxformr implementation adopted from:
    https://github.com/google/flaxformer/blob/main/flaxformer/architectures/longt5/long_attention.py.

    In our scenario, as we use this strategy only for a decoder, orphan tokens, i.e. those tokens which do not make for
    the whole fixed block, are assigned to the preceding block.

    Padding tokens from the original sequence are represented by -1.
    Nrg   	block_idsr(   c                 d   t          j                  z  dz
  k    }|                    | j                  }t          j        || dk              }|                    d                              d                              | j                  dz
  }t          j	        | |k     | |          } | S )Nr"   r   r-   )
r7   rW   ra   rC   rb   r:   rY   typer+   where)ro   
block_endstrue_block_endsfull_blocksrm   seq_lens       r>   handle_orphan_tokensz:_make_global_fixed_block_ids.<locals>.handle_orphan_tokens   s    l7++.??DUXYDYY
]]9#344
+J	QGG%))"--77;;@@QQTUUK	K 7KPP	r@   rC   r"   )axis              ?g     @r-   rB   r   rL   )r4   r7   Tensor	ones_likerC   cumsumrr   rq   r+   floortensormaxvaluesrepeat	transposer8   onesra   int)re   rm   
batch_sizerw   fixed_block_maskmaskglobal_block_ids_global_block_ids_lower_boundnum_globals_sequence_block_ids_maxglobal_segment_idsrv   s    `         @r>   _make_global_fixed_block_idsr      sP    ).rr2J         ~n>STTTWhh|$41===@PP;~,c7;;@@AUVVD{4*:#:S#@AAFF~G[\\$)L;K;QZjZq$r$r$r!{88:JLi  )>9nq>PQ++,<==..KQ"'),<""E"E"E"L"S"ST_ab"c"c"m"mnoqr"s"s"'+!1!7@P@W#
 #
 #
 ej[&I&IrRRRUVV+..~/DEE%7;R%RTUWXYY  ++-?-D-DUY-O-OOOr@   c                     t          | |          \  }}|j        d         }t          j        ||j                  }||d         z
  }|                    t          j                  S )zBCreate the relative position tensor for local -> global attention.r-   rx   .N)r   r4   r7   rW   rC   rq   int64)re   rm   ro   r   global_seq_lenglobal_positionsside_relative_positions          r>    _make_side_relative_position_idsr      sc    $@Qb$c$c!I!'-b1N|N9;KLLL-	)0DD!&&u{333r@   hidden_statesro   r   c                 n   |                     |dk    t          j        ||j        |j                            }t
          j                            |                    t          j	                  |dz             ddddddf         }t          j
        d| |                    | j                            S )zFCompute individual block aggregates by summing over individual blocks.r   rB   r"   Nr-   z...nd,...ng->...gd)rr   r7   r   r+   rC   r   r;   one_hotrq   r   einsum)r   ro   r   one_hot_block_idss       r>   _create_global_aggregatesr      s    
 Q^9?S\Scddd I --innU[.I.I>\]K]^^_`_`_`bcbcbcehfheh_hi<,m=N=S=STaTg=h=hiiir@   c                   &     e Zd Zd fd	Zd Z xZS )LongT5LayerNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )zg
        Construct a layernorm module in the LongT5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parameterr7   r   weightvariance_epsilon)selfhidden_sizeeps	__class__s      r>   r   zLongT5LayerNorm.__init__   sD     	l5:k#:#:;; #r@   c                 h   |                     t          j                                      d                              dd          }|t          j        || j        z             z  }| j        j        t          j	        t          j
        fv r|                     | j        j                  }| j        |z  S )Nrg   r-   T)keepdim)ra   r7   float32powmeanrsqrtr   r   r+   float16bfloat16)r   r   variances      r>   forwardzLongT5LayerNorm.forward   s     !##EM2266q99>>r4>PP%Ht?T4T(U(UU ; ???),,T[->??M{]**r@   )r   )__name__
__module____qualname__r   r   __classcell__r   s   @r>   r   r      sL        $ $ $ $ $ $+ + + + + + +r@   r   )FusedRMSNormzSDiscovered apex.normalization.FusedRMSNorm - will use it instead of LongT5LayerNormzFdiscovered apex but it failed to load, falling back to LongT5LayerNormc                   *     e Zd Zdef fdZd Z xZS )LongT5DenseActDenseconfigc                 J   t                                                       t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j        |j	                  | _
        t          |j                 | _        d S NFbias)r   r   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr   dense_act_fnactr   r   r   s     r>   r   zLongT5DenseActDense.__init__  sx    )FNFKeDDD)FKeDDDz&"566&-.r@   c                    |                      |          }|                     |          }|                     |          }t          | j        j        t          j                  r]|j        | j        j        j        k    rC| j        j        j        t          j	        k    r$|
                    | j        j        j                  }|                     |          }|S N)r   r   r   
isinstancer   r   r7   r|   r+   int8ra   )r   r   s     r>   r   zLongT5DenseActDense.forward  s    ..//]33tw~u|44	C#tw~';;;$
22),,TW^-ABBM..r@   r   r   r   r#   r   r   r   r   s   @r>   r   r     sS        /| / / / / / /      r@   r   c                   *     e Zd Zdef fdZd Z xZS )LongT5DenseGatedActDenser   c                    t                                                       t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j	        |j
                  | _        t          |j                 | _        d S r   )r   r   r   r   r   r   wi_0wi_1r   r   r   r   r   r   r   r   s     r>   r   z!LongT5DenseGatedActDense.__init__  s    IfnfkFFF	IfnfkFFF	)FKeDDDz&"566&-.r@   c                     |                      |                     |                    }|                     |          }||z  }|                     |          }|                     |          }|S r   )r   r   r   r   r   )r   r   hidden_geluhidden_linears       r>   r   z LongT5DenseGatedActDense.forward%  sb    hhtyy7788		-00#m3]33..r@   r   r   s   @r>   r   r     sS        /| / / / / / /      r@   r   c                   *     e Zd Zdef fdZd Z xZS )LongT5LayerFFr   c                 $   t                                                       |j        rt          |          | _        nt          |          | _        t          |j        |j                  | _	        t          j        |j                  | _        d S )Nr   )r   r   is_gated_actr   DenseReluDenser   r   r   layer_norm_epsilon
layer_normr   r   r   r   r   s     r>   r   zLongT5LayerFF.__init__0  sx     	>":6"B"BD"5f"="=D)&.f>WXXXz&"566r@   c                     |                      |          }|                     |          }||                     |          z   }|S r   )r   r   r   )r   r   forwarded_statess      r>   r   zLongT5LayerFF.forward:  sF    ??=99../?@@%5E(F(FFr@   r   r   s   @r>   r   r   /  sS        7| 7 7 7 7 7 7      r@   r   c                   z     e Zd Z	 	 ddedee         f fdZd Zedd
            Z	ddZ
	 	 	 	 	 	 	 	 	 ddZ xZS )LongT5AttentionFNr   	layer_idxc                 P   t                                                       |j        | _        || _        |j        | _        |j        | _        |j        | _        |j        | _        |j	        | _
        |j        | _        | j
        | j        z  | _        || _        |/| j        r(t                              d| j        j         d           t'          j        | j        | j        d          | _        t'          j        | j        | j        d          | _        t'          j        | j        | j        d          | _        t'          j        | j        | j        d          | _        | j        r$t'          j        | j        | j
                  | _        t7                      | _        d| _        d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr   )r   r   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancer   d_kvkey_value_proj_dim	num_headsn_headsr   r   	inner_dimr   loggerwarning_oncer   r   r   r   qkvo	Embeddingrelative_attention_biassetpruned_headsgradient_checkpointingr   r   r   r   r   s       r>   r   zLongT5Attention.__init__C  su    	 ++F(.4.S+/5/U,~"(+'*(??",4>+B , , ,   4<eDDD4<eDDD4<eDDD4>4<eDDD+ 	k+-<8[]a]i+j+jD(EE&+###r@   c                    t          |          dk    rd S t          || j        | j        | j                  \  }}t          | j        |          | _        t          | j        |          | _        t          | j        |          | _        t          | j	        |d          | _	        | j        t          |          z
  | _        | j        | j        z  | _
        | j                            |          | _        d S Nr   r"   rL   lenr   r   r   r   r   r   r   r   r   r   unionr   headsindexs      r>   prune_headszLongT5Attention.prune_headsf      u::??F74<!8$:K
 
u $DFE22#DFE22#DFE22#DFEq999|c%jj004<? -33E::r@   T       c                 P   d}|rC|dz  }|| dk                         t          j                  |z  z  }t          j        |           } n(t          j        | t          j        |                      } |dz  }| |k     }|t          j        |                                 |z            t          j        ||z            z  ||z
  z                       t          j                  z   }t          j        |t          j	        ||dz
                      }|t          j
        || |          z  }|S a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   rg   r"   ra   r7   longr`   min
zeros_likelogfloatmath	full_likerr   relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r>   _relative_position_bucketz)LongT5Attention._relative_position_bucketv  s>   ,  	cAK!2Q!6 : :5: F F TT %	*; < <!&+<e>NO`>a>a!b!b b  1$	$y0 &/I'--//);<<h|i/001Y&( "UZ..	&"
 &+Y&8RT_bcTc(d(d&
 &
" 	EK2CE_```r@   c                    || j         j        j        }|,t          j        |t          j        |          dddf         }n|dddf                             |          }t          j        |t          j        |          dddf         }||z
  }|                     || j         | j	        | j
                  }|                      |          }	|	                    g d                              d          }	|	S )%Compute binned relative position biasNrB   r  r  r  rg   r   r"   r   )r   r   rC   r7   rW   r  ra   r  r   r   r   permuterY   )
r   query_length
key_lengthrC   cache_positioncontext_positionmemory_positionr  relative_position_bucketr   s
             r>   compute_biaszLongT5Attention.compute_bias  s   >18?F!$|L
SYZZZ[\[\[\^b[bc-aaag699&AA,zFSSSTXZ[Z[Z[T[\+.>>#'#A#A#.;=	 $B $
 $
  --.FGG			**44Q77r@   c                    |j         dd         \  }}|du}|                     |          }|                    |d| j        | j                                      dd          }|0|j                            | j                  }|r|j	        }n|j
        }|r|n|}|r)|'|r%|j        | j                 }|j        | j                 }n|                     |          }|                     |          }|                    |d| j        | j                                      dd          }|                    |d| j        | j                                      dd          }|9|s|
nd}
|                    ||| j        d|
i          \  }}|rd|j        | j        <   t!          j        ||                    dd                    }||j         d         }||n
|
d         dz   }| j        s@t!          j        d| j        ||f|j        |j        	          }| j        r| j        rd|_        n3|                     |||j        |

          }|dddd| dddf         }|$|ddddddd|j         d         f         }||z   }| j        rUt!          j        |j         d                   }d|t9          | j                  <   |dd|                                f         }n|}||z  }t<          j                             |!                                d          "                    |          }t<          j        #                    || j#        | j                  }|||z  }t!          j        ||          }|                    dd          $                                }|                    |d| j%                  }| &                    |          }|||f}|	r||fz   }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nrg   r-   r"   r#  Tr
   ri   rC   r+   )rC   r#  r   rL   ptraining)'r4   r   viewr   r   r   
is_updatedgetr   cross_attention_cacheself_attention_cache	key_cachevalue_cacher   r   updater7   matmulr   r8   rC   r+   r   r,  requires_gradr'  r   r   r6   boolr   r;   softmaxr  type_asr   
contiguousr   r   )r   r   r   key_value_statesposition_biaspast_key_valuelayer_head_maskr!  	use_cacheoutput_attentionsr#  r   
seq_lengthis_cross_attentionquery_statesr.  curr_past_key_valuecurrent_states
key_statesvalue_statesscoresr"  real_seq_lengthcausal_maskposition_bias_maskedattn_weightsattn_outputoutputss                               r>   r   zLongT5Attention.forward  sA   $ "/!4RaR!8
J .T9vvm,,#((RtG^__iijkmnoo%'266t~FFJ! J&4&J##&4&I#-?R))] 	E."<"<,6t~FJ.:4>JLL//J66.11L#RtG^__iijkmnooJ',,ZT\4KbccmmnoqrssL)7I!St+>+E+Edn?OQ_>`, ,(
L & E@DN-dn= lJ,@,@A,F,FGG #)"-J.:.FllN[]L^abLbO3 
E %j*=fm[a[g! ! ! . 74= 726M/ $ 1 1#ZVd !2 ! ! !.aaaZKLL!!!.C D"111aaa,Bj.>r.B,B#BC - ; 	1:m1!455D,-Dd'(()#0DIIKK#@  #0 && },,V\\^^,DDLLVTT},,\T\TXTa,bb &'/9Ll<>>!++Aq11<<>>!&&z2t~FFff[))> 	0/Gr@   FNTr  r  )NN)	NNNNNNFFN)r   r   r   r#   r   r   r   r  staticmethodr  r'  r   r   r   s   @r>   r   r   B  s         %*#'	!, !,!, C=	!, !, !, !, !, !,F; ; ;  -  -  -  \- ^   . i i i i i i i ir@   r   c                   h     e Zd Zddededdf fdZd Zedd            Zde	fdZ
	 	 	 	 ddZ xZS )LongT5LocalAttentionFr   r   r(   Nc                    t                                                       |j        | _        || _        |j        | _        |j        | _        |j        | _        |j        | _        |j	        | _
        |j        | _        | j        dz   | _        |j        | _        | j
        | j        z  | _        t!          j        | j        | j        d          | _        t!          j        | j        | j        d          | _        t!          j        | j        | j        d          | _        t!          j        | j        | j        d          | _        | j        r$t!          j        | j        | j
                  | _        t1                      | _        d| _        d S )Nr"   Fr   )r   r   r   r   r   r   r   r   r   r   r   local_radiusr%   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   s      r>   r   zLongT5LocalAttention.__init__'  sF    ++F(.4.S+/5/U,~"(+'"/*Q.*(?? 4<eDDD4<eDDD4<eDDD4>4<eDDD+ 	k+-<8[]a]i+j+jD(EE&+###r@   c                    t          |          dk    rd S t          || j        | j        | j                  \  }}t          | j        |          | _        t          | j        |          | _        t          | j        |          | _        t          | j	        |d          | _	        | j        t          |          z
  | _        | j        | j        z  | _
        | j                            |          | _        d S r   r   r  s      r>   r  z LongT5LocalAttention.prune_headsA  r  r@   Tr  r  c                 P   d}|rC|dz  }|| dk                         t          j                  |z  z  }t          j        |           } n(t          j        | t          j        |                      } |dz  }| |k     }|t          j        |                                 |z            t          j        ||z            z  ||z
  z                       t          j                  z   }t          j        |t          j	        ||dz
                      }|t          j
        || |          z  }|S r	  r
  r  s           r>   r  z.LongT5LocalAttention._relative_position_bucketQ  >   .  	cAK!2Q!6 : :5: F F TT %	*; < <!&+<e>NO`>a>a!b!b b  1$	$y0 &/I'--//);<<h|i/001Y&( "UZ..	&"
 &+Y&8RT_bcTc(d(d&
 &
" 	EK2CE_```r@   block_lengthc                    | j         j        j        j        dk    r| j         j        j        nd}t	          j        d|z  t          j        |          }|||          }|dddf         |dddf         z
  }|                     || j         | j	        | j
                  }|                      |          }|                    g d                              d                              d          }|S r  metaNr
   rB   r  r  r   r   r   rC   rq   r7   rW   r  r  r   r   r   r   rY   r   rZ  target_devicer%  r$  r  r&  r   s           r>   r'  z!LongT5LocalAttention.compute_bias      +29>&HH (/66 	
  ,q<'7uzR_```*<+EF ,D!!!G47G47PP#'#A#A#.;=	 $B $
 $
  --.FGG			**44Q77AA!DDr@   c                 (    |j         d d         \  } fd} fd} |                     |                    }	 |                     |                    }
 |                     |                    }t	          |	 j        d          }	t	          |
 j        d          }
t	          | j        d          }t          |
dd          }
t          |dd          }t          j        d|	|
          }| j	        sNt          j
        dd j         j        d j        z  f|j        |j        	          } j        r j        rd
|_        n                      j                  }|3t          j        |dk    dd          }||                    dd          z   }||z  }t(          j                            |                                d                              |          }t(          j                            | j         j                  }|||z  }|                    |j                  } |t          j        d||                    }|d d d |d d f         }                     |          }d }|f|fz   |fz   }|r||fz   }|S )Nrg   c                 H    |                      dj        j                  S 
projectionr-   r-  r   r   statesr   r   s    r>   r4   z+LongT5LocalAttention.forward.<locals>.shape       ;;z2t|T=TUUUr@   c                 `    |                                                      dj                  S rE   r-   r:  r-  r   rg  s    r>   unshapez-LongT5LocalAttention.forward.<locals>.unshape  )    $$&&++JDNKKKr@   r"   rL   rh   ...qhd,...khd->...hqkr
   r)  Tr   rz       _r-   r*  ...hqk,...khd->...qhd)r4   r   r   r   rH   r%   rU   r7   r   r   r8   r   rC   r+   r   r,  r6  r'  rr   r   r   r;   r8  r  r9  r   rq   r   )r   r   r   r<  r>  r@  rA  r4   rm  rC  rF  rG  rH  rL  rM  present_key_value_staterN  r   s   `                @r>   r   zLongT5LocalAttention.forward  s    "/!4RaR!8
J	V 	V 	V 	V 	V 	V	L 	L 	L 	L 	L 	L
 uTVVM2233U466-0011
uTVVM2233 *,ANNN'
DNJJJ
),ANNN +:QRSSS
,\QUVWWW #\:
 
  3 B %4<T^9KLU[Ubjpjv! ! ! . 74= 726M/ $ 1 1$. A A{4!8S%88 -q!0D0D D-},,V\\^^,DDLLVTT},,\T\TXTa,bb &'/9L#(();<<gel+BLR^__``!!!![j[!!!"34ff[))"&.$;#==@PP 	0/Gr@   FrP  NNNF)r   r   r   r#   r7  r   r  rQ  r  r   r'  r   r   r   s   @r>   rS  rS  &  s        , ,| ,$ ,[_ , , , , , ,4; ; ;  -  -  -  \- ^    6 I I I I I I I Ir@   rS  c                        e Zd Zddededdf fdZd Zedd            Zde	fdZ
dej        dej        dej        fdZ	 	 	 	 ddZ xZS )LongT5TransientGlobalAttentionFr   r   r(   Nc                    t                                                       |j        | _        || _        |j        | _        |j        | _        |j        | _        |j        | _        |j	        | _
        |j        | _        | j        dz   | _        |j        | _        |j        | _        | j
        | j        z  | _        t#          j        | j        | j        d          | _        t#          j        | j        | j        d          | _        t#          j        | j        | j        d          | _        t#          j        | j        | j        d          | _        | j        r$t#          j        | j        | j
                  | _        t3                      | _        | j        r$t#          j        | j        | j
                  | _        t9          |j        |j                  | _        d S )Nr"   Fr   r   )r   r   r   r   r   r   r   r   r   r   r   rU  r%   rm   r   r   r   r   r   r   r   r   r   r   r   r   r   global_relative_attention_biasr   r   global_input_layer_normrV  s      r>   r   z'LongT5TransientGlobalAttention.__init__  s    ++F(.4.S+/5/U,~"(+'"/*Q.!'!9*(?? 4<eDDD4<eDDD4<eDDD4>4<eDDD+ 	k+-<8[]a]i+j+jD(EE + 	r24,t?bdhdp2q2qD/'6v~6Kd'e'e'e$$$r@   c                    t          |          dk    rd S t          || j        | j        | j                  \  }}t          | j        |          | _        t          | j        |          | _        t          | j        |          | _        t          | j	        |d          | _	        | j        t          |          z
  | _        | j        | j        z  | _
        | j                            |          | _        d S r   r   r  s      r>   r  z*LongT5TransientGlobalAttention.prune_heads  r  r@   Tr  r  c                 P   d}|rC|dz  }|| dk                         t          j                  |z  z  }t          j        |           } n(t          j        | t          j        |                      } |dz  }| |k     }|t          j        |                                 |z            t          j        ||z            z  ||z
  z                       t          j                  z   }t          j        |t          j	        ||dz
                      }|t          j
        || |          z  }|S r	  r
  r  s           r>   r  z8LongT5TransientGlobalAttention._relative_position_bucket  rY  r@   rZ  c                    | j         j        j        j        dk    r| j         j        j        nd}t	          j        d|z  t          j        |          }|||          }|dddf         |dddf         z
  }|                     || j         | j	        | j
                  }|                      |          }|                    g d                              d                              d          }|S r\  r^  r_  s           r>   r'  z+LongT5TransientGlobalAttention.compute_biasG  ra  r@   r   r   c                 ~   t          j        |d         |d d d d d f                   d d d df         }t          j        |dk    dd          }t          || j                  }|                     || j         | j        | j                  }| 	                    |          }|
                    g d          }||z   }|S )Nr   .r   rz   rp  r  )r   r
   r"   rg   )r7   eqrr   r   rm   r  r   r   r   rx  r   )r   r   r   side_attention_maskattention_side_biasr   side_relative_position_bucket	side_biass           r>   compute_side_biasz0LongT5TransientGlobalAttention.compute_side_bias_  s    #htI8J111dTUTUTU:8VWWXYXYXY[_adXde#k*=*A3NN!A$H^!_!_(,(F(F"#.;=	 )G )
 )
% 778UVV	 %%lll33	1I=""r@   c                 r	    |j         d d         \  } fd} fd}t          ||n t          j        |j         d d                    j                  \  }	}
|
j         d         }t          ||	|          }                     |          } |                     |                    } |                     |                    } | 	                    |                    } |                     |                    } | 	                    |                    }t          | j        d          }t          | j        d          }t          | j        d          }t          |dd          }t          |dd          }dg|j        dz   z  }|j         d         |d<   |                    d                              |          }|                    d                              |          }t          j        ||gd          }t          j        ||gd          }t          j        d||          }|6t%          | j        |j                  }t          j        |d	k    d
d          }nd }|F j        sNt          j        dd j         j        d j        z  f|j        |j                  } j        r j        rd|_        n                      j                  }|||                    dd          z   }|                    |j                  }|t          j        |          }                     ||
          }t          | j        d                              dd          }|                    |j                                       |j                  }t          j        ||gd          }||z  }tB          j"        #                    |$                                d          %                    |          }tB          j"        &                    | j&         j                  }|||z  }|                    |j                  } |t          j        d||                    }|d d d |d d f         } '                    |          }d }|f|fz   |fz   }|r||fz   }|S )Nrg   c                 H    |                      dj        j                  S rd  rf  rg  s    r>   r4   z5LongT5TransientGlobalAttention.forward.<locals>.shape~  ri  r@   c                 `    |                                                      dj                  S rk  rl  rg  s    r>   rm  z7LongT5TransientGlobalAttention.forward.<locals>.unshape  rn  r@   r-   r"   rL   rh   ro  r   rz   rp  r
   r)  Tri   r*  rq  )(r4   r   r7   r   rm   r   ry  r   r   r   rH   r%   rU   r9   rY   r   rQ   r   rl   rC   rr   r   r8   r   r+   r   r,  r6  r'  r   rq   r  ra   r   r;   r8  r  r9  r   r   )r   r   r   r<  r>  r@  rA  r4   rm  ro   r   _global_seq_lenglobal_inputsrC  rF  rG  side_key_statesside_value_statesrepsrH  r^   side_position_biasrL  rM  rr  rN  r   s   `                         @r>   r   z&LongT5TransientGlobalAttention.forwardt  s    "/!4RaR!8
J	V 	V 	V 	V 	V 	V	L 	L 	L 	L 	L 	L )E$DD%*]5H"5M*N*N")
 )
%	%
 -2261-O\\44]CC uTVVM2233U466-0011
uTVVM2233%} 5 566!E$&&"7"788 *,ANNN'
DNJJJ
),ANNN +:QRSSS
,\QUVWWW so*Q./"1%Q)33A66==dCC-77::AA$GG Y
O<!DDD
y,0A!BJJJ 5|ZPP#<T4>S`Sg#h#h #(;/Ca/Ge#T#T  #'  3 	B %4<T^9KL!= ,! ! !
 . 74= 726M/ $ 1 1$. A A#/ -0D0N0NqRS0T0T T)..v|<<M |z*j99!%!7!7>P!Q!Q!34F\^!_!_!_!i!ijkmn!o!o!3!8!8!F!F!I!I&-!X!X!I}6H&IrRRRM-},,V\\^^,DDLLVTT},,\T\TXTa,bb &'/9L#(();<<gel+BLR^__``!!!![j[!!!"34ff[))"&.$;#==@PP 	0/Gr@   rs  rP  rt  )r   r   r   r#   r7  r   r  rQ  r  r   r'  r7   r|   r  r   r   r   s   @r>   rv  rv    s       f f| f$ f[_ f f f f f f>; ; ;  -  -  -  \- ^    0#el # #Y^Ye # # # #0 v v v v v v v vr@   rv  c                   H     e Zd Zddee         f fdZ	 	 	 	 	 	 	 ddZ xZS )LongT5LayerSelfAttentionFNr   c                     t                                                       t          |||          | _        t	          |j        |j                  | _        t          j	        |j
                  | _        d S )Nr   r   r   )r   r   r   SelfAttentionr   r   r   r   r   r   r   r   r   s       r>   r   z!LongT5LayerSelfAttention.__init__  sl    ,0KW`
 
 
 *&.f>WXXXz&"566r@   c	           
          |                      |          }	|                     |	|||||||          }
||                     |
d                   z   }|f|
dd          z   }|S )N)r   r<  r>  r=  r?  r@  r#  r   r"   )r   r  r   )r   r   re   r<  r>  r=  r?  r@  r#  normed_hidden_statesattention_outputrN  s               r>   r   z LongT5LayerSelfAttention.forward  s      $}==-- '+)/) . 	
 	
 &5Ea5H(I(II "%5abb%99r@   rO  )NNNNFFNr   r   r   r   r   r   r   r   r   s   @r>   r  r    sy        7 7XVY] 7 7 7 7 7 7        r@   r  c                   L     e Zd ZdZddee         f fdZ	 	 	 	 d	defdZ xZ	S )
LongT5LayerLocalSelfAttentionz$Local self attention used in encoderFNr   c                     t                                                       t          ||          | _        t	          |j        |j                  | _        t          j	        |j
                  | _        d S N)r   r   )r   r   rS  LocalSelfAttentionr   r   r   r   r   r   r   r   r   s       r>   r   z&LongT5LayerLocalSelfAttention.__init__  s`    "6v[v"w"w"w)&.f>WXXXz&"566r@   kwargsc                     |                      |          }|                     |||||          }||                     |d                   z   }|f|dd          z   }	|	S N)r   r<  r>  r@  r   r"   )r   r  r   
r   r   re   r<  r>  r@  r  r  r  rN  s
             r>   r   z%LongT5LayerLocalSelfAttention.forward  sz      $}==22 '+/ 3 
 
 &5Ea5H(I(II "%5abb%99r@   rO  rt  
r   r   r   __doc__r   r   r   r   r   r   r   s   @r>   r  r    s        ..7 7XVY] 7 7 7 7 7 7          r@   r  c                   L     e Zd ZdZddee         f fdZ	 	 	 	 d	defdZ xZ	S )
'LongT5LayerTransientGlobalSelfAttentionz/Transient-Global self attention used in encoderFNr   c                     t                                                       t          ||          | _        t	          |j        |j                  | _        t          j	        |j
                  | _        d S r  )r   r   rv  TransientGlobalSelfAttentionr   r   r   r   r   r   r   r   r   s       r>   r   z0LongT5LayerTransientGlobalSelfAttention.__init__4  si    ,J0K-
 -
 -
) *&.f>WXXXz&"566r@   r  c                     |                      |          }|                     |||||          }||                     |d                   z   }|f|dd          z   }	|	S r  )r   r  r   r  s
             r>   r   z/LongT5LayerTransientGlobalSelfAttention.forward<  sz      $}==<< '+/ = 
 
 &5Ea5H(I(II "%5abb%99r@   rO  rt  r  r   s   @r>   r  r  1  s        997 7XVY] 7 7 7 7 7 7          r@   r  c                   J     e Zd Zddee         f fdZ	 	 	 	 	 	 	 	 ddZ xZS )LongT5LayerCrossAttentionNr   c                     t                                                       t          |d|          | _        t	          |j        |j                  | _        t          j	        |j
                  | _        d S )NFr  r   )r   r   r   EncDecAttentionr   r   r   r   r   r   r   r   )r   r   r   r   s      r>   r   z"LongT5LayerCrossAttention.__init__T  sc    .vSXdmnnn)&.f>WXXXz&"566r@   Fc                     |                      |          }|                     |||||||||	|

  
        }||                     |d                   z   }|f|dd          z   }|S )N)	r   r;  r<  r>  r=  r?  r!  r@  r#  r   r"   )r   r  r   )r   r   r;  re   r<  r>  r=  r?  r!  r@  r#  r  r  layer_outputrN  s                  r>   r   z!LongT5LayerCrossAttention.forwardZ  s      $}==// -'+)%/) 0 
 
 %t||4DQ4G'H'HH/$4QRR$88r@   r   )NNNNFNFNr  r   s   @r>   r  r  S  s{        7 7(3- 7 7 7 7 7 7        r@   r  c                   R     e Zd Zddee         f fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddZ xZS )	LongT5BlockFNr   c                 $   t                                                       |j        | _        |j        rt          }n>|j        dk    rt
          }n+|j        dk    rt          }nt          d|j         d          t          j	                    | _
        | j
                             ||||                     | j        r)| j
                            t          ||                     | j
                            t          |                     d S )Nlocalztransient-globalzjFor encoder attention mechanism, either `local` or `transient-global` attention type is expected, but got .r  )r   )r   r   r   r  encoder_attention_typer  r  
ValueErrorr   
ModuleListlayerrP   r  r   )r   r   r   r   attention_layerr   s        r>   r   zLongT5Block.__init__z  s     + 
	6OO*g55;OO*.@@@EOO<!8< < <   ]__

OF@[gpqqq	
 	
 	
 ? 	VJ7)TTTUUU
-//00000r@   Tc                     | j         d         |||||	|
||          }|d d         \  }}	|dd          }|j        t          j        k    r_t          j        |                                          r9t          j        |j                  j        dz
  }t          j        || |          }| j	        o|d u}|r | j         d         ||||||	|d         dz   |
||
  
        }|d d         \  }}	|j        t          j        k    r_t          j        |                                          r9t          j        |j                  j        dz
  }t          j        || |          }||dd          z   } | j         d         |          }|j        t          j        k    r_t          j        |                                          r9t          j        |j                  j        dz
  }t          j        || |          }|f}|
r
||	fz   |z   }n||z   }|S )	Nr   )re   r<  r>  r=  r?  r@  r#  rg   i  )r  r   r"   r-   )	r;  re   r<  r>  r=  r!  r?  r@  r#  )
r  r+   r7   r   isinfanyfinfor   clampr   )r   r   re   r<  encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr>  cross_attn_layer_head_maskr=  r?  r@  return_dictr#  self_attention_outputsattention_outputsclamp_valuedo_cross_attentioncross_attention_outputsrN  s                       r>   r   zLongT5Block.forward  sU     "/A)'+)/)	"
 	"
 	"
 )?rr(B%~21226 %-//EK4N4N4R4R4T4T/+m&9::>EK!KK<[YYYM!_R1Fd1R 	P&3djm!65; :-+B/!3#"3-' ' '# -DBQB,G)M> "em33M8R8R8V8V8X8X3#k-*=>>BTI %M|Q\ ] ] ] !24KABB4O O '
2}55 %-//EK4N4N4R4R4T4T/+m&9::>EK!KK<[YYYM " 	2 114EEGG 11Gr@   rO  )NNNNNNNNFFTNr  r   s   @r>   r  r  y  s        1 1XVY] 1 1 1 1 1 14 "#&*#'I I I I I I I Ir@   r  c                   N    e Zd ZdZeZdZdZdgZdZ	dZ
ed             Zd Zd Zd	S )
LongT5PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    transformerTr  Fc                 v    t          j        t                    }t          j        t                    }|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r7   r   r   r   )r   r  
input_maskdummy_inputss       r>   r  z"LongT5PreTrainedModel.dummy_inputs  s?     L..	\*--
!*"&0
 

 r@   c                 	   | j         j        }t          |t                    r$|j        j                            |dz             dS t          |t          t          t          f          rt|j
        j        j                            d|dz             t          |d          r7| j         j        s-|j        j        j                            d|dz             dS dS dS t          |t                    r|j        j        j                            d|| j         j        dz  z             t          |j        d          r/|j        j        #|j        j        j                                         |j        j        j                            d|| j         j        dz  z             t          |j        d          r1|j        j        '|j        j        j                                         dS dS dS t          |t,                    rt|j        j        j                            d|| j         j        dz  z             t          |j        d          r/|j        j        #|j        j        j                                         |j        j        j                            d|| j         j        dz  z             t          |j        d          r/|j        j        #|j        j        j                                         |j        j        j                            d|| j         j        dz  z             t          |j        d          r1|j        j        '|j        j        j                                         dS dS dS t          |t2          t4          t6          f          rP| j         j        }| j         j        }| j         j        }|j        j        j                            d|||z  dz  z             |j        j        j                            d||dz  z             |j         j        j                            d||dz  z             |j!        j        j                            d|||z  dz  z             |j"        rq|j#        j        j                            d||dz  z             t          |t6                    r2|j$        j        j                            d||dz  z             dS dS dS dS )zInitialize the weightsr{   rz   )r   stdlm_head      r   N)%r   initializer_factorr   r   r   datafill_LongT5ModelLongT5ForConditionalGenerationLongT5EncoderModelsharednormal_hasattrtie_word_embeddingsr  r   r   r   r   zero_r   r   r   r   r   r   rS  rv  r   r   r   r   r   r   r   r   rx  )r   modulefactorr   r   r   s         r>   _init_weightsz#LongT5PreTrainedModel._init_weights  s   /fo.. +	M$$Vc\22222.LN` abb )	 M %--3FSL-IIIvy)) O$+2Q O%*22#2NNNNNO O O O 344 #	 I!))s4;CV[_B_8`)aaavy&)) ,fin.H	#))+++I!))s4;CSX\B\8])^^^vy&)) ,fin.H	#))+++++, ,.H.H 899 	K#++&T[EX]aDa:b+cccv{F++ .0@0L %++---K#++&T[EX]aDa:b+cccv{F++ .0@0L %++---I!))s4;CSX\B\8])^^^vy&)) ,fin.H	#))+++++, ,.H.H2FHf ghh 	 k)G!%!1k+GHO ((cv'L^B^cgAg7h(iiiHO ((cv$7O(PPPHO ((cv$7O(PPPHO ((cv'L^B^cgAg7h(iii1 .5:BBQW\chl[lQmBnnnf&DEE 9@EMM fT0A&B N     	 	  r@   c                    | j         j        }| j         j        }|t          d          t	          |          rHt          j        |j        d d         dz   |          }t          j        ||dd df         gd          }nD|	                    |j                  }|dd df         
                                |ddd f<   ||d<   |t          d          |                    |d	k    |           |S )
Nzself.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the pad_token_id. See LongT5 docs for more information.r-   )r"   .rL   r"   ).r   z1self.model.config.pad_token_id has to be defined.)r   decoder_start_token_idpad_token_idr  r   r7   fullr4   rQ   	new_zerosclonemasked_fill_)r   r  r  r  shifted_input_idss        r>   _shift_rightz"LongT5PreTrainedModel._shift_right'  s   !%!C{/!)8   Y'' 	? %
9?3B3+?$+FH^ _ _ %	+<iSbS>Q*RXZ [ [ [ ) 3 3IO D D)238)<)B)B)D)Dc122g&(>f%PQQQ&&'8D'@,OOO  r@   N)r   r   r   r  r#   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_cache_class_supports_static_cachepropertyr  r  r  r.   r@   r>   r  r    s}         
  L%&*#& "  X. . .b! ! ! ! !r@   r  c                        e Zd Zd fd	Zd Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 ddZdej        dej        dej        d	e	d
e
f
dZedej        dededej        dej        dej        defd            Z xZS )LongT5StackNc                 (   t                                                     t          j        j        j                  | _        ||j        | j        _        j        | _        j	        | _	        | j	        dz   | _
        t          j        fdt          j                  D                       | _        t          j        j                  | _        t          j        j                  | _        d| _        |                                  d S )Nr"   c           	      V    g | ]%}t          t          |d k              |          &S )r   r  )r  r7  ).0rS   r   s     r>   
<listcomp>z(LongT5Stack.__init__.<locals>.<listcomp>P  sC        FQ!VXYZZZ  r@   r   F)r   r   r   r   
vocab_sizer   embed_tokensr   r   rU  r%   r  rM   
num_layersblockr   r   final_layer_normr   r   r   r   	post_init)r   r   r  r   s    ` r>   r   zLongT5Stack.__init__D  s      L):FNKK#'3':D$ +"/*Q.]   v011  
 

 !0FD] ^ ^ ^z&"566&+# 	r@   c                     | j         S r   r  r   s    r>   get_input_embeddingsz LongT5Stack.get_input_embeddings^  s      r@   c                     || _         d S r   r  r   new_embeddingss     r>   set_input_embeddingsz LongT5Stack.set_input_embeddingsb  s    *r@   c                 
   |	|	n| j         j        }	|
|
n| j         j        }
||n| j         j        }||n| j         j        }|#|!| j        rdnd}t          d| d| d          |1|                                }|                    d|d                   }n@||                                d d         }n!| j        rdnd}t          d| d| d	          | j	        r%| j
        r|	rt                              d
           d}	|&| j        
J d            |                     |          }|\  }}d}d}| j        r|	s|t          |t                    r4t          |t                     sd}t!          |t#                                }nzt          |t                     s1d}t                              d           t!          j        |          }n4|(t!          t#                      t#                                }n	| j        sd }||                                nd}|t)          j        |||z   |j                  }|/t/                      s!||z   }t)          j        |||j                  }| j        r#|                     |||||j        nd |
          }n.| j         j        dk    rt9          || j        |j                  }n|}| j        rQ|O|                                \  }}}||f}|t)          j        ||j                  }|                     |          }nd }|                     || j         j                   }|                     || j         j                   }|rdnd }|
rdnd }|
r	| j        rdnd }d }d } | !                    |          }!tE          | j#                  D ]\  }"}#||"         }$||"         }%|r||!fz   }| j	        r/| j
        r(| $                    |#j%        |!||||| |$|%d |	|
||          }&n |#|!||||| |$|%||	|
||          }&|	du r|&d d         dz   |&dd          z   }&|&d d         \  }!}'|&d         }| j        r||&|
rdnd         } |
r||&d         fz   }| j        r||&d         fz   }| &                    |!          }!| !                    |!          }!|r||!fz   }|	r|'nd }(|r|j        }(|r|'                                }(|stQ          d |!|(|||fD                       S tS          |!|(|||          S )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer-   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   rx   r  r.   )re   r<  r  r  r  r>  r  r=  r?  r@  r  r#  r"   r   rg      r
      c              3      K   | ]}||V  	d S r   r.   )r  r   s     r>   	<genexpr>z&LongT5Stack.forward.<locals>.<genexpr>&  s4       
 
 =  !===
 
r@   )last_hidden_statepast_key_valuesr   
attentionscross_attentions)*r   r?  r@  output_hidden_statesuse_return_dictr   r  sizer-  r   r,  r   r   r  r   r   r   r   from_legacy_cacheget_seq_lengthr7   rW   rC   r   r   _update_causal_maskr1  r  rl   r%   invert_attention_maskget_head_maskr  r   	enumerater  _gradient_checkpointing_funcr   r  to_legacy_cacherO   r   ))r   r  re   r  r  r  	head_maskcross_attn_head_maskr  r?  r@  r  r  r#  err_msg_prefixinput_shaper   rA  return_legacy_cachereturn_self_attention_cachepast_key_values_lengthmask_seq_lengthrJ  encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsr<  r  r   rS   layer_moduler>  r  layer_outputsnext_decoder_cache
next_caches)                                            r>   r   zLongT5Stack.forwarde  s     "+!6IIDK<Q	1B1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] ]%>+/?BZZNw>wwwww   "#..**K!r;r?;;II&',,..ss3KK+/?BZZNu>uuXfuuuvvv& 	"4= 	" "##p   "	 $002p000 --i88M!,
J $&+#? 	#	 	#_-H/511 V*_Vi:j:j V.2+"5o|~~"V"V1DEE 	V&*###`  
 #6"G"X"X ("5lnnlnn"U"U 	# #OETE`!?!?!A!A!Afg!"\&(>(KTaTh  N !*B*D*D!4zAO"Z
OML`aaaN? 	)228G8S44Y]! KK [/7::3NDNTaThiiKK(K ? 	34@=R=W=W=Y=Y: 7$68O#P %-).4HQ^Qe)f)f)f&.2.H.HI_.`.`++.2+ &&y$+2HII	#112FH^__"6@BBD0:d&7VDOVrrRV(,%]33(44 :	V :	VOA|'lO)=a)@&# I$58H$H!*  t}   $ A A (!!)31#.%"! !" !-!#."/*?+J2O$3/I#2'&7 +#1! ! !$ E!! -bqb 1G ;mABB>O O0=bqb0A-M-
 *!,M ]#8#D0=CT>[aaZ[0\-  V!/=3C2E!E? V+?=QRCSBU+U(--m<<]33   	E 1]4D D+4>''$
& 	>(=J 	;(88::J 	 
 
 "%"(
 
 
 
 
 
 9+&+%1
 
 
 	
r@   re   input_tensorr#  r  r@  c           
         | j         j        dk    r
|d|v r|S d S ||                                nd}t          |t                    }| j         j        dk    r#|s!|st          j        |||| j                  rd S |j        |j	        }	}|j
        d         }
|r|                                }n/t          |t          j                  r|j
        d         n||
z   dz   }|                     ||
|||	||j
        d                   }| j         j        dk    rB|@|j	        j        d	k    r0|s.t          j        |          j        }t          j        ||          }|S )
Nflash_attention_2rz   r   sdpa)r  r"  is_trainingr"   r-   )sequence_lengthtarget_lengthr+   rC   r#  r   cuda)r   _attn_implementationr  r   r   r   _ignore_causal_mask_sdpar,  r+   rC   r4   get_max_cache_shaper7   r|   5_prepare_4d_causal_attention_mask_with_cache_positionrq   r  r  _unmask_unattended)r   re   r0  r#  r  r@  past_seen_tokensusing_static_cacher+   rC   r5  r6  rJ  	min_dtypes                 r>   r  zLongT5Stack._update_causal_mask:  s    ;+/BBB)c^.C.C%%4
 @O?Z?99;;;`a'EE ;+v55>P5Yj5%>*'7 M	    t$*L,?v&,Q/ 	+??AAMM nel;;<$R((%7!;  PP+')#)!, Q 
 
 K,66*%*f44% 5 E**.I0CKQZ[[Kr@   r5  r6  r+   rC   r   c                    | |                                  dk    r| }n+t          j        |          j        }	t          j        ||f|	||          }|dk    rt          j        |d          }|t          j        ||          |                    dd          k    z  }|ddddddf                             |ddd          }| |	                                }| j
        d         }
|ddddddd|
f         | ddddddf         z   }|dk    }|ddddddd|
f                             ||	          |ddddddd|
f<   |S )	a  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            device (`torch.device`):
                The device to plcae the 4D attention mask on.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr	  )
fill_valuer+   rC   r"   )diagonalrx   r-   r   )r&   r7   r  r  r  triurW   rE   expandr  r4   masked_fill)re   r5  r6  r+   rC   r#  r   r  rJ  r?  mask_lengthpadding_masks               r>   r;  zALongT5Stack._prepare_4d_causal_attention_mask_with_cache_position{  s   D %.*<*<*>*>!*C*C(KKE**.I* -0Ye\b  K !###jqAAA5<fEEEH^H^_acdHeHeeeK%dD!!!QQQ&67>>z1bRTUUK))//11,226*111aaaL[L+@ANSTSTSTVZ\`bcbcbcScDdd+q05@AAAqqq,;,AV5W5c5c )6 6AAAqqq!!!\k\12 r@   r   )NNNNNNNNNNNNN)r   r   r   r   r   r  r   r7   r|   r   r7  r  rQ  r   r+   rC   r;  r   r   s   @r>   r  r  C  sR            4! ! !+ + +
 "#!!R
 R
 R
 R
j?? l? 	?
 ?  ? ? ? ?B 555 5 {	5
 5 5 5 5 5 \5 5 5 5 5r@   r  aQ  

    The LongT5 model was proposed in [LongT5: Efficient Text-To-Text Transformer for Long
    Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo
    Ni, Yun-Hsuan Sung and Yinfei Yang. It's an encoder-decoder transformer pre-trained in a text-to-text denoising
    generative setting. LongT5 model is an extension of T5 model, and it enables using one of the two different
    efficient attention mechanisms - (1) Local attention, or (2) Transient-Global attention.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`LongT5Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).
        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
            Training](./longt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
                `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*)
            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at
            the output of the last layer of the encoder. Used in the cross-attention of the decoder.
        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
            input (see `past_key_values`). This is useful if you want more control over how to convert
            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.

            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
            of `inputs_embeds`.

        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
            cache in the correct position and to infer the complete sequence length.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).
        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a_  
The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
num_heads)`.
z`The bare LONGT5 Model transformer outputting raw hidden-states without any specific head on top.c            '           e Zd ZdgZddgZdef fdZd Zd Zd Z	d	 Z
d
 Zd Z ee           eee          	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d deej                 deej                 deej                 deej                 deej                 deej                 deej                 deeeej                                   deeeej                                   deej                 deej                 dee         dee         dee         dee         deej                 deeej                 ef         f"d                        Z xZS )!r  Fdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightencoder.embed_tokens.weightdecoder.embed_tokens.weightr   c                    t                                          |           t          j        |j        |j                  | _        t          j        |          }d|_	        d|_
        d|_        t          || j                  | _        t          j        |          }d|_	        d|_        |j        |_        t          || j                  | _        |                                  d S )NFT)r   r   r   r   r  r   r  copydeepcopyr   r?  is_encoder_decoderr  encodernum_decoder_layersr  decoderr  r   r   encoder_configdecoder_configr   s       r>   r   zLongT5Model.__init__f  s       l6#4fnEEv..$)!#( ,1)">4;??v..$(!,1)$*$=!">4;?? 	r@   c                     | j         S r   r  r  s    r>   r   z LongT5Model.get_input_embeddingsy  
    {r@   c                 |    || _         | j                            |           | j                            |           d S r   r  rP  r  rR  r  s     r>   r  z LongT5Model.set_input_embeddings|  ;    $)).999)).99999r@   c                     | j         j        rL|                     | j        j        | j                   |                     | j        j        | j                   d S d S r   r   r  _tie_or_clone_weightsrP  r  r  rR  r  s    r>   _tie_weightszLongT5Model._tie_weights  \    ;* 	O&&t|'@$+NNN&&t|'@$+NNNNN	O 	Or@   c                     | j         S r   rP  r  s    r>   get_encoderzLongT5Model.get_encoder  
    |r@   c                     | j         S r   rR  r  s    r>   get_decoderzLongT5Model.get_decoder  rd  r@   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        NitemsrP  r  	attentionr  r   heads_to_pruner  r  s       r>   _prune_headszLongT5Model._prune_heads  U    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr@   output_typer  Nr  re   r  r  r  decoder_head_maskr  encoder_outputsr  r  decoder_inputs_embedsr?  r@  r  r  r#  r(   c                    ||n| j         j        }||n| j         j        }|=|;| j         j        | j         j        k    r!t          j        t          t                     |}|| 	                    |||
||||          }ne|rct          |t                    sNt          |d         t          |          dk    r|d         ndt          |          dk    r|d         nd          }|d         }|                     ||||	|||||||||          }|s||z   S t          |j        |j        |j        |j        |j        |j        |j        |j                  S )	a%  
        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, LongT5Model

        >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
        >>> model = LongT5Model.from_pretrained("google/long-t5-local-base")

        >>> # Let's try a very long encoder input.
        >>> input_ids = tokenizer(
        ...     100 * "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1

        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

        >>> # forward pass
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr  re   r  r  r@  r  r  r   r"   rg   r  r   r  r  re   r  r  r  r  r  r  r?  r@  r  r  r#  )r  r  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentions)r   r?  r  r  rQ  warningswarn#_LongT5Model__HEAD_MASK_WARNING_MSGFutureWarningrP  r   r   r   rR  r   r  r  r   r  r  )r   r  re   r  r  r  rs  r  rt  r  r  ru  r?  r@  r  r  r#  r   decoder_outputss                      r>   r   zLongT5Model.forward  s   V "+!6IIDK<Q	%0%<kk$+B]  %6%>{%)GGG5}EEE$-! ""ll#-+#"3%9' +  OO  	O_!M!M 	-"1!"4474H4H14L4Loa00RV14_1E1E1I1I?1--t  O (* ,,'1/+"/#1'!5/!5#) ' 
 
   	5"_44!-?+;"1"?.9,=&5&G"1"?.9	
 	
 	
 		
r@   )NNNNNNNNNNNNNNNN)r   r   r   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr#   r   r   r  r_  rc  rg  ro  r   LONGT5_INPUTS_DOCSTRINGr!   r   _CONFIG_FOR_DOCr   r7   
LongTensorFloatTensor
BoolTensorr|   r   r7  r   r   r   r   s   @r>   r  r  \  s        	R*& 89VW|      &  : : :
O O O
    C C C +*+BCC+=O\\\ 156:8<=A159=7;EIEI048<$(,0/3&*59#c
 c
E,-c
 !!23c
 $E$45	c

 !))9 :c
 E-.c
 $E$56c
 'u|4c
 "%e.?(@"ABc
 "%e.?(@"ABc
  -c
  (5c
 D>c
 $D>c
 'tnc
  d^!c
" !!12#c
$ 
uU&');;	<%c
 c
 c
 ]\ DCc
 c
 c
 c
 c
r@   r  z4LONGT5 Model with a `language modeling` head on top.c            )           e Zd ZdgZg dZdef fdZd Zd Zd Z	d Z
d	 Zd
 Zd Z ee           eee          	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d#deej                 deej                 deej                 deej                 deej                 deej                 deej                 deeeej                                   deeeej                                   deej                 deej                 deej                 dee         dee         dee         dee         deej                 deeej                 ef         f$d                         Zdej        fd!Zd" Z xZS )$r  rI  )rJ  rK  zlm_head.weightr   c                 4   t                                          |           |j        | _        t	          j        |j        |j                  | _        t          j	        |          }d|_
        d|_        d|_        t          || j                  | _        t          j	        |          }d|_
        d|_        |j        |_        t          || j                  | _        t	          j        |j        |j        d          | _        |                                  d S )NFTr   )r   r   r   	model_dimr   r   r  r  rM  rN  r   r?  rO  r  rP  rQ  r  rR  r   r  r  rS  s       r>   r   z'LongT5ForConditionalGeneration.__init__  s       l6#4fnEEv..$)!#( ,1)">4;??v..$(!,1)$*$=!">4;??y1BOOO 	r@   c                     | j         S r   rW  r  s    r>   r   z3LongT5ForConditionalGeneration.get_input_embeddings  rX  r@   c                 |    || _         | j                            |           | j                            |           d S r   rZ  r  s     r>   r  z3LongT5ForConditionalGeneration.set_input_embeddings  r[  r@   c                     | j         j        rL|                     | j        j        | j                   |                     | j        j        | j                   d S d S r   r]  r  s    r>   r_  z+LongT5ForConditionalGeneration._tie_weights"  r`  r@   c                     || _         d S r   r  r  s     r>   set_output_embeddingsz4LongT5ForConditionalGeneration.set_output_embeddings'  s    %r@   c                     | j         S r   r  r  s    r>   get_output_embeddingsz4LongT5ForConditionalGeneration.get_output_embeddings*  rd  r@   c                     | j         S r   rb  r  s    r>   rc  z*LongT5ForConditionalGeneration.get_encoder-  rd  r@   c                     | j         S r   rf  r  s    r>   rg  z*LongT5ForConditionalGeneration.get_decoder0  rd  r@   rq  Nr  re   r  r  r  rs  r  rt  r  r  ru  labelsr?  r@  r  r  r#  r(   c                 V   ||n| j         j        }||n| j         j        }|=|;| j         j        | j         j        k    r!t          j        t          t                     |}|| 	                    |||
||||          }ne|rct          |t                    sNt          |d         t          |          dk    r|d         ndt          |          dk    r|d         nd          }|d         }||||                     |          }|                     ||||	|||||||||          }|d         }| j         j        r|| j        dz  z  }|                     |          }d}|pt%          d	
          }|                    |j                  } ||                    d|                    d                    |                    d                    }|s|f|dd         z   |z   }||f|z   n|S t/          |||j        |j        |j        |j        |j        |j        |j        	  	        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
        >>> model = LongT5ForConditionalGeneration.from_pretrained(
        ...     "Stancld/longt5-tglobal-large-16384-pubmed-3k_steps"
        ... )

        >>> # Let's try a very long input.
        >>> inputs = tokenizer(100 * "studies have shown that owning a dog is good for you ", return_tensors="pt")
        >>> input_ids = inputs.input_ids

        >>> outputs = model.generate(input_ids)
        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
        abstractthe aim of this article is to provide an overview of the literature on the role of dog
        ```Nrw  r   r"   rg   rx  ry  r  r  )ignore_indexr-   )	losslogitsr  rz  r{  r  r|  r  r}  )r   r?  r  r  rQ  r~  r  6_LongT5ForConditionalGeneration__HEAD_MASK_WARNING_MSGr  rP  r   r   r   r  rR  r  r  r  r	   ra   rC   r-  r  r   r  r   r  r  r  )r   r  re   r  r  r  rs  r  rt  r  r  ru  r  r?  r@  r  r  r#  r   r  sequence_output	lm_logitsr  loss_fctoutputs                            r>   r   z&LongT5ForConditionalGeneration.forward3  s   ` "+!6IIDK<Q	%0%<kk$+B]  %6%>{%)GGG5}EEE$-! ""ll#-+#"3%9' +  OO  	O_!M!M 	-"1!"4474H4H14L4Loa00RV14_1E1E1I1I?1--t  O (*"3";@U@] $ 1 1& 9 9 ,,'1/+"/#1'!5/!5#) ' 
 
  *!,;* 	G .1EFOLL11	'T:::HYYy/00F8INN2y~~b/A/ABBFKKPROOTTD  	F\OABB$77/IF)-)9TGf$$vE+;"1"?.9,=&5&G"1"?.9

 

 

 
	
r@   c                 ,    |                      |          S r   )r  )r   r  s     r>   %prepare_decoder_input_ids_from_labelszDLongT5ForConditionalGeneration.prepare_decoder_input_ids_from_labels  s      (((r@   c           	      N   |t                               d           |S d}|D ]}d}|D ]4}||                    d|                    |j                            fz   }5|d         j        |d         j        k    sJ t          |          t          |          k    sJ ||fz   }|S )NzHYou might want to consider setting `use_cache=True` to speed up decodingr.   r   )r   warningindex_selectra   rC   r4   r   )r   r  beam_idxreordered_decoder_pastlayer_past_statesreordered_layer_past_stateslayer_past_states          r>   _reorder_cachez-LongT5ForConditionalGeneration._reorder_cache  s     "NNefff""!#!0 	] 	] +-'$5   .I$11!X[[AQAX5Y5YZZM /++ /q17;LQ;O;UUUUU233s;L7M7MMMMM%;?Z>\%\""%%r@   )NNNNNNNNNNNNNNNNN) r   r   r   r  r  r#   r   r   r  r_  r  r  rc  rg  r   r  r!   r   r  r   r7   r  r  r  r|   r   r7  r   r   r  r  r   r   s   @r>   r  r    s        	R*& jii|      .  : : :
O O O
& & &       +*+BCC?YYY 156:8<=A159=7;@D@D59=A-1$(,0/3&*59%@
 @
E,-@
 !!23@
 $E$45	@

 !))9 :@
 E-.@
 $E$56@
 'u|4@
 "%el(;"<=@
 "%el(;"<=@
   12@
  ((9:@
 )*@
 D>@
 $D>@
  'tn!@
" d^#@
$ !!12%@
& 
uU&'8	9'@
 @
 @
 ZY DC@
D)EL ) ) ) )& & & & & & &r@   r  zjThe bare LONGT5 Model transformer outputting encoder's raw hidden-states without any specific head on top.c                   p    e Zd ZdgZdgZdef fdZd Zd Zd Z	d Z
d	 Z ee           eee
          	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 dee         dee         dee         deeej                 ef         fd                        Z xZS )r  rJ  rR  r   c                 2   t                                          |           t          j        |j        |j                  | _        t          j        |          }d|_	        d|_
        t          || j                  | _        |                                  d S )NF)r   r   r   r   r  r   r  rM  rN  r?  rO  r  rP  r  )r   r   rT  r   s      r>   r   zLongT5EncoderModel.__init__  s}       l6#4fnEEv..#( ,1)">4;?? 	r@   c                     | j         S r   rW  r  s    r>   r   z'LongT5EncoderModel.get_input_embeddings  rX  r@   c                 H    || _         | j                            |           d S r   )r  rP  r  r  s     r>   r  z'LongT5EncoderModel.set_input_embeddings  s%    $)).99999r@   c                 l    | j         j        r'|                     | j        j        | j                   d S d S r   )r   r  r^  rP  r  r  r  s    r>   r_  zLongT5EncoderModel._tie_weights  s?    ;* 	O&&t|'@$+NNNNN	O 	Or@   c                     | j         S r   rb  r  s    r>   rc  zLongT5EncoderModel.get_encoder  rd  r@   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS ri  rj  rm  s       r>   ro  zLongT5EncoderModel._prune_heads  rp  r@   rq  Nr  re   r  r  r@  r  r  r(   c           	      ^    ||n| j         j        }|                     |||||||          }|S )a\  
        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
        >>> model = LongT5EncoderModel.from_pretrained("google/long-t5-local-base")
        >>> input_ids = tokenizer(
        ...     100 * "Studies have been shown that owning a dog is good for you ", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nrw  )r   r  rP  )	r   r  re   r  r  r@  r  r  rt  s	            r>   r   zLongT5EncoderModel.forward  sL    8 &1%<kk$+B],,)'/!5# ' 
 
 r@   )NNNNNNN)r   r   r   r  r  r#   r   r   r  r_  rc  ro  r   LONGT5_ENCODER_INPUTS_DOCSTRINGr!   r   r  r   r7   r  r  r7  r   r   r   r   r   s   @r>   r  r    s       
 88*4&
| 
 
 
 
 
 
  : : :O O O  C C C +*+JKK?YYY 156:1559,0/3&*& &E,-& !!23& E-.	&
   12& $D>& 'tn& d^& 
uU&'8	9& & & ZY LK& & & & &r@   r  )r   )]r  rM  r  r~  typingr   r   r   r   r   r7   r   torch.nnr	   activationsr   cache_utilsr   r   r   r   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   r   r    r!   configuration_longt5r#   
get_loggerr   r   r  _CHECKPOINT_FOR_DOCr|   r   r?   rH   rU   r]   rd   rC   rl   r   r   r   Moduler   apex.normalizationr   infoImportError	Exceptionr  rP   r   r   r   r   rS  rv  r  r  r  r  r  r  r  LONGT5_START_DOCSTRINGr  r  __HEAD_MASK_WARNING_MSGr  r  r  r.   r@   r>   <module>r     s>        4 4 4 4 4 4 4 4 4 4 4 4 4 4        % % % % % % ! ! ! ! ! ! P P P P P P P P P P P P ) ) ) ) ) ) > > > > > >            . - - - - - g g g g g g g g g g	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 / . . . . . 
	H	%	% 1 
   3 3 W\Wc     #%, #3 #S #U\ # # # #4 4U\ 4c 4 4Y\ 4ejeq 4 4 4 42!# !%, ! ! ! !BU\ Bc BV[Vb B B B B8el 8s 8TYT` 8ejeq 8 8 8 8 .PL.P58.P
5<%&.P .P .P .Pb4U\ 4VY 4^c^j 4 4 4 4	j<	j,1L	jJM	j
\	j 	j 	j 	j+ + + + +bi + + +2	//////"O
KKeffff 	 	 	D 	 	 	
NN[\\\D	   O , , ,    ")   ,    ry   &    BI   &a a a a abi a a aH} } } } }29 } } }@D D D D DRY D D DP! ! ! ! !ry ! ! !H    BI   >    bi   D# # # # #	 # # #La a a a a") a a aHc! c! c! c! c!O c! c! c!Lo o o o o' o o od ,` D$# N  f Y
 Y
 Y
 Y
 Y
' Y
 Y
	 Y
x PRhiiS& S& S& S& S&%:O S& S& jiS&l p N N N N N. N N	 N N Ns   =F F>"F>=F>