
    gz                         d Z ddlmZ ddlmZ ddlmZ ddlmZ  ej	        e
          Z G d de          Z G d	 d
e          ZdS )zLongT5 model configuration    )Mapping   )PretrainedConfig)OnnxSeq2SeqConfigWithPast)loggingc                   d     e Zd ZdZdZdgZdddddZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )LongT5Configa  
    This is the configuration class to store the configuration of a [`LongT5Model`] or a [`FlaxLongT5Model`]. It is
    used to instantiate a LongT5 model according to the specified arguments, defining the model architecture.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the LongT5
    [google/long-t5-local-base](https://huggingface.co/google/long-t5-local-base) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Arguments:
        vocab_size (`int`, *optional*, defaults to 32128):
            Vocabulary size of the LongT5 model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`LongT5Model`].
        d_model (`int`, *optional*, defaults to 512):
            Size of the encoder layers and the pooler layer.
        d_kv (`int`, *optional*, defaults to 64):
            Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model //
            num_heads`.
        d_ff (`int`, *optional*, defaults to 2048):
            Size of the intermediate feed forward layer in each `LongT5Block`.
        num_layers (`int`, *optional*, defaults to 6):
            Number of hidden layers in the Transformer encoder.
        num_decoder_layers (`int`, *optional*):
            Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
        num_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        local_radius (`int`, *optional*, defaults to 127)
            Number of tokens to the left/right for each token to locally self-attend in a local attention mechanism.
        global_block_size (`int`, *optional*, defaults to 16)
            Lenght of blocks an input sequence is divided into for a global token representation. Used only for
            `encoder_attention_type = "transient-global"`.
        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
            The number of buckets to use for each attention layer.
        relative_attention_max_distance (`int`, *optional*, defaults to 128):
            The maximum distance of the longer sequences for the bucket separation.
        dropout_rate (`float`, *optional*, defaults to 0.1):
            The ratio for all dropout layers.
        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
            The epsilon used by the layer normalization layers.
        initializer_factor (`float`, *optional*, defaults to 1):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).
        feed_forward_proj (`string`, *optional*, defaults to `"relu"`):
            Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`. LongT5v1.1 uses the
            `"gated-gelu"` feed forward projection. Original LongT5 implementation uses `"gated-gelu"`.
        encoder_attention_type (`string`, *optional*, defaults to `"local"`):
            Type of encoder attention to be used. Should be one of `"local"` or `"transient-global"`, which are
            supported by LongT5 implementation.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
    longt5past_key_valuesd_model	num_heads
num_layersd_kv)hidden_sizenum_attention_headsnum_hidden_layershead_dim}     @         N                皙?ư>      ?reluTlocalr      c                 V   || _         || _        || _        || _        || _        ||n| j        | _        || _        || _        |	| _        |
| _	        || _
        || _        || _        || _        || _        || _        || _        | j                            d          }|d         | _        |d         dk    | _        t)          |          dk    r|d         dk    st)          |          dk    rt+          d| d          |d	k    rd
| _         t-                      j        d|||d| d S )N-r   gatedr#      z`feed_forward_proj`: z is not a valid activation function of the dense layer. Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. 'gated-gelu' or 'relu'z
gated-gelugelu_new)pad_token_ideos_token_idis_encoder_decoder )
vocab_sizer   r   d_ffr   num_decoder_layersr   local_radiusglobal_block_sizerelative_attention_num_bucketsrelative_attention_max_distancedropout_ratelayer_norm_epsiloninitializer_factorfeed_forward_projencoder_attention_type	use_cachesplitdense_act_fnis_gated_actlen
ValueErrorsuper__init__)selfr.   r   r   r/   r   r0   r   r1   r2   r3   r4   r5   r6   r7   r8   r,   r9   r:   r*   r+   kwargsact_info	__class__s                          k/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/longt5/configuration_longt5.pyrA   zLongT5Config.__init__Y   sw   0 %		$8J8V"4"4\`\k"(!2.L+/N,("4"4!2&<#")//44$RL$QK72x==1!!7!73x==1;L;L)(9 ) ) )   ,, *D 	
%%1	
 	
 		
 	
 	
 	
 	
    )r   r   r   r   r   Nr   r   r   r   r   r   r   r    r!   Tr"   Tr   r#   )	__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferenceattribute_maprA   __classcell__)rE   s   @rF   r	   r	      s        2 2h J#4"5 *)	 M ')(+ &+?
 ?
 ?
 ?
 ?
 ?
 ?
 ?
 ?
 ?
rG   r	   c                   f    e Zd Zedeeeeef         f         fd            Zedefd            ZdS )LongT5OnnxConfigreturnc                     ddddddd}| j         rd|d         d<   ddi|d	<   dd
d|d<   nddd|d	<   ddd|d<   | j         r|                     |d           |S )Nbatchencoder_sequence)r   r#   )	input_idsattention_maskz past_encoder_sequence + sequencerW   r#   r   decoder_input_idsz past_decoder_sequence + sequencedecoder_attention_maskdecoder_sequenceinputs)	direction)use_pastfill_with_past_key_values_)rB   common_inputss     rF   r[   zLongT5OnnxConfig.inputs   s     %);<<").@AA
 
 = 	Z1SM*+A.23WM-.:AFh6i6iM2335<AS1T1TM-.:AFX6Y6YM23= 	O++MX+NNNrG   c                     dS )N   r-   )rB   s    rF   default_onnx_opsetz#LongT5OnnxConfig.default_onnx_opset   s    rrG   N)	rH   rI   rJ   propertyr   strintr[   rb   r-   rG   rF   rQ   rQ      ss        WS#X%6 67    X$ C    X  rG   rQ   N)rK   typingr   configuration_utilsr   onnxr   utilsr   
get_loggerrH   loggerr	   rQ   r-   rG   rF   <module>rl      s    !         3 3 3 3 3 3 - - - - - -       
	H	%	%}
 }
 }
 }
 }
# }
 }
 }
@    0     rG   