§
    ٌ§g<  م                   َ–   — d Z ddlmZ ddlmZ  ej        e¦  «        Z G d„ de¦  «        Z G d„ de¦  «        Z	 G d„ d	e¦  «        Z
d
S )zIdefics model configurationé   )عPretrainedConfig)عloggingc                   َH   ‡ — e Zd ZdZdZddiZ	 	 	 	 	 	 	 	 	 	 	 	 dˆ fd„	Zˆ xZS )عIdeficsVisionConfiga?
  
    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Idefics-9B.

    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer. (elsewhere referred to as `hidden_size`)
        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.
        intermediate_size (`int`, *optional*, defaults to 5120):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        patch_size (`int`, *optional*, defaults to 14):
            The size (resolution) of each patch.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        image_num_channels (`int`, *optional*, defaults to `3`):
            Number of image channels.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization
            testing).
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    عideficsعhidden_sizeع	embed_dimé   éà   é   é   é    é   r   عgeluçٌhمˆµّن>ç        ç{®Gلz”?ç      ً?c                 َâ   •— || _         || _        || _        || _        || _        || _        || _        |	| _        |
| _        || _	        || _
        || _         t          ¦   «         j        di |¤ژ d S ©N© )r	   ع
image_sizeعintermediate_sizeع
patch_sizeعnum_hidden_layersعnum_attention_headsعnum_channelsعlayer_norm_epsعattention_dropoutعinitializer_rangeعinitializer_factorع
hidden_actعsuperع__init__)عselfr	   r   r   r   r   r   r   r"   r   r   r    r!   عkwargsع	__class__s                 €ْm/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/idefics/configuration_idefics.pyr$   zIdeficsVisionConfig.__init__L   s„   ّ€ ً  #ˆŒط$ˆŒط!2ˆشط$ˆŒط!2ˆشط#6ˆش ط(ˆشط,ˆشط!2ˆشط!2ˆشط"4ˆشط$ˆŒàچ‰Œشذ"ذ"ک6ذ"ذ"ذ"ذ"ذ"َ    )r
   r   r   r   r   r   r   r   r   r   r   r   )ع__name__ع
__module__ع__qualname__ع__doc__ع
model_typeعattribute_mapr$   ع__classcell__©r'   s   @r(   r   r      sپ   ّ€ € € € € ً'ً 'ًR €Jàگ{ً€Mً طططططططططططً#ً #ً #ً #ً #ً #ً #ً #ً #ً #r)   r   c                   َ4   ‡ — e Zd ZdZdZ	 	 	 	 	 	 d	ˆ fd„	Zˆ xZS )
عIdeficsPerceiverConfigaإ  
    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Idefics-9B.

    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        use_resampler (`bool`, *optional*, defaults to `False`):
            Whether or not to use the resampler
        resampler_n_latents (`int`, *optional*, defaults to ):
            Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
        resampler_depth (`int`, *optional*, defaults to 6):
            Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
        resampler_n_heads (`int`, *optional*, defaults to 16):
            Number of heads in each Transformer block (for multi-headed self-attention).
        resampler_head_dim (`int`, *optional*, defaults to 96):
            Dimensionality of each head projection in the Transformer block.
        qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`):
            Whether or not to use qk layer norms in perceiver
    r   Fé@   é   r   é`   c                 َژ   •— || _         || _        || _        || _        || _        || _         t          ¦   «         j        di |¤ژ d S r   )عuse_resamplerعresampler_n_latentsعresampler_depthعresampler_n_headsعresampler_head_dimعqk_layer_norms_perceiverr#   r$   )	r%   r8   r9   r:   r;   r<   r=   r&   r'   s	           €r(   r$   zIdeficsPerceiverConfig.__init__ˆ   sX   ّ€ ً +ˆشط#6ˆش ط.ˆشط!2ˆشط"4ˆشط(@ˆش%àچ‰Œشذ"ذ"ک6ذ"ذ"ذ"ذ"ذ"r)   )Fr4   r5   r   r6   F)r*   r+   r,   r-   r.   r$   r0   r1   s   @r(   r3   r3   l   sb   ّ€ € € € € ًً ً2 €Jً ططططط!&ً#ً #ً #ً #ً #ً #ً #ً #ً #ً #r)   r3   c                   َd   ‡ — e Zd ZdZdZdZddddddd	d
ddd	ddddddddddg ddg dddfˆ fd„	Zˆ xZS )عIdeficsConfiga  
    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Idefics-9B.

    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        additional_vocab_size (`int`, *optional*, defaults to 0):
            Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens
            are always trainable whereas regular vocab tokens can be frozen or not.
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the Idefics model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`~IdeficsModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        dropout (`float`, *optional*, defaults to 0.0):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        alpha_initializer (`str`, *optional*, defaults to `"zeros"`):
            Initialization type for the alphas.
        alphas_initializer_range (`float`, *optional*, defaults to 0.0):
            The standard deviation of the truncated_normal_initializer for initializing the alphas in the Gated Cross
            Attention.
        alpha_type (`str`, *optional*, defaults to `"float"`):
            Whether the gating alphas should be vectors or single floats.
        rms_norm_eps (`float`, *optional*, defaults to 1e-6):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0)
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 1)
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 2)
            End of stream token id.
        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        cross_layer_interval (`int`, *optional*, default to 1)
            Interval for cross attention (from text to image) layers.
        qk_layer_norms (`bool`, *optional*, defaults to `False`): Whether to add layer norm after q and k
        freeze_text_layers (`bool`, *optional*, defaults to `True`): Whether to freeze text layers
        freeze_text_module_exceptions (`bool`, *optional*, defaults to `[]`):
            Exceptions to freezing text layers when `freeze_text_layers` is `True`
        freeze_lm_head (`bool`, *optional*, defaults to `False`): Whether to freeze lm head
        freeze_vision_layers (`bool`, *optional*, defaults to `True`):  Whether to freeze vision layers
        freeze_vision_module_exceptions (`bool`, *optional*, defaults to `[]`):
            Exceptions to freezing vision layers when `freeze_vision_layers` is `True`
        use_resampler (`bool`, *optional*, defaults to `False`): Whether to use the Resampler
        vision_config (`IdeficsVisionConfig`,  *optional*): Custom vision config or dict
        perceiver_config (`IdeficsPerceiverConfig`,  *optional*): Custom perceiver config or dict

    Example:

    ```python
    >>> from transformers import IdeficsModel, IdeficsConfig

    >>> # Initializing a Idefics idefics-9b style configuration
    >>> configuration = IdeficsConfig()

    >>> # Initializing a model from the idefics-9b style configuration
    >>> model = IdeficsModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```r   Fi }  é    i   i +  r   r   عsilur   عzerosعfloatgچيµ ÷ئ°>Té   é   Nc                 َـ  •— || _         || _        || _        || _        || _        || _        || _        || _        |	| _        |
| _	        || _
        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        |€t-          ¦   «         | _        nCt1          |t2          ¦  «        rt-          di |¤ژ| _        nt1          |t,          ¦  «        r|| _        |€t5          ¦   «         | _        nCt1          |t2          ¦  «        rt5          di |¤ژ| _        nt1          |t4          ¦  «        r|| _         t9          ¦   «         j        d||||dœ|¤ژ d S )N)عpad_token_idعbos_token_idعeos_token_idعtie_word_embeddingsr   )ع
vocab_sizeعadditional_vocab_sizer   r   r   r   عdropoutr"   r    عalpha_initializerعalphas_initializer_rangeع
alpha_typeعrms_norm_epsع	use_cacheعcross_layer_intervalعqk_layer_normsعfreeze_vision_layersعfreeze_text_layersعfreeze_text_module_exceptionsعfreeze_vision_module_exceptionsعfreeze_lm_headr8   r3   عperceiver_configع
isinstanceعdictr   عvision_configr#   r$   )r%   rK   rL   r   r   r   r   rM   r"   r    rN   rO   rP   rQ   rR   rG   rH   rI   rJ   rS   rT   rV   rW   rY   rU   rX   r8   r]   rZ   r&   r'   s                                 €r(   r$   zIdeficsConfig.__init__ً   s³  ّ€ ً@ %ˆŒط%:ˆش"ط&ˆشط!2ˆشط!2ˆشط#6ˆش طˆŒط$ˆŒط!2ˆشط!2ˆشط(@ˆش%ط$ˆŒط(ˆشط"ˆŒà$8ˆش!ط,ˆشط$8ˆش!à"4ˆشط-Jˆش*ط/Nˆش,ط,ˆشà*ˆشàذ#ف$:ر$<ش$<ˆDش!ذ!فذ(­$ر/ش/ً 	5ف$:ذ$Nذ$Nذ=Mذ$Nذ$NˆDش!ذ!فذ(ص*@رAشAً 	5ط$4ˆDش!àذ ف!4ر!6ش!6ˆDشذفک¥tر,ش,ً 	/ف!4ذ!Eذ!E°}ذ!Eذ!EˆDشذفکص':ر;ش;ً 	/ط!.ˆDشàچ‰Œشً 	
ط%ط%ط%ط 3ً		
ً 	
ً
 ً	
ً 	
ً 	
ً 	
ً 	
r)   )r*   r+   r,   r-   r.   عis_compositionr$   r0   r1   s   @r(   r?   r?   œ   s¶   ّ€ € € € € ًNً Nً` €Jط€Nً ططططططططط!ط!$ططططططط!طططط&(طط!ط(*طططً;N
ً N
ً N
ً N
ً N
ً N
ً N
ً N
ً N
ً N
r)   r?   N)r-   عconfiguration_utilsr   عutilsr   ع
get_loggerr*   عloggerr   r3   r?   r   r)   r(   ْ<module>rc      sن   ًً( "ذ !à 3ذ 3ذ 3ذ 3ذ 3ذ 3ط ذ ذ ذ ذ ذ ً 
ˆش	کHر	%ش	%€ًL#ً L#ً L#ً L#ً L#ذ*ٌ L#ô L#ً L#ً^-#ً -#ً -#ً -#ً -#ذ-ٌ -#ô -#ً -#ً`b
ً b
ً b
ً b
ً b
ذ$ٌ b
ô b
ً b
ً b
ً b
r)   