
    g<                         d Z ddlmZ ddlmZ  ej        e          Z G d de          Z G d de          Z	 G d d	e          Z
d
S )zIdefics model configuration   )PretrainedConfig)loggingc                   H     e Zd ZdZdZddiZ	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )IdeficsVisionConfiga?
  
    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Idefics-9B.

    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer. (elsewhere referred to as `hidden_size`)
        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.
        intermediate_size (`int`, *optional*, defaults to 5120):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        patch_size (`int`, *optional*, defaults to 14):
            The size (resolution) of each patch.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        image_num_channels (`int`, *optional*, defaults to `3`):
            Number of image channels.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization
            testing).
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    ideficshidden_size	embed_dim                   r   geluh㈵>        {Gz?      ?c                     || _         || _        || _        || _        || _        || _        || _        |	| _        |
| _        || _	        || _
        || _         t                      j        di | d S N )r	   
image_sizeintermediate_size
patch_sizenum_hidden_layersnum_attention_headsnum_channelslayer_norm_epsattention_dropoutinitializer_rangeinitializer_factor
hidden_actsuper__init__)selfr	   r   r   r   r   r   r   r"   r   r   r    r!   kwargs	__class__s                 m/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/idefics/configuration_idefics.pyr$   zIdeficsVisionConfig.__init__L   s      #$!2$!2#6 (,!2!2"4$""6"""""    )r
   r   r   r   r   r   r   r   r   r   r   r   )__name__
__module____qualname____doc__
model_typeattribute_mapr$   __classcell__r'   s   @r(   r   r      s        ' 'R J{M # # # # # # # # # #r)   r   c                   4     e Zd ZdZdZ	 	 	 	 	 	 d	 fd	Z xZS )
IdeficsPerceiverConfiga  
    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Idefics-9B.

    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        use_resampler (`bool`, *optional*, defaults to `False`):
            Whether or not to use the resampler
        resampler_n_latents (`int`, *optional*, defaults to ):
            Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).
        resampler_depth (`int`, *optional*, defaults to 6):
            Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
        resampler_n_heads (`int`, *optional*, defaults to 16):
            Number of heads in each Transformer block (for multi-headed self-attention).
        resampler_head_dim (`int`, *optional*, defaults to 96):
            Dimensionality of each head projection in the Transformer block.
        qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`):
            Whether or not to use qk layer norms in perceiver
    r   F@      r   `   c                     || _         || _        || _        || _        || _        || _         t                      j        di | d S r   )use_resamplerresampler_n_latentsresampler_depthresampler_n_headsresampler_head_dimqk_layer_norms_perceiverr#   r$   )	r%   r8   r9   r:   r;   r<   r=   r&   r'   s	           r(   r$   zIdeficsPerceiverConfig.__init__   sX     +#6 .!2"4(@%""6"""""r)   )Fr4   r5   r   r6   F)r*   r+   r,   r-   r.   r$   r0   r1   s   @r(   r3   r3   l   sb         2 J !&# # # # # # # # # #r)   r3   c                   d     e Zd ZdZdZdZddddddd	d
ddd	ddddddddddg ddg dddf fd	Z xZS )IdeficsConfiga  
    This is the configuration class to store the configuration of a [`IdeficsModel`]. It is used to instantiate an
    Idefics model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Idefics-9B.

    e.g. [HuggingFaceM4/idefics-9b](https://huggingface.co/HuggingFaceM4/idefics-9b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        additional_vocab_size (`int`, *optional*, defaults to 0):
            Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens
            are always trainable whereas regular vocab tokens can be frozen or not.
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the Idefics model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`~IdeficsModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        dropout (`float`, *optional*, defaults to 0.0):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        alpha_initializer (`str`, *optional*, defaults to `"zeros"`):
            Initialization type for the alphas.
        alphas_initializer_range (`float`, *optional*, defaults to 0.0):
            The standard deviation of the truncated_normal_initializer for initializing the alphas in the Gated Cross
            Attention.
        alpha_type (`str`, *optional*, defaults to `"float"`):
            Whether the gating alphas should be vectors or single floats.
        rms_norm_eps (`float`, *optional*, defaults to 1e-6):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0)
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 1)
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 2)
            End of stream token id.
        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        cross_layer_interval (`int`, *optional*, default to 1)
            Interval for cross attention (from text to image) layers.
        qk_layer_norms (`bool`, *optional*, defaults to `False`): Whether to add layer norm after q and k
        freeze_text_layers (`bool`, *optional*, defaults to `True`): Whether to freeze text layers
        freeze_text_module_exceptions (`bool`, *optional*, defaults to `[]`):
            Exceptions to freezing text layers when `freeze_text_layers` is `True`
        freeze_lm_head (`bool`, *optional*, defaults to `False`): Whether to freeze lm head
        freeze_vision_layers (`bool`, *optional*, defaults to `True`):  Whether to freeze vision layers
        freeze_vision_module_exceptions (`bool`, *optional*, defaults to `[]`):
            Exceptions to freezing vision layers when `freeze_vision_layers` is `True`
        use_resampler (`bool`, *optional*, defaults to `False`): Whether to use the Resampler
        vision_config (`IdeficsVisionConfig`,  *optional*): Custom vision config or dict
        perceiver_config (`IdeficsPerceiverConfig`,  *optional*): Custom perceiver config or dict

    Example:

    ```python
    >>> from transformers import IdeficsModel, IdeficsConfig

    >>> # Initializing a Idefics idefics-9b style configuration
    >>> configuration = IdeficsConfig()

    >>> # Initializing a model from the idefics-9b style configuration
    >>> model = IdeficsModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```r   Fi }      i   i +  r   r   silur   zerosfloatgư>T      Nc                    || _         || _        || _        || _        || _        || _        || _        || _        |	| _        |
| _	        || _
        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        |t-                      | _        nCt1          |t2                    rt-          di || _        nt1          |t,                    r|| _        |t5                      | _        nCt1          |t2                    rt5          di || _        nt1          |t4                    r|| _         t9                      j        d||||d| d S )N)pad_token_idbos_token_ideos_token_idtie_word_embeddingsr   )
vocab_sizeadditional_vocab_sizer   r   r   r   dropoutr"   r    alpha_initializeralphas_initializer_range
alpha_typerms_norm_eps	use_cachecross_layer_intervalqk_layer_normsfreeze_vision_layersfreeze_text_layersfreeze_text_module_exceptionsfreeze_vision_module_exceptionsfreeze_lm_headr8   r3   perceiver_config
isinstancedictr   vision_configr#   r$   )r%   rK   rL   r   r   r   r   rM   r"   r    rN   rO   rP   rQ   rR   rG   rH   rI   rJ   rS   rT   rV   rW   rY   rU   rX   r8   r]   rZ   r&   r'   s                                 r(   r$   zIdeficsConfig.__init__   s   @ %%:"&!2!2#6 $!2!2(@%$("$8!,$8!"4-J*/N,,*#$:$<$<D!!($// 	5$:$N$N=M$N$ND!!(*@AA 	5$4D! !4!6!6Dt,, 	/!4!E!E}!E!ED':;; 	/!.D 	
%%% 3		
 	

 	
 	
 	
 	
 	
r)   )r*   r+   r,   r-   r.   is_compositionr$   r0   r1   s   @r(   r?   r?      s        N N` JN !!$!&(!(*;N
 N
 N
 N
 N
 N
 N
 N
 N
 N
r)   r?   N)r-   configuration_utilsr   utilsr   
get_loggerr*   loggerr   r3   r?   r   r)   r(   <module>rc      s   ( " ! 3 3 3 3 3 3       
	H	%	%L# L# L# L# L#* L# L# L#^-# -# -# -# -#- -# -# -#`b
 b
 b
 b
 b
$ b
 b
 b
 b
 b
r)   