
    g+                         d Z ddlmZmZ ddlmZ ddlmZ  ej        e	          Z
 G d de          Z G d d	e          Z G d
 de          ZdS )zDBRX model configuration    )AnyOptional   )PretrainedConfig)loggingc                   r     e Zd ZdZ	 	 	 	 ddedee         ded	ed
ef
 fdZe	de
d
eddfd            Z xZS )DbrxAttentionConfiga_  Configuration class for Dbrx Attention.

    [`DbrxAttention`] class. It is used to instantiate attention layers
    according to the specified arguments, defining the layers architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        attn_pdrop (`float`, *optional*, defaults to 0.0):
            The dropout probability for the attention layers.
        clip_qkv (`float`, *optional*):
            If set, clip the queries, keys, and values in the attention layer to this value.
        kv_n_heads (`int`, *optional*, defaults to 1): For grouped_query_attention only, allow user to specify number of kv heads.
        rope_theta (`float`, *optional*, defaults to 10000.0): The base frequency for rope.
            N        @
attn_pdropclip_qkv
kv_n_heads
rope_thetakwargsc                      t                      j        di | || _        || _        || _        || _        dD ]}||v r|                    |           t          |          dk    rt          d|          d S )N
model_typer   Found unknown kwargs= )	super__init__r   r   r   r   poplen
ValueError)selfr   r   r   r   r   k	__class__s          g/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/dbrx/configuration_dbrx.pyr   zDbrxAttentionConfig.__init__,   s     	""6"""$ $$ 	 	AF{{

1v;;!7f77888     pretrained_model_name_or_pathreturnr   c                 T   |                      |            | j        |fi |\  }}|                    d          dk    r|d         }d|v rPt          | d          r@|d         | j        k    r/t
                              d|d          d| j         dz               | j        |fi |S )Nr   dbrxattn_configYou are using a model of type   to instantiate a model of type N. This is not supported for all configurations of models and can yield errors._set_token_in_kwargsget_config_dictgethasattrr   loggerwarning	from_dictclsr!   r   config_dicts       r   from_pretrainedz#DbrxAttentionConfig.from_pretrained@   s      (((1c12OZZSYZZV??<((F22%m4K;&&73+E+E&+VbJcgjguJuJuNNl\1Jlll^ssst  
 s}[33F333r    )r
   Nr   r   )__name__
__module____qualname____doc__floatr   intr   r   classmethodstrr4   __classcell__r   s   @r   r	   r	      s         &  $(#9 99 5/9 	9
 9 9 9 9 9 9 9( 4C 43 4Se 4 4 4 [4 4 4 4 4r    r	   c                        e Zd ZdZ	 	 	 	 	 	 	 dded	ed
ededee         dedee         def fdZ	e
dededdfd            Z xZS )DbrxFFNConfiga|  Configuration class for Dbrx FFN.

    [`DbrxFFN`] class. It is used to instantiate feedforward layers according to
    the specified arguments, defining the layers architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        ffn_act_fn (`dict`, *optional*, defaults to `None`): A dict specifying activation function for the FFN.
            The dict should have a key 'name' with the value being the name of the activation function along with
            any additional keyword arguments. If `None`, then set to `{"name": "silu"}`.
        ffn_hidden_size (`int`, *optional*, defaults to 3584): The hidden size of the feedforward network.
        moe_num_experts (`int`, *optional*, defaults to 4): The number of experts in the mixture of experts layer.
        moe_top_k (`int`, *optional*, defaults to 1): The number of experts to use in the mixture of experts layer.
        moe_jitter_eps (`float`, *optional*, defaults to `None`): If not `None`, the jitter epsilon for the mixture of experts layer.
        moe_loss_weight (`float`, *optional*, defaults to 0.01): The loss weight for the mixture of experts layer.
        moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights.
    N      r   {Gz?      ?
ffn_act_fnffn_hidden_sizemoe_num_experts	moe_top_kmoe_jitter_epsmoe_loss_weightmoe_normalize_expert_weightsr   c                 <   t                                                       |ddi}|| _        || _        || _        || _        || _        || _        || _        dD ]}	|	|v r|	                    |	           t          |          dk    rt          d|          d S )Nnamesilur   r   r   )r   r   rE   rF   rG   rH   rI   rJ   rK   r   r   r   )r   rE   rF   rG   rH   rI   rJ   rK   r   r   r   s             r   r   zDbrxFFNConfig.__init__g   s     	 &)J$..",.,H) 	 	AF{{

1v;;!7f77888 r    r!   r"   r   c                 T   |                      |            | j        |fi |\  }}|                    d          dk    r|d         }d|v rPt          | d          r@|d         | j        k    r/t
                              d|d          d| j         dz               | j        |fi |S )Nr   r$   
ffn_configr&   r'   r(   r)   r1   s       r   r4   zDbrxFFNConfig.from_pretrained   s      (((1c12OZZSYZZV??<((F22%l3K;&&73+E+E&+VbJcgjguJuJuNNl\1Jlll^ssst  
 s}[33F333r    )NrA   rB   r   NrC   rD   )r5   r6   r7   r8   dictr:   r   r9   r   r   r;   r<   r4   r=   r>   s   @r   r@   r@   R   s         ,  # *.!%8;9 99 9 	9
 9 !9 9 '/uo9 9 9 9 9 9 98 4C 43 4Se 4 4 4 [4 4 4 4 4r    r@   c                        e Zd ZdZdZdddddZ	 	 	 	 	 	 	 	 	 	 	 	 ddedededededededee	         dee
         dedededef fdZ xZS )
DbrxConfiga.  

    This is the configuration class to store the configuration of a [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
    specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a different configuration to that of the [databricks/dbrx-instruct](https://huggingface.co/databricks/dbrx-instruct) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        d_model (`int`, *optional*, defaults to 2048):
            Dimensionality of the embeddings and hidden states.
        n_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        n_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        max_seq_len (`int`, *optional*, defaults to 2048):
            The maximum sequence length of the model.
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
            the `inputs_ids` passed when calling [`DbrxModel`].
        resid_pdrop (`float`, *optional*, defaults to 0.0):
            The dropout probability applied to the attention output before combining with residual.
        emb_pdrop (`float`, *optional*, defaults to 0.0):
            The dropout probability for the embedding layer.
        attn_config (`dict`, *optional*):
            A dictionary used to configure the model's attention module.
        ffn_config (`dict`, *optional*):
            A dictionary used to configure the model's FFN module.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabling this will also
            allow the model to output the auxiliary loss. See [here]() for more details.


    Example:
    ```python
    >>> from transformers import DbrxConfig, DbrxModel

    >>> # Initializing a Dbrx configuration
    >>> configuration = DbrxConfig(n_layers=2, d_model=256, n_heads=8, vocab_size=128)

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = DbrxModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    r$   n_headsd_modeln_layersmax_seq_len)num_attention_headshidden_sizenum_hidden_layersmax_position_embeddings          }  r
   NT{Gz?F
vocab_sizeresid_pdrop	emb_pdropr%   rP   	use_cacheinitializer_rangeoutput_router_logitsr   c                 J   |t                      | _        n.t          |t                    rt          di || _        n|| _        |	t	                      | _        n.t          |	t                    rt	          di |	| _        n|	| _        || _        || _        || _        || _	        || _
        || _        || _        |
| _        || _        || _        | j        j        | _        |                    dd          }|rt'          d           t)                      j        dd|i| d S )Ntie_word_embeddingsFz5tie_word_embeddings is not supported for DBRX models.r   )r	   r%   
isinstancerQ   r@   rP   rU   rT   rV   rW   ra   rb   rc   rd   re   rf   r   num_key_value_headsr   r   r   r   )r   rU   rT   rV   rW   ra   rb   rc   r%   rP   rd   re   rf   r   rh   r   s                  r   r   zDbrxConfig.__init__   s@     244DT** 	+2AA[AAD*D+ooDOO
D)) 	)+99j99DOO(DO &$&""!2$8!#'#3#> $jj)>FF 	VTUUUKK-@KFKKKKKr    )r\   r]   r^   r\   r_   r
   r
   NNTr`   F)r5   r6   r7   r8   r   attribute_mapr:   r9   r   r	   r@   boolr   r   r=   r>   s   @r   rS   rS      s5       4 4l J( '#0	 M  59.2#'%*.L .L.L .L 	.L
 .L .L .L .L 12.L ]+.L .L !.L #.L .L .L .L .L .L .L .L .L .L .Lr    rS   N)r8   typingr   r   configuration_utilsr   utilsr   
get_loggerr5   r.   r	   r@   rS   r   r    r   <module>rq      s                     3 3 3 3 3 3       
	H	%	%54 54 54 54 54* 54 54 54p@4 @4 @4 @4 @4$ @4 @4 @4FmL mL mL mL mL! mL mL mL mL mLr    