
    g                     >   d dl Z d dlmZmZmZmZmZmZmZ d dl	Z
d dlZd dlZd dlmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, erddl-m.Z. ddiZ/dZ0dZ1 e!j2        e3          Z4 G d de          Z5 G d de,e          Z6 G d dej7                  Z8 ej9        e8            G d dej7                  Z: G d de:          Z; G d d e:          Z< G d! d"ej7                  Z= G d# d$ej7                  Z> G d% d&e>          Z? G d' d(e$e>          Z@e>e@e?d)ZA G d* d+e#          ZB G d, d-e(          ZC G d. d/e%          ZD G d0 d1e&          ZE G d2 d3e'          ZFg d4ZGdS )5    N)TYPE_CHECKINGAnyDictListOptionalTupleUnion)nn   )ACT2FN)CacheDynamicCacheStaticCache)PretrainedConfig)_flash_attention_forward)BaseModelOutputWithPastCausalLMOutputWithPast)ALL_LAYERNORM_LAYERS)
AddedTokenPreTrainedTokenizer)logging   )LlamaDecoderLayerLlamaFlashAttention2LlamaForCausalLMLlamaForSequenceClassificationLlamaForTokenClassification
LlamaModelapply_rotary_pos_emb	repeat_kv)LlamaTokenizer)	TextInput
vocab_fileztokenizer.modelu   ▁zgoogle/gemma-7bc                   V     e Zd ZdZdZdgZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )GemmaConfiga  
    This is the configuration class to store the configuration of a [`GemmaModel`]. It is used to instantiate an Gemma
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Gemma-7B.
    e.g. [google/gemma-7b](https://huggingface.co/google/gemma-7b)
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the Gemma model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`GemmaModel`]
        hidden_size (`int`, *optional*, defaults to 3072):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 24576):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 28):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 16):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The legacy activation function. It is overwritten by the `hidden_activation`.
        hidden_activation (`str` or `function`, *optional*):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
    ```python
    >>> from transformers import GemmaModel, GemmaConfig
    >>> # Initializing a Gemma gemma-7b style configuration
    >>> configuration = GemmaConfig()
    >>> # Initializing a model from the gemma-7b style configuration
    >>> model = GemmaModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```gemmapast_key_values      `           gelu_pytorch_tanhN    {Gz?ư>Tr      r        @F        c                 $   || _         |
| _        || _        || _        || _        || _        || _        || _        || _        |	| _	        || _
        || _        || _        || _        || _        || _         t!                      j        d||||d| d S )N)pad_token_idbos_token_ideos_token_idtie_word_embeddings )
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headshead_dimnum_key_value_heads
hidden_acthidden_activationinitializer_rangerms_norm_eps	use_cache
rope_thetaattention_biasattention_dropoutsuper__init__)selfr;   r=   r>   r?   r@   rB   rA   rC   rD   r<   rE   rF   rG   r6   r8   r7   r9   rH   rI   rJ   kwargs	__class__s                         c/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/gemma/modular_gemma.pyrL   zGemmaConfig.__init__   s    0 %'>$&!2!2#6  #6 $!2!2("$,!2 	
%%% 3		
 	

 	
 	
 	
 	
 	
    )r(   r)   r*   r+   r,   r,   r-   r.   Nr/   r0   r1   Tr   r2   r   Tr3   Fr4   )__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencerL   __classcell__rO   s   @rP   r%   r%   9   s        A AF J#4"5 & $ +/
 /
 /
 /
 /
 /
 /
 /
 /
 /
rQ   r%   c            	           e Zd ZdZ	 	 	 	 	 	 	 	 	 	 dd	eeeef                  fd
Zd Z	d Z
dddee         fdZd Z	 	 ddee         dededefdZd ZdS )GemmaTokenizera
  
    Construct a Gemma tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
    no padding token in the original model.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<bos>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<eos>"`):
            The end of sequence token.
        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<pad>"`):
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
            attention mechanisms or loss computation.
        sp_model_kwargs (`Dict[str, Any]`, `Optional`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

        add_bos_token (`bool`, *optional*, defaults to `True`):
            Whether or not to add an `bos_token` at the start of sequences.
        add_eos_token (`bool`, *optional*, defaults to `False`):
            Whether or not to add an `eos_token` at the end of sequences.
        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
            extra spaces.
        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
            Whether or not the default system prompt for Gemma should be used.
        spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
            Whether or not to add spaces between special tokens.
    <unk><bos><eos><pad>NTFsp_model_kwargsc                 @   |i n|| _         t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}|| _        || _        || _        |
| _        t          j	        di | j         | _
        | j
                            |           t          j        | f||||||||	|
|d
| d S )NFT)
normalizedspecial)
	bos_token	eos_token	unk_token	pad_tokenadd_bos_tokenadd_eos_tokenr`   clean_up_tokenization_spacesuse_default_system_promptspaces_between_special_tokensr:   )r`   
isinstancestrr   r#   rh   ri   rk   spmSentencePieceProcessorsp_modelLoadr   rL   )rM   r#   rf   rd   re   rg   r`   rh   ri   rj   rk   rl   rN   s                rP   rL   zGemmaTokenizer.__init__   se    &5%<rr/MWXacfMgMgvJyUDIIIImv	MWXacfMgMgvJyUDIIIImv	MWXacfMgMgvJyUDIIIImv	MWXacfMgMgvJyUDIIIImv	$**)B&2JJT5IJJ:&&&$	
''+)E&?*G	
 	
 	
 	
 	
 	
 	
rQ   c                      t          d          NzNot needed for GemmaAttributeErrorrM   s    rP   get_spm_processorz GemmaTokenizer.get_spm_processor      3444rQ   c                      t          d          rt   ru   rw   s    rP   unk_token_lengthzGemmaTokenizer.unk_token_length  ry   rQ   textr"   returnc                 (    t          j        | |fi |S )ze
        Args:
            text: TextInput
        Simply calls PreTrainedTokenizer's method
        )r   tokenizerM   r|   rN   s      rP   r   zGemmaTokenizer.tokenize  s     #+D$AA&AAArQ   c                 D    | j                             |t                    S )z
        Args:
            text: TextInput
        Returns a tokenized string. The Gemma tokenizer never adds a prefix space.
        )out_type)rq   encodern   r   s      rP   	_tokenizezGemmaTokenizer._tokenize  s     }##D3#777rQ   	token_idsskip_special_tokensrl   c                    g }g }|D ]}|r
|| j         v r|| j        v rW|r-|                    | j                            |                     |                    | j        |         j                   g }n|                    |           |r-|                    | j                            |                     |rd                    |          }nd                    |          }|                    t          d          S )N  )	all_special_ids_added_tokens_decoderappendrq   decodecontentjoinreplaceSPIECE_UNDERLINE)rM   r   r   rl   rN   	sub_textscurrent_sub_textidss           rP   _decodezGemmaTokenizer._decode"  s"    	 		- 		-C" sd.B'B'Bd000# M$$T]%9%9:J%K%KLLL  !;C!@!HIII#%   '',,,, 	ET]112BCCDDD( 	+++II	**I  !13777rQ   c                     g }d}|D ]C}|| j         v r#|| j                            |          |z   z  }g }.|                    |           D|| j                            |          z  }|S )z:Converts a sequence of tokens (string) in a single string.r   )_added_tokens_encoderrq   r   r   )rM   tokenscurrent_sub_tokens
out_stringtokens        rP   convert_tokens_to_stringz'GemmaTokenizer.convert_tokens_to_string?  s    
 	1 	1E222dm223EFFNN
%'"""))%0000dm**+=>>>
rQ   )
r\   r]   r^   r_   NTFFFF)FF)rR   rS   rT   rU   r   r   rn   r   rL   rx   r{   r   r   r   intboolr   r   r:   rQ   rP   r[   r[      s#       , ,b 48%*"'&+)
 )
 "$sCx.1)
 )
 )
 )
V5 5 55 5 5B[ BtCy B B B B8 8 8 %*.3	8 898 "8 (,	8 
8 8 8 8:    rQ   r[   c                   <     e Zd Zddedef fdZd Zd Zd Z xZ	S )	GemmaRMSNormr1   dimepsc                     t                                                       || _        t          j        t          j        |                    | _        d S N)rK   rL   r   r
   	Parametertorchzerosweight)rM   r   r   rO   s      rP   rL   zGemmaRMSNorm.__init__O  s?    l5;s#3#344rQ   c                     |t          j        |                    d                              dd          | j        z             z  S )Nr   T)keepdim)r   rsqrtpowmeanr   rM   xs     rP   _normzGemmaRMSNorm._normT  s8    5;quuQxx}}R}>>IJJJJrQ   c                     |                      |                                          }|d| j                                        z   z  }|                    |          S )N      ?)r   floatr   type_as)rM   r   outputs      rP   forwardzGemmaRMSNorm.forwardW  sL    AGGII&& 3!2!2!4!445~~a   rQ   c                 H    t          | j        j                   d| j         S )Nz, eps=)tupler   shaper   rw   s    rP   
extra_reprzGemmaRMSNorm.extra_repr^  s%    )**<<$(<<<rQ   )r1   )
rR   rS   rT   r   r   rL   r   r   r   rX   rY   s   @rP   r   r   N  s        5 5C 5e 5 5 5 5 5 5
K K K! ! != = = = = = =rQ   r   c                   R     e Zd Zd fd	Z ej                    dd            Z xZS )GemmaRotaryEmbedding   '  Nc                 :   t                                                       || _        || _        || _        d| j        t          j        d| j        dt
          j                                                  | j        z  z  z  }| 	                    d|d           d S )Nr   r   r   dtypeinv_freqF)tensor
persistent)
rK   rL   r   r<   baser   arangeint64r   register_buffer)rM   r   r<   r   devicer   rO   s         rP   rL   zGemmaRotaryEmbedding.__init__f  s    '>$	$)Q!5;(W(W(W(](](_(_bfbj(jklZUKKKKKrQ   c                 "   | j                             |j                   | j         d d d d f                                                             |j        d         dd          }|d d d d d f                                         }|j        j        }t          |t                    r|dk    r|nd}t          j
        |d          5  |                                |                                z                      dd          }t          j        ||fd	          }|                                }	|                                }
d d d            n# 1 swxY w Y   |	                    |j        
          |
                    |j        
          fS )Nr   r   r2   mpscpuF)device_typeenabledr   )r   r   )r   tor   r   expandr   typerm   rn   r   autocast	transposecatcossinr   )rM   r   position_idsseq_leninv_freq_expandedposition_ids_expandedr   freqsembr   r   s              rP   r   zGemmaRotaryEmbedding.forwardo  s    	""" M$4-8>>@@GGHZ[\H]_acdee ,QQQaaaZ 8 > > @ @ hm%/S%A%AekUZFZFZkk`e^UCCC 	 	&,,..1F1L1L1N1NNYYZ[]^__E)UEN333C''))C''))C		 	 	 	 	 	 	 	 	 	 	 	 	 	 	
 vvAGv$$cff17f&;&;;;s   A>EEE)r   r   Nr   )rR   rS   rT   rL   r   no_gradr   rX   rY   s   @rP   r   r   e  sk        L L L L L L U]__< < < _< < < < <rQ   r   c                   "     e Zd ZdZ fdZ xZS )!GemmaLinearScalingRotaryEmbeddingz[GemmaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendevc                     |                                 | j        z  }t                                          ||          \  }}||fS r   )r   scaling_factorrK   r   )rM   r   r   r   r   rO   s        rP   r   z)GemmaLinearScalingRotaryEmbedding.forward  s@    #))++d.AA77??1l33SCxrQ   rR   rS   rT   rU   r   rX   rY   s   @rP   r   r     s>        ee        rQ   r   c                   "     e Zd ZdZ fdZ xZS )%GemmaDynamicNTKScalingRotaryEmbeddingzmGemmaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozillac                    t          j        |          dz   }|| j        k    r| j        | j        |z  | j        z  | j        dz
  z
  | j        | j        dz
  z  z  z  }d|t          j        d| j        dt           j                                                  	                    |j
                  | j        z  z  z  }|                     d|d           t                                          ||          \  }}||fS )	Nr2   r   r   r   r   r   F)r   )r   maxr<   r   r   r   r   r   r   r   r   r   rK   r   )	rM   r   r   r   r   r   r   r   rO   s	           rP   r   z-GemmaDynamicNTKScalingRotaryEmbedding.forward  s    )L))A-T1119$w.1MMRVRehiRij(dhl+ - -D a1EKHHHNNPPSSTUT\]]`d`hhiH   X% HHH77??1l33SCxrQ   r   rY   s   @rP   r   r     s>        ww        rQ   r   c                   $     e Zd Z fdZd Z xZS )GemmaMLPc                    t                                                       || _        |j        | _        |j        | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _	        |j
        !t                              d           d|_
        |j
        }t          |         | _        d S )NFbiasa.  `config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.r.   )rK   rL   configr=   r>   r
   Linear	gate_projup_proj	down_projrD   loggerwarning_oncer   act_fn)rM   r   rD   rO   s      rP   rL   zGemmaMLP.__init__  s    !-!'!94#3T5KRWXXXy!143IPUVVV4#94;KRWXXX#+_   (;F$"4./rQ   c                     |                      |                     |                     |                    |                     |          z            S r   )r   r   r   r   r   s     rP   r   zGemmaMLP.forward  s;    ~~dkk$..*;*;<<t||ANOOOrQ   )rR   rS   rT   rL   r   rX   rY   s   @rP   r   r     sN        0 0 0 0 0&P P P P P P PrQ   r   c                       e Zd ZdZddedee         f fdZ	 	 	 	 	 	 ddej	        deej	                 d	eej
                 d
ee         dededeej
                 deej	        eej	                 eeej	                          f         fdZ xZS )GemmaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNr   	layer_idxc                 "   t                                                       || _        || _        |(t                              d| j        j         d           |j        | _        |j	        | _	        |j
        | _        |j        | _        |j        | _        | j        | j        z  | _        |j        | _        |j        | _        d| _        dt%          j        |j                  z  | _        | j	        | j        z  dk    r t+          d| j	         d| j         d          t-          j        | j	        | j        | j        z  |j        	          | _        t-          j        | j	        | j        | j        z  |j        	          | _        t-          j        | j	        | j        | j        z  |j        	          | _        t-          j        | j        | j        z  | j	        |j        	          | _        t;          | j        | j        | j        
          | _        d S )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tr2   r   z?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).r   )r<   r   )rK   rL   r   r   r   r   rO   rR   rJ   r=   r@   	num_headsrA   rB   num_key_value_groupsr<   rH   	is_causalmathsqrtscaling
ValueErrorr
   r   rI   q_projk_projv_projo_projr   
rotary_embrM   r   r   rO   s      rP   rL   zGemmaAttention.__init__  s   ",!8 , , ,   "(!9!-3#)#= $(Nd6N$N!'-'E$ +49V_555dn,118RVRb 8 8%)^8 8 8  
 i 0$.4=2PW]Wlmmmi 0$2JT]2Zagavwwwi 0$2JT]2Zagavwwwi >@PW]Wlmmm.M$($@
 
 
rQ   Fhidden_statesattention_maskr   past_key_valueoutput_attentionsrG   cache_positionr}   c                    |                                 \  }}	}
|                     |          }|                     |          }|                     |          }|                    ||	| j        | j                                      dd          }|                    ||	| j        | j                                      dd          }|                    ||	| j        | j                                      dd          }| 	                    ||          \  }}t          ||||          \  }}|&|||d}|                    ||| j        |          \  }}t          || j                  }t          || j                  }t          j        ||                    dd                    | j        z  }|$|d d d d d d d |j        d         f         }||z   }t&          j                            |dt          j                                      |j                  }t&          j                            || j        | j                  }t          j        ||          }|                                 || j        |	| j        fk    r5t9          d	|| j        |	| j        f d
|                                            |                    dd                                          }|                    ||	d          }|                     |          }|sd }|||fS )Nr2   r   r   r   r  r   r   )r   r   )ptrainingz `attn_output` should be of size z	, but is )sizer  r  r  viewr   rA   r   rB   r  r   updater   r    r   r   matmulr   r   r
   
functionalsoftmaxfloat32r   r   dropoutrJ   r  r   
contiguousr  )rM   r  r  r   r	  r
  rG   r  bszq_len_query_states
key_statesvalue_statesr   r   cache_kwargsattn_weightscausal_maskattn_outputs                       rP   r   zGemmaAttention.forward  s	    &**,,UA{{=11[[//
{{=11#((eT^T]SS]]^_abcc__S%1I4=YYccdeghii
#((eT5Mt}]]gghiklmm??<>>S#7jRUWZ#[#[ j%#&snUUL'5'<'<ZW[Wegs't't$Jz4+DEE
 t/HII|L*2F2Fq!2L2LMMPTP\\%(AAAqqq2HJ4DR4H2H)HIK'+5L },,\r,WWZZ[g[mnn},,\T=S^b^k,lll<>>#t~udm!LLL)CPTP]3^ ) )$$&&) )  
 "++Aq11<<>>!&&sE266kk+..  	 LL.88rQ   r   NNNFFN)rR   rS   rT   rU   r%   r   r   rL   r   Tensor
LongTensorr   r   r   r   rX   rY   s   @rP   r   r     s       GG$
 $
{ $
x} $
 $
 $
 $
 $
 $
R 2637*."'5989 89|89 !.89 u/0	89
 !89  89 89 !!1289 
u|Xel3XeEL>Q5RR	S89 89 89 89 89 89 89 89rQ   r   c                        e Zd ZdZ	 	 	 	 	 	 ddej        deej                 deej                 dee         de	d	e	d
eej                 de
ej        eej                 ee
ej                          f         f fdZ xZS )GemmaSdpaAttentionz
    Gemma attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
    `GemmaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    NFr  r  r   r	  r
  rG   r  r}   c           	         |rBt                               d           t                                          |||||||          S |                                \  }	}
}|                     |          }|                     |          }|                     |          }|                    |	|
| j	        | j
                                      dd          }|                    |	|
| j        | j
                                      dd          }|                    |	|
| j        | j
                                      dd          }|                     ||          \  }}t          ||||          \  }}|&|||d}|                    ||| j        |          \  }}t#          || j                  }t#          || j                  }|}||d d d d d d d |j        d         f         }|j        j        dk    r>|<|                                }|                                }|                                }||
dk    rdnd	}t.          j        j                            ||||| j        r| j        nd
|          }|                    dd                                          }|                    |	|
d          }|                     |          }|d |fS )Na  GemmaModel is using GemmaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r  r  r   r	  r
  rG   r  r2   r   r  r  cudaTFr4   )	attn_mask	dropout_pr   r   )r   r   rK   r   r  r  r  r  r  r   rA   r   rB   r  r   r  r   r    r   r   r   r   r  r   r
   r  scaled_dot_product_attentionr  rJ   r  )rM   r  r  r   r	  r
  rG   r  rN   r  r  r  r  r  r  r   r   r   r"  r   r#  rO   s                        rP   r   zGemmaSdpaAttention.forward"  s     	[   77??+-)-"3#- #    &**,,UA{{=11[[//
{{=11#((eT^T]SS]]^_abcc__S%1I4=YYccdeghii
#((eT5Mt}]]gghiklmm??<>>S#7jRUWZ#[#[ j%#&snUUL'5'<'<ZW[Wegs't't$Jz4+DEE
 t/HII$%%aaaAAA/E1A"1E/E&EFK #v--+2I'2244L#..00J'2244L (/EAIIDD5	h)FF!04Fd,,3 G 
 
 "++Aq11<<>>!&&sE266kk+..D.00rQ   r$  )rR   rS   rT   rU   r   r%  r   r&  r   r   r   r   rX   rY   s   @rP   r(  r(    s          2637*."'59M1 M1|M1 !.M1 u/0	M1
 !M1  M1 M1 !!12M1 
u|Xel3XeEL>Q5RR	SM1 M1 M1 M1 M1 M1 M1 M1 M1 M1rQ   r(  c                       e Zd ZdZ	 	 	 	 	 	 ddej        deej                 deej                 dee         de	d	e	d
eej                 de
ej        eej                 ee
ej                          f         fdZdS )GemmaFlashAttention2aF  
    Gemma flash attention module. This module inherits from `GemmaAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    NFr  r  r   r	  r
  rG   r  r}   c                    t          |t                    rt          d          d}|                                \  }}	}
|                     |          }|                     |          }|                     |          }|                    ||	| j        | j	                  
                    dd          }|                    ||	| j        | j	                  
                    dd          }|                    ||	| j        | j	                  
                    dd          }|                     ||          \  }}t          ||||          \  }}|&|||d}|                    ||| j        |          \  }}|
                    dd          }|
                    dd          }|
                    dd          }| j        r| j        nd}|j        }|t&          j        k    rt'          j                    rt'          j                    }n3t/          | j        d          r| j        j        }n| j        j        j        }t6                              d| d	           |                    |          }|                    |          }|                    |          }t=          |||||	||t?          | d
d           | j         | j!        
  
        }|"                    ||	d          #                                }| $                    |          }|sd }|||fS )Nz`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformersFr2   r   r  r4   _pre_quantization_dtypezThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .sliding_window)r   r  r4  r   use_top_left_maskr   )%rm   r   r   r  r  r  r  r  r   rA   r   rB   r  r   r  r   r  rJ   r   r   r  is_autocast_enabledget_autocast_gpu_dtypehasattrr   r2  r   r   r   r   r   getattrr   _flash_attn_uses_top_left_maskreshaper  r  )rM   r  r  r   r	  r
  rG   r  r  r  r  r  r  r  r   r   r   dropout_rateinput_dtypetarget_dtyper#  r!  s                         rP   r   zGemmaFlashAttention2.forwardy  s+    nk22 	}  
 "%**,,UA{{=11[[//
{{=11
 $((eT^T]SS]]^_abcc__S%1I4=YYccdeghii
#((eT5Mt}]]gghiklmm??<>>S#7jRUWZ#[#[ j%#&snUUL'5'<'<ZW[Wegs't't$J $--a33))!Q//
#--a3315Gt--C #(%-''(** 8$;==&?@@ 8#{B#{17$ $ $ $   (??<88L#|44J'??<88L.% "4)94@@n"A
 
 
 "))#ub99DDFFkk+..  	 LL.88rQ   r$  )rR   rS   rT   rU   r   r%  r   r&  r   r   r   r   r:   rQ   rP   r0  r0  r  s          6:37*."'59[9 [9|[9 !!12[9 u/0	[9
 ![9  [9 [9 !!12[9 
u|Xel3XeEL>Q5RR	S[9 [9 [9 [9 [9 [9rQ   r0  )eagerflash_attention_2sdpac                       e Zd Zdedef fdZ	 	 	 	 	 	 ddej        deej                 deej	                 d	ee
         d
ee         dee         deej	                 deej        eeej        ej        f                  f         fdZ xZS )GemmaDecoderLayerr   r   c                 4   t                                          |           t          |j                 ||          | _        t          |          | _        t          |j        |j	                  | _
        t          |j        |j	                  | _        d S )N)r   r   r   )rK   rL   GEMMA_ATTENTION_CLASSES_attn_implementation	self_attnr   mlpr   r=   rF   input_layernormpost_attention_layernormr  s      rP   rL   zGemmaDecoderLayer.__init__  s       01LMU[gpqqqF##+F,>FDWXXX(4V5GVM`(a(a(a%%%rQ   NFr  r  r   r	  r
  rG   r  r}   c                     |}	|                      |          } | j        d|||||||d|\  }}
}|	|z   }|}	|                     |          }|                     |          }|	|z   }|f}|r||
fz  }|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        r*  r:   )rJ  rH  rK  rI  )rM   r  r  r   r	  r
  rG   r  rN   residualself_attn_weightspresent_key_valueoutputss                rP   r   zGemmaDecoderLayer.forward  s    < !,,];; ?Mdn 	?
')%)/)	?
 	?
 	?
 	?
;(*; !=0 !55mDD// =0 " 	,)++G 	,)++GrQ   r$  )rR   rS   rT   r%   r   rL   r   r%  r   r&  r   r   r   FloatTensorr   rX   rY   s   @rP   rC  rC    s       b{ bs b b b b b b 2637*.,1$)59= =|= !.= u/0	=
 != $D>= D>= !!12= 
u (51BEDU1U+V"WW	X= = = = = = = =rQ   rC  c                   B    e Zd Zdef fdZ	 	 	 	 	 	 	 	 	 	 ddej        deej                 deej                 dee	e
eej                 f                  deej                 d	ee         d
ee         dee         dee         deej                 de	eef         fdZ xZS )
GemmaModelr   c                 &   t                                                     t          j        fdt	          j                  D                       | _        t          j        j	                  | _
        | `|                                  d S )Nc                 0    g | ]}t          |          S r:   )rC  ).0r   r   s     rP   
<listcomp>z'GemmaModel.__init__.<locals>.<listcomp>*  s$    cccivy11cccrQ   rE  )rK   rL   r
   
ModuleListranger?   layersr   r=   rF   normr  	post_initrM   r   rO   s    `rP   rL   zGemmaModel.__init__'  s       mcccc5IaCbCbccc
 
 !!39LMMM	OrQ   N	input_idsr  r   r'   inputs_embedsrG   r
  output_hidden_statesreturn_dictr  r}   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|	|	n| j         j        }	|d u |d uz  rt          d          | j        r%| j        r|rt          	                    d           d}|| 
                    |          }d}|rVt          |t                    sAd}|t                      }n.t          j        |          }t          	                    d           |
B||                                nd}t!          j        |||j        d         z   |j                  }
||
                    d          }|                     |||
||          }|}t!          j        | j         j        d	z  |j        
          }||z  }|rdnd }|rdnd }d }| j        D ]p}|r||fz  }| j        r)| j        r"|                     |j        |||||||
          }n ||||||||
          }|d         }|r||rdnd         }|r||d         fz  }q|                     |          }|r||fz  }|r|nd }|r|                                }|	st=          d ||||fD                       S t?          ||||          S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FTzWe detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)r   r2   )r   g      ?r   r:   )r  r   r	  r
  rG   r  r   c              3      K   | ]}||V  	d S r   r:   )rV  vs     rP   	<genexpr>z%GemmaModel.forward.<locals>.<genexpr>  s(      ttqfgfsfsfsfsfsttrQ   )last_hidden_stater'   r  
attentions) r   r
  r`  rG   use_return_dictr   gradient_checkpointingr  r   r   embed_tokensrm   r   r   from_legacy_cacheget_seq_lengthr   r   r   r   	unsqueeze_update_causal_maskr   r=   r   rZ  _gradient_checkpointing_func__call__r[  to_legacy_cacher   r   )rM   r^  r  r   r'   r_  rG   r
  r`  ra  r  return_legacy_cachepast_seen_tokensr"  r  
normalizerall_hidden_statesall_self_attnsnext_decoder_cachedecoder_layerlayer_outputs
next_caches                         rP   r   zGemmaModel.forward0  s    2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B]-t";< 	[YZZZ& 	4= 	Y 	j   I  --i88M $ 
	Z?? 
	"&&"..."."@"Q"Q##^   !CRC^==???de"\ "2]5H5K"KTaTh  N )33A66L..M>?L]
 

 &
 \$+"93">mFYZZZ
%
2 #7@BBD0:d!![  	6  	6M# 6!m%55!* t}  $ A A!*! #%"	! 	! !.!#.!-#2&7'#1! ! ! *!,M R%28I3P11q%Q"  6=#3"55		-00   	2-!11+4>''$
 	6#3355J 	utt]J@QSa$btttttt&+&+%	
 
 
 	
rQ   )
NNNNNNNNNN)rR   rS   rT   r%   rL   r   r&  r   r%  r	   r   r   rQ  r   r   r   r   rX   rY   s   @rP   rS  rS  &  sQ       {       '+1537KO59$(,0/3&*59|
 |
#|
 !.|
 u/0	|

 "%tE4E/F(F"GH|
   12|
 D>|
 $D>|
 'tn|
 d^|
 !!12|
 
u--	.|
 |
 |
 |
 |
 |
 |
 |
rQ   rS  c                   ^    e Zd Z fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddej        deej                 deej                 deee	e
ej                 f                  deej                 d	eej                 d
ee         dee         dee         dee         deej                 dedeeef         fdZ xZS )GemmaForCausalLMc                     t                                          |           t          |          | _        |                                  d S r   rK   rL   rS  modelr\  r]  s     rP   rL   zGemmaForCausalLM.__init__  @       ''
rQ   Nr   r^  r  r   r'   r_  labelsrG   r
  r`  ra  r  num_logits_to_keepr}   c                    ||n| j         j        }|	|	n| j         j        }	|
|
n| j         j        }
|                     ||||||||	|
|
  
        }|d         }|                     |dd| dddf                   }d}| | j        ||| j        fi |}|
s|f|dd         z   }||f|z   n|S t          |||j	        |j
        |j                  S )aj  
        ```python
        >>> from transformers import AutoTokenizer, GemmaForCausalLM

        >>> model = GemmaForCausalLM.from_pretrained("google/gemma-7b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```N)
r^  r  r   r'   r_  rG   r
  r`  ra  r  r   r2   )losslogitsr'   r  rg  )r   r
  r`  rh  r  lm_headloss_functionr;   r   r'   r  rg  )rM   r^  r  r   r'   r_  r  rG   r
  r`  ra  r  r  loss_kwargsrP  r  r  r  r   s                      rP   r   zGemmaForCausalLM.forward  sJ   > 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] **)%+'/!5#)  
 
  
mAAA0B/B/C/CQQQ,FGHH%4%ffdoUUUUD 	DY,F'+'7D7V##VC%#3!/)
 
 
 	
rQ   )NNNNNNNNNNNr   )rR   rS   rT   rL   r   r&  r   r%  r	   r   r   rQ  r   r   r   r   r   rX   rY   s   @rP   r|  r|    si            '+1537KO59-1$(,0/3&*59"#E
 E
#E
 !.E
 u/0	E

 "%tE4E/F(F"GHE
   12E
 )*E
 D>E
 $D>E
 'tnE
 d^E
 !!12E
  E
 
u,,	-E
 E
 E
 E
 E
 E
 E
 E
rQ   r|  c                        e Zd Z fdZ xZS )GemmaForSequenceClassificationc                     t                                          |           t          |          | _        |                                  d S r   r~  r]  s     rP   rL   z'GemmaForSequenceClassification.__init__  r  rQ   rR   rS   rT   rL   rX   rY   s   @rP   r  r    8                rQ   r  c                        e Zd Z fdZ xZS )GemmaForTokenClassificationc                     t                                          |           t          |          | _        |                                  d S r   r~  r]  s     rP   rL   z$GemmaForTokenClassification.__init__  r  rQ   r  rY   s   @rP   r  r    r  rQ   r  )r%   r[   rS  r|  r  r  )Hr   typingr   r   r   r   r   r   r	   sentencepiecero   r   torch.utils.checkpointr
   activationsr   cache_utilsr   r   r   configuration_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r   pytorch_utilsr   tokenization_utilsr   r   utilsr   llama.modeling_llamar   r   r   r   r   r   r   r    llama.tokenization_llamar!   tokenization_utils_baser"   VOCAB_FILES_NAMESr   _CHECKPOINT_FOR_DOC
get_loggerrR   r   r%   r[   Moduler   r   r   r   r   r   r   r(  r0  rF  rC  rS  r|  r  r  __all__r:   rQ   rP   <module>r     s     I I I I I I I I I I I I I I I I I I                ! ! ! ! ! ! ; ; ; ; ; ; ; ; ; ; 3 3 3 3 3 3 F F F F F F O O O O O O O O 1 1 1 1 1 1 A A A A A A A A      	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 6 5 5 5 5 5  5444444!#45  ' 		H	%	%v
 v
 v
 v
 v
" v
 v
 v
rY Y Y Y Y^%8 Y Y Yx= = = = =29 = = =(   L ) ) )< < < < <29 < < <8    (<       ,@   &P P P P Pry P P P0a9 a9 a9 a9 a9RY a9 a9 a9HU1 U1 U1 U1 U1 U1 U1 U1pb9 b9 b9 b9 b9/ b9 b9 b9L -  E E E E E) E E EPF
 F
 F
 F
 F
 F
 F
 F
TK
 K
 K
 K
 K
' K
 K
 K
\    %C       "=     rQ   