
    g                     V   d dl mZmZmZ d dlZd dlmZ d dlZddlm	Z	 ddl
mZmZ ddlmZ ddlmZmZ ddlmZmZmZmZ d	d
lmZmZmZmZmZmZmZmZm Z m!Z!  e            rddl"m#Z# dZ$ ej%        e&          Z' G d de          Z( G d de          Z) G d dej*                  Z+ G d de          Z, G d de,          Z- G d de,          Z. G d de          Z/ G d de          Z0 G d dee0          Z1 G d d e          Z2 G d! d"e          Z3 G d# d$e          Z4dS )%    )OptionalTupleUnionN   )ACT2FN)CacheHybridCache)PretrainedConfig)BaseModelOutputWithPastCausalLMOutputWithPast)is_flash_attn_2_availableis_flash_attn_greater_or_equal#is_flash_attn_greater_or_equal_2_10logging   )
GemmaAttentionGemmaDecoderLayerGemmaForCausalLMGemmaForSequenceClassificationGemmaForTokenClassification
GemmaModelGemmaPreTrainedModelGemmaRMSNormapply_rotary_pos_emb	repeat_kv)_flash_attention_forwardzgoogle/gemma2-7bc                   ^     e Zd ZdZdZdgZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )Gemma2Configa  
    This is the configuration class to store the configuration of a [`Gemma2Model`]. It is used to instantiate an Gemma2
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Gemma2-7B.
    e.g. [google/gemma2-7b](https://huggingface.co/google/gemma2-7b)
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the Gemma2 model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Gemma2Model`]
        hidden_size (`int`, *optional*, defaults to 3072):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 24576):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 28):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 16):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 224): scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096): in Gemma2, every other layer uses sliding window attention. This is the
            size of the sliding window.
        final_logit_softcapping (`float`, *optional*, defaults to 30.0): scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*, defaults to 50.0): scaling factor when applying tanh softcapping on the attention scores.
        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.

    ```python
    >>> from transformers import Gemma2Model, Gemma2Config
    >>> # Initializing a Gemma2 gemma2-7b style configuration
    >>> configuration = Gemma2Config()
    >>> # Initializing a model from the gemma2-7b style configuration
    >>> model = Gemma2Model(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```gemma2past_key_values      `           gelu_pytorch_tanh    {Gz?ư>Tr      r        @F                    >@      I@hybridc                 \    t                      j        d||||d| || _        |	| _        || _        || _        || _        || _        || _        || _	        |
| _
        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        d S )N)pad_token_idbos_token_ideos_token_idtie_word_embeddings )super__init__
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headshead_dimnum_key_value_headsinitializer_rangerms_norm_eps	use_cache
rope_thetaattention_biasattention_dropouthidden_activationquery_pre_attn_scalarsliding_windowfinal_logit_softcappingattn_logit_softcappingcache_implementation)selfr;   r=   r>   r?   r@   rB   rA   rI   r<   rC   rD   rE   r4   r6   r5   r7   rF   rG   rH   rJ   rK   rL   rM   rN   kwargs	__class__s                             e/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/gemma2/modular_gemma2.pyr:   zGemma2Config.__init__   s    8 	 	
%%% 3		
 	

 	
 	
 	
 %'>$&!2!2#6  #6 !2("$,!2!2%:",'>$&<#$8!!!    )r!   r"   r#   r$   r%   r%   r&   r'   r(   r)   r*   Tr   r+   r   Tr,   Fr-   r.   r/   r0   r1   r2   )__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencer:   __classcell__rQ   s   @rR   r   r   :   s        F FP J#4"5 - $ ! $#%369 69 69 69 69 69 69 69 69 69rS   r   c                       e Zd ZdS )Gemma2RMSNormN)rT   rU   rV   r8   rS   rR   r]   r]      s        DrS   r]   c                   $     e Zd Z fdZd Z xZS )	Gemma2MLPc                    t                                                       || _        |j        | _        |j        | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _        t          j        | j        | j        d          | _	        t          |j                 | _        d S )NF)bias)r9   r:   configr=   r>   nnLinear	gate_projup_proj	down_projr   rI   act_fnrO   rb   rQ   s     rR   r:   zGemma2MLP.__init__   s    !-!'!94#3T5KRWXXXy!143IPUVVV4#94;KRWXXXV56rS   c                     |                      |                     |                     |                    |                     |          z            S N)rg   rh   re   rf   )rO   xs     rR   forwardzGemma2MLP.forward   s;    ~~dkk$..*;*;<<t||ANOOOrS   )rT   rU   rV   r:   rm   rZ   r[   s   @rR   r_   r_      sN        7 7 7 7 7P P P P P P PrS   r_   c                       e Zd ZdZddedee         f fdZ	 	 	 	 	 	 ddej	        deej	                 d	eej
                 d
ee         dededeej
                 deej	        eej	                 eeej	                          f         fdZ xZS )Gemma2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperNrb   	layer_idxc                     t                                          ||           |j        dz  | _        t	          |dz            s|j        nd | _        d S )Ng      r   )r9   r:   rJ   scalingboolrK   rO   rb   rp   rQ   s      rR   r:   zGemma2Attention.__init__   sT    +++3T9;?	A;N;NXf33TXrS   Fhidden_statesattention_maskposition_idspast_key_valueoutput_attentionsrE   cache_positionreturnc                    |                                 \  }}	}
|                     |          }|                     |          }|                     |          }|                    ||	| j        | j                                      dd          }|                    ||	| j        | j                                      dd          }|                    ||	| j        | j                                      dd          }| 	                    ||          \  }}t          ||||          \  }}|,||| j        |d}|                    ||| j        |          \  }}t          || j                  }t          || j                  }t!          j        ||                    dd                    | j        z  }| j        j        2|| j        j        z  }t!          j        |          }|| j        j        z  }|$|d d d d d d d |j        d         f         }||z   }t.          j                            |dt           j                                      |j                  }t.          j                            || j        | j                  }t!          j        ||          }|                                 || j        |	| j        fk    r5tA          d	|| j        |	| j        f d
|                                            |                    dd          !                                }|                    ||	d          }| "                    |          }|sd }|||fS )Nr+   r   sincosrK   rz   r   )dimdtype)ptrainingz `attn_output` should be of size z	, but is )#sizeq_projk_projv_projview	num_headsrA   	transposerB   
rotary_embr   rK   updaterp   r   num_key_value_groupstorchmatmulrr   rb   rM   tanhshaperc   
functionalsoftmaxfloat32tor   dropoutrH   r   
ValueError
contiguouso_proj)rO   ru   rv   rw   rx   ry   rE   rz   bszq_len_query_states
key_statesvalue_statesr   r~   cache_kwargsattn_weightscausal_maskattn_outputs                       rR   rm   zGemma2Attention.forward   sQ    &**,,UA{{=11[[//
{{=11#((eT^T]SS]]^_abcc__S%1I4=YYccdeghii
#((eT5Mt}]]gghiklmm??<>>S#7jRUWZ#[#[ j% "&"5"0	 L (6'<'<ZW[Wegs't't$Jz4+DEE
 t/HII|L*2F2Fq!2L2LMMPTP\\;-9'$+*LLL :l33L'$+*LLL%(AAAqqq2HJ4DR4H2H)HIK'+5L },,\r,WWZZ[g[mnn},,\T=S^b^k,lll<>>#t~udm!LLL)CPTP]3^ ) )$$&&) )  
 "++Aq11<<>>!&&sE266kk+..  	 LL.88rS   rk   NNNFFN)rT   rU   rV   rW   r   r   intr:   r   Tensor
LongTensorr   rs   r   rm   rZ   r[   s   @rR   ro   ro      s-       GGY Y| Y Y Y Y Y Y Y 2637*."'59A9 A9|A9 !.A9 u/0	A9
 !A9  A9 A9 !!12A9 
u|Xel3XeEL>Q5RR	SA9 A9 A9 A9 A9 A9 A9 A9rS   ro   c                       e Zd ZdZ fdZ	 	 	 	 	 	 ddej        deej                 deej                 dee	         d	e
d
e
deej                 deej        eej                 eeej                          f         fdZ xZS )Gemma2FlashAttention2aH  
    Gemma2 flash attention module. This module inherits from `Gemma2Attention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                 b     t                      j        |i | t                       | _        d S rk   )r9   r:   r   _flash_attn_uses_top_left_mask)rO   argsrP   rQ   s      rR   r:   zGemma2FlashAttention2.__init__%  s9    $)&)))
 3V2W2W.W+++rS   NFru   rv   rw   rx   ry   rE   rz   r{   c                 r   d}|                                 \  }}	}
|                     |          }|                     |          }|                     |          }|                    ||	| j        | j                                      dd          }|                    ||	| j        | j                                      dd          }|                    ||	| j        | j                                      dd          }| 	                    ||          \  }}t          ||||          \  }}|,||| j        |d}|                    ||| j        |          \  }}|/|j        d         }|d d d d d |f         }|d d d d d |f         }|                    dd          }|                    dd          }|                    dd          }| j        r| j        nd}|j        }|t$          j        k    rt%          j                    rt%          j                    }n3t-          | j        d          r| j        j        }n| j        j        j        }t4                              d| d           |                    |          }|                    |          }|                    |          }t;          |||||	|| j        | j        | j        | j         tC          d	          r| j        j"        nd 
          }|#                    ||	d          $                                }| %                    |          }|sd }|||fS )NFr+   r   r}   r-   _pre_quantization_dtypezThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .z2.6.0)r   softmax_scale	is_causalrK   use_top_left_masksoftcapr   )&r   r   r   r   r   r   rA   r   rB   r   r   rK   r   rp   r   r   rH   r   r   r   is_autocast_enabledget_autocast_gpu_dtypehasattrrb   r   weightloggerwarning_oncer   r   rr   r   r   r   rM   reshaper   r   )rO   ru   rv   rw   rx   ry   rE   rz   r   r   r   r   r   r   r   r~   r   seq_lendropout_rateinput_dtypetarget_dtyper   r   s                          rR   rm   zGemma2FlashAttention2.forward-  sn    "%**,,UA{{=11[[//
{{=11
 $((eT^T]SS]]^_abcc__S%1I4=YYccdeghii
#((eT5Mt}]]gghiklmm??<>>S#7jRUWZ#[#[ j% "&"5"0	 L (6'<'<ZW[Wegs't't$J%$*1-G#AAAqqq(7(N3J'111hwh7L $--a33))!Q//
#--a3315Gt--C #(%-''(** 8$;==&?@@ 8#{B#{17$ $ $ $   (??<88L#|44J'??<88L. ,n."A:XY`:a:akDK66gk
 
 
 "))#ub99DDFFkk+..  	 LL.88rS   r   )rT   rU   rV   rW   r:   r   r   r   r   r   rs   r   rm   rZ   r[   s   @rR   r   r     s        X X X X X 6:37*."'59a9 a9|a9 !!12a9 u/0	a9
 !a9  a9 a9 !!12a9 
u|Xel3XeEL>Q5RR	Sa9 a9 a9 a9 a9 a9 a9 a9rS   r   c                        e Zd ZdZ	 	 	 	 	 	 ddej        deej                 deej                 dee         de	d	e	d
eej                 de
ej        eej                 ee
ej                          f         f fdZ xZS )Gemma2SdpaAttentionz
    Gemma2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
    `Gemma2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    NFru   rv   rw   rx   ry   rE   rz   r{   c           	         |rBt                               d           t                                          |||||||          S |                                \  }}	}
|                     |          }|                     |          }|                     |          }|                    ||	| j	        | j
                                      dd          }|                    ||	| j        | j
                                      dd          }|                    ||	| j        | j
                                      dd          }|                     ||          \  }}t          ||||          \  }}|,||| j        |d}|                    ||| j        |          \  }}t%          || j                  }t%          || j                  }|}||d d d d d d d |j        d         f         }|j        j        dk    r>|<|                                }|                                }|                                }||	dk    rdnd	}t0          j        j                            ||||| j        r| j        nd
|| j                  }|                    dd                                          }|                    ||	d          }|                     |          }|d |fS )Na  Gemma2Model is using Gemma2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.ru   rv   rw   rx   ry   rE   rz   r+   r   r}   r   cudaTFr-   )	attn_mask	dropout_pr   scaler   ) r   r   r9   rm   r   r   r   r   r   r   rA   r   rB   r   r   rK   r   rp   r   r   r   devicetyper   r   rc   r   scaled_dot_product_attentionr   rH   rr   r   )rO   ru   rv   rw   rx   ry   rE   rz   r   r   r   r   r   r   r   r~   r   r   r   r   rQ   s                       rR   rm   zGemma2SdpaAttention.forward  s     	[   77??+-)-"3#- #    &**,,UA{{=11[[//
{{=11#((eT^T]SS]]^_abcc__S%1I4=YYccdeghii
#((eT5Mt}]]gghiklmm??<>>S#7jRUWZ#[#[ j% "&"5"0	 L (6'<'<ZW[Wegs't't$Jz4+DEE
 t/HII$%%aaaAAA/E1A"1E/E&EFK #v--+2I'2244L#..00J'2244L (/EAIIDD5	h)FF!04Fd,,3, G 
 
 "++Aq11<<>>!&&sE266kk+..D.00rS   r   )rT   rU   rV   rW   r   r   r   r   r   rs   r   rm   rZ   r[   s   @rR   r   r     s          2637*."'59R1 R1|R1 !.R1 u/0	R1
 !R1  R1 R1 !!12R1 
u|Xel3XeEL>Q5RR	SR1 R1 R1 R1 R1 R1 R1 R1 R1 R1rS   r   c                       e Zd Zdedef fdZ	 	 	 	 	 	 ddej        deej                 deej	                 d	ee
         d
ee         dee         deej	                 deej        eeej        ej        f                  f         fdZ xZS )Gemma2DecoderLayerrb   rp   c                 J   t                                          ||           || _        t          |dz             | _        t          |          | _        t          |j        |j	                  | _
        t          |j        |j	                  | _        |j        | _        d S )Nr   )eps)r9   r:   rb   rs   
is_slidingr_   mlpr]   r=   rD   pre_feedforward_layernormpost_feedforward_layernormrK   rt   s      rR   r:   zGemma2DecoderLayer.__init__  s    +++"9q=111V$$)6v7IvOb)c)c)c&*78JPVPc*d*d*d'$3rS   NFru   rv   rw   rx   ry   rE   rz   r{   c           	         | j         r|| j        j        dk    r||d d | j         d f         }nt	          j        |j                  j        }t	          j        t	          j	        |t          j
                  | j                   }	t	          j        |	||          }|j        d         dk    r|d d d d d d | j         d f         }|}
|                     |          }|                     |||||||          \  }}}|                     |          }|
|z   }|}
|                     |          }|                     |          }|                     |          }|
|z   }|f}|r||fz  }|r||fz  }|S )Nflash_attention_2r   )diagonalr   r+   r   )r   rb   _attn_implementationrK   r   finfor   mintril	ones_likers   wherer   input_layernorm	self_attnpost_attention_layernormr   r   r   )rO   ru   rv   rw   rx   ry   rE   rz   	min_dtypesliding_window_maskresidualself_attn_weightspresent_key_valueoutputss                 rR   rm   zGemma2DecoderLayer.forward  s    ? 	U~9{/3FFF!-%3AAA8K7K7M7M4M%NN!K(;<<@	&+jON%*EEEQUQdPd' ' '# "'-@)^!\!\!'+q00%3AAAqqq!!!d>Q=Q=S=S4S%TN ,,];; ?Cnn')%)/) ?M ?
 ?
;(*; 55mDD =0 66}EE//77FF =0 " 	,)++G 	,)++GrS   r   )rT   rU   rV   r   r   r:   r   r   r   r   r   rs   r   FloatTensorrm   rZ   r[   s   @rR   r   r     s       4| 4 4 4 4 4 4 4 2637*.,1$)597 7|7 !.7 u/0	7
 !7 $D>7 D>7 !!127 
u (51BEDU1U+V"WW	X7 7 7 7 7 7 7 7rS   r   c                   :     e Zd ZdZeddef fd            Z xZS )Gemma2PreTrainedModelFhard_check_onlyc                 v    t                                          ||          }|s|j        dk    rd|_        |S )z
        Overloads `PreTrainedModel._check_and_enable_sdpa` so as to DISABLE torch SDPA by default on Gemma2 models.
        SDPA reduces the model performance on Gemma2 because of the logits softcapping.
        )r   sdpaeager)r9   _check_and_enable_sdpar   )clsrb   r   rQ   s      rR   r   z,Gemma2PreTrainedModel._check_and_enable_sdpa5  sE     ///XX  	26#>&#H#H*1F'rS   )F)rT   rU   rV   _supports_quantized_cacheclassmethodrs   r   rZ   r[   s   @rR   r   r   2  sY         % T      [    rS   r   c                       e Zd Zdef fdZ	 	 	 	 	 	 	 	 	 	 ddej        deej                 deej                 dee	         deej
                 d	ee         d
ee         dee         dee         deej                 deeef         fdZ ej                    dej        dej        dej        de	d
ef
d            Z xZS )Gemma2Modelrb   c                     t                                                     t          j        fdt	          j                  D                       | _        |                                  d S )Nc                 0    g | ]}t          |          S r8   )r   ).0rp   rb   s     rR   
<listcomp>z(Gemma2Model.__init__.<locals>.<listcomp>H  s$    dddy	22dddrS   )r9   r:   rc   
ModuleListranger?   layers	post_initri   s    `rR   r:   zGemma2Model.__init__E  si       mddddE&JbDcDcddd
 
 	rS   N	input_idsrv   rw   r    inputs_embedsrE   ry   output_hidden_statesreturn_dictrz   r{   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|	|	n| j         j        }	|d u |d uz  rt          d          | j        r%| j        r|rt          	                    d           d}|| 
                    |          }|r7|5| j        s.|j        \  }}}t          | j         ||| j        |j                  }|
B||                                nd}t!          j        |||j        d         z   |j                  }
||
                    d          }|                     |||
||          }|}t!          j        | j         j        dz  |j        	          }||z  }|rd
nd }|rd
nd }| j        D ]b}|r||fz  }| j        r)| j        r"|                     |j        |||||||
          }n ||||||||
          }|d         }|r||d         fz  }c|                     |          }|r||fz  }|r|nd }|	st5          d ||||fD                       S t7          ||||          S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.F)
batch_sizemax_cache_lenr   r   r   r+   )r   g      ?r   r8   )rv   rw   rx   ry   rE   rz   c              3      K   | ]}||V  	d S rk   r8   )r   vs     rR   	<genexpr>z&Gemma2Model.forward.<locals>.<genexpr>  s(      ttqfgfsfsfsfsfsttrS   )last_hidden_stater    ru   
attentions)rb   ry   r   rE   use_return_dictr   gradient_checkpointingr   r   r   embed_tokensr   r	   r   r   get_seq_lengthr   arange	unsqueeze_update_causal_masktensorr=   r   _gradient_checkpointing_func__call__normtupler   )rO   r   rv   rw   r    r   rE   ry   r   r   rz   r  r   r   past_seen_tokensr   ru   
normalizerall_hidden_statesall_self_attnsdecoder_layerlayer_outputs
next_caches                          rR   rm   zGemma2Model.forwardL  sT    2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B]-t";< 	[YZZZ& 	4= 	Y 	j   I  --i88M 	00%2%8"J)%%{#)  O !CRC^==???de"\ "2]5H5K"KTaTh  N )33A66L..M>?L]
 

 &
 \$+"93">mFYZZZ
%
2 #7@BBD0:d![ 	6 	6M# 6!m%55!* t}  $ A A!*! #%"	! 	! !.!#.!-#2&7'#1! ! ! *!,M  6=#3"55		-00 	2-!11(1;__t
 	utt]J@QSa$btttttt&+&+%	
 
 
 	
rS   input_tensorc           
      :   | j         j        dk    r|S |j        |j        }}|j        d         }t          |t                    r|                                }	n||j        d         n|j        d         }	|                     |||	||||j        d                   }
|
S )Nr   r+   r   r   sequence_lengthtarget_lengthr   r   rz   r  )	rb   r   r   r   r   
isinstancer	   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_position)rO   rv   r  rz   r    ry   r   r   r  r  r   s              rR   r  zGemma2Model._update_causal_mask  s     ;+/BBB!!$*L,?v&,Q/o{33 	n+??AAMM8F8RN044XdXjklXmM PP+')#)!, Q 
 
 rS   )
NNNNNNNNNN)rT   rU   rV   r   r:   r   r   r   r   r	   r   rs   r   r   r   rm   no_gradr  rZ   r[   s   @rR   r   r   D  s       |       '+15371559$(,0/3&*59q
 q
#q
 !.q
 u/0	q

 "+.q
   12q
 D>q
 $D>q
 'tnq
 d^q
 !!12q
 
u--	.q
 q
 q
 q
f U]__   l  	 
 %         _         rS   r   c                   N    e Zd Z fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddej        deej                 deej                 dee         deej	                 d	eej                 d
ee
         dee
         dee
         dee
         deej                 dedeeef         fdZ	 	 	 	 	 	 	 ddZ xZS )Gemma2ForCausalLMc                     t                                          |           t          |          | _        |                                  d S rk   r9   r:   r   modelr   ri   s     rR   r:   zGemma2ForCausalLM.__init__  @        ((
rS   Nr   r   rv   rw   r    r   labelsrE   ry   r   r   rz   num_logits_to_keepr{   c                    | j         r8| j        j        dk    r(t                              d| j        j         d           ||n| j        j        }|	|	n| j        j        }	|
|
n| j        j        }
|                     ||||||||	|
|
  
        }|d         }| 	                    |dd| dddf                   }| j        j
        2|| j        j
        z  }t          j        |          }|| j        j
        z  }d}| | j        ||| j        fi |}|
s|f|dd         z   }||f|z   n|S t          |||j        |j        |j                  S )	an  
        ```python
        >>> from transformers import AutoTokenizer, GemmaForCausalLM

        >>> model = GemmaForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```r   zhIt is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `zp`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.N)
r   rv   rw   r    r   rE   ry   r   r   rz   r   r+   )losslogitsr    ru   r  )r   rb   r   r   r   ry   r   r  r(  lm_headrL   r   r   loss_functionr;   r   r    ru   r  )rO   r   rv   rw   r    r   r*  rE   ry   r   r   rz   r+  loss_kwargsr   ru   r.  r-  outputs                      rR   rm   zGemma2ForCausalLM.forward  s   @ = 	T[=HHr#{?r r r   2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]**)%+'/!5#)  
 
  
mAAA0B/B/C/CQQQ,FGHH;.:dkAAFZ''FdkAAF%4%ffdoUUUUD 	DY,F'+'7D7V##VC%#3!/)
 
 
 	
rS   Tc	           	         |E||d d |j         d          d f         }n(|j         d         |j         d         k    r|d d |f         }|||                                                    d          dz
  }|                    |dk    d           |r:|d d |j         d          d f         }|                    t
          j                  }||d         dk    r|d d}
n#|                    t
          j                  d d}
t          |t                    r|j	        dk    r| j
        j        dk    s|
d	         |
d	         j         \  }}}|
d	         j        }n|
d
         j         \  }}|
d
         j        }| j                            |||                                | j        j        j        |||          }|||
d<   |
                    |||||d           |
S )Nr   r+   r   )memory_format)r   r   )r   r   r   r   r   r   r  r+  )rw   rz   r    rE   rv   )r   longcumsummasked_fill_cloner   contiguous_formatr   r	   ndimrb   r   r   r(  r"  r!  r/  r   r   r   )rO   r   r    rv   r   rz   rw   rE   r+  rP   model_inputsr  r  r   r   s                  rR   prepare_inputs_for_generationz/Gemma2ForCausalLM.prepare_inputs_for_generation9  sM   " &(%aaa.*>q*A)A)C)C&CD		#~';A'>>>%aaa&78	%,*>)..0077;;a?L%%n&91=== Y+AAA	0B/B/D/D,DE  ,11@W1XX $):a)?)?-:NNLL *3uG^)_)_rvwwL 44	#q((K48KKKO,81=o1N1T.
OQ%o6=.:;.G.M+
O%k29!Z]] /-AACCl)/-% ^  N )1CL-. ,"0#2&"0 	
 	
 	
 rS   )NNNNNNNNNNNr   )NNNNNTN)rT   rU   rV   r:   r   r   r   r   r	   r   rs   r   r   r   r   rm   r<  rZ   r[   s   @rR   r%  r%    s            '+15371559-1$(,0/3&*59"#N
 N
#N
 !.N
 u/0	N

 "+.N
   12N
 )*N
 D>N
 $D>N
 'tnN
 d^N
 !!12N
  N
 
u,,	-N
 N
 N
 N
f L L L L L L L LrS   r%  c                        e Zd Z fdZ xZS )Gemma2ForSequenceClassificationc                     t                                          |           t          |          | _        |                                  d S rk   r'  ri   s     rR   r:   z(Gemma2ForSequenceClassification.__init__  r)  rS   rT   rU   rV   r:   rZ   r[   s   @rR   r>  r>    8                rS   r>  c                        e Zd Z fdZ xZS )Gemma2ForTokenClassificationc                     t                                          |           t          |          | _        |                                  d S rk   r'  ri   s     rR   r:   z%Gemma2ForTokenClassification.__init__  r)  rS   r@  r[   s   @rR   rC  rC    rA  rS   rC  )5typingr   r   r   r   torch.nnrc   torch.utils.checkpointactivationsr   cache_utilsr   r	   configuration_utilsr
   modeling_outputsr   r   utilsr   r   r   r   gemma.modeling_gemmar   r   r   r   r   r   r   r   r   r   modeling_flash_attention_utilsr   _CHECKPOINT_FOR_DOC
get_loggerrT   r   r   r]   Moduler_   ro   r   r   r   r   r   r%  r>  rC  r8   rS   rR   <module>rR     s    * ) ) ) ) ) ) ) ) )            ! ! ! ! ! ! - - - - - - - - 3 3 3 3 3 3                                           KJJJJJJ ) 		H	%	%B9 B9 B9 B9 B9# B9 B9 B9J	 	 	 	 	L 	 	 	P P P P P	 P P PI9 I9 I9 I9 I9n I9 I9 I9Xp9 p9 p9 p9 p9O p9 p9 p9fZ1 Z1 Z1 Z1 Z1/ Z1 Z1 Z1zA A A A A* A A AH    0   $\ \ \ \ \*3 \ \ \~b b b b b( b b bJ    &D       #>     rS   