
    g                     4   d dl Z d dlmZ d dlZd dlmZ d dlZddlmZ ddl	m
Z
mZmZ ddlmZmZmZ ddlmZmZmZ dd	lmZmZmZ d
dlmZ  ej        e          Z G d de          Z G d de          Z G d de          Z d Z!d(dZ" G d de          Z# G d de#e          Z$ G d de          Z%e#e$e%dZ& G d de          Z' G d de          Z( G d d e(e          Z) G d! d"e
          Z* G d# d$e          Z+ G d% d&e          Z,g d'Z-dS ))    N)Optional   )logging   )GemmaForCausalLMGemmaForSequenceClassificationGemmaForTokenClassification)GraniteAttentionGraniteFlashAttention2GraniteSdpaAttention)LlamaDecoderLayer
LlamaModelLlamaPreTrainedModel)Phi3MLPPhi3RMSNormPhi3RotaryEmbedding   )	GlmConfigc                       e Zd ZdS )
GlmRMSNormN__name__
__module____qualname__     _/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/glm/modular_glm.pyr   r   2           Dr   r   c                       e Zd ZdS )GlmRotaryEmbeddingNr   r   r   r   r    r    6   r   r   r    c                       e Zd ZdS )GlmMLPNr   r   r   r   r"   r"   :   r   r   r"   c                     | ddddf         }| ddddf         }t          j        | |fd                              d          S )	z*Rotates half the hidden dims of the input..r   Nr   r   dim)torchstackflatten)xx1x2s      r   rotate_halfr.   >   sQ    	
319B	
319B;Ryb)))11"555r   c                    |                     |          }|                     |          }|dd|j        d         dz  f                             dd          }|dd|j        d         dz  f                             dd          }| dd| j        d         dz  f         | d| j        d         dz  df         }} |dd|j        d         dz  f         |d|j        d         dz  df         }}| |z  t          |           |z  z   }||z  t          |          |z  z   }	t	          j        ||gd          }t	          j        |	|gd          }	||	fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .Nr$   r   r%   )	unsqueezeshaperepeat_interleaver.   r(   cat)
qkcossinposition_idsunsqueeze_dimq_passk_passq_embedk_embeds
             r   apply_rotary_pos_embr>   E   s   ( --
&
&C
--
&
&C c'SYr]a'''
(
:
:1"
:
E
EC
c'SYr]a'''
(
:
:1"
:
E
EC #))))*Ac172;!3C3E3E.E,FvA#))))*Ac172;!3C3E3E.E,FvA 3w;q>>C/0G3w;q>>C/0G i&)r222Gi&)r222GGr   c                   6     e Zd Zddedee         f fdZ xZS )GlmAttentionNconfig	layer_idxc                     t                                          ||           t          j        | j        | j        d          | _        dt          j        | j                  z  | _	        d S )NF)biasr   )
super__init__nnLinearhidden_sizeo_projmathsqrthead_dimscalingselfrA   rB   	__class__s      r   rF   zGlmAttention.__init__o   sW    +++i 0$2BOOO49T]333r   Nr   r   r   r   r   intrF   __classcell__rQ   s   @r   r@   r@   n   sT        4 4y 4Xc] 4 4 4 4 4 4 4 4 4 4r   r@   c                       e Zd ZdS )GlmFlashAttention2Nr   r   r   r   rX   rX   u   r   r   rX   c                       e Zd ZdS )GlmSdpaAttentionNr   r   r   r   rZ   rZ   y   r   r   rZ   )eagerflash_attention_2sdpac                   6     e Zd Zddedee         f fdZ xZS )GlmDecoderLayerNrA   rB   c                     t                                                       t          |          | _        t	          |j        |j                  | _        t	          |j        |j                  | _        d S )Neps)	rE   rF   r"   mlpr   rI   rms_norm_epsinput_layernormpost_attention_layernormrO   s      r   rF   zGlmDecoderLayer.__init__   sb    &>>)&*<&BUVVV(263E6K^(_(_(_%%%r   rR   rS   rV   s   @r   r_   r_      sa        ` `y `Xc] ` ` ` ` ` ` ` ` ` `r   r_   c                       e Zd ZdS )GlmPreTrainedModelNr   r   r   r   rh   rh      r   r   rh   c                   $     e Zd Zdef fdZ xZS )GlmModelrA   c                    t                                                     t          j        fdt	          j                  D                       | _        t          j        j	                  | _
        t          j        dz  j        j                  | _        d| _        |                                  d S )Nc                 0    g | ]}t          |          S r   )r_   ).0rB   rA   s     r   
<listcomp>z%GlmModel.__init__.<locals>.<listcomp>   s#    aaaI_VY//aaar   ra   r   )r&   max_position_embeddingsbaseF)rE   rF   rG   
ModuleListrangenum_hidden_layerslayersr   rI   rd   normr    rM   ro   
rope_theta
rotary_embgradient_checkpointing	post_initrP   rA   rQ   s    `r   rF   zGlmModel.__init__   s       maaaavG_A`A`aaa
 
 v1v7JKKK	,1$f>\cict
 
 
 ',# 	r   r   r   r   r   rF   rU   rV   s   @r   rj   rj      sD        y          r   rj   c                   $     e Zd Zdef fdZ xZS )GlmForCausalLMrA   c                     t                                          |           t          |          | _        |                                  d S rR   rE   rF   rj   modelry   rz   s     r   rF   zGlmForCausalLM.__init__   @       f%%
r   r{   rV   s   @r   r}   r}      D        y          r   r}   c                   $     e Zd Zdef fdZ xZS )GlmForSequenceClassificationrA   c                     t                                          |           t          |          | _        |                                  d S rR   r   rz   s     r   rF   z%GlmForSequenceClassification.__init__   r   r   r{   rV   s   @r   r   r      r   r   r   c                   $     e Zd Zdef fdZ xZS )GlmForTokenClassificationrA   c                     t                                          |           t          |          | _        |                                  d S rR   r   rz   s     r   rF   z"GlmForTokenClassification.__init__   r   r   r{   rV   s   @r   r   r      r   r   r   )rh   rj   r}   r   r   )Nr   ).rK   typingr   r(   torch.nnrG   torch.utils.checkpointutilsr   gemma.modeling_gemmar   r   r	   granite.modeling_graniter
   r   r   llama.modeling_llamar   r   r   phi3.modeling_phi3r   r   r   configuration_glmr   
get_loggerr   loggerr   r    r"   r.   r>   r@   rX   rZ   GLM_ATTENTION_CLASSESr_   rh   rj   r}   r   r   __all__r   r   r   <module>r      s                                    
         
         
         
 ) ( ( ( ( ( 
	H	%	%	 	 	 	 	 	 	 		 	 	 	 	, 	 	 		 	 	 	 	W 	 	 	6 6 6& & & &R4 4 4 4 4# 4 4 4	 	 	 	 	'= 	 	 		 	 	 	 	+ 	 	 	
 +  ` ` ` ` `' ` ` `	 	 	 	 	- 	 	 	    !:        %       #A        ;     r   