
    g                     ,   d Z ddlZddlmZmZmZmZ ddlZddlm	c m
Z ddlZddlm	Z	 ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZ ddlmZ  ej        e           Z!dZ"dZ# G d de	j$                  Z% G d de	j$                  Z& G d de	j$                  Z' G d de	j$                  Z( G d de	j$                  Z) G d de	j$                  Z* G d de	j$                  Z+ G d de	j$                  Z, G d  d!e	j$                  Z- G d" d#e	j$                  Z. G d$ d%e	j$                  Z/ G d& d'e          Z0d(Z1d)Z2 ed*e1           G d+ d,e0                      Z3 ed-e1           G d. d/e0e                      Z4dS )0zPyTorch CPMAnt    N)ListOptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN)GenerationMixin)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )CpmAntConfigzopenbmb/cpm-ant-10br   c                   >     e Zd ZdZdef fdZdej        fdZ xZ	S )CpmAntLayerNormzv
    We use Root Mean Square (RMS) Layer Normalization, please see https://arxiv.org/abs/1910.07467 for details."
    configc                     t                                                       |j        | _        |j        | _        t          j        t          j        |j                            | _	        d S N)
super__init__epshidden_sizedim_normr   	Parametertorchemptyweightselfr   	__class__s     f/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/cpmant/modeling_cpmant.pyr   zCpmAntLayerNorm.__init__-   sN    :*l5;v/A#B#BCC    hidden_statesc                 p   |                     d          | j        k    rt          d          |j        }|                    t
          j                                      d                              dd          }|t          j	        || j
        z             z                      |          | j        z  }|S )f
        Args:
            hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
        z'hidden_states.size(-1) != self.dim_norm   T)dimkeepdim)sizer   AssertionErrordtypetor    float32powmeanrsqrtr   r"   )r$   r(   	old_dtypevariances       r&   forwardzCpmAntLayerNorm.forward4   s    
 b!!T]22 !JKKK!'	 ##EM2266q99>>2t>TT&X5H)I)IIMMiXX[_[ffr'   )
__name__
__module____qualname____doc__r   r   r    Tensorr9   __classcell__r%   s   @r&   r   r   (   sr         D| D D D D D D
U\ 
 
 
 
 
 
 
 
r'   r   c                        e Zd Zdef fdZ	 	 	 ddej        dej        dej        dej        d	ee	         d
ee
ej        ej        f                  dee	         fdZ xZS )CpmAntAttentionr   c                    t                                                       |j        | _        |j        | _        |j        | _        t          j        | j        | j        | j        z  d          | _	        t          j        | j        | j        | j        z  d          | _
        t          j        | j        | j        | j        z  d          | _        t          j        | j        | j        z  | j        d          | _        t          j                            d          | _        |j        ,t          j                            |j                  | _        d S d | _        d S )NFbiasr+   r-   )p)r   r   r   	dim_modelnum_attention_heads	num_headsdim_headr   Linear	project_q	project_k	project_vattention_outr    Softmaxsoftmax	dropout_pDropoutdropoutr#   s     r&   r   zCpmAntAttention.__init__B   s   +34>4>DM3QX]^^^4>4>DM3QX]^^^4>4>DM3QX]^^^Yt~'Et~\abbbx''B'//' 8++f.>+??DLLLDLLLr'   FNhidden_q	hidden_kvattention_maskposition_biasoutput_attentionspast_key_values	use_cachec           	         |                     d          }|                     d          }	|                     d          }
|                     |          }|                     |          }|                     |          }|                    ||	| j        | j                                      dddd          }|                    ||
| j        | j                                      dddd          }|                    ||
| j        | j                                      dddd          }|Qt          j	        |d         |gd          }t          j	        |d         |gd          }|                     d          }
t          j
        ||                    dd                    t          j        | j                  z  }||z   }t          j        ||                    |d|	|
          t          j        d	          k    t          j        t#          d
          |j        |j                            }|                     |          }t          j        ||                    |d|	|
          t          j        d	          k    t          j        d|j        |j                            }|r|}nd}| j        |                     |          }t          j
        ||          }|                    || j        |	| j                                      dddd          }|                                                    ||	| j        | j        z            }|                     |          }d}|r||f}|||fS )a  
        Args:
            hidden_q (`torch.Tensor`):
                Input of transformer block(self-attention block). It can be the raw embedding of a batch of sequences.
            hidden_kv (`torch.Tensor` of shape `(batch, len_k, dim_model)`)):
                Tensor *key_value* and *query* of shape `(batch, len_k, dim_model)`
            attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Avoid invalid areas to participate in the calculation of self-attention.
            position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Provide positional information to self-attention block.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            past_key_values (`Tuple[torch.Tensor, torch.Tensor]`, *optional*):
                Cached past key and value projection states.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        r   r   r,   r	   NrF   r+   Fz-inf)devicer1   )r/   rM   rN   rO   viewrJ   rK   permuter    catmatmul	transposemathsqrtmasked_filltensorscalar_tensorfloatr_   r1   rR   rU   
contiguousrP   )r$   rV   rW   rX   rY   rZ   r[   r\   
batch_sizelen_qlen_kquerykeyvaluescoreattn_weightss                   r&   r9   zCpmAntAttention.forwardU   s(   8 ]]1%%
a  q!!x((nnY''y))

:udndmLLTTUVXY[\^_``hhz5$.$-HHPPQRTUWXZ[\\

:udndmLLTTUVXY[\^_``&)_Q/52>>>CIq159rBBBEHHRLLE UCMM"b$9$9::TYt}=U=UU%!
Aue<<U@S@SSfel%+VVV
 

 U##!
Aue<<U@S@SS%,ekJJJ
 

  	  LLL<#LL''E UE**

:t~udmLLTTUVXY[\^_``  ""''
E4>DM;YZZ""5)) 	+"ElOlO33r'   )FNN)r:   r;   r<   r   r   r    r>   
BoolTensorr   boolr   r9   r?   r@   s   @r&   rB   rB   A   s         |            2 -2GK$(Q4 Q4,Q4 <Q4 (	Q4
 |Q4 $D>Q4 "%el(B"CDQ4 D>Q4 Q4 Q4 Q4 Q4 Q4 Q4 Q4r'   rB   c                        e Zd Zdef fdZ	 	 	 	 ddej        dej        deej                 dee         d	ee	ej        ej        f                  d
ee         fdZ
 xZS )CpmAntSelfAttentionBlockr   c                 
   t                                                       t          |          | _        t	          |          | _        |j        r+t          j        	                    |j                  | _
        d S d | _
        d S r   )r   r   r   layernorm_before_attentionrB   self_attentionrS   r    r   rT   rU   r#   s     r&   r   z!CpmAntSelfAttentionBlock.__init__   sm    *9&*A*A'-f55 	  8++F,<==DLLLDLLLr'   NFr(   rX   rY   rZ   r[   r\   c           	          |                      |          }|                     |||||||          }|\  }}}	| j        |                     |          }||z   }|||	fS )a  
        Args:
            hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
                Input of transformer block(self-attention block). It can be the raw embedding of a batch of sequences.
            attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Avoid invalid areas to participate in the calculation of self-attention.
            position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Provide positional information to self-attention block.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*):
                Cached past key and value projection states.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        )ry   rz   rU   )
r$   r(   rX   rY   rZ   r[   r\   outputsrs   current_key_values
             r&   r9   z CpmAntSelfAttentionBlock.forward   s~    2 11-@@%%Wnm=NP_aj
 
 4;00<#ll7++G%/l,===r'   NFNNr:   r;   r<   r   r   r    r>   r   ru   r   r9   r?   r@   s   @r&   rw   rw      s         |             15,1GK$($> $>|$> $>  -	$>
 $D>$> "%el(B"CD$> D>$> $> $> $> $> $> $> $>r'   rw   c                   :     e Zd Zdef fdZdej        fdZ xZS )CpmAntDenseGatedACTr   c                 &   t                                                       t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j        	                                | _
        d S NFrD   )r   r   r   rL   r   dim_ffw_0w_1r    GELUactr#   s     r&   r   zCpmAntDenseGatedACT.__init__   sh    9V/UKKK9V/UKKK8==??r'   r(   c                     |                      |                     |                    }|                     |          }||z  }|S )zTransform an input tensor from one feature space to another via a nonlinear operation

        Args:
            hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
        )r   r   r   )r$   r(   
gate_scores      r&   r9   zCpmAntDenseGatedACT.forward   sB     XXdhh}5566
//"]2r'   	r:   r;   r<   r   r   r    r>   r9   r?   r@   s   @r&   r   r      sa        #| # # # # # #
U\ 
 
 
 
 
 
 
 
r'   r   c                   :     e Zd Zdef fdZdej        fdZ xZS )CpmAntFeedForwardr   c                 ,   t                                                       t          |          | _        |j        *t
          j                            |j                  | _        nd | _        t          j	        |j
        |j        d          | _        d S r   )r   r   r   w_inrS   r    r   rT   rU   rL   r   r   w_outr#   s     r&   r   zCpmAntFeedForward.__init__   su    '//	' 8++F,<==DLLDLYv}f.@uMMM


r'   r(   c                     |                      |          }| j        |                     |          }|                     |          }|S )r*   )r   rU   r   r$   r(   s     r&   r9   zCpmAntFeedForward.forward   sE    
 		-00<# LL77M

=11r'   r   r@   s   @r&   r   r      sh        N| N N N N N NU\        r'   r   c                   :     e Zd Zdef fdZdej        fdZ xZS )CpmAntFFNBlockr   c                 
   t                                                       t          |          | _        t	          |          | _        |j        r+t          j        	                    |j                  | _
        d S d | _
        d S r   )r   r   r   layernorm_before_ffnr   ffnrS   r    r   rT   rU   r#   s     r&   r   zCpmAntFFNBlock.__init__	  sl    $3F$;$;!$V,, 	  8++F,<==DLLLDLLLr'   r(   c                     |                      |          }|                     |          }| j        |                     |          }||z   }|S )z
        Args:
            hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
                Hidden states before feed forward layer.
        )r   r   rU   )r$   r(   
ln_outputsr|   s       r&   r9   zCpmAntFFNBlock.forward  sQ     ..}==
((:&&<#ll7++G%/r'   r   r@   s   @r&   r   r     sb         |            |       r'   r   c                        e Zd Zdef fdZ	 	 	 	 ddej        dej        deej                 dee         d	ee	ej        ej        f                  d
ee         fdZ
 xZS )CpmAntTransformerBlockr   c                     t                                                       t          |          | _        t	          |          | _        d S r   )r   r   rw   self_attr   r   r#   s     r&   r   zCpmAntTransformerBlock.__init__$  s;    088!&))r'   NFr(   rX   rY   rZ   r[   r\   c                 x    |                      ||||||          }|\  }}}|                     |          }|||fS )a  
        Args:
            hidden_states (`torch.Tensor`):
                Input to the layer of shape `(batch, seq_len, dim_model)`
            attention_mask (`torch.Tensor`):
                Avoid invalid areas to participate in the calculation of shape `(batch, seq_len, seq_len)`
            position_bias (`torch.Tensor`):
                Provides position information to attention mechanism of shape `(num_heads, seq_len, seq_len)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            past_key_values (`Tuple[torch.Tensor, torch.Tensor])`, *optional*):
                Cached past key and value projection states
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        )rX   rY   rZ   r[   r\   )r   r   )	r$   r(   rX   rY   rZ   r[   r\   rs   r}   s	            r&   r9   zCpmAntTransformerBlock.forward)  s]    2 )'/+ & 
 
 :G6|%6//l,===r'   r~   r   r@   s   @r&   r   r   #  s        *| * * * * * * 15,1GK$(&> &>|&> &>  -	&>
 $D>&> "%el(B"CD&> D>&> &> &> &> &> &> &> &>r'   r   c                        e Zd Zdef fdZ	 	 	 	 ddej        dej        dej        dee         dee         d	ee	ej        ej        f                  d
ee         fdZ
 xZS )CpmAntEncoderr   c                     t                                                       j        | _        t	          j        fdt          | j                  D                       | _        t                    | _	        d S )Nc                 .    g | ]}t                    S  )r   ).0ithr   s     r&   
<listcomp>z*CpmAntEncoder.__init__.<locals>.<listcomp>V  s"    $f$f$f%;F%C%C$f$f$fr'   )
r   r   num_hidden_layers
num_layersr   
ModuleListrangelayersr   output_layernormr#   s    `r&   r   zCpmAntEncoder.__init__S  sl     2m$f$f$f$fuUYUdOeOe$f$f$fgg / 7 7r'   Nr(   rX   rY   rZ   output_hidden_statesr[   r\   c           	         |rdnd}|rdnd}	|rdnd}
t          | j                  D ]>\  }}|r||fz  } ||||||r||         nd|          }|\  }}}|r|	|fz  }	||
|fz   }
?|                     |          }|r||fz  }||
||	fS )a%  
        Args:
            hidden_states (`torch.Tensor`):
                Input to the layer of shape `(batch, seq_len, dim_model)`
            attention_mask (`torch.Tensor`):
                Avoid invalid areas to participate in the calculation of shape `(batch, seq_len, seq_len)`
            position_bias (`torch.Tensor`):
                Provides position information to attention mechanism of shape `(num_heads, seq_len, seq_len)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers.
            past_key_values (`Tuple[torch.Tensor, torch.Tensor])`, *optional*):
                Cached past key and value projection states
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        r   N)rZ   r[   r\   )	enumerater   r   )r$   r(   rX   rY   rZ   r   r[   r\   all_hidden_statesall_self_attnscurrent_key_valuesilayerlayer_outputsrs   r}   s                   r&   r9   zCpmAntEncoder.forwardZ  s   8 #7@BBD0:d#,6RR$!$+.. 	O 	OHAu# 6!m%55!!E"36E O 2 24#  M >K:M<):  2</1 ,%7;L:N%N"--m<< 	2-!1102C^SSr'   )NNNNr   r@   s   @r&   r   r   R  s        8| 8 8 8 8 8 8 -1/3GK$(6T 6T|6T 6T |	6T
 $D>6T 'tn6T "%el(B"CD6T D>6T 6T 6T 6T 6T 6T 6T 6Tr'   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )CpmAntIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r   )r   r   r   rL   r   intermediate_sizedense
isinstance
hidden_actstrr
   intermediate_act_fnr#   s     r&   r   zCpmAntIntermediate.__init__  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r'   r(   returnc                 Z    |                      |          }|                     |          }|S r   )r   r   r   s     r&   r9   zCpmAntIntermediate.forward  s,    

=1100??r'   r:   r;   r<   r   r    r>   r9   r?   r@   s   @r&   r   r     s^        9 9 9 9 9U\ el        r'   r   c                   r     e Zd Zdef fdZdej        dej        dej        dej        fdZd ZddZ	 xZ
S )CpmAntSegmentPositionEmbeddingr   c                 4   t                                                       |j        | _        |j        | _        |j        | _        |j        | _	        t          j        t          j        |j        |j        z  |j        z   |j                            | _        d S r   )r   r   rI   rJ   position_bias_num_bucketsnum_bucketsposition_bias_max_distancemax_distancesegment_typesnum_segmentsr   r   r    r!   relative_attention_biasr#   s     r&   r   z'CpmAntSegmentPositionEmbedding.__init__  s    3!;"="0')|K$v';;f>^^* (
 (
$$$r'   key_pos	query_poskey_segmentquery_segmentc           	         t          j                    5  |                    d          }|                    d          }|                    d          }|                    d          |                    d          k    r<t          d|                    d           d|                    d           d          ||                    d          k    s||                    d          k    r)t          d| d|                    d           d          ||                    d          k    r)t          d| d|                    d           d          |                    |d|          }|                    ||d          }|                    |d|          }|                    ||d          }|                     ||          }|| j        z   }|                     t          j	        |t           j
        |j        	          d d d f         t          j	        |t           j
        |j        	          d d d f         z
  | j        | j        
          }	t          j        ||k    |	d d d d d f         |          }d d d            n# 1 swxY w Y   t          j        || j                  }
|
                    dddd                                          }
|
S )Nr   r   z>key_pos.size(0) should be equal to query_pos.size(0), but got z and !z7keylen should be equal to key_segment.size(1), but got z;querylen should be equal to query_segment.size(1), but got r+   r1   r_   )r   r   r	   r,   )r    no_gradr/   r0   szier`   !_segment_relative_position_bucketr   _position_bucketarangeint32r_   r   whereF	embeddingr   ra   rk   )r$   r   r   r   r   batchkeylenquerylenrelative_position_bucketabsolute_position_bucketembedss              r&   r9   z&CpmAntSegmentPositionEmbedding.forward  s8    ]__ %	 %	LLOOE\\!__F ~~a((H||A).."3"333$U\UaUabcUdUdktkykyz{k|k|   ))!,,,,M<N<Nq<Q<Q0Q0Q$qfqq[f[k[klm[n[nqqq   =--a0000$yRZyyanasastuavavyyy   ll5"f55G!uh;;I%**5"f==K)..uhCCM'+'M'Mm]h'i'i$'?$BR'R$ (,'<'<V5;?W?^___`dfgfgfg`gh,xu{C[Cbcccdededegkdklm ,!.	 (= ( ($ (-{-(qqq!!!4(( ($C%	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	 %	P 5t7STT1a++6688s   I)J		JJc                     || j         z  |z   S r   )r   )r$   r   r   s      r&   r   z@CpmAntSegmentPositionEmbedding._segment_relative_position_bucket  s    t00;>>r'          c                 .   d}|dz  }|dk                         t          j                  |z  }t          j        |          }|dz  }||k     }|t          j        |                                |z            t          j        ||z            z  ||z
  z                       t          j                  z   }t          j        |t          j        ||dz
                      }|t          j	        ||                     t          j                  |          z  }|S )Nr   r,   r   )
r2   r    r   abslogrj   re   min	full_liker   )r$   relative_positionr   r   relative_buckets	max_exactis_smallrelative_postion_if_larges           r&   r   z/CpmAntSegmentPositionEmbedding._position_bucket  s   -155ekBB[P!I&7881$	$y0$-I'--//);<<h|i/001Y&( "U[//	%!
 %*I%O5{QGG%
 %
! 	EK2C2F2Fu{2S2SUnooor'   )r   r   )r:   r;   r<   r   r   r    r>   r9   r   r   r?   r@   s   @r&   r   r     s        
| 
 
 
 
 
 
22 <2 \	2
 |2 2 2 2h? ? ?               r'   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )CpmAntOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j	        |j
                  | _        d S )N)r   )r   r   r   rL   r   r   r   	LayerNormlayer_norm_epsrT   hidden_dropout_probrU   r#   s     r&   r   zCpmAntOutput.__init__  sf    Yv79KLL
f&8f>STTTz&"<==r'   r(   input_tensorr   c                     |                      |          }|                     |          }|                     ||z             }|S r   )r   rU   r   )r$   r(   r   s      r&   r9   zCpmAntOutput.forward  s@    

=11]33}|'CDDr'   r   r@   s   @r&   r   r      si        > > > > >U\  RWR^        r'   r   c                        e Zd ZdZeZdZd ZdS )CpmAntPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    cpmantc                 v   t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j        	                                 dS dS t          |t          j
                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 	                                 dS dS t          |t          j                  r?|j        j        	                                 |j        j                            d           dS t          |t                    r!|j        j                            d           dS t          |t                    r-|j        j                            d| j        j                   dS dS )zInitialize the weightsg        )r5   stdNg      ?)r   r   rL   r"   datanormal_r   init_stdrE   zero_	Embeddingpadding_idxr   fill_r   r   r   )r$   modules     r&   _init_weightsz#CpmAntPreTrainedModel._init_weights  s   fbi(( 	\M&&CT[5I&JJJ{& &&((((( '&-- 
	\M&&CT[5I&JJJ!-"6#56<<>>>>> .--- 	\K""$$$M$$S)))))00 	\M$$S))))) >?? 	\*/77SdkFZ7[[[[[	\ 	\r'   N)r:   r;   r<   r=   r   config_classbase_model_prefixr  r   r'   r&   r   r     s?         
  L \ \ \ \ \r'   r   aB  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters
        config ([`~CpmAntConfig`]): Model configuration class with all the parameters of the
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`CPMAntTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zTThe bare CPMAnt Model outputting raw hidden-states without any specific head on top.c                   L    e Zd Zdef fdZd Zd Zd Z ee	           e
eee          	 	 	 	 	 	 ddeej                 d	ee         d
ee         deeeej                                   dee         dee         deeej                 ef         fd                        Z xZS )CpmAntModelr   c                    t                                          |           t          |          | _        t	          j        |j        |j                  | _        t	          j        |j	        |j
        |j        z  z   |j                  | _        t          |          | _        |j        | _        |j	        | _	        |                                  d S r   )r   r   r   encoderr   r   r   r   segment_embedding
vocab_sizeprompt_typesprompt_lengthinput_embeddingr   rY   	post_initr#   s     r&   r   zCpmAntModel.__init__R  s       $V,,!#f.BFDV!W!W!| 3f6J JJFL^ 
  
 <FCC#1 +r'   c                     | j         S r   r  r$   s    r&   get_input_embeddingsz CpmAntModel.get_input_embeddings_  s    ##r'   c                     || _         d S r   r  )r$   
embeddingskwargss      r&   set_input_embeddingsz CpmAntModel.set_input_embeddingsb  s    )r'   c                    |                     d          }|                     d          }|j        }t          j        ||          t          j        ||                              dd          k    }|d d d d d f         |d d d d d f                                         |                    d||          z  z  }	|	|d d d d d f         |d d d d d f         k    z  }	t          j        t          t          || j	        z
                      d d d         |          d d d f         
                    |d          |d d d f         k     }
t          j        t          j        || j	        |                                          |
fd          }
|
                    ||d          |
                    |d|          z  |	z  }	|	S )Nr   r   )r_   r+   rF   )r/   r_   r    r   r`   logical_notrh   listr   r  repeatrb   onesru   )r$   	input_idsspancontextlengthr   seqlenr_   directional_mask_2drX   mask_1ds              r&   _prepare_attention_maskz#CpmAntModel._prepare_attention_maske  s   q!!""!#l6&AAAU\RXagEhEhEhEmEmnprsEtEtt D!!!,AAAqqq$J++--0C0H0HFTZ0[0[[
 (44
+;tAAAqqq$J?O+OP LeFT-?$?@@AA$$B$GPVWWWX\^_^_^_X_`gghmopqqQQQWo 	 )UZt/A&QQQVVXXZabhijjj eVQ77',,uaQW:X:XX[iir'   
checkpointoutput_typer  Nr  rZ   r   r[   r\   return_dictr   c           	         ||n| j         j        }||n| j         j        }||n| j         j        }||n| j         j        }|j        t          j        k    r|                    t          j                  }|j        |j	        }	}t          j
        |dk    dd                              ||	          }
|
dk                        d                              ||	          }t          j        t          j        | j        dz  | j        z   | j        dz  | j        z   ||	                              |                    d          d          |fd          }|                                \  }}t          j        t          j        || j        ||	          |
fd          }
t          j        ||fd||	          }t          j        |||	                              |d          }t          j        ||fd||	          }|cd}t)          d g| j        j        z            }|                                }|                     |          }|                     |
          }||z   }n]|d         d                             d          }|                     |
          }|                     |          |d d dd d d f         z   }|                     ||||          }|                     |||
|
          }|d d |d d d f         }|d d d d |d d d f         }|d d |d d d f         }|                     |||||||          \  }}}}|dk    rh|d d | j        d d d f         }|+d	}|D ]$}||d d d d | j        d | j        d f         fz  }%|}|#d	}|D ]}||d d | j        d d d f         fz  }|}|st)          d
 ||||fD                       S t9          ||||          S )Nr   r,   r   r+   r	   r   rF   r^   r   c              3      K   | ]}||V  	d S r   r   )r   vs     r&   	<genexpr>z&CpmAntModel.forward.<locals>.<genexpr>  s1        efererererer r'   )last_hidden_stater[   r(   
attentions)r   rZ   r   use_return_dictr\   r1   r    r   r2   r_   r   sumrb   r   r  r  r  r/   zerosfulltupler
  r   rk   r  r  r%  rY   r   )r$   r  rZ   r   r[   r\   r)  r  r1   r_   segmentr!  r   
seq_lengthr   positionr  past_lengthr(   segment_statesrX   rY   present_key_valuesr   all_attentionsnew_attentions	attentionnew_hidden_stateshidden_states                                r&   r9   zCpmAntModel.forwardw  s     2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]!*!6IIDK<Q	 ?ek))!U[11I!)9v+i1na3366U66RRQ,##B''**v*FFI&*T_<&*T_<!	  
 &**A.. 
 
 
	 &NN,,z)U[0B%X^___ahiopqqq*eZ0!5PPP<
%GGGNNuVWXXz5*-qfMMM"K#TFT\-D$DEEO!,,..I 00;;M!33G<<N)N:MM)!,Q/44R88K!33G<<N 00;;nQQQPRPSPSUVUVUVY>WWM55iwPVWW**8XwPP';<<(:;%aaaKLL!!!&;<%aaaqqq&89OS|| P
 P
L)+<n !)!!!T-?-A-A111*DEM)!#!/ e eI"yAAAt7I7K7KTM_MaMa1a'b&ddNN!/ ,$&!$5 U UL%,qqq$:L:N:NPQPQPQ7Q*R)TT%%$5! 	  )+=?PR`a      '+.+%	
 
 
 	
r'   )NNNNNN)r:   r;   r<   r   r   r  r  r%  r   CPMANT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r    r>   ru   r   r   r9   r?   r@   s   @r&   r  r  M  sn       
|      $ $ $* * *  $ +*+BCC&+$   -1,0/3@D$(&*^
 ^
EL)^
 $D>^
 'tn	^

 "%el(;"<=^
 D>^
 d^^
 
uU\"$;;	<^
 ^
 ^
  DC^
 ^
 ^
 ^
 ^
r'   r  zy
    The CPMAnt Model with a language modeling head on top (linear layer with weights tied to the input embeddings).
    c                       e Zd ZdgZdef fdZ ee           ee	e
e          	 	 	 	 	 	 	 	 ddeej                 deeeej        ej        f                           dee         d	ee         d
ee         deej                 dee         deej                 deee
f         fd                        Zd Zd Zd Zd Zd Z xZS )CpmAntForCausalLMzlm_head.weightr   c                    t                                          |           t          |          | _        t	          j        |j        |j        |j        |j	        z  z   d          | _
        |                                  d S r   )r   r   r  r   r   rL   r   r  r  r  lm_headr  r#   s     r&   r   zCpmAntForCausalLM.__init__  sz       !&)) y 1F4G&J^4^ ^ej
 
 
 	r'   r&  Nr  r[   r\   rZ   r   labelsr)  rX   r   c	                    ||n| j         j        }|                     ||||||          }
|r|
j        n|
d         }|                     |          }d}|Tt                      } ||                    d|                    d                    |                    d                    }|s|f|
dd         z   }||f|z   n|S t          |||
j	        |
j
        |
j                  S )u;
  
        Args:
            input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
                Indices of input sequence tokens in the vocabulary.

                Indices can be obtained using [`CPMAntTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers.
            labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                CPMAnt will process attention mask automatically, this parameter is a dummy parameter for
                text-generation pipeline.

        Example:

        Text Generation with CpmAntForCausalLM.
        ```python
        >>> from transformers import CPMAntTokenizer, CpmAntForCausalLM

        >>> texts = "今天天气不错，"
        >>> model = CpmAntForCausalLM.from_pretrained("openbmb/cpm-ant-10b")
        >>> tokenizer = CPMAntTokenizer.from_pretrained("openbmb/cpm-ant-10b")
        >>> input_ids = tokenizer(texts, return_tensors="pt")
        >>> outputs = model.generate(**input_ids)
        >>> output_texts = tokenizer.batch_decode(outputs)
        >>> print(output_texts)
        ['今天天气不错，阳光明媚，我和妈妈一起去超市买东西。\n在超市里，我看到了一个很好玩的玩具，它的名字叫“机器人”。它有一个圆圆的脑袋，两只圆圆的眼睛，还有一个圆圆的']
        ```
        Nr   r+   r   )losslogitsr[   r(   r/  )r   r0  r   r.  rF  r   r`   r/   r   r[   r(   r/  )r$   r  r[   r\   rZ   r   rG  r)  rX   r  model_outputr(   rJ  rI  	loss_funcoutputs                   r&   r9   zCpmAntForCausalLM.forward  s   z &1%<kk$+B]{{(*>QZ\g
 
 ;FZ66<XY?m,,(**I9V[[V[[__==v{{2OOD 	FYabb!11F)-)9TGf$$vE%(8&4#.
 
 
 	
r'   c                     | j         j        S r   r   r  r  s    r&   r  z&CpmAntForCausalLM.get_input_embeddingsH  s    {**r'   c                     || j         _        d S r   rO  )r$   r  s     r&   r  z&CpmAntForCausalLM.set_input_embeddingsK  s    &0###r'   c                     | j         S r   rF  r  s    r&   get_output_embeddingsz'CpmAntForCausalLM.get_output_embeddingsN  s
    |r'   c                     || _         d S r   rR  )r$   new_embeddingss     r&   set_output_embeddingsz'CpmAntForCausalLM.set_output_embeddingsQ  s    %r'   c                 l    d |D             }|D ]$}|d         |         |d<   |d         |         |d<   %|S )Nc                 4    g | ]}|t          |          n|S r   )r  )r   eachs     r&   r   z4CpmAntForCausalLM._reorder_cache.<locals>.<listcomp>U  s'    ```)94:::t```r'   r   r   r   )r$   r[   beam_idxkey_value_layers       r&   _reorder_cachez CpmAntForCausalLM._reorder_cacheT  sV    ``P_```. 	> 	>O!0!3H!=OA!0!3H!=OAr'   )NNNNNNNN)r:   r;   r<   _tied_weights_keysr   r   r   r@  r   rA  r   rB  r   r    r>   r   r   ru   r   r9   r  r  rS  rV  r\  r?   r@   s   @r&   rD  rD    s        ++|       +*+BCC&*$   -1MQ$(,0/3)-&*15O
 O
EL)O
 "$uU\5<-G'H"IJO
 D>	O

 $D>O
 'tnO
 &O
 d^O
 !.O
 
u,,	-O
 O
 O
  DCO
b+ + +1 1 1  & & &      r'   rD  )5r=   re   typingr   r   r   r   r    torch.nn.functionalr   
functionalr   torch.utils.checkpointtorch.nnr   activationsr
   
generationr   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   r   configuration_cpmantr   
get_loggerr:   loggerrA  rB  Moduler   rB   rw   r   r   r   r   r   r   r   r   r   CPMANT_START_DOCSTRINGr@  r  rD  r   r'   r&   <module>rm     s1      / / / / / / / / / / / /                     % % % % % % ! ! ! ! ! ! ) ) ) ) ) ) O O O O O O O O - - - - - - u u u u u u u u u u u u . . . . . . 
	H	%	%+      bi   2e4 e4 e4 e4 e4bi e4 e4 e4P.> .> .> .> .>ry .> .> .>b    ")   (    	   4    RY   6,> ,> ,> ,> ,>RY ,> ,> ,>^>T >T >T >T >TBI >T >T >TD       Y  Y  Y  Y  Y RY Y  Y  Y z    29   \ \ \ \ \O \ \ \8	  0 Z J
 J
 J
 J
 J
' J
 J
	 J
Z  	 u u u u u- u u u u ur'   