
    g"                        d Z ddlmZmZmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZmZmZ ddlmZ  ej         e!          Z"dZ#dZ$de%de%dej&        fdZ'dej&        dej&        fdZ(dej&        dej&        dej&        dej&        fdZ) G d dej*                  Z+ G d dej*                  Z, G d  d!ej*                  Z- G d" d#e          Z.d$Z/d%Z0 ed&e/           G d' d(e.                      Z1 ed)e/           G d* d+e.e                      Z2dS ),zPyTorch CodeGen model.    )OptionalTupleUnionN)nn)CrossEntropyLoss   )ACT2FN)CacheDynamicCacheStaticCache)GenerationMixin)AttentionMaskConverter)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )CodeGenConfigzSalesforce/codegen-2B-monor   num_posdimreturnc                    ddt          j        d|dt           j                  |z  z  z  }t          j        dt          j        | t           j                                                  |                                          }t          j        t          j        |          t          j        |          fd          S )	N      ?i'  r      dtypezi , j -> i jr   r   )torcharangeint64einsumfloatcatsincos)r   r   inv_freqsinusoid_inps       h/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/codegen/modeling_codegen.pycreate_sinusoidal_positionsr,   )   s    eQQek J J JS PQRH<WEK0X0X0X0^0^0`0`bjkkqqssL9ei--uy/F/FGQOOOO    xc                     | d d d d d d d d df         }| d d d d d d dd df         }t          j        | |fd          } |                     d          S )Nr   r   r    )r!   stackflatten)r.   x1x2s      r+   rotate_every_twor6   0   ss    	
111aaaCCaC<B	
111aaaADqD=	BbS"I2&&&A99R==r-   tensorr'   r(   c                     t          j        |d d d d d d d f         dd          }t          j        |d d d d d d d f         dd          }| |z  t          |           |z  z   S )Nr   r   )r!   repeat_interleaver6   )r7   r'   r(   s      r+   apply_rotary_pos_embr:   8   sy    

!#aaaD!!!m"4a
;
;C

!#aaaD!!!m"4a
;
;CSL-f55;<<r-   c                       e Zd Zd fd	Zd Zd Z	 	 ddZ	 	 	 	 	 	 	 ddeej	                 dee
         d	eej	                 d
eej                 deej	                 dee         dee         deej                 deeej        eej                 f         eeej        eej                 eej        df         f                  f         fdZ xZS )CodeGenAttentionNc                    t                                                       |j        }t          j        |j                  | _        t          j        |j                  | _        || _	        |(t                              d| j        j         d           |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t%          d| j         d| j         d          t'          j        t'          j        | j        t&          j                                                t'          j                              | _        t          j        | j        | j        dz  d	          | _        t          j        | j        | j        d	          | _        |j        | _        | j        p| j        }t=          ||          | _        d S )
NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.zEembed_dim must be divisible by num_attention_heads (got `embed_dim`: z and `num_attention_heads`: z).r   r   F)bias) super__init__max_position_embeddingsr   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropout	layer_idxloggerwarning_once	__class____name__hidden_size	embed_dimnum_attention_headshead_dim
ValueErrorr!   sqrtr7   float32toget_default_dtype
scale_attnLinearqkv_projout_proj
rotary_dimr,   embed_positions)selfconfigrG   max_positionspos_embd_dimrJ   s        r+   r@   zCodeGenAttention.__init__?   s   6Jv'899Z(:;;",!8 , , ,    +#)#= $*BB=433t~EEHX\Xf H H+/+CH H H    *U\$-u}%U%U%UVVYYZ_ZqZsZstt	$.$.12D5QQQ	$.$.uMMM +8$.:=,WWr-   c                     |                     |j        d d         ||z  |fz             }|                     |j        d d         dz   |j        dd          z             }|S )Nr0   r1   )r0   )reshapeshape)r[   r.   n_headdim_headmp_numreshapeds         r+   _split_headszCodeGenAttention._split_heads]   sc    99QWSbS\Vv-=x,HHII##AGCRCL5$88>"##;N$NOOr-   c                    t          |j                  dk    r,|                    ddddd                                          }ngt          |j                  dk    r+|                    dddd                                          }n$t	          dt          |j                             |                                dd	         ||z  fz   }|                    |          S )
zM
        Merges attn_head_size dim and num_attn_heads dim into n_ctx
           r   r   r   r      z3Input tensor rank should be one of [4, 5], but is: Nr1   )lenra   permute
contiguousrP   sizeview)r[   r7   rN   attn_head_size	new_shapes        r+   _merge_headszCodeGenAttention._merge_headsb   s     v|!!^^Aq!Q22==??FF!##^^Aq!Q//::<<FFfSVW]WcSdSdffgggKKMM#2#&*=*N)PP	{{9%%%r-   c                 
   |                     t          j                  }|                     t          j                  }t          j        ||                    dd                    }|$|d d d d d d d |j        d         f         }||z  }|| j        z  } t          j        d          |          }|                     |j	                  }| 
                    |          }|||z  }t          j        ||          }||fS )Nr0   r1   r    )rS   r!   rR   matmul	transposera   rU   r   Softmaxr   rD   )	r[   querykeyvalueattention_mask	head_maskattn_weightscausal_maskattn_outputs	            r+   _attnzCodeGenAttention._attno   s     ''ffU]##|E3==R+@+@AA%(AAAqqq/CIbM/)ABKK'L#do5)rzb))),77#u{33((66  ')3Ll<77L((r-   Fhidden_states
layer_pastry   position_idsrz   	use_cacheoutput_attentionscache_positionr   .c	                 @   |                      |          }	d}
|	                    |	j        d d         |
dfz             }| j        | j        z  |
z  }t          j        ||d          \  }}}|                     || j        | j        |
          }|                     || j        | j        |
          }|                     || j        | j        |
          }|                    dddd          }| j	        }|j
        |j
        k    r!|                    |j
                  }|| _	        ||         }t          j        ||j        d         dz  d          \  }}| j        |d d d d d d d | j        f         }|d d d d d d | j        d f         }|d d d d d d d | j        f         }|d d d d d d | j        d f         }t          |||          }t          |||          }t          j        ||gd          }t          j        ||gd          }n"t          |||          }t          |||          }|                    dddd          }|                    dddd          }|D||| j        |d	}|                    |                    |j                  || j        |          \  }}|                     |||||          \  }}|                     || j        | j                  }|                     |          }|                     |          }||f}|r||fz  }|S )
Nri   r0   r    )rd   r   r   r   r   )r'   r(   partial_rotation_sizer   )rW   r`   ra   rO   rN   r!   splitrf   rk   rZ   devicerS   rY   r:   r&   updater   rG   r~   rq   rX   rF   )r[   r   r   ry   r   rz   r   r   r   qkvrd   	qkv_split	local_dimrv   rx   rw   rZ   sincosr'   r(   k_rotk_passq_rotq_passcache_kwargsr}   r{   outputss                               r+   forwardzCodeGenAttention.forward   sr    mmM**KK	#2#&" =>>	MD$<<F	!K	9"EEEuc!!%)A4=Y_!``T%=t}U[\\!!%)A4=Y_!``aAq)).!\%888-001DEEO#2D  .;vv|B'71'<"EEES?&111aaa!24?!223EAAAqqq$/"3"334F!!!QQQ#4T_#445E111aaaDO$5$556F(S99E(S99E)UFO444CIufo2666EE&sC55C(S99Ekk!Q1%%aAq)) !)-"0	 L $**366-2E+F+Ft~_kllJC %)JJuc5.R[$\$\!\''T5Mt}]]mmK00((55
+ 	'&Gr-   N)NNNNNNFFN)rK   
__module____qualname__r@   rf   rq   r~   r   r!   FloatTensorr
   
LongTensorboolr   r   Tensorr   __classcell__rJ   s   @r+   r<   r<   >   s       X X X X X X<  
& & &$ ) ) ) )D '+6:3715$),159L L 12L UOL !!23	L
 u/0L E-.L D>L $D>L !!12L 
elE%,//0u|U5<%8%c@Q:RRST	V
L L L L L L L Lr-   r<   c                   N     e Zd Z fdZdeej                 dej        fdZ xZS )
CodeGenMLPc                 (   t                                                       |j        }t          j        ||          | _        t          j        ||          | _        t          |j                 | _	        t          j
        |j                  | _        d S r   )r?   r@   n_embdr   rV   fc_infc_outr	   activation_functionactrB   rE   dropout)r[   intermediate_sizer\   rM   rJ   s       r+   r@   zCodeGenMLP.__init__   so    M	Yy*;<<
i 19==&45z&"455r-   r   r   c                     |                      |          }|                     |          }|                     |          }|                     |          }|S r   )r   r   r   r   )r[   r   s     r+   r   zCodeGenMLP.forward   sL    

=11//M22]33r-   )	rK   r   r   r@   r   r!   r   r   r   r   s   @r+   r   r      se        6 6 6 6 6Xe.?%@ UEV        r-   r   c                   T    e Zd Zd fd	Z	 	 	 	 	 	 	 ddeej                 dee         deej                 deej                 deej                 d	ee	         d
ee	         deej                 de
eej                 eeej        eej        df         f                  f         fdZ xZS )CodeGenBlockNc                    t                                                       |j        |j        n	d|j        z  }t	          j        |j        |j                  | _        t          ||          | _	        t          ||          | _        d S )Nri   eps)r?   r@   n_innerr   r   	LayerNormlayer_norm_epsilonln_1r<   attnr   mlp)r[   r\   rG   	inner_dimrJ   s       r+   r@   zCodeGenBlock.__init__   ss    &,n&@FNNa&-FW	LF4MNNN	$VY77	i00r-   Fr   r   ry   r   rz   r   r   r   r   .c	           
          |}	|                      |          }|                     ||||||||          }
|
d         }|
dd          }|                     |          }||z   |	z   }|r|f|z   }n|f|dd          z   }|S )Nr   r   ry   r   rz   r   r   r   r   r   )r   r   r   )r[   r   r   ry   r   rz   r   r   r   residualattn_outputsr}   r   feed_forward_hidden_statess                 r+   r   zCodeGenBlock.forward   s     !		-00yy'!)%/) ! 	
 	
 #1oqrr"%)XXm%<%<"#&@@8K 	5$&0GG$&4Gr-   r   r   )rK   r   r   r@   r   r!   r   r
   r   r   r   r   r   r   r   r   s   @r+   r   r      s)       1 1 1 1 1 1 '+6:3715$),159" " 12" UO" !!23	"
 u/0" E-." D>" $D>" !!12" 
uU\"HU5<uGXZ]G]A^3^-_$``	a" " " " " " " "r-   r   c                   J     e Zd ZdZeZdZdZdgZdZ	dZ
dZdZ fdZd Z xZS )CodeGenPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    transformerTr   past_key_valuesc                 :     t                      j        |i | d S r   )r?   r@   )r[   inputskwargsrJ   s      r+   r@   zCodeGenPreTrainedModel.__init__/  s%    &+F+++++r-   c                    t          |t          j        f          rT|j        j                            d| j        j                   |j         |j        j        	                                 dS dS t          |t          j
                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 	                                 dS dS t          |t          j                  r?|j        j        	                                 |j        j                            d           dS dS )zInitialize the weights.        )meanstdNr   )
isinstancer   rV   weightdatanormal_r\   initializer_ranger>   zero_	Embeddingpadding_idxr   fill_)r[   modules     r+   _init_weightsz$CodeGenPreTrainedModel._init_weights2  s+   fryl++ 	* M&&CT[5R&SSS{& &&((((( '&-- 	*M&&CT[5R&SSS!-"6#56<<>>>>> .--- 	*K""$$$M$$S)))))	* 	*r-   )rK   r   r   __doc__r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_cache_class_supports_quantized_cache_supports_static_cacher@   r   r   r   s   @r+   r   r      s         
 !L%&*#'("3  $!, , , , ,* * * * * * *r-   r   aJ  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`CodeGenConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoProcenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_attention_heads,)` or `(n_layer, num_attention_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_dim)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            Two formats are allowed:
            - a [`~cache_utils.Cache`] instance, see our
            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
            cache format.

            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
            legacy cache format will be returned.

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
            the complete sequence length.
zaThe bare CodeGen Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd Z fdZd Zd Z ee                    d                     e	e
ee          	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deeeeeej                          f                  d	eej                 d
eej                 deej                 deej                 deej                 dee         dee         dee         dee         deej                 deeef         fd                        Zd	ej        dej        dej        dedef
dZed	ej        dededej        dej        dej        defd            Z xZS )CodeGenModelc                 <   t                                                     j        | _        j        | _        t          j        j        | j                  | _        t          j        j	                  | _
        t          j        fdt          j                  D                       | _        t          j        | j        j                  | _        t%          j        j        j        z            | _        d| _        |                                  d S )Nc                 2    g | ]}t          |           S ))rG   )r   ).0ir\   s     r+   
<listcomp>z)CodeGenModel.__init__.<locals>.<listcomp>  s&    aaaaVq A A Aaaar-   r   F)r?   r@   r   rM   
vocab_sizer   r   wterB   
embd_pdropdrop
ModuleListrangen_layerhr   r   ln_fminrY   n_ctxrN   gradient_checkpointing	post_initr[   r\   rJ   s    `r+   r@   zCodeGenModel.__init__  s        +< 14>BBJv011	aaaa5QWQ_K`K`aaabbLV5NOOO	f/A[1[\\&+# 	r-   c                     | j         S r   r   r[   s    r+   get_input_embeddingsz!CodeGenModel.get_input_embeddings  s	    xr-   c                     || _         d S r   r   r[   new_embeddingss     r+   set_input_embeddingsz!CodeGenModel.set_input_embeddings  s    !r-   batch_size, sequence_length
checkpointoutput_typer   N	input_idsr   ry   token_type_idsr   rz   inputs_embedsr   r   output_hidden_statesreturn_dictr   r   c                 &   |	|	n| j         j        }	|
|
n| j         j        }
||n| j         j        }||n| j         j        }|d u |d uz  rt          d          | j        r%| j        r|rt          	                    d           d}|| 
                    |          }d}|rVt          |t                    sAd}|t                      }n.t          j        |          }t          	                    d           |j        d         }|7||                                nd}t#          j        |||z   |j                  }||                    d          }|                     |||||	          }|                     || j         j                  }|}|0|                    d	|          }| 
                    |          }||z   }|                     |          }d	||                    d	          f}d }|	rd
nd }|
rd
nd }t7          | j                  D ]\  }}|
r||fz   }| j        r0| j        r)|                     |j        |d ||||         ||	|	  	        }n |||||||         ||	|          }|d         }|du r|d         }|	r|||rdnd         fz   }|                     |          }|                    |          }|
r||fz   }|r|nd }|r|                                 }|stC          d ||||fD                       S tE          ||||          S )Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FTzWe detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)r   r   r   r0    r   r   c              3      K   | ]}||V  	d S r   r  )r   vs     r+   	<genexpr>z'CodeGenModel.forward.<locals>.<genexpr>5  s1        bcbobobobobo r-   )last_hidden_stater   r   
attentions)#r\   r   r   r   use_return_dictrP   r   trainingrH   rI   r   r   r
   r   from_legacy_cachera   get_seq_lengthr!   r"   r   	unsqueeze_update_causal_maskget_head_maskr   rn   r   rm   	enumerater   _gradient_checkpointing_func__call__r   to_legacy_cachetupler   )r[   r   r   ry   r   r   rz   r   r   r   r   r   r   return_legacy_cache
seq_lengthpast_seen_tokensr|   r   token_type_embedsoutput_shapenext_decoder_cacheall_self_attentionsall_hidden_statesr   blockr   
next_caches                              r+   r   zCodeGenModel.forward  s;   * 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B]-t";< 	[YZZZ& 	"4= 	" "##p   "	  HHY//M $ 
	Z?? 
	"&&"..."."@"Q"Q##^   #(+
!CRC^==???de"\*:<Lz<YbobvwwwN)33A66L..M>?L]
 
 &&y$+2EFF	%%+00Z@@N $ 8 8),==M		-00J(:(:2(>(>?!$5?bb4"6@BBD!$&)) !	^ !	^HAu# I$58H$H!* t} ;;N! aL%"
 
  %"/.#.!-'l'&7#1	 	 	 $AJMD  %,QZ"  ^&9W)EZQQYZ=[<]&]#		-00%**<88 	E 1]4D D+4>''$
 	6#3355J 	  ):7HJ]^      '+&+*	
 
 
 	
r-   input_tensorc           
         | j         j        dk    r
|d|v r|S d S ||                                nd}t          |t                    }| j         j        dk    r#|s!|st          j        |||| j                  rd S |j        |j	        }	}|j
        d         }
|r|                                }n/t          |t          j                  r|j
        d         n||
z   dz   }|                     ||
|||	||j
        d                   }| j         j        dk    rB|@|j	        j        d	k    r0|s.t          j        |          j        }t          j        ||          }|S )
Nflash_attention_2r   r   sdpa)r   past_key_values_lengthis_trainingr   r0   )sequence_lengthtarget_lengthr   r   r   
batch_sizecuda)r\   _attn_implementationr  r   r   r   _ignore_causal_mask_sdpar	  r   r   ra   get_max_cache_shaper!   r   5_prepare_4d_causal_attention_mask_with_cache_positiontypefinfor   _unmask_unattended)r[   ry   r  r   r   r   r  using_static_cacher   r   r$  r%  r|   	min_dtypes                 r+   r  z CodeGenModel._update_causal_maskA  s    ;+/BBB)c^.C.C%%4
 @O?Z?99;;;`a'EE ;+v55>P5Yj5%>*'7 M	    t$*L,?v&,Q/ 	+??AAMM nel;;<$R((%7!;  PP+')#)!, Q 
 
 K,66*%*f44% 5 E**.I0CKQZ[[Kr-   r$  r%  r   r   r&  c                    | |                                  dk    r| }n+t          j        |          j        }	t          j        ||f|	||          }|dk    rt          j        |d          }|t          j        ||          |                    dd          k    z  }|ddddddf                             |ddd          }| |	                                }| j
        d         }
|ddddddd|
f         | ddddddf         z   }|dk    }|ddddddd|
f                             ||	          |ddddddd|
f<   |S )	a  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            device (`torch.device`):
                The device to plcae the 4D attention mask on.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nri   )
fill_valuer   r   r   )diagonalr  r0   r   )r   r!   r-  r   fulltriur"   r`   expandclonera   masked_fill)ry   r$  r%  r   r   r   r&  r   r|   r0  mask_lengthpadding_masks               r+   r+  zBCodeGenModel._prepare_4d_causal_attention_mask_with_cache_position  s   D %.*<*<*>*>!*C*C(KKE**.I* -0Ye\b  K !###jqAAA5<fEEEH^H^_acdHeHeeeK%dD!!!QQQ&67>>z1bRTUUK))//11,226*111aaaL[L+@ANSTSTSTVZ\`bcbcbcScDdd+q05@AAAqqq,;,AV5W5c5c )6 6AAAqqq!!!\k\12 r-   )NNNNNNNNNNNN)rK   r   r   r@   r   r   r   CODEGEN_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r!   r   r   r
   r   r   r   r   r   r  staticmethodintr   r   r+  r   r   s   @r+   r   r     s       
       " " " +*+C+J+JKh+i+ijj&+$   15NR6:59371559$(,0/3&*59G
 G
E,-G
 "%uU5<5H/I(I"JKG
 !!23	G

 !!12G
 u/0G
 E-.G
   12G
 D>G
 $D>G
 'tnG
 d^G
 !!12G
 
u--	.G
 G
 G
  kjG
T?? l? 	?
 ?  ? ? ? ?B 555 5 {	5
 5 5 5 5 5 \5 5 5 5 5r-   r   zM
    The CodeGen Model transformer with a language modeling head on top.
    c            !           e Zd ZdgZ fdZd Zd Z ee	                    d                     e
eee          	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 d	eeeeeej                          f                  d
eej                 deej                 deej                 deej                 deej                 deej                 dee         dee         dee         dee         deej                 deeef         fd                        Zed	eeej                          dej        deeej                          fd            Z xZS )CodeGenForCausalLMzlm_head.weightc                     t                                          |           t          |          | _        t	          j        |j        |j                  | _        | 	                                 d S r   )
r?   r@   r   r   r   rV   r   r   lm_headr   r   s     r+   r@   zCodeGenForCausalLM.__init__  s[       '//y0ABB 	r-   c                     | j         S r   rD  r   s    r+   get_output_embeddingsz(CodeGenForCausalLM.get_output_embeddings  s
    |r-   c                     || _         d S r   rF  r   s     r+   set_output_embeddingsz(CodeGenForCausalLM.set_output_embeddings  s    %r-   r   r   Nr   r   ry   r   r   rz   r   labelsr   r   r   r   r   r   c                    ||n| j         j        }|                     ||||||||	|
|||          }|d         }|                     |                              t
          j                  }d}||                    |j                  }|dddddf                                         }|dddf                                         }t                      } ||
                    d|                    d                    |
                    d                    }|                    |j                  }|s|f|dd         z   }||f|z   n|S t          |||j        |j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)r   ry   r   r   rz   r   r   r   r   r   r   r   .r0   r   )losslogitsr   r   r  )r\   r  r   rD  rS   r!   rR   r   rl   r   rn   rm   r   r   r   r   r  )r[   r   r   ry   r   r   rz   r   rJ  r   r   r   r   r   transformer_outputsr   	lm_logitsrL  shift_logitsshift_labelsloss_fctoutputs                         r+   r   zCodeGenForCausalLM.forward  s   8 &1%<kk$+B]"..+))%'/!5#) / 
 
 ,A.
 LL//225=AA	YYy/00F$S#2#qqq[1<<>>L!#qrr'?5577L'))H8L--b,2C2CB2G2GHH,J[J[\^J_J_``D77=.//D 	F\$7$;;F)-)9TGf$$vE%/?-;*5
 
 
 	
r-   beam_idxc                 :    t          fd| D                       S )a  
        This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
        [`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        c              3   N   K   | ]}t          fd |D                       V   dS )c              3   t   K   | ]2}|                     d                     |j                            V  3dS )r   N)index_selectrS   r   )r   
past_staterT  s     r+   r  z>CodeGenForCausalLM._reorder_cache.<locals>.<genexpr>.<genexpr>)  sC      jjQ[*))!X[[9J-K-KLLjjjjjjr-   Nr  )r   r   rT  s     r+   r  z4CodeGenForCausalLM._reorder_cache.<locals>.<genexpr>(  sU       
 
 jjjj_ijjjjj
 
 
 
 
 
r-   rZ  )r   rT  s    `r+   _reorder_cachez!CodeGenForCausalLM._reorder_cache  s=      
 
 
 
-
 
 
 
 
 	
r-   )NNNNNNNNNNNNN)rK   r   r   _tied_weights_keysr@   rG  rI  r   r;  r<  r   r=  r   r>  r   r!   r   r   r
   r   r   r   r   r   r?  r[  r   r   s   @r+   rB  rB    sZ        ++      & & & +*+C+J+JKh+i+ijj&*$   15NR6:59371559-1$(,0/3&*59D
 D
E,-D
 "%uU5<5H/I(I"JKD
 !!23	D

 !!12D
 u/0D
 E-.D
   12D
 )*D
 D>D
 $D>D
 'tnD
 d^D
 !!12D
 
u,,	-D
 D
 D
  kjD
L 
uU\23
?D|
	uU\"	#
 
 
 \
 
 
 
 
r-   rB  )3r   typingr   r   r   r!   torch.utils.checkpointr   torch.nnr   activationsr	   cache_utilsr
   r   r   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   r   configuration_codegenr   
get_loggerrK   rH   r=  r>  r@  r   r,   r6   r:   Moduler<   r   r   r   CODEGEN_START_DOCSTRINGr;  r   rB  r  r-   r+   <module>rk     sQ     ) ) ) ) ) ) ) ) ) )            % % % % % % ! ! ! ! ! ! ; ; ; ; ; ; ; ; ; ; ) ) ) ) ) ) > > > > > > O O O O O O O O - - - - - - u u u u u u u u u u u u 0 0 0 0 0 0 
	H	%	%2 !P P3 P5< P P P P     = =EL =u| =X]Xd = = = =\ \ \ \ \ry \ \ \@       (+ + + + +29 + + +\ *  *  *  *  *_  *  *  *F	 E P g _ _ _ _ _) _ _	 _D	  	 i
 i
 i
 i
 i
/ i
 i
 i
 i
 i
r-   