
    gj                        d Z ddlZddlmZmZmZ ddlZddlZddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZ ddlmZ ddlmZmZmZmZ ddl m!Z!  ej"        e#          Z$dZ%dZ& G d dej'                  Z( G d dej'                  Z) G d dej*                  Z+ G d de+          Z,e+e,dZ- G d dej*                  Z. G d de          Z/dZ0dZ1 ed e0           G d! d"e/                      Z2 ed#e0           G d$ d%e/e                      Z3 ed&e0           G d' d(e/                      Z4 ed)e0           G d* d+e/                      Z5dS ),zPyTorch BioGPT model.    N)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GenerationMixin)!_prepare_4d_causal_attention_mask*_prepare_4d_causal_attention_mask_for_sdpa))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )BioGptConfigzmicrosoft/biogptr   c                   L     e Zd ZdZdedef fdZd	dej        def fdZ xZ	S )
 BioGptLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                 j    d| _         t                                          || j         z   |           d S )N   )offsetsuper__init__)selfr   r   	__class__s      f/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/biogpt/modeling_biogpt.pyr"   z)BioGptLearnedPositionalEmbedding.__init__9   s3     $+5}EEEEE    r   attention_maskpast_key_values_lengthc                    |                                 }t          j        |d                              |          |z                                   dz
  }|dd|df         }t	                                          || j        z             S )z3`input_ids_shape` is expected to be [bsz x seqlen].r   dimN)longtorchcumsumtype_asr!   forwardr    )r#   r'   r(   	positionsr$   s       r%   r0   z(BioGptLearnedPositionalEmbedding.forward?   s    ',,.. \.a888@@PPSaaggiilmm	 aaa!7!8!889	wwy4;6777r&   )r   )
__name__
__module____qualname____doc__intr"   r-   
LongTensorr0   __classcell__r$   s   @r%   r   r   4   s         Fs F3 F F F F F F
8 
8e&6 
8PS 
8 
8 
8 
8 
8 
8 
8 
8 
8 
8r&   r   c            
       \     e Zd ZdZd
dedededee         f fdZdej	        f fd	Z
 xZS )BioGptScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
          ?r   r   padding_idxembed_scalec                 \    t                                          |||           || _        d S N)r!   r"   r>   )r#   r   r   r=   r>   r$   s        r%   r"   z"BioGptScaledWordEmbedding.__init__R   s-    DDD&r&   	input_idsc                 V    t                                          |          | j        z  S r@   )r!   r0   r>   )r#   rA   r$   s     r%   r0   z!BioGptScaledWordEmbedding.forwardV   s!    wwy))D,<<<r&   )r<   )r2   r3   r4   r5   r6   r   floatr"   r-   Tensorr0   r8   r9   s   @r%   r;   r;   M   s         ' 's '3 'S '_ghm_n ' ' ' ' ' '= = = = = = = = = = =r&   r;   c                   h    e Zd ZdZ	 	 	 	 	 ddededed	ed
ededee         f fdZ	de
j        dedefdZ	 	 	 	 	 dde
j        dee
j                 deee
j                          dee
j                 dee
j                 dedee
j        ee
j                 eee
j                          f         fdZ xZS )BioGptAttentionz=Multi-headed attention from 'Attention Is All You Need' paper        FTN	embed_dim	num_headsdropout
is_decoderbias	is_causalconfigc                 
   t                                                       || _        || _        || _        ||z  | _        || _        | j        |z  | j        k    rt          d| j         d| d          | j        dz  | _        || _	        || _
        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rL   )r!   r"   rH   rI   rJ   head_dimrN   
ValueErrorscalingrK   rM   r   Lineark_projv_projq_projout_proj)	r#   rH   rI   rJ   rK   rL   rM   rN   r$   s	           r%   r"   zBioGptAttention.__init__^   s    	""!Y.MI%$.883dn 3 3%.3 3 3   }d*$"i	94@@@i	94@@@i	94@@@	)YTBBBr&   tensorseq_lenbszc                     |                     ||| j        | j                                      dd                                          S )Nr   r   )viewrI   rQ   	transpose
contiguous)r#   rY   rZ   r[   s       r%   _shapezBioGptAttention._shape}   s<    {{3GGQQRSUVWWbbdddr&   hidden_stateskey_value_statespast_key_valuer'   layer_head_maskoutput_attentionsreturnc                 Z
   |du}|                                 \  }}	}
|                     |          | j        z  }|r6|4|d         j        d         |j        d         k    r|d         }|d         }n>|rU|                     |                     |          d|          }|                     |                     |          d|          }n||                     |                     |          d|          }|                     |                     |          d|          }t          j        |d         |gd          }t          j        |d         |gd          }nT|                     |                     |          d|          }|                     |                     |          d|          }| j	        r||f}|| j
        z  d| j        f} |                     ||	|          j        | } |j        | } |j        | }|                     d          }t          j        ||                    dd                    }|                                 || j
        z  |	|fk    r2t!          d|| j
        z  |	|f d|                                            ||                                 |d|	|fk    r+t!          d	|d|	|f d|                                            |                    || j
        |	|          |z   }|                    || j
        z  |	|          }t"          j                            |d          }||                                 | j
        fk    r-t!          d
| j
        f d|                                            |                    dddd          |                    || j
        |	|          z  }|                    || j
        z  |	|          }|r=|                    || j
        |	|          }|                    || j
        z  |	|          }nd}t"          j                            || j        | j                  }t          j        ||          }|                                 || j
        z  |	| j        fk    r7t!          d|| j
        z  |	| j        f d|                                            |                    || j
        |	| j                  }|                    dd          }|                    ||	| j                  }|                     |          }|||fS )#Input shape: Batch x Time x ChannelNr   r   r   r*   z$Attention weights should be of size 	, but is z!Attention mask should be of size z/Head mask for a single layer should be of size ptraining `attn_output` should be of size )sizerW   rS   shaper`   rU   rV   r-   catrK   rI   rQ   r]   reshapebmmr^   rR   r   
functionalsoftmaxrJ   rm   rH   rX   )r#   ra   rb   rc   r'   rd   re   is_cross_attentionr[   tgt_len_query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                       r%   r0   zBioGptAttention.forward   s    .T9',,..Wa {{=11DL@ 	L*q!'*.>.DQ.GGG (*J)!,LL 	LT[[1A%B%BBLLJ;;t{{3C'D'Db#NNLL'T[[%?%?SIIJ;;t{{='A'A2sKKLN1$5z#BJJJJ 9nQ&7%FANNNLL T[[%?%?SIIJ;;t{{='A'A2sKKL? 	8 ),7NDN*B>
Ct{{<#>>CZP'Z'4
+|+Z8//!$$yz/C/CAq/I/IJJ3#7'"JJJ*dn8LgW^7_ * * %%''* *  
 %""$$a'(BBB ta'8Rtt]k]p]p]r]rtt   (,,S$.'7SSVddL',,S4>-A7GTTL},,\r,BB&##%%$.)::: 1t~FW 1 1',,..1 1   +//2q!<<|?P?PQTVZVdfmov?w?wwL',,S4>-A7GTTL 	)
 %1$5$5c4>7T[$\$\!055cDN6JGU\]]LL$(!]**<4<RVR_*``
i
L99#"6!OOO)C$.4H'SWS`3a ) )$$&&) )  
 "&&sDNGT]SS!++Aq11 "))#wGGmmK001>AAr&   )rG   FTFNNNNNF)r2   r3   r4   r5   r6   rC   boolr   r   r"   r-   rD   r`   r   r0   r8   r9   s   @r%   rF   rF   [   s       GG  )-C CC C 	C
 C C C &C C C C C C>eU\ eC ec e e e e 488<1526"'vB vB|vB #5<0vB !u|!45	vB
 !.vB "%,/vB  vB 
u|Xel3XeEL>Q5RR	SvB vB vB vB vB vB vB vBr&   rF   c                   
    e Zd Z	 	 	 	 	 ddej        deej                 deeej                          deej                 deej                 ded	eej        eej                 eeej                          f         f fd
Z xZ	S )BioGptSdpaAttentionNFra   rb   rc   r'   rd   re   rf   c                    |s|At                               d           t                                          ||||||          S |du}|                                \  }}	}
|                     |          }|r6|4|d         j        d         |j        d         k    r|d         }|d         }n>|rU|                     |                     |          d|          }|                     | 	                    |          d|          }n||                     |                     |          d|          }|                     | 	                    |          d|          }t          j        |d         |gd          }t          j        |d         |gd          }nT|                     |                     |          d|          }|                     | 	                    |          d|          }| j        r||f}|                     ||	|          }| j        r
||	dk    rd	nd
}t          j        j                            ||||| j        r| j        nd|          }|                                || j        |	| j        fk    r5t+          d|| j        |	| j        f d|                                           |                    dd          }|                    ||	| j                  }|                     |          }|d|fS )rh   Na  BioGptModel is using BioGptSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)rb   rc   r'   rd   re   r   r   r   ri   r*   TFrG   )	attn_mask	dropout_prM   rn   rj   )loggerwarning_oncer!   r0   ro   rW   rp   r`   rU   rV   r-   rq   rK   rM   r   rt   scaled_dot_product_attentionrm   rJ   rI   rQ   rR   r^   rr   rH   rX   )r#   ra   rb   rc   r'   rd   re   rv   r[   rw   rx   ry   rz   r{   rM   r   r$   s                   r%   r0   zBioGptSdpaAttention.forward   sK     	 ;l   77??!1-- /"3 #    .T9',,..Wa {{=11 	L*q!'*.>.DQ.GGG (*J)!,LL 	LT[[1A%B%BBLLJ;;t{{3C'D'Db#NNLL'T[[%?%?SIIJ;;t{{='A'A2sKKLN1$5z#BJJJJ 9nQ&7%FANNNLL T[[%?%?SIIJ;;t{{='A'A2sKKL? 	8 ),7N{{<#>>
 !N`~/E'TU++DD[`	 h)FF$&*m<dll G 
 
 #t~w!NNN)CRVR_3` ) )$$&&) )  
 "++Aq11 "))#wGGmmK00D.00r&   r   )
r2   r3   r4   r-   rD   r   r   r   r0   r8   r9   s   @r%   r   r      s         488<1526"'f1 f1|f1 #5<0f1 !u|!45	f1
 !.f1 "%,/f1  f1 
u|Xel3XeEL>Q5RR	Sf1 f1 f1 f1 f1 f1 f1 f1 f1 f1r&   r   )eagersdpac                       e Zd Zdef fdZ	 	 	 	 	 ddej        deej                 deej                 d	eeej                          d
ee	         dee	         deej
        eeej
        ej
        f                  f         fdZ xZS )BioGptDecoderLayerrN   c                 ,   t                                                       |j        | _        t	          |j                 | j        |j        |j        dd          | _        |j	        | _
        t          |j                 | _        |j        | _        t          j        | j                  | _        t          j        | j        |j                  | _        t          j        |j        | j                  | _        t          j        | j                  | _        d S )NT)rH   rI   rJ   rK   rM   )r!   r"   hidden_sizerH   BIOGPT_ATTENTION_CLASSES_attn_implementationnum_attention_headsattention_probs_dropout_prob	self_attnhidden_dropout_probrJ   r   
hidden_actactivation_fnactivation_dropoutr   	LayerNormself_attn_layer_normrT   intermediate_sizefc1fc2final_layer_normr#   rN   r$   s     r%   r"   zBioGptDecoderLayer.__init__k  s    +1&2MNn07
 
 
 1#F$56"(";$&L$@$@!9T^V-EFF9V5t~FF "T^ < <r&   NFTra   r'   rd   rc   re   	use_cacherf   c                 z   |}|                      |          }|
|dd         nd}|                     |||||          \  }}	}
t          j                            || j        | j                  }||z   }|}|                     |          }|                     |          }|                     |          }t          j                            || j	        | j                  }| 
                    |          }t          j                            || j        | j                  }||z   }|f}|r||	fz  }|r||
fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        Nr   )ra   rc   r'   rd   re   rk   )r   r   r   rt   rJ   rm   r   r   r   r   r   )r#   ra   r'   rd   rc   re   r   residualself_attn_past_key_valueself_attn_weightspresent_key_valueoutputss               r%   r0   zBioGptDecoderLayer.forward  so   0 !11-@@ :H9S>"1"#5#5Y] >Bnn'3)+/ ?M ?
 ?
;(*; --mt|VZVc-dd =0 !--m<<//**=99--mt?Vaean-oo//--mt|VZVc-dd =0 " 	,)++G 	,)++Gr&   )NNNFT)r2   r3   r4   r   r"   r-   rD   r   r   r   FloatTensorr0   r8   r9   s   @r%   r   r   j  s        =| = = = = = =0 26268<,1$(< <|< !.< "%,/	<
 !u|!45< $D>< D>< 
u (51BEDU1U+V"WW	X< < < < < < < <r&   r   c                   (    e Zd ZdZeZdZdZdZd Z	dS )BioGptPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    biogptTc                    t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j        	                                 dS dS t          |t          j
                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 	                                 dS dS t          |t          j                  r?|j        j        	                                 |j        j                            d           dS dS )zInitialize the weightsrG   )meanstdNr<   )
isinstancer   rT   weightdatanormal_rN   initializer_rangerL   zero_	Embeddingr=   r   fill_)r#   modules     r%   _init_weightsz#BioGptPreTrainedModel._init_weights  s)   fbi(( 	* M&&CT[5R&SSS{& &&((((( '&-- 	*M&&CT[5R&SSS!-"6#56<<>>>>> .--- 	*K""$$$M$$S)))))	* 	*r&   N)
r2   r3   r4   r5   r   config_classbase_model_prefixsupports_gradient_checkpointing_supports_sdpar    r&   r%   r   r     sE         
  L &*#N* * * * *r&   r   aJ  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`~BioGptConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z`The bare BioGPT Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd Zdef fdZd Zd Z ee	                    d                     e
eee          	 	 	 	 	 	 	 	 	 ddeej                 d	eej                 d
eej                 deej                 deeeej                                   dee         dee         dee         dee         deeef         fd                        Z xZS )BioGptModelrN   c                    t                                                     | _        j        | _        j        | _        j        | _        j        | _	        j
        rt          j        j                  nd}t          j        | j        | j	        |          | _        t!          j        | j                  | _        t'          j        fdt+          j                  D                       | _        t'          j        | j                  | _        d| _        j        dk    | _        |                                  d S )Nr<   )r>   c                 .    g | ]}t                    S r   )r   ).0rx   rN   s     r%   
<listcomp>z(BioGptModel.__init__.<locals>.<listcomp>0  s"    $i$i$iA%7%?%?$i$i$ir&   Fr   )r!   r"   rN   	layerdropr   rJ   r   rH   pad_token_idr=   scale_embeddingmathsqrtr;   
vocab_sizeembed_tokensr   max_position_embeddingsembed_positionsr   
ModuleListrangenum_hidden_layerslayersr   
layer_normgradient_checkpointingr   	_use_sdpa	post_init)r#   rN   r>   r$   s    ` r%   r"   zBioGptModel.__init__"  s!      )1+!.7=7MVdi 2333SV5t~t/?[
 
 
  @@^`d`noom$i$i$i$ivOgIhIh$i$i$ijj,t~66&+#4>r&   c                     | j         S r@   r   r#   s    r%   get_input_embeddingsz BioGptModel.get_input_embeddings8  s      r&   c                     || _         d S r@   r   r#   values     r%   set_input_embeddingsz BioGptModel.set_input_embeddings;  s    !r&   batch_size, sequence_length
checkpointoutput_typer   NrA   r'   	head_maskinputs_embedspast_key_valuesr   re   output_hidden_statesreturn_dictrf   c
           
         ||n| j         j        }||n| j         j        }||n| j         j        }|	|	n| j         j        }	||t          d          ||}
|
                                }n=|,|                                d d         }|d d d d df         }
nt          d          ||d         d         j        d         nd}||                     |
          }|Bt          j
        |j        d         |j        d         |z   ft          j        |j                  }nD|j        d         ||d         z   k    r*t          d|j        d          d	||d         z    d
          |                     ||          }| j        r|s|t          ||||          }nt!          ||||          }||z   }t"          j                            || j        | j                  }| j        r%| j        r|rt,                              d           d}|rdnd }|rdnd }d }|rdnd }t1          | j                  D ]\  }}|r||fz  }| j        r t          j        g           }|| j        k     r4|||         nd }| j        r2| j        r+|                     |j        |||||         nd d ||          }n ||||||         nd |||          }|d         }|r|||rdnd         fz  }|r||d         fz  }|r||fz  }|                     |          }|r|nd }|	st?          d |||||fD                       S tA          |||||          S )NzDYou cannot specify both input_ids and inputs_embeds at the same timeri   z5You have to specify either input_ids or inputs_embedsr   r   r   )dtypedevicez'The provided attention mask has length z, but its length should be z0 (sum of the lengths of current and past inputs)rk   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   )r'   rd   rc   re   r   c              3      K   | ]}||V  	d S r@   r   )r   vs     r%   	<genexpr>z&BioGptModel.forward.<locals>.<genexpr>  s0        =  === r&   )last_hidden_stater   ra   
attentionscross_attentions)!rN   re   r   r   use_return_dictrR   ro   rp   r   r-   onesr   r   r   r   r   r   r   rt   rJ   rm   r   r   r   	enumerater   randr   _gradient_checkpointing_func__call__r   tupler   )r#   rA   r'   r   r   r   r   re   r   r   inputinput_shaper(   r1   ra   all_hidden_statesall_self_attnsall_cross_attentionsnext_decoder_cacheidxdecoder_layerdropout_probabilityrc   layer_outputs
next_caches                            r%   r0   zBioGptModel.forward>  s   $ 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B]  ]%>cddd"E**,,KK&',,..ss3K!!!!QQQ(+EETUUU DSC^!3A!6!<Q!?!?de  --e44M!"Z$Q')<Q)?BX)XYj$+  NN
 !!$(>Q(OOOm.:Nq:Q m m)KN:m m m   ((9OPP	> 		"3 			8I H]<R NN ?]<R N &	1--mt|VZVc-dd& 	"4= 	" "##p   "	"6@BBD0:d##,6RR$"+DK"8"8 %	6 %	6C# 6!m%55!} &+jnn#&775D5P_S11VZN* t}  $ A A!*!"&/&;IcNN%! ! !.!#17@7LYs^^RV#1&7'! ! ! *!,M V"}:K5RQQQR'S&UU"  6=#3"55   	2-!1166+4>''$
 	  '5FXlm     
 9+&+%1
 
 
 	
r&   )	NNNNNNNNN)r2   r3   r4   r   r"   r   r   r   BIOGPT_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r-   r7   r   r   rD   r   r   r0   r8   r9   s   @r%   r   r     s       
|      ,! ! !" " " +*+B+I+IJg+h+hii&=$   156:1559@D$(,0/3&*J
 J
E,-J
 !!23J
 E-.	J

   12J
 "%el(;"<=J
 D>J
 $D>J
 'tnJ
 d^J
 
u??	@J
 J
 J
  jiJ
 J
 J
 J
 J
r&   r   zHBioGPT Model with a `language modeling` head on top for CLM fine-tuning.c                       e Zd ZdgZ fdZd Zd Z ee	                    d                     e
eee          	 	 	 	 	 	 	 	 	 	 ddeej                 d	eej                 d
eej                 deej                 deeeej                                   deej                 dee         dee         dee         dee         deeef         fd                        Zed             Z xZS )BioGptForCausalLMzoutput_projection.weightc                     t                                          |           t          |          | _        t	          j        |j        |j        d          | _        | 	                                 d S NFrP   )
r!   r"   r   r   r   rT   r   r   output_projectionr   r   s     r%   r"   zBioGptForCausalLM.__init__  sb       !&))!#6+=v?PW\!]!]!] 	r&   c                     | j         S r@   r
  r   s    r%   get_output_embeddingsz'BioGptForCausalLM.get_output_embeddings  s    %%r&   c                     || _         d S r@   r  )r#   new_embeddingss     r%   set_output_embeddingsz'BioGptForCausalLM.set_output_embeddings  s    !/r&   r   r   NrA   r'   r   r   r   labelsr   re   r   r   rf   c                 B   |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }d}||ddddddf                                         }|ddddf                                         }t                      } ||                    d| j         j                  |                    d                    }|
s|f|dd         z   }||f|z   n|S t          |||j	        |j
        |j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)r'   r   r   r   r   re   r   r   r   ri   r   )losslogitsr   ra   r   r   )rN   r   r   r
  r_   r   r]   r   r   r   ra   r   r   )r#   rA   r'   r   r   r   r  r   re   r   r   r   sequence_outputprediction_scoreslm_lossshifted_prediction_scoresloss_fctoutputs                     r%   r0   zBioGptForCausalLM.forward  sj   2 &1%<kk$+B]++)'+/!5#  

 

 "!* 22?CC(9!!!SbS!!!)(D(O(O(Q(Q%AAAqrrE]--//F'))Hh8==b$+BXYY[a[f[fgi[j[jkkG 	L')GABBK7F,3,?WJ''VK0$#3!/)$5
 
 
 	
r&   c                 T    d}| D ]!}|t          fd|D                       fz  }"|S )Nr   c              3   t   K   | ]2}|                     d                     |j                            V  3dS )r   N)index_selecttor   )r   
past_statebeam_idxs     r%   r   z3BioGptForCausalLM._reorder_cache.<locals>.<genexpr>*  sC      nnU_j--aZ=N1O1OPPnnnnnnr&   )r   )r   r   reordered_past
layer_pasts    `  r%   _reorder_cachez BioGptForCausalLM._reorder_cache%  sQ    ) 	 	Jnnnncmnnnnn NN r&   
NNNNNNNNNN)r2   r3   r4   _tied_weights_keysr"   r  r  r   r  r  r   r  r   r  r   r-   r7   r   r   rD   r   r   r0   staticmethodr#  r8   r9   s   @r%   r  r    s        55    & & &0 0 0 +*+B+I+IJg+h+hii&5$   156:1559@D-1$(,0/3&*7
 7
E,-7
 !!237
 E-.	7

   127
 "%el(;"<=7
 )*7
 D>7
 $D>7
 'tn7
 d^7
 
u77	87
 7
 7
  ji7
r   \    r&   r  z
    BioGPT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       e Zd Z fdZ ee           eeee	          	 	 	 	 	 	 	 	 	 	 	 dde
ej                 de
ej                 de
ej                 de
ej                 de
eeej                                   d	e
ej                 d
e
ej                 de
e         de
e         de
e         de
e         deeef         fd                        Z xZS )BioGptForTokenClassificationc                 x   t                                          |           |j        | _        t          |          | _        t          |d          r|j        |j        }n|j        }t          j	        |          | _
        t          j        |j        |j                  | _        |                                  d S )Nclassifier_dropout)r!   r"   
num_labelsr   r   hasattrr*  r   r   DropoutrJ   rT   r   
classifierr   )r#   rN   r*  r$   s      r%   r"   z%BioGptForTokenClassification.__init__7  s        +!&))6/00 	<V5N5Z!'!:!'!;z"455)F$68IJJr&   r   NrA   token_type_idsr'   r   r   r   r  r   re   r   r   rf   c                    ||n| j         j        }|                     |||||||	|
|	  	        }|d         }|                     |          }|                     |          }d}|t                      }||                    d          dk    }|                    d| j                  }t          j	        ||                    d          t          j
        |j                                      |                    } |||          }n8 ||                    d| j                  |                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r'   r   r   r   re   r   r   r   ri   r   r   )r  r  ra   r   )rN   r   r   rJ   r.  r   r]   r+  r-   whererY   ignore_indexr/   r   ra   r   )r#   rA   r/  r'   r   r   r   r  r   re   r   r   transformer_outputsra   r  r  r  active_lossactive_logitsactive_labelsr  s                        r%   r0   z$BioGptForTokenClassification.forwardE  s   4 &1%<kk$+B]"kk+)'/!5# * 

 

 ,A.]33//'))H),11"55: &B @ @ %R%,x?T2U2U2]2]^d2e2e! !  x}==xB @ @&++b//RR 	FY!4QRR!88F)-)9TGf$$vE$-;*5	
 
 
 	
r&   )NNNNNNNNNNN)r2   r3   r4   r"   r   r  r   r  r   r  r   r-   r7   r   r   rD   r   r   r0   r8   r9   s   @r%   r(  r(  /  s            +*+BCC&)$   15596:15@D59-1$(,0/3&*=
 =
E,-=
 !!12=
 !!23	=

 E-.=
 "%el(;"<==
   12=
 )*=
 D>=
 $D>=
 'tn=
 d^=
 
u++	,=
 =
 =
  DC=
 =
 =
 =
 =
r&   r(  a  
    The BioGpt Model transformer with a sequence classification head on top (linear layer).

    [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it is required to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                       e Zd Zdef fdZ ee           eee	e
          	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deeeej                                   d	eej                 d
eej                 dee         dee         dee         dee         deee	f         fd                        Zd Zd Z xZS )BioGptForSequenceClassificationrN   c                     t                                          |           |j        | _        t          |          | _        t          j        |j        | j        d          | _        | 	                                 d S r	  )
r!   r"   r+  r   r   r   rT   r   scorer   r   s     r%   r"   z(BioGptForSequenceClassification.__init__  si        +!&))Yv14?OOO
 	r&   r   NrA   r'   r   r   r   r  r   re   r   r   rf   c                 t   |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }||j        dd         \  }}n|j        dd         \  }}| j         j        d}ny|Nt          j        || j         j                                      d          dz
  	                    |j
                  }n)d}t                              | j        j         d           |t          j        ||j
                  |f         }d}|Z| j         j        f| j        dk    rd	| j         _        nN| j        dk    r7|j        t          j        k    s|j        t          j        k    rd
| j         _        nd| j         _        | j         j        d	k    rWt+                      }| j        dk    r1 ||                                |                                          }n |||          }n| j         j        d
k    rGt/                      } ||                    d| j                  |                    d                    }n*| j         j        dk    rt3                      } |||          }|
s|f|dd         z   }||f|z   n|S t5          |||j        |j        |j                  S )r1  Nr2  r   r   ri   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`)r   
regressionsingle_label_classificationmulti_label_classification)r  r  r   ra   r   )rN   r   r   r<  rp   r   r-   nesumr  r   r   r   r$   r2   arangeproblem_typer+  r   r,   r6   r	   squeezer   r]   r   r   r   ra   r   )r#   rA   r'   r   r   r   r  r   re   r   r   r5  ra   r  
batch_sizesequence_lengthpooled_logitsr  r  r  s                       r%   r0   z'BioGptForSequenceClassification.forward  s   2 &1%<kk$+B]"kk+)'/!5# * 

 

 ,A.M** *3/"1"*='J*7*=bqb*A'J;#+ OO$#(8It{7O#P#P#T#TUW#X#X[\#\"`"`agan"o"o"$##~. ^ ^ ^  
 u|Jv}MMM^_{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8M$9$9$;$;V^^=M=MNNDD#8M6::DD)-JJJ+--x 2 22t G GUWYY)-III,..xv66 	F#%(;ABB(??F)-)9TGf$$vE/ /?-;*5
 
 
 	
r&   c                     | j         j        S r@   r   r   r   s    r%   r   z4BioGptForSequenceClassification.get_input_embeddings  s    {''r&   c                     || j         _        d S r@   rJ  r   s     r%   r   z4BioGptForSequenceClassification.set_input_embeddings  s    #(   r&   r$  )r2   r3   r4   r   r"   r   r  r   r  r   r  r   r-   r7   r   r   rD   r   r   r0   r   r   r8   r9   s   @r%   r:  r:    s        |       +*+BCC&4$   156:15@D59-1$(,0/3&*V
 V
E,-V
 !!23V
 E-.	V

 "%el(;"<=V
   12V
 )*V
 D>V
 $D>V
 'tnV
 d^V
 
u66	7V
 V
 V
  DCV
p( ( () ) ) ) ) ) )r&   r:  )6r5   r   typingr   r   r   r-   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   
generationr   modeling_attn_mask_utilsr   r   modeling_outputsr   r   r   r   modeling_utilsr   utilsr   r   r   r   configuration_biogptr   
get_loggerr2   r   r  r  r   r   r;   ModulerF   r   r   r   r   BIOGPT_START_DOCSTRINGr  r   r  r(  r:  r   r&   r%   <module>rY     s.      ) ) ) ) ) ) ) ) ) )            A A A A A A A A A A ! ! ! ! ! ! ) ) ) ) ) ) u u u u u u u u            . - - - - -            / . . . . . 
	H	%	%(  
8 8 8 8 8r| 8 8 82
= 
= 
= 
= 
= 
= 
= 
=[B [B [B [B [Bbi [B [B [B~g1 g1 g1 g1 g1/ g1 g1 g1V   R R R R R R R Rj* * * * *O * * *8	 4 n f m
 m
 m
 m
 m
' m
 m
	 m
` RTj X X X X X- X X Xv   R
 R
 R
 R
 R
#8 R
 R
 R
j   l) l) l) l) l)&; l) l) l) l) l)r&   