
    g?                        d Z ddlZddlmZmZmZ ddlZddlZddlmZ ddl	m
Z
mZmZmZ ddl	mZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ ddl m!Z! ddl"m#Z#  e!j$        e%          Z&dZ'dZ(d-dZ) G d dej*                  Z+ G d dej*                  Z, G d dej*                  Z- G d de          Z.dZ/dZ0 ede/           G d d e.                      Z1 ed!e/           G d" d#e.e                      Z2 ed$e/           G d% d&e.                      Z3 ed'e/           G d( d)e.                      Z4 ed*e/           G d+ d,e.                      Z5dS ).zPyTorch MPT model.    N)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLoss	LayerNormMSELoss)
functional   )add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forward)GenerationMixin)!_prepare_4d_causal_attention_mask))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)logging   )	MptConfigzmosaicml/mpt-7br      c                 j   t          j        d|z
  dt           j        |                              ddd|          }dt	          j        t	          j        |                     z  }t          j        d|dz   t           j        |                                          }|||z  z  }dt          j	        d|          z  }|                    d|dd          }|| k    rAt          j
        |ddddddf         |ddddddf         gd          ddd| df         }||z  }|                    d          S )	a  
    Link to paper: https://arxiv.org/abs/2108.12409 - Alibi tensor is not causal as the original paper mentions, it
    relies on a translation invariance of softmax for quick implementation. This implementation has been copied from
    the alibi implementation of MPT source code that led to slightly different results than the Bloom alibi:
    https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L292
    r   )dtypedevice         ?N.dimr   )torcharangeint32viewmathceillog2int64floatpowconcatsqueeze)	num_headssequence_lengthalibi_bias_maxr   alibinum_heads_power_of_2baseslopess           `/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/mpt/modeling_mpt.pybuild_mpt_alibi_tensorr7   /   sH    L_,au{6RRRWWXY[\^_apqqE	$)I*>*> ? ??</!35;vVVV\\^^D>$889D59Q%%%F[[0!Q77Fy((vaaaAsl3VAAAsssCK5HIqQQQRSRSRSU_V_U_adRdeFNE==    c            
            e Zd ZdZdef fdZ	 	 d
dej        dej        dee	ej                          deej                 fd	Z
 xZS )MptAttentionzyMulti-head self attention.
    Using torch or triton attention implemetation enables user to also use additive bias.
    configc                    t                                                       |j        | _        |j        | _        |j        | _        | j        | j        z  | _        |j        j        | _        | j        )dt          j
        | j        | j        z            z  | _        |j        j        | _        |j        j        | _        t          j        | j        d| j        z  d          | _        t          j        | j        | j        d          | _        d S )Nr   r   Fbias)super__init__hidden_sizen_headsmax_seq_lenmax_seq_lengthhead_dimattn_configsoftmax_scaler'   sqrt
attn_pdropattn_dropout_pclip_qkvr   LinearWqkvout_projselfr;   	__class__s     r6   r@   zMptAttention.__init__K   s    !-~$0(DL8#/=%!"TYt/?$,/N%O%O!OD$0;*3Id.D4D0D5QQQ		$"2D4D5QQQr8   Nhidden_statesposition_biaspast_key_valueattention_maskc                    |j         d d         \  }}|                     |          }| j        r"|                    | j         | j                  }|                    dd          \  }}	}
|                    ||| j        | j                                      dd          }|	                    ||| j        | j                                      dd          }	|
                    ||| j        | j                                      dd          }
|Tt          |          dk    r<t          j        |d         |	gd          }	t          j        |d         |
gd          }
|	|
f}n|	|
f}t          j        ||	                    dd                    | j        z  }||n||d         j         d         z   }|t          |j                   dk    r$t          d	t          |j                              |	j         d         }t          d|                    d          |z
            }t          d|                    d          |z
            }|d d |d |d f         }||z   }|2|                    |t          j        |j                  j                  }t*          j                            |                                d                              |
j                  }t*          j                            || j        | j        
          }t          j        ||
          }|                    dddd                                                              ||d          }|                      |          }|||fS )Nr   )minmaxr   r!   r   r   z6Expecting position_bias shape to be 3 dimensions, got ptraining)!shaperM   rK   clampchunkreshaperB   rE   	transposelenr#   catmatmulrG   
ValueErrorrX   sizemasked_fillfinfor   rW   r   r   softmaxr+   todropoutrJ   r]   permute
contiguousr&   rN   )rP   rR   rS   rT   rU   
batch_size
seq_length	mixed_qkvquery_states
key_statesvalue_statesattention_scoresquery_length
key_lengthposition_bias_query_indexposition_bias_key_indexattn_weightscontext_statesattn_outputs                      r6   forwardzMptAttention.forwardZ   sk    "/!4RaR!8
JIIm,,	= 	O!T]NNNI1:1J1J.j,#++J
DLRVR_``jjklnopp''
Jdm\\ffghjkll
#++J
DLRVR_``jjklnopp%>""a''"Yq(9:'FANNN
$y.*;\)JPQRRR(,7NN(,7N <j6J6J2r6R6RSSVZVhh%3%;zzn]^N_NefgNhAh$=&''1,, !tZ]^k^qZrZr!t!tuuu#)"-J(+A}/A/A!/D/D|/S(T(T%&)!]-?-?-B-BZ-O&P&P#)!!!-F-G-GI`IaIa*abM/-?%/;;NEKXdXjLkLkLopp },,-=-C-C-E-E2,NNQQR^Rdee},,\T=P[_[h,iilLAA'//1a;;FFHHMMjZdfhiimmN33L.88r8   )NN)__name__
__module____qualname____doc__r   r@   r#   Tensorr   r   r}   __classcell__rQ   s   @r6   r:   r:   F   s         Ry R R R R R R& 9=1559 59|59 |59 !u|!45	59
 !.59 59 59 59 59 59 59 59r8   r:   c                   V     e Zd Zdef fdZdej        dej        dej        fdZ xZS )MptMLPr;   c                 (   t                                                       |j        }t          j        |d|z  d          | _        t          j        d          | _        t          j        d|z  |d          | _        |j	        j
        | _        d S )N   Fr=   none)approximate)r?   r@   rA   r   rL   up_projGELUact	down_projrF   rI   hidden_dropoutrP   r;   rA   rQ   s      r6   r@   zMptMLP.__init__   s    (ya+oEJJJ7v...1{?KeLLL$0;r8   rR   residualreturnc                     |                      |                     |                    }|                     |          }t          j        || j        | j                  }||z   }|S )Nr[   )r   r   r   Frl   r   r]   )rP   rR   r   intermediate_outputoutputs        r6   r}   zMptMLP.forward   s^    m!<!<=="nn];;.$2EPTP]^^^("r8   )	r~   r   r   r   r@   r#   r   r}   r   r   s   @r6   r   r      ss        <y < < < < < <U\ U\ el        r8   r   c                        e Zd Zdef fdZ	 	 	 ddej        dej        dej        deeej        ej        f                  d	e	d
e	fdZ
 xZS )MptBlockr;   c                    t                                                       |j        }t          ||j                  | _        d | j        _        |j        | _        t          |          | _
        t          ||j                  | _        d | j        _        t          |          | _        |j        j        | _        t#          j        | j                  | _        d S )Neps)r?   r@   rA   r	   layer_norm_epsilonnorm_1r>   rB   r/   r:   attnnorm_2r   ffnrF   rI   dropout_rater   Dropoutresid_attn_dropoutr   s      r6   r@   zMptBlock.__init__   s    (1JKKK ((	1JKKK&>>".9"$*T->"?"?r8   NFrR   rS   rU   
layer_past	use_cacheoutput_attentionsc                    |                      |          }|}|                     ||||          \  }	}
}|                     |	          |z   }|                     |          }|}|                     ||          }|f}|r||fz  }|r||
fz  }|S )N)rS   rU   rT   )r   r   r   r   r   )rP   rR   rS   rU   r   r   r   layernorm_outputr   attn_outputsrz   rT   r   outputss                 r6   r}   zMptBlock.forward   s      ;;}55  6:YY')%	 6? 6
 6
2lN //==H;;}55 ! *H55) 	)((G 	'&Gr8   )NFF)r~   r   r   r   r@   r#   r   r   r   boolr}   r   r   s   @r6   r   r      s        @y @ @ @ @ @ @2 CG"'( (|( |( 	(
 U5<#=>?( (  ( ( ( ( ( ( ( (r8   r   c                        e Zd ZeZdZdZdgZdgZ fdZ	de
j        fdZedeeej        ej        f                  d	eeej        ej        f                  fd
            Z xZS )MptPreTrainedModeltransformerTr   z
lm_head.*.c                 :     t                      j        |i | d S N)r?   r@   )rP   inputskwargsrQ   s      r6   r@   zMptPreTrainedModel.__init__   s%    &+F+++++r8   modulec                    t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j        	                                 dS dS t          |t          j
                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 	                                 dS dS t          |t                    rF|j        |j        j        	                                 |j        j                            d           dS dS )zInitialize the weights.g        )meanstdNr    )
isinstancer   rL   weightdatanormal_r;   initializer_ranger>   zero_	Embeddingpadding_idxr	   fill_)rP   r   s     r6   _init_weightsz MptPreTrainedModel._init_weights   s/   fbi(( 	* M&&CT[5R&SSS{& &&((((( '&-- 	*M&&CT[5R&SSS!-"6#56<<>>>>> .-	** 	*{& &&(((M$$S)))))	* 	*r8   rT   r   c                 |    | d         d         j         \  }}||z  t          fd| D                       S )zw
        Converts the cache to the format expected by Mpt, i.e. to tuple(tuple([batch_size * num_heads, ...]))
        r   c              3      K   | ]>}|d                                         |d                                        fV  ?dS r   r   N)ra   ).0r   batch_size_times_num_headsrE   rp   s     r6   	<genexpr>z;MptPreTrainedModel._convert_to_mpt_cache.<locals>.<genexpr>  so       
 

  1%%&@(JWW1%%&@*hWW
 
 
 
 
 
r8   )r^   tuple)rT   ro   r/   r   rE   rp   s      @@@r6   _convert_to_mpt_cachez(MptPreTrainedModel._convert_to_mpt_cache  st     7EQ6G6J6P3
Ix%/)%;"  
 
 
 
 
 

 -
 
 
 
 
 	
r8   )r~   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_keys_to_ignore_on_load_missingr@   r   Moduler   staticmethodr   r#   r   r   r   r   s   @r6   r   r      s        L%&*##'4o#, , , , ,*BI * * * *" 
eEL%,$>?@
	uU\5</0	1
 
 
 \
 
 
 
 
r8   r   a*  

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`MptConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`):
            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
            `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
            their past given to this model should not be passed as `input_ids` as they have already been computed.

            Each element of `past_key_values` is a tuple (past_key, past_value):
            - past_key: [batch_size * num_heads, head_dim, kv_length]
            - past_value: [batch_size * num_heads, kv_length, head_dim]
        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.

            If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
            `past_key_values`).
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
z]The bare Mpt Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd Zdef fdZd ZddZdej        fdZ	 e
e           eeee	          	 	 	 	 	 	 	 	 dd
eej                 deeeej        ej        f         df                  deej                 deej                 dee         dee         dee         dee         deeej        df         ef         fd                        Z xZS )MptModelr;   c                    t                                                     j        | _        j        | _        t          j        j        | j                  | _        t          j	        fdt          j                  D                       | _        t          | j        j                  | _        d | j        _        d| _        |                                  d S )Nc                 .    g | ]}t                    S  )r   )r   _r;   s     r6   
<listcomp>z%MptModel.__init__.<locals>.<listcomp>f  s!    $V$V$V!Xf%5%5$V$V$Vr8   r   F)r?   r@   rA   rB   r/   r   r   
vocab_sizewte
ModuleListrangen_layersblocksr	   r   norm_fr>   gradient_checkpointing	post_initrO   s    `r6   r@   zMptModel.__init__\  s       !- < 143CDD m$V$V$V$VuV_?U?U$V$V$VWW   0f6OPPP&+# 	r8   c                     | j         S r   r   rP   s    r6   get_input_embeddingszMptModel.get_input_embeddingsr  s	    xr8   r   Nc                 &    t          ||||          S r   )r7   )rP   r/   r0   r1   r   s        r6   r7   zMptModel.build_mpt_alibi_tensoru  s    %i.RXYYYr8   new_embeddingsc                     || _         d S r   r   rP   r   s     r6   set_input_embeddingszMptModel.set_input_embeddingsx  s    !r8   
checkpointoutput_typer   	input_idspast_key_values.rU   inputs_embedsr   r   output_hidden_statesreturn_dictr   c	           
          ||n| j         j        }||n| j         j        }||n| j         j        }||n| j         j        }||t          d          ||j        \  }	}
n||j        \  }	}
}nt          d          |%t          d gt          | j	                  z            }|| 
                    |          }|}|rdnd }|rdnd }|rdnd }| j        r%| j        r|rt                              d           d}|
}d}|d         |d         d         j        d         }||z   }|t          j        |	|f|j                  }n|                    |j                  }|                     | j        | j         j        |j                  }t-          ||	|
f||          }|                                }t1          | j	        |          D ]w\  }}|r||fz   }| j        r(| j        r!|                     |j        ||||||          }n |||||||	          }|d         }|d
u r||d         fz   }|r|||rdnd         fz   }x|                     |          }|r||fz   }|st          d ||||fD                       S t9          ||||          S )NzDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedsr   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r   r   )r   rU   r   r   rS   Tr   c              3      K   | ]}||V  	d S r   r   )r   vs     r6   r   z#MptModel.forward.<locals>.<genexpr>  s(      wwqijivivivivivwwr8   )last_hidden_stater   rR   
attentions)r;   r   r   r   use_return_dictrf   r^   r   rc   r   r   r   r]   loggerwarning_oncer#   onesr   rk   r7   r/   rC   r   r   zip_gradient_checkpointing_func__call__r   r   )rP   r   r   rU   r   r   r   r   r   ro   rp   r   rR   presentsall_self_attentionsall_hidden_statesseq_length_with_pastpast_key_values_lengthr2   causal_maskblockr   r   s                          r6   r}   zMptModel.forward{  s   " 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B] ]%>cddd"%._"J

&(5(;%J
AATUUU"#TFS-=-=$=>>O  HHY//M%",22$5?bb4"6@BBD& 	"4= 	" "##p   "	  *!"1)%4Q%7%:%@%C"#7:P#P !"Z5I(JS`SghhhNN+..}/CDDN++DNDK<S\i\p+qq7Z4mE[
 
 "&&((!$T[/!B!B 	^ 	^E:# I$58H$H!* t} ;;N!%   %!)#.'&7"'   $AJMD  #wqzm3  ^&9W)EZQQYZ=[<]&]# M22 	E 1]4D D 	xww]H>OQd$ewwwwww8+$+*	
 
 
 	
r8   r   NNNNNNNNN)r~   r   r   r   r@   r   r7   r#   r   r   r   MPT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   
LongTensorr   r   r   r}   r   r   s   @r6   r   r   W  s       
y      ,  Z Z Z Z"5< " " " " +*+?@@&=$   15SW1548$(,0/3&*m
 m
E,-m
 "%elEL.H(I3(N"OPm
 !.	m

   01m
 D>m
 $D>m
 'tnm
 d^m
 
uU\3&')RR	Sm
 m
 m
  A@m
 m
 m
 m
 m
r8   r   z
    The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                   H    e Zd ZdgZdef fdZd Zdej        fdZ	 e
e           eeee          	 	 	 	 	 	 	 	 	 dd	eej                 d
eeeej        ej        f         df                  deej                 deej                 deej                 dee         dee         dee         dee         deeej                 ef         fd                        Zdeeej        ej        f         df         dej        deeej        ej        f         df         fdZ xZS )MptForCausalLMzlm_head.weightr;   c                     t                                          |           t          |          | _        t	          j        |j        |j        d          | _        | 	                                 d S NFr=   )
r?   r@   r   r   r   rL   rA   r   lm_headr   rO   s     r6   r@   zMptForCausalLM.__init__  sa       #F++y!3V5FUSSS 	r8   c                     | j         S r   r  r   s    r6   get_output_embeddingsz$MptForCausalLM.get_output_embeddings  s
    |r8   r   c                     || _         d S r   r  r   s     r6   set_output_embeddingsz$MptForCausalLM.set_output_embeddings  s    %r8   r   Nr   r   .rU   r   labelsr   r   r   r   r   c
           
      n   |	|	n| j         j        }	|                     ||||||||	          }
|
d         }|                     |          }d}||                    |j                  }|dddddf                                         }|dddf                                         }|j        \  }}}t                      } ||	                    ||z  |          |	                    ||z                      }|	s|f|
dd         z   }||f|z   n|S t          |||
j        |
j        |
j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        Nr   rU   r   r   r   r   r   r   .rY   r   losslogitsr   rR   r   )r;   r   r   r  rk   r   rn   r^   r   r&   r   r   rR   r   )rP   r   r   rU   r   r  r   r   r   r   transformer_outputsrR   	lm_logitsr  shift_logitsshift_labelsro   rp   r   loss_fctr   s                        r6   r}   zMptForCausalLM.forward	  s   0 &1%<kk$+B]"..+)'/!5# / 	
 	
 ,A.LL//	YYy/00F$S#2#qqq[1<<>>L!#qrr'?5577L1=1C.J
J'))H8!!*z"9:FFHYHYZdgqZqHrHr D  	F\$7$;;F)-)9TGf$$vE0/?-;*5
 
 
 	
r8   pastbeam_idxc                 \    fd|D             t          fd|D                       }|S )aL  
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.

        Output shares the same memory storage as `past`.
        c                 Z    i | ]'}|D ]"}|j                             |j                   #(S r   )r   rk   )r   r   
past_stater!  s      r6   
<dictcomp>z1MptForCausalLM._reorder_cache.<locals>.<dictcomp>V  sO     
 
 
BLgq
 
YcJx{{:+<==
 
 
 
r8   c              3      K   | ]^}|d                               d |d          j                           |d                              d |d          j                           fV  _dS r   )index_selectr   )r   r   device_to_beam_idxs     r6   r   z0MptForCausalLM._reorder_cache.<locals>.<genexpr>Y  s       
 

  1**1.@AAU.VWW1**1.@AAU.VWW
 
 
 
 
 
r8   )r   )rP   r   r!  reordered_pastr(  s     ` @r6   _reorder_cachezMptForCausalLM._reorder_cacheK  sn    
 
 
 
PT
 
 
  
 
 
 

 #
 
 
 
 
 r8   	NNNNNNNNN)r~   r   r   _tied_weights_keysr   r@   r  r#   r   r  r   r  r   r  r   r	  r   r
  r   r   r   r}   r*  r   r   s   @r6   r  r    s        ++y        &EL & & & & +*+?@@&5$   15SW1504)-$(,0/3&*:
 :
E,-:
 "%elEL.H(I3(N"OP:
 !.	:

  -:
 &:
 D>:
 $D>:
 'tn:
 d^:
 
uU\"$EE	F:
 :
 :
  A@:
x%el :;S@AMRM]	uU\5</0#5	6       r8   r  a  
    The MPT Model transformer with a sequence classification head on top (linear layer).

    [`MptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                       e Zd Zdef fdZ ee           eee	e
          	 	 	 	 	 	 	 	 	 ddeej                 deeeej        ej        f         df                  deej                 d	eej                 d
eej                 dee         dee         dee         dee         deeej                 e	f         fd                        Z xZS )MptForSequenceClassificationr;   c                     t                                          |           |j        | _        t          |          | _        t          j        |j        |j        d          | _        | 	                                 d S r  )
r?   r@   
num_labelsr   r   r   rL   rA   scorer   rO   s     r6   r@   z%MptForSequenceClassification.__init__s  sk        +#F++Yv163D5QQQ
 	r8   r   Nr   r   .rU   r   r  r   r   r   r   r   c
           
         |	|	n| j         j        }	|                     ||||||||	          }
|
d         }|                     |          }||j        d         }n|j        d         }| j         j        |dk    rt          d          | j         j        d}n|rt          j        || j         j                  	                                
                    d          dz
  }||j        d         z  }|                    |j                  }n)d}t                              | j        j         d           |t          j        ||j                  |f         }d}|.| j         j        f| j        dk    rd	| j         _        nN| j        dk    r7|j        t          j        k    s|j        t          j	        k    rd
| j         _        nd| j         _        | j         j        d	k    rWt-                      }| j        dk    r1 ||                                |                                          }nb |||          }nU| j         j        d
k    rt1                      } |||          }n*| j         j        dk    rt3                      } |||          }|	s|f|
dd         z   }||f|z   n|S t5          |||
j        |
j        |
j                  S )  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.rY   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classificationr  )r;   r   r   r1  r^   pad_token_idrf   r#   eqintargmaxrk   r   r   r   rQ   r~   r$   problem_typer0  r   longr
   r.   r   r   r   r   rR   r   )rP   r   r   rU   r   r  r   r   r   r   r  rR   r  ro   sequence_lengthspooled_logitsr  r  r   s                      r6   r}   z$MptForSequenceClassification.forward|  s   0 &1%<kk$+B]"..+)'/!5# / 	
 	
 ,A.M** "+JJ&,Q/J;#+
a\]]];#+!$#(8It{7O#P#P#T#T#V#V#]#]^`#a#ade#e #3iob6I#I #3#6#6v}#E#E  #% ##~. ^ ^ ^  
 u|Jv}MMMO__`{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8M$9$9$;$;V^^=M=MNNDD#8M6::DD)-JJJ+--xv66)-III,..xv66 	F#%(;ABB(??F)-)9TGf$$vE/ /?-;*5
 
 
 	
r8   r+  )r~   r   r   r   r@   r   r  r   r  r   r	  r   r#   r
  r   r   r   r   r}   r   r   s   @r6   r.  r.  c  s        y       +*+?@@&4$   15SW1504)-$(,0/3&*Z
 Z
E,-Z
 "%elEL.H(I3(N"OPZ
 !.	Z

  -Z
 &Z
 D>Z
 $D>Z
 'tnZ
 d^Z
 
uU\"$DD	EZ
 Z
 Z
  A@Z
 Z
 Z
 Z
 Z
r8   r.  z
    MPT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       e Zd Zdef fdZ ee           eee	e
          	 	 	 	 	 	 	 	 	 ddeej                 deeeej        ej        f         df                  deej                 d	eej                 d
eej                 dee         dee         dee         dee         deeej                 e	f         fd                        Z xZS )MptForTokenClassificationr;   c                    t                                          |           |j        | _        t          |          | _        t          |d          r|j        |j        }n!t          |d          r|j        |j        }nd}t          j	        |          | _
        t          j        |j        |j                  | _        |                                  d S )Nclassifier_dropoutr   g?)r?   r@   r0  r   r   hasattrrB  r   r   r   rl   rL   rA   
classifierr   )rP   r;   rB  rQ   s      r6   r@   z"MptForTokenClassification.__init__  s        +#F++6/00 	%V5N5Z!'!:V-.. 	%63H3T!'!6!$z"455)F$68IJJ 	r8   r   Nr   r   .rU   r   r  r   r   r   r   r   c
           
         |	|	n| j         j        }	|                     ||||||||	          }|d         }|                     |          }|                     |          }d}|p|                    |j                  }|j        \  }}t                      } ||	                    ||z  | j
                  |	                    ||z                      }|	s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )r3  Nr  r   r   )r  r  rR   r   )r;   r   r   rl   rD  rk   r   r^   r   r&   r0  r   rR   r   )rP   r   r   rU   r   r  r   r   r   r   deprecated_argumentsr  rR   r  r  ro   rp   r  r   s                      r6   r}   z!MptForTokenClassification.forward  sL   2 &1%<kk$+B]"..+)'/!5# / 	
 	
 ,A.]33//YYv}--F%+\"J
'))H8J3T_EEv{{S]`jSjGkGk D  	FY!4QRR!88F)-)9TGf$$vE$-;*5	
 
 
 	
r8   r+  )r~   r   r   r   r@   r   r  r   r  r   r	  r   r#   r
  r   r   r   r   r}   r   r   s   @r6   r@  r@    sz       y      " +*+?@@&)$   15SW1504)-$(,0/3&*7
 7
E,-7
 "%elEL.H(I3(N"OP7
 !.	7

  -7
 &7
 D>7
 $D>7
 'tn7
 d^7
 
uU\"$99	:7
 7
 7
  A@7
 7
 7
 7
 7
r8   r@  z
    The MPT Model transformer with a span classification head on top for extractive question-answering tasks like SQuAD
    (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                   F    e Zd Z fdZ ee                    d                    	 	 	 	 	 	 	 	 ddeej	                 deej
                 deej
                 deej	                 deej	                 d	ee         d
ee         dee         deeef         fd            Z xZS )MptForQuestionAnsweringc                     t                                          |           t          |          | _        t	          j        |j        d          | _        |                                  d S )Nr   )	r?   r@   r   r   r   rL   rA   
qa_outputsr   rO   s     r6   r@   z MptForQuestionAnswering.__init__@  sY       #F++)F$6:: 	r8   zbatch_size, sequence_lengthNr   rU   r   start_positionsend_positionsr   r   r   r   c	                    ||n| j         j        }|                     ||||||          }	|	d         }
|                     |
          }|                    dd          \  }}|                    d                                          }|                    d                                          }d}||t          |                                          dk    r|                    d          }t          |                                          dk    r|                    d          }|                    d          }|	                    d|          }|	                    d|          }t          |          } |||          } |||          }||z   dz  }|s||f|	dd         z   }||f|z   n|S t          ||||	j        |	j        	          S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        N)rU   r   r   r   r   r   r   rY   r!   )ignore_indexr   )r  start_logits
end_logitsrR   r   )r;   r   r   rJ  splitr.   rn   rc   rg   r_   r   r   rR   r   )rP   r   rU   r   rK  rL  r   r   r   r   sequence_outputr  rO  rP  
total_lossignored_indexr  
start_lossend_lossr   s                       r6   r}   zMptForQuestionAnswering.forwardH  s   , &1%<kk$+B]"")'/!5# # 
 
 "!*11#)<<r<#:#: j#++B//::<<''++6688

&=+D?''))**Q.."1"9"9""="==%%''((1,, - 5 5b 9 9(--a00M-33A}EEO)//=AAM']CCCH!,@@Jx
M::H$x/14J 	R"J/'!""+=F/9/EZMF**6Q+%!!/)
 
 
 	
r8   r  )r~   r   r   r@   r   r  formatr   r#   r
  FloatTensorr   r   r   r   r}   r   r   s   @r6   rH  rH  8  s=            +*+?+F+FGd+e+eff 156:596:48,0/3&*B
 B
E,-B
 !!23B
   12	B

 "%"23B
   01B
 $D>B
 'tnB
 d^B
 
u22	3B
 B
 B
 gfB
 B
 B
 B
 B
r8   rH  r  )6r   r'   typingr   r   r   r#   torch.utils.checkpointr   torch.nnr   r   r	   r
   r   r   
file_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   configuration_mptr   
get_loggerr~   r   r  r	  r7   r   r:   r   r   r   MPT_START_DOCSTRINGr  r   r  r.  r@  rH  r   r8   r6   <module>re     s      ) ) ) ) ) ) ) ) ) )            L L L L L L L L L L L L $ $ $ $ $ $ q q q q q q q q q q ) ) ) ) ) ) I I I I I I              . - - - - -       ( ( ( ( ( ( 
	H	%	%'    .I9 I9 I9 I9 I929 I9 I9 I9X    RY   *= = = = =ry = = =@,
 ,
 ,
 ,
 ,
 ,
 ,
 ,
^ / d c S
 S
 S
 S
 S
! S
 S
	 S
l   h h h h h' h h hV   j
 j
 j
 j
 j
#5 j
 j
 j
Z   O
 O
 O
 O
 O
 2 O
 O
 O
d   L
 L
 L
 L
 L
0 L
 L
 L
 L
 L
r8   