
    g                       d dl Z d dlZd dlZd dlmZ d dlmZmZmZ d dl	Z	d dl	m
Z
 d dlmZmZmZ ddlmZ ddlmZmZmZmZmZmZmZmZ dd	lmZ dd
lmZmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'  e$j(        e)          Z*dZ+dZ,dZ-dZ.dZ/dZ0dZ1dZ2dZ3dZ4dZ5dZ6dZ7d Z8 G d de
j9                  Z:e
j;        e:dZ< G d d e
j9                  Z= G d! d"e
j9                  Z> G d# d$e
j9                  Z? G d% d&e
j9                  Z@ G d' d(e
j9                  ZA G d) d*e
j9                  ZB G d+ d,e
j9                  ZC G d- d.e
j9                  ZD G d/ d0e
j9                  ZE G d1 d2e
j9                  ZF G d3 d4e
j9                  ZG G d5 d6e
j9                  ZH G d7 d8e
j9                  ZI G d9 d:e
j9                  ZJ G d; d<e
j9                  ZK G d= d>e
j9                  ZL G d? d@e
j9                  ZM G dA dBe
j9                  ZN G dC dDe          ZOe G dE dFe                       ZPdGZQdHZR e"dIeQ           G dJ dKeO                      ZS e"dLeQ           G dM dNeO                      ZT e"dOeQ           G dP dQeO                      ZU G dR dSe
j9                  ZV e"dTeQ           G dU dVeO                      ZW e"dWeQ           G dX dYeO                      ZX e"dZeQ           G d[ d\eO                      ZY e"d]eQ           G d^ d_eO                      ZZ e"d`eQ           G da dbeO                      Z[dS )c    N)	dataclass)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputMultipleChoiceModelOutputNextSentencePredictorOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputadd_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )MobileBertConfigzgoogle/mobilebert-uncasedr   z mrm8488/mobilebert-finetuned-nerzK['I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC']gQ?z#csarron/mobilebert-uncased-squad-v2z'a nice puppet'gףp=
@      zlordtt13/emo-mobilebertz'others'z4.72c           	         	 ddl }ddl}ddl}n)# t          $ r t                              d            w xY wt          j                            |          }t          	                    d|            |j
                            |          }g }g }	|D ]j\  }
}t          	                    d|
 d|            |j
                            ||
          }|                    |
           |	                    |           kt          ||	          D ]\  }
}|
                    dd          }
|
                    d	d
          }
|
                    dd          }
|
                    dd          }
|
                    d          }
t#          d |
D                       r1t          	                    dd                    |
                      | }|
D ]H}|                    d|          r|                    d|          }n|g}|d         dk    s|d         dk    rt)          |d          }n|d         dk    s|d         dk    rt)          |d          }n|d         dk    rt)          |d          }nv|d         dk    rt)          |d          }nY	 t)          ||d                   }nA# t*          $ r4 t          	                    dd                    |
                      Y w xY wt-          |          dk    rt/          |d                   }||         }J|dd         d k    rt)          |d          }n|dk    r|                    |          }	 |j        |j        k    sJ d!|j         d"|j         d#            n/# t4          $ r"}|xj        |j        |j        fz  c_         d}~ww xY wt          	                    d$|
            t9          j        |          |_        | S )%z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape 	ffn_layerffnFakeLayerNorm	LayerNormextra_output_weightszdense/kernelbert
mobilebert/c              3      K   | ]}|d v V  	dS ))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepN ).0ns     n/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/mobilebert/modeling_mobilebert.py	<genexpr>z0load_tf_weights_in_mobilebert.<locals>.<genexpr>q   s<       
 
 nn
 
 
 
 
 
    z	Skipping z[A-Za-z]+_\d+z_(\d+)kernelgammaweightoutput_biasbetabiasoutput_weightssquad
classifier   r   i_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight )renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzipreplacesplitanyjoin	fullmatchgetattrAttributeErrorlenint	transposeshapeAssertionErrorargstorch
from_numpydata)modelconfigtf_checkpoint_pathrB   nptftf_path	init_varsnamesarraysnamer[   arraypointerm_namescope_namesnumes                     r4   load_tf_weights_in_mobilebertrq   P   sa   
			   Q	
 	
 	
 	 goo011G
KKBBBCCC''00IEF   eBBB5BBCCC&&w55Te5&)) 1/ 1/e||K//||O[99||2NCC||FL11zz#  
 

 
 
 
 
 	 KK4CHHTNN44555 	' 	'F||,f55 ' hhy&99%h1~))[^w-F-F!'844Q=00KNf4L4L!'622Q#333!'844Q7**!'<88%g{1~>>GG%   KK <CHHTNN < <===H ;1$$+a.))!#,#$$<=((gx00GGxLL''E	,,,XXXXXX -,,, 	 	 	FFw}ek22FF	 	777888'..Ls2    &5J'':K%$K%+N
N-N((N-c                   D     e Zd Zd fd	Zdej        dej        fdZ xZS )NoNormNc                     t                                                       t          j        t	          j        |                    | _        t          j        t	          j        |                    | _        d S N)	super__init__r   	Parameterr^   zerosr<   onesr9   )self	feat_sizeeps	__class__s      r4   rw   zNoNorm.__init__   sS    LY!7!788	l5:i#8#899r6   input_tensorreturnc                 &    || j         z  | j        z   S ru   )r9   r<   )r{   r   s     r4   forwardzNoNorm.forward   s    dk)DI55r6   ru   __name__
__module____qualname__rw   r^   Tensorr   __classcell__r~   s   @r4   rs   rs      sc        : : : : : :
6EL 6U\ 6 6 6 6 6 6 6 6r6   rs   )
layer_normno_normc                        e Zd ZdZ fdZ	 	 	 	 d
deej                 deej                 deej                 deej                 dej	        f
d	Z
 xZS )MobileBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t                                                       |j        | _        |j        | _        |j        | _        t          j        |j        |j        |j                  | _	        t          j        |j
        |j                  | _        t          j        |j        |j                  | _        | j        rdnd}| j        |z  }t          j        ||j                  | _        t!          |j                 |j                  | _        t          j        |j                  | _        |                     dt/          j        |j
                                      d          d           d S )N)padding_idxr   r   position_ids)r   F)
persistent)rv   rw   trigram_inputembedding_sizehidden_sizer   	Embedding
vocab_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddingsLinearembedding_transformationNORM2FNnormalization_typer&   Dropouthidden_dropout_probdropoutregister_bufferr^   arangeexpand)r{   rb   embed_dim_multiplierembedded_input_sizer~   s       r4   rw   zMobileBertEmbeddings.__init__   sF   #1$3!-!|F,=v?Tbhbuvvv#%<0NPVPb#c#c %'\&2H&J\%]%]"$($6=qqA"14HH(*	2EvGY(Z(Z% !:;F<NOOz&"<== 	EL)GHHOOPWXXej 	 	
 	
 	
 	
 	
r6   N	input_idstoken_type_idsr   inputs_embedsr   c           
      6   ||                                 }n|                                 d d         }|d         }|| j        d d d |f         }|+t          j        |t          j        | j        j                  }||                     |          }| j        rut          j        t          j
                            |d d dd f         g dd          |t          j
                            |d d d df         g dd          gd	          }| j        s| j        | j        k    r|                     |          }|                     |          }|                     |          }||z   |z   }	|                     |	          }	|                     |	          }	|	S )
Nr   r   dtypedevice)r   r   r   r   r   r           )value)r   r   r   r   r   r   r@   dim)sizer   r^   ry   longr   r   r   catr   
functionalpadr   r   r   r   r   r&   r   )
r{   r   r   r   r   input_shape
seq_lengthr   r   
embeddingss
             r4   r   zMobileBertEmbeddings.forward   s     #..**KK',,..ss3K ^
,QQQ^<L!"[EJtO`OghhhN  00;;M 	 "IM%%mAAAqrrE&:<N<N<NVY%ZZ!M%%mAAAssF&;=O=O=OWZ%[[
   M  	I!48H!H!H 99-HHM #66|DD $ : :> J J"%88;PP
^^J//
\\*--
r6   )NNNN)r   r   r   __doc__rw   r   r^   
LongTensorFloatTensorr   r   r   r   s   @r4   r   r      s        QQ
 
 
 
 
0 155937590 0E,-0 !!120 u/0	0
   120 
0 0 0 0 0 0 0 0r6   r   c                        e Zd Z fdZd Z	 	 	 ddej        dej        dej        deej                 deej                 d	ee	         d
e
ej                 fdZ xZS )MobileBertSelfAttentionc                    t                                                       |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j        |j        | j                  | _	        t          j        |j        | j                  | _
        t          j        |j        r|j        n|j        | j                  | _        t          j        |j                  | _        d S ru   )rv   rw   num_attention_headsrY   true_hidden_sizeattention_head_sizeall_head_sizer   r   querykeyuse_bottleneck_attentionr   r   r   attention_probs_dropout_probr   r{   rb   r~   s     r4   rw   z MobileBertSelfAttention.__init__   s    #)#= #&v'>A['[#\#\ !58PPYv68JKK
9V4d6HIIY'-'F^F##FL^`d`r
 

 z&"EFFr6   c                     |                                 d d         | j        | j        fz   }|                    |          }|                    dddd          S )Nr   r   r@   r   r   )r   r   r   viewpermute)r{   xnew_x_shapes      r4   transpose_for_scoresz,MobileBertSelfAttention.transpose_for_scores  sP    ffhhssmt'?AY&ZZFF;yyAq!$$$r6   Nquery_tensor
key_tensorvalue_tensorattention_mask	head_maskoutput_attentionsr   c                    |                      |          }|                     |          }|                     |          }	|                     |          }
|                     |          }|                     |	          }t	          j        |
|                    dd                    }|t          j        | j	                  z  }|||z   }t          j                            |d          }|                     |          }|||z  }t	          j        ||          }|                    dddd                                          }|                                d d         | j        fz   }|                    |          }|r||fn|f}|S )Nr   r   r   r@   r   r   )r   r   r   r   r^   matmulrZ   mathsqrtr   r   r   softmaxr   r   
contiguousr   r   r   )r{   r   r   r   r   r   r   mixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                     r4   r   zMobileBertSelfAttention.forward
  s    !JJ|44((:.. JJ|44//0ABB--o>>	//0ABB !<Y5H5HR5P5PQQ+di8P.Q.QQ%/.@-//0@b/II ,,77 -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S%**+BCC6G]=/22mM]r6   NNN)r   r   r   rw   r   r^   r   r   r   boolr   r   r   r   s   @r4   r   r      s        G G G G G% % % 7;15,0$ $l$ L$ l	$
 !!23$ E-.$ $D>$ 
u|	$ $ $ $ $ $ $ $r6   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )MobileBertSelfOutputc                 L   t                                                       |j        | _        t          j        |j        |j                  | _        t          |j                 |j        |j	                  | _
        | j        s t          j        |j                  | _        d S d S Nr}   )rv   rw   use_bottleneckr   r   r   denser   r   layer_norm_epsr&   r   r   r   r   s     r4   rw   zMobileBertSelfOutput.__init__2  s    $3Yv68OPP
 !:;F<SY_Ynooo" 	B:f&@AADLLL	B 	Br6   hidden_statesresidual_tensorr   c                     |                      |          }| j        s|                     |          }|                     ||z             }|S ru   )r   r   r   r&   r{   r   r   layer_outputss       r4   r   zMobileBertSelfOutput.forward:  sK    

=11" 	8 LL77M}'FGGr6   r   r   s   @r4   r   r   1  sn        B B B B BU\ EL UZUa        r6   r   c                        e Zd Z fdZd Z	 	 	 ddej        dej        dej        dej        deej                 d	eej                 d
ee	         de
ej                 fdZ xZS )MobileBertAttentionc                     t                                                       t          |          | _        t	          |          | _        t                      | _        d S ru   )rv   rw   r   r{   r   outputsetpruned_headsr   s     r4   rw   zMobileBertAttention.__init__C  sI    +F33	*622EEr6   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   r   )rX   r   r{   r   r   r   r   r   r   r   r   r   r   union)r{   headsindexs      r4   prune_headszMobileBertAttention.prune_headsI  s    u::??F7490$)2OQUQb
 
u
 -TY_eDD	*49=%@@	,TY_eDD	.t{/@%QOOO )-	(EE

(R	%"&)"?$)B_"_	 -33E::r6   Nr   r   r   layer_inputr   r   r   r   c                     |                      ||||||          }|                     |d         |          }	|	f|dd          z   }
|
S )Nr   r   )r{   r   )r{   r   r   r   r  r   r   r   self_outputsattention_outputr   s              r4   r   zMobileBertAttention.forward[  s^     yy
 
  ;;|ADD#%QRR(88r6   r   )r   r   r   rw   r  r^   r   r   r   r   r   r   r   r   s   @r4   r   r   B  s        " " " " "; ; ;0 7;15,0 l L l	
 \ !!23 E-. $D> 
u|	       r6   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )MobileBertIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S ru   )rv   rw   r   r   r   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr   s     r4   rw   zMobileBertIntermediate.__init__u  sn    Yv68PQQ
f'-- 	9'-f.?'@D$$$'-'8D$$$r6   r   r   c                 Z    |                      |          }|                     |          }|S ru   )r   r  r{   r   s     r4   r   zMobileBertIntermediate.forward}  s,    

=1100??r6   r   r   s   @r4   r	  r	  t  s^        9 9 9 9 9U\ el        r6   r	  c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )OutputBottleneckc                 "   t                                                       t          j        |j        |j                  | _        t          |j                 |j        |j	                  | _
        t          j        |j                  | _        d S r   )rv   rw   r   r   r   r   r   r   r   r   r&   r   r   r   r   s     r4   rw   zOutputBottleneck.__init__  sm    Yv68JKK
 !:;F<NTZTijjjz&"<==r6   r   r   r   c                     |                      |          }|                     |          }|                     ||z             }|S ru   )r   r   r&   r   s       r4   r   zOutputBottleneck.forward  s@    

=11]33}'FGGr6   r   r   s   @r4   r  r    si        > > > > >U\ EL UZUa        r6   r  c                   ^     e Zd Z fdZdej        dej        dej        dej        fdZ xZS )MobileBertOutputc                 f   t                                                       |j        | _        t          j        |j        |j                  | _        t          |j	                 |j                  | _
        | j        s t          j        |j                  | _        d S t          |          | _        d S ru   )rv   rw   r   r   r   r  r   r   r   r   r&   r   r   r   r  
bottleneckr   s     r4   rw   zMobileBertOutput.__init__  s    $3Yv79PQQ
 !:;F<STT" 	7:f&@AADLLL.v66DOOOr6   intermediate_statesresidual_tensor_1residual_tensor_2r   c                     |                      |          }| j        s.|                     |          }|                     ||z             }n.|                     ||z             }|                     ||          }|S ru   )r   r   r   r&   r  )r{   r  r  r  layer_outputs        r4   r   zMobileBertOutput.forward  s}     zz"566" 	L<<55L>>,9J*JKKLL>>,9J*JKKL??<9JKKLr6   r   r   s   @r4   r  r    su        7 7 7 7 7
#(<
DIL
ejeq
	
 
 
 
 
 
 
 
r6   r  c                   B     e Zd Z fdZdej        dej        fdZ xZS )BottleneckLayerc                     t                                                       t          j        |j        |j                  | _        t          |j                 |j        |j	                  | _
        d S r   )rv   rw   r   r   r   intra_bottleneck_sizer   r   r   r   r&   r   s     r4   rw   zBottleneckLayer.__init__  sY    Yv163OPP
 !:;F<X^d^stttr6   r   r   c                 Z    |                      |          }|                     |          }|S ru   r   r&   )r{   r   r  s      r4   r   zBottleneckLayer.forward  s*    jj//nn[11r6   r   r   s   @r4   r   r     sc        u u u u u
U\ el        r6   r   c                   N     e Zd Z fdZdej        deej                 fdZ xZS )
Bottleneckc                     t                                                       |j        | _        |j        | _        t	          |          | _        | j        rt	          |          | _        d S d S ru   )rv   rw   key_query_shared_bottleneckr   r   input	attentionr   s     r4   rw   zBottleneck.__init__  sf    +1+M((.(G%$V,,
+ 	5,V44DNNN	5 	5r6   r   r   c                     |                      |          }| j        r|fdz  S | j        r|                     |          }||||fS ||||fS )N   )r)  r   r(  r*  )r{   r   bottlenecked_hidden_statesshared_attention_inputs       r4   r   zBottleneck.forward  so    " &*ZZ%>%>"( 	].0144- 	]%)^^M%B%B"*,BMSmnn!=-A[\\r6   	r   r   r   rw   r^   r   r   r   r   r   s   @r4   r&  r&    sm        5 5 5 5 5]U\ ]eEL6I ] ] ] ] ] ] ] ]r6   r&  c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )	FFNOutputc                     t                                                       t          j        |j        |j                  | _        t          |j                 |j        |j	                  | _
        d S r   )rv   rw   r   r   r  r   r   r   r   r   r&   r   s     r4   rw   zFFNOutput.__init__  sY    Yv79PQQ
 !:;F<SY_Ynooor6   r   r   r   c                 `    |                      |          }|                     ||z             }|S ru   r$  r   s       r4   r   zFFNOutput.forward  s/    

=11}'FGGr6   r   r   s   @r4   r1  r1    sn        p p p p p
U\ EL UZUa        r6   r1  c                   B     e Zd Z fdZdej        dej        fdZ xZS )FFNLayerc                     t                                                       t          |          | _        t	          |          | _        d S ru   )rv   rw   r	  intermediater1  r   r   s     r4   rw   zFFNLayer.__init__  s<    26::''r6   r   r   c                 \    |                      |          }|                     ||          }|S ru   )r7  r   )r{   r   intermediate_outputr   s       r4   r   zFFNLayer.forward  s0    "//>>$7GGr6   r   r   s   @r4   r5  r5    s^        ( ( ( ( (
U\ el        r6   r5  c                        e Zd Z fdZ	 	 	 d	dej        deej                 deej                 dee         de	ej                 f
dZ
 xZS )
MobileBertLayerc                    t                                                       j        | _        j        | _        t	                    | _        t                    | _        t                    | _	        | j        rt                    | _        j        dk    r<t          j        fdt          j        dz
            D                       | _        d S d S )Nr   c                 .    g | ]}t                    S r1   )r5  r2   _rb   s     r4   
<listcomp>z,MobileBertLayer.__init__.<locals>.<listcomp>  s!    %k%k%k1hv&6&6%k%k%kr6   )rv   rw   r   num_feedforward_networksr   r*  r	  r7  r  r   r&  r  r   
ModuleListranger$   r   s    `r4   rw   zMobileBertLayer.__init__  s    $3(.(G%,V4426::&v.. 	1(00DO*Q..}%k%k%k%kfFehiFi@j@j%k%k%kllDHHH /.r6   Nr   r   r   r   r   c           	         | j         r|                     |          \  }}}}n|gdz  \  }}}}|                     |||||||          }	|	d         }
|
f}|	dd          }| j        dk    r+t	          | j                  D ]\  }} ||
          }
||
fz  }|                     |
          }|                     ||
|          }|f|z   t          j	        d          |||||
|fz   |z   }|S )Nr,  )r   r   r   i  )
r   r  r*  rA  	enumerater$   r7  r   r^   tensor)r{   r   r   r   r   r   r   r   r  self_attention_outputsr  sr   i
ffn_moduler9  r  s                    r4   r   zMobileBertLayer.forward  sZ     	VBF//R_B`B`?L*lKKCP/TUBU?L*lK!%/ "0 "
 "
 2!4(,(A--!*48!4!4 ) ):#-:.>#?#? &(("//0@AA{{#68H-XXO T"" #
  	 r6   r   )r   r   r   rw   r^   r   r   r   r   r   r   r   r   s   @r4   r;  r;    s        m m m m m  7;15,0. .|. !!23. E-.	.
 $D>. 
u|	. . . . . . . .r6   r;  c                        e Zd Z fdZ	 	 	 	 	 ddej        deej                 deej                 dee         d	ee         d
ee         de	e
ef         fdZ xZS )MobileBertEncoderc                     t                                                       t          j        fdt	          j                  D                       | _        d S )Nc                 .    g | ]}t                    S r1   )r;  r>  s     r4   r@  z.MobileBertEncoder.__init__.<locals>.<listcomp>3  s!    #e#e#eOF$;$;#e#e#er6   )rv   rw   r   rB  rC  num_hidden_layerslayerr   s    `r4   rw   zMobileBertEncoder.__init__1  sO    ]#e#e#e#eU6KcEdEd#e#e#eff


r6   NFTr   r   r   r   output_hidden_statesreturn_dictr   c                 "   |rdnd }|rdnd }t          | j                  D ]7\  }	}
|r||fz   } |
||||	         |          }|d         }|r||d         fz   }8|r||fz   }|st          d |||fD                       S t          |||          S )Nr1   r   r   c              3      K   | ]}||V  	d S ru   r1   )r2   vs     r4   r5   z,MobileBertEncoder.forward.<locals>.<genexpr>T  s(      hhqZ[ZgZgZgZgZghhr6   )last_hidden_stater   
attentions)rE  rP  tupler   )r{   r   r   r   r   rQ  rR  all_hidden_statesall_attentionsrI  layer_moduler   s               r4   r   zMobileBertEncoder.forward5  s    #7@BBD0:d(44 	F 	FOA|# I$58H$H!(L!!	 M *!,M  F!/=3C2E!E   	E 1]4D D 	ihh]4E~$Vhhhhhh+;LYg
 
 
 	
r6   )NNFFT)r   r   r   rw   r^   r   r   r   r   r   r   r   r   r   r   s   @r4   rL  rL  0  s        g g g g g 7;15,1/4&*"
 "
|"
 !!23"
 E-.	"

 $D>"
 'tn"
 d^"
 
uo%	&"
 "
 "
 "
 "
 "
 "
 "
r6   rL  c                   B     e Zd Z fdZdej        dej        fdZ xZS )MobileBertPoolerc                     t                                                       |j        | _        | j        r&t	          j        |j        |j                  | _        d S d S ru   )rv   rw   classifier_activationdo_activater   r   r   r   r   s     r4   rw   zMobileBertPooler.__init__[  sY    !7 	K6#5v7IJJDJJJ	K 	Kr6   r   r   c                     |d d df         }| j         s|S |                     |          }t          j        |          }|S )Nr   )r`  r   r^   tanh)r{   r   first_token_tensorpooled_outputs       r4   r   zMobileBertPooler.forwarda  sO     +111a40 	!%% JJ'9::M!J}55M  r6   r   r   s   @r4   r]  r]  Z  sc        K K K K K	!U\ 	!el 	! 	! 	! 	! 	! 	! 	! 	!r6   r]  c                   B     e Zd Z fdZdej        dej        fdZ xZS )!MobileBertPredictionHeadTransformc                 X   t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _
        n|j        | _
        t          d         |j        |j                  | _        d S )Nr   r   )rv   rw   r   r   r   r   r  r  r  r   transform_act_fnr   r   r&   r   s     r4   rw   z*MobileBertPredictionHeadTransform.__init__n  s    Yv163EFF
f'-- 	6$*6+<$=D!!$*$5D! .v/AvG\]]]r6   r   r   c                     |                      |          }|                     |          }|                     |          }|S ru   )r   rh  r&   r  s     r4   r   z)MobileBertPredictionHeadTransform.forwardw  s=    

=11--m<<}55r6   r   r   s   @r4   rf  rf  m  sc        ^ ^ ^ ^ ^U\ el        r6   rf  c                   J     e Zd Z fdZddZdej        dej        fdZ xZS )MobileBertLMPredictionHeadc                    t                                                       t          |          | _        t	          j        |j        |j        |j        z
  d          | _	        t	          j        |j        |j        d          | _
        t	          j        t          j        |j                            | _        | j        | j
        _        d S )NF)r<   )rv   rw   rf  	transformr   r   r   r   r   r   decoderrx   r^   ry   r<   r   s     r4   rw   z#MobileBertLMPredictionHead.__init__  s    :6BB Yv0&2DvG\2\chiii
y!68IPUVVVLV->!?!?@@	 Ir6   r   Nc                 (    | j         | j        _         d S ru   )r<   rn  r{   s    r4   _tie_weightsz'MobileBertLMPredictionHead._tie_weights  s     Ir6   r   c                     |                      |          }|                    t          j        | j        j                                        | j        j        gd                    }|| j        j        z  }|S )Nr   r   )	rm  r   r^   r   rn  r9   tr   r<   r  s     r4   r   z"MobileBertLMPredictionHead.forward  sh    }55%,,UY8K8M8M8O8OQUQ[Qb7cij-k-k-kll**r6   )r   N)	r   r   r   rw   rq  r^   r   r   r   r   s   @r4   rk  rk  ~  sr        	& 	& 	& 	& 	&& & & &U\ el        r6   rk  c                   B     e Zd Z fdZdej        dej        fdZ xZS )MobileBertOnlyMLMHeadc                 p    t                                                       t          |          | _        d S ru   )rv   rw   rk  predictionsr   s     r4   rw   zMobileBertOnlyMLMHead.__init__  s/    5f==r6   sequence_outputr   c                 0    |                      |          }|S ru   )rw  )r{   rx  prediction_scoress      r4   r   zMobileBertOnlyMLMHead.forward  s     ,,_==  r6   r   r   s   @r4   ru  ru    s^        > > > > >!u| ! ! ! ! ! ! ! ! !r6   ru  c                   \     e Zd Z fdZdej        dej        deej                 fdZ xZS )MobileBertPreTrainingHeadsc                     t                                                       t          |          | _        t	          j        |j        d          | _        d S Nr@   )rv   rw   rk  rw  r   r   r   seq_relationshipr   s     r4   rw   z#MobileBertPreTrainingHeads.__init__  sF    5f== "	&*<a @ @r6   rx  rd  r   c                 ^    |                      |          }|                     |          }||fS ru   )rw  r  )r{   rx  rd  rz  seq_relationship_scores        r4   r   z"MobileBertPreTrainingHeads.forward  s6     ,,_==!%!6!6}!E!E "888r6   r/  r   s   @r4   r|  r|    st        A A A A A
9u| 9EL 9UZ[`[gUh 9 9 9 9 9 9 9 9r6   r|  c                   $    e Zd ZdZeZeZdZd Z	dS )MobileBertPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    r)   c                    t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j        	                                 dS dS t          |t          j
                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 	                                 dS dS t          |t          j        t          f          r?|j        j        	                                 |j        j                            d           dS dS )zInitialize the weightsr   )meanstdNg      ?)r  r   r   r9   r`   normal_rb   initializer_ranger<   zero_r   r   r&   rs   fill_)r{   modules     r4   _init_weightsz'MobileBertPreTrainedModel._init_weights  s.   fbi(( 	* M&&CT[5R&SSS{& &&((((( '&-- 	*M&&CT[5R&SSS!-"6#56<<>>>>> .-v 677 	*K""$$$M$$S)))))	* 	*r6   N)
r   r   r   r   r   config_classrq   load_tf_weightsbase_model_prefixr  r1   r6   r4   r  r    s?         
 $L3O$* * * * *r6   r  c                       e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
ej        ed<   dZeeej                          ed<   dZeeej                          ed<   dS )MobileBertForPreTrainingOutputab  
    Output type of [`MobileBertForPreTraining`].

    Args:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            Total loss as the sum of the masked language modeling loss and the next sequence prediction
            (classification) loss.
        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlossprediction_logitsseq_relationship_logitsr   rW  )r   r   r   r   r  r   r^   r   __annotations__r  r  r   r   rW  r1   r6   r4   r  r    s          2 )-D(5$
%,,,+/u(///15U.5558<M8E%"345<<<59Ju01299999r6   r  aD  

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`MobileBertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a5
  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zdThe bare MobileBert Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd ZdZd fd	Zd Zd Zd Z ee	
                    d                     eeee          	 	 	 	 	 	 	 	 	 dd
eej                 deej                 deej                 deej                 deej                 deej                 dee         dee         dee         deeef         fd                        Z xZS )MobileBertModelz.
    https://arxiv.org/pdf/2004.02984.pdf
    Tc                     t                                          |           || _        t          |          | _        t          |          | _        |rt          |          nd | _        | 	                                 d S ru   )
rv   rw   rb   r   r   rL  encoderr]  pooler	post_init)r{   rb   add_pooling_layerr~   s      r4   rw   zMobileBertModel.__init__2  sr       .v66(002CM&v... 	r6   c                     | j         j        S ru   r   r   rp  s    r4   get_input_embeddingsz$MobileBertModel.get_input_embeddings=  s    ..r6   c                     || j         _        d S ru   r  )r{   r   s     r4   set_input_embeddingsz$MobileBertModel.set_input_embeddings@  s    */'''r6   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  rP  r*  r  )r{   heads_to_prunerP  r  s       r4   _prune_headszMobileBertModel._prune_headsC  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr6   batch_size, sequence_length
checkpointoutput_typer  Nr   r   r   r   r   r   rQ  r   rR  r   c
                 P   ||n| j         j        }||n| j         j        }|	|	n| j         j        }	||t	          d          |+|                     ||           |                                }
n.||                                d d         }
nt	          d          ||j        n|j        }|t          j	        |
|          }|!t          j
        |
t          j        |          }|                     ||
          }|                     || j         j                  }|                     ||||          }|                     ||||||	          }|d         }| j        |                     |          nd }|	s||f|d	d          z   S t%          |||j        |j        
          S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)r   r   )r   r   r   r   )r   r   r   rQ  rR  r   r   )rV  pooler_outputr   rW  )rb   r   rQ  use_return_dict
ValueError%warn_if_padding_and_no_attention_maskr   r   r^   rz   ry   r   get_extended_attention_maskget_head_maskrO  r   r  r  r   r   rW  )r{   r   r   r   r   r   r   rQ  r   rR  r   r   extended_attention_maskembedding_outputencoder_outputsrx  rd  s                    r4   r   zMobileBertModel.forwardK  s	   $ 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU%.%:!!@T!"ZFCCCN!"[EJvVVVN 150P0PQ_al0m0m &&y$+2OPP	??l>iv + 
 
 ,,2/!5# ' 
 
 *!,8<8OO444UY 	J#]3oabb6III)-')7&1	
 
 
 	
r6   )T)	NNNNNNNNN)r   r   r   r   rw   r  r  r  r   MOBILEBERT_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r^   r   r   r   r   r   r   r   r   s   @r4   r  r  )  s       
 	 	 	 	 	 	/ / /0 0 0C C C +*+F+M+MNk+l+lmm&.$   156:59371559/3,0&*D
 D
E,-D
 !!23D
 !!12	D

 u/0D
 E-.D
   12D
 'tnD
 $D>D
 d^D
 
u00	1D
 D
 D
  nmD
 D
 D
 D
 D
r6   r  z
    MobileBert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `next sentence prediction (classification)` head.
    c                       e Zd ZddgZ fdZd Zd Zddee         de	j
        f fd	Z ee                    d
                     eee          	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 deej                 deej                 deej                 deej                 deej                 deeef         fd                        Z xZS )MobileBertForPreTrainingcls.predictions.decoder.weightcls.predictions.decoder.biasc                     t                                          |           t          |          | _        t	          |          | _        |                                  d S ru   )rv   rw   r  r)   r|  clsr  r   s     r4   rw   z!MobileBertForPreTraining.__init__  sQ       )&11-f55 	r6   c                 $    | j         j        j        S ru   r  rw  rn  rp  s    r4   get_output_embeddingsz.MobileBertForPreTraining.get_output_embeddings      x#++r6   c                 T    || j         j        _        |j        | j         j        _        d S ru   r  rw  rn  r<   r{   new_embeddingss     r4   set_output_embeddingsz.MobileBertForPreTraining.set_output_embeddings  %    '5$$2$7!!!r6   Nnew_num_tokensr   c                     |                      | j        j        j        |d          | j        j        _        t	                                          |          S NT)r  
transposed)r  _get_resized_lm_headr  rw  r   rv   resize_token_embeddingsr{   r  r~   s     r4   r  z0MobileBertForPreTraining.resize_token_embeddings  sR    %)%>%>H &~RV &? &
 &
" ww..n.MMMr6   r  r  r  r   r   r   r   r   r   labelsnext_sentence_labelr   rQ  rR  c                 .   ||n| j         j        }|                     |||||||	|
|	  	        }|dd         \  }}|                     ||          \  }}d}||t	                      } ||                    d| j         j                  |                    d                    } ||                    dd          |                    d                    }||z   }|s||f|dd         z   }||f|z   n|S t          ||||j        |j	                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, MobileBertForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
        >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")

        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)
        >>> # Batch size 1
        >>> outputs = model(input_ids)

        >>> prediction_logits = outputs.prediction_logits
        >>> seq_relationship_logits = outputs.seq_relationship_logits
        ```Nr   r   r   r   r   r   rQ  rR  r@   r   )r  r  r  r   rW  )
rb   r  r)   r  r	   r   r   r  r   rW  )r{   r   r   r   r   r   r   r  r  r   rQ  rR  r   rx  rd  rz  r  
total_lossloss_fctmasked_lm_lossnext_sentence_lossr   s                         r4   r   z MobileBertForPreTraining.forward  sn   \ &1%<kk$+B]//))%'/!5# " 

 

 *1!&48HH_m4\4\11
"5"A'))H%X&7&<&<RAW&X&XZ`ZeZefhZiZijjN!)*@*E*Eb!*L*LNaNfNfgiNjNj!k!k'*<<J 	R')?@7122;NF/9/EZMF**6Q-/$:!/)
 
 
 	
r6   ru   NNNNNNNNNNN)r   r   r   _tied_weights_keysrw   r  r  r   rY   r   r   r  r   r  r  r   r  r  r^   r   r   r   r   r   r   r   s   @r4   r  r    s        ;<Z[    , , ,8 8 8N Nhsm Nr| N N N N N N +*+F+M+MNk+l+lmm+IXghhh 156:59371559-1:>9=<@37M
 M
E,-M
 !!23M
 !!12	M

 u/0M
 E-.M
   12M
 )*M
 &e&67M
 $E$56M
 'u'89M
 e/0M
 
u44	5M
 M
 M
 ih nmM
 M
 M
 M
 M
r6   r  z8MobileBert Model with a `language modeling` head on top.c                       e Zd ZddgZ fdZd Zd Zddee         de	j
        f fd	Z ee                    d
                     eeeedd          	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 deej                 dee         dee         dee         deeef         fd                        Z xZS )MobileBertForMaskedLMr  r  c                     t                                          |           t          |d          | _        t	          |          | _        || _        |                                  d S NF)r  )rv   rw   r  r)   ru  r  rb   r  r   s     r4   rw   zMobileBertForMaskedLM.__init__  s]       )&EJJJ(00 	r6   c                 $    | j         j        j        S ru   r  rp  s    r4   r  z+MobileBertForMaskedLM.get_output_embeddings  r  r6   c                 T    || j         j        _        |j        | j         j        _        d S ru   r  r  s     r4   r  z+MobileBertForMaskedLM.set_output_embeddings  r  r6   Nr  r   c                     |                      | j        j        j        |d          | j        j        _        t	                                          |          S r  r  r  s     r4   r  z-MobileBertForMaskedLM.resize_token_embeddings  sR    %)%>%>H &~RV &? &
 &
" ww..n.MMMr6   r  z'paris'g=
ףp=?r  r  r  expected_outputexpected_lossr   r   r   r   r   r   r  r   rQ  rR  c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }d}|Kt	                      } ||                    d| j         j                  |                    d                    }|
s|f|dd         z   }||f|z   n|S t          |||j        |j	                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr  r   r   r@   r  logitsr   rW  )
rb   r  r)   r  r	   r   r   r   r   rW  )r{   r   r   r   r   r   r   r  r   rQ  rR  r   rx  rz  r  r  r   s                    r4   r   zMobileBertForMaskedLM.forward&  s   6 &1%<kk$+B]//))%'/!5# " 

 

 "!* HH_55'))H%X&7&<&<RAW&X&XZ`ZeZefhZiZijjN 	Z')GABBK7F3A3M^%..SYY$!/)	
 
 
 	
r6   ru   
NNNNNNNNNN)r   r   r   r  rw   r  r  r   rY   r   r   r  r   r  r  r   r  r   r  r^   r   r   r   r   r   r   r   r   s   @r4   r  r    s       :<Z[    , , ,8 8 8N Nhsm Nr| N N N N N N +*+F+M+MNk+l+lmm&"$!   156:59371559-1,0/3&*2
 2
E,-2
 !!232
 !!12	2

 u/02
 E-.2
   122
 )*2
 $D>2
 'tn2
 d^2
 
un$	%2
 2
 2
  nm2
 2
 2
 2
 2
r6   r  c                   B     e Zd Z fdZdej        dej        fdZ xZS )MobileBertOnlyNSPHeadc                     t                                                       t          j        |j        d          | _        d S r~  )rv   rw   r   r   r   r  r   s     r4   rw   zMobileBertOnlyNSPHead.__init__d  s6     "	&*<a @ @r6   rd  r   c                 0    |                      |          }|S ru   )r  )r{   rd  r  s      r4   r   zMobileBertOnlyNSPHead.forwardh  s    !%!6!6}!E!E%%r6   r   r   s   @r4   r  r  c  sc        A A A A A&U\ &el & & & & & & & &r6   r  zPMobileBert Model with a `next sentence prediction (classification)` head on top.c                       e Zd Z fdZ ee                    d                     eee	          	 	 	 	 	 	 	 	 	 	 dde
ej                 de
ej                 de
ej                 de
ej                 d	e
ej                 d
e
ej                 de
ej                 de
e         de
e         de
e         deeef         fd                        Z xZS )#MobileBertForNextSentencePredictionc                     t                                          |           t          |          | _        t	          |          | _        |                                  d S ru   )rv   rw   r  r)   r  r  r  r   s     r4   rw   z,MobileBertForNextSentencePrediction.__init__r  sQ       )&11(00 	r6   r  r  Nr   r   r   r   r   r   r  r   rQ  rR  r   c                    d|v r/t          j        dt                     |                    d          }|
|
n| j        j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }d}|At                      } ||	                    dd          |	                    d                    }|
s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )	a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring) Indices should be in `[0, 1]`.

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, MobileBertForNextSentencePrediction
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
        >>> model = MobileBertForNextSentencePrediction.from_pretrained("google/mobilebert-uncased")

        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
        >>> loss = outputs.loss
        >>> logits = outputs.logits
        ```r  zoThe `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.Nr  r   r   r@   r  )warningswarnFutureWarningpoprb   r  r)   r  r	   r   r   r   rW  )r{   r   r   r   r   r   r   r  r   rQ  rR  kwargsr   rd  r  r  r  r   s                     r4   r   z+MobileBertForNextSentencePrediction.forward{  sI   X !F**M%  
 ZZ 566F%0%<kk$+B]//))%'/!5# " 

 

  
!%-!8!8!'))H!)*@*E*Eb!*L*LfkkZ\oo!^!^ 	b,.<F7I7U')F22[aa*#)!/)	
 
 
 	
r6   r  )r   r   r   rw   r   r  r  r   r   r  r   r^   r   r   r   r   r   r   r   r   s   @r4   r  r  m  s       
     +*+F+M+MNk+l+lmm+FUdeee 156:59371559-1,0/3&*Q
 Q
E,-Q
 !!23Q
 !!12	Q

 u/0Q
 E-.Q
   12Q
 )*Q
 $D>Q
 'tnQ
 d^Q
 
u11	2Q
 Q
 Q
 fe nmQ
 Q
 Q
 Q
 Q
r6   r  z
    MobileBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                       e Zd Z fdZ ee                    d                     eee	e
ee          	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 dee         dee         dee         deeej                 e	f         fd                        Z xZS )#MobileBertForSequenceClassificationc                 d   t                                          |           |j        | _        || _        t	          |          | _        |j        |j        n|j        }t          j	        |          | _
        t          j        |j        |j                  | _        |                                  d S ru   )rv   rw   
num_labelsrb   r  r)   classifier_dropoutr   r   r   r   r   r   r?   r  r{   rb   r  r~   s      r4   rw   z,MobileBertForSequenceClassification.__init__  s        +)&11)/)B)NF%%TZTn 	 z"455)F$68IJJ 	r6   r  r  Nr   r   r   r   r   r   r  r   rQ  rR  r   c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }d}|Z| j         j        f| j        dk    rd| j         _        nN| j        dk    r7|j        t          j	        k    s|j        t          j
        k    rd| j         _        nd| j         _        | j         j        dk    rWt                      }| j        dk    r1 ||                                |                                          }n |||          }n| j         j        dk    rGt                      } ||                    d| j                  |                    d                    }n*| j         j        dk    rt                      } |||          }|
s|f|dd         z   }||f|z   n|S t!          |||j        |j        	          S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   
regressionsingle_label_classificationmulti_label_classificationr   r@   r  )rb   r  r)   r   r?   problem_typer  r   r^   r   rY   r
   squeezer	   r   r   r   r   rW  )r{   r   r   r   r   r   r   r  r   rQ  rR  r   rd  r  r  r  r   s                    r4   r   z+MobileBertForSequenceClassification.forward  s   6 &1%<kk$+B]//))%'/!5# " 

 

  
]33//{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE'!/)	
 
 
 	
r6   r  )r   r   r   rw   r   r  r  r   '_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATIONr   r  _SEQ_CLASS_EXPECTED_OUTPUT_SEQ_CLASS_EXPECTED_LOSSr   r^   r   r   r   r   r   r   r   s   @r4   r  r    s            +*+F+M+MNk+l+lmm:,$2.   -11515/3,004)-,0/3&*E
 E
EL)E
 !.E
 !.	E

 u|,E
 EL)E
  -E
 &E
 $D>E
 'tnE
 d^E
 
uU\"$<<	=E
 E
 E
  nmE
 E
 E
 E
 E
r6   r  z
    MobileBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd Z fdZ ee                    d                     eee	e
eeee          	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 deej                 dee         dee         dee         deeej                 e	f         fd                        Z xZS )MobileBertForQuestionAnsweringc                     t                                          |           |j        | _        t          |d          | _        t          j        |j        |j                  | _        | 	                                 d S r  )
rv   rw   r  r  r)   r   r   r   
qa_outputsr  r   s     r4   rw   z'MobileBertForQuestionAnswering.__init__B  sj        +)&EJJJ)F$68IJJ 	r6   r  )r  r  r  qa_target_start_indexqa_target_end_indexr  r  Nr   r   r   r   r   r   start_positionsend_positionsr   rQ  rR  r   c                    ||n| j         j        }|                     |||||||	|
|	  	        }|d         }|                     |          }|                    dd          \  }}|                    d                                          }|                    d                                          }d}||t          |                                          dk    r|                    d          }t          |                                          dk    r|                    d          }|                    d          }|	                    d|          }|	                    d|          }t          |          } |||          } |||          }||z   dz  }|s||f|dd         z   }||f|z   n|S t          ||||j        |j        	          S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        Nr  r   r   r   r   )ignore_indexr@   )r  start_logits
end_logitsr   rW  )rb   r  r)   r  rR   r  r   rX   r   clampr	   r   r   rW  )r{   r   r   r   r   r   r   r  r  r   rQ  rR  r   rx  r  r  r  r  ignored_indexr  
start_lossend_lossr   s                          r4   r   z&MobileBertForQuestionAnswering.forwardL  s   D &1%<kk$+B]//))%'/!5# " 

 

 "!*11#)<<r<#:#: j#++B//::<<''++6688

&=+D?''))**Q.."1"9"9""="==%%''((1,, - 5 5b 9 9(--a00M-33A}EEO)//=AAM']CCCH!,@@Jx
M::H$x/14J 	R"J/'!""+=F/9/EZMF**6Q+%!!/)
 
 
 	
r6   r  )r   r   r   rw   r   r  r  r   _CHECKPOINT_FOR_QAr   r  _QA_TARGET_START_INDEX_QA_TARGET_END_INDEX_QA_EXPECTED_OUTPUT_QA_EXPECTED_LOSSr   r^   r   r   r   r   r   r   r   s   @r4   r  r  9  s            +*+F+M+MNk+l+lmm%0$40+'   -11515/3,0042604,0/3&*H
 H
EL)H
 !.H
 !.	H

 u|,H
 EL)H
  -H
 "%,/H
  -H
 $D>H
 'tnH
 d^H
 
uU\"$@@	AH
 H
 H
  nmH
 H
 H
 H
 H
r6   r  z
    MobileBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
    a softmax) e.g. for RocStories/SWAG tasks.
    c                       e Zd Z fdZ ee                    d                     eee	e
          	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 dee         dee         dee         deeej                 e	f         fd                        Z xZS )MobileBertForMultipleChoicec                 4   t                                          |           t          |          | _        |j        |j        n|j        }t          j        |          | _        t          j	        |j
        d          | _        |                                  d S )Nr   )rv   rw   r  r)   r  r   r   r   r   r   r   r?   r  r  s      r4   rw   z$MobileBertForMultipleChoice.__init__  s       )&11)/)B)NF%%TZTn 	 z"455)F$6:: 	r6   z(batch_size, num_choices, sequence_lengthr  Nr   r   r   r   r   r   r  r   rQ  rR  r   c                    |
|
n| j         j        }
||j        d         n|j        d         }|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|=|                    d|                    d          |                    d                    nd}|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }|                    d|          }d}|t                      } |||          }|
s|f|dd         z   }||f|z   n|S t          |||j
        |j                  S )aJ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r   r   r  r@   r  )rb   r  r[   r   r   r)   r   r?   r	   r   r   rW  )r{   r   r   r   r   r   r   r  r   rQ  rR  num_choicesr   rd  r  reshaped_logitsr  r  r   s                      r4   r   z#MobileBertForMultipleChoice.forward  s*   6 &1%<kk$+B],5,Aioa((}GZ[\G]>G>SINN2y~~b'9'9:::Y]	M[Mg,,R1D1DR1H1HIIImqM[Mg,,R1D1DR1H1HIIImqGSG_|((\->->r-B-BCCCei ( r=#5#5b#9#9=;M;Mb;Q;QRRR 	 //))%'/!5# " 

 

  
]33// ++b+66'))H8OV44D 	F%''!""+5F)-)9TGf$$vE("!/)	
 
 
 	
r6   r  )r   r   r   rw   r   r  r  r   r  r   r  r   r^   r   r   r   r   r   r   r   s   @r4   r  r    s            +*#**+UVV   &-$   -11515/3,004)-,0/3&*@
 @
EL)@
 !.@
 !.	@

 u|,@
 EL)@
  -@
 &@
 $D>@
 'tn@
 d^@
 
uU\"$==	>@
 @
 @
  @
 @
 @
 @
 @
r6   r  z
    MobileBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
    for Named-Entity-Recognition (NER) tasks.
    c                       e Zd Z fdZ ee                    d                     eee	e
ee          	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 dee         dee         dee         deeej                 e	f         fd                        Z xZS ) MobileBertForTokenClassificationc                 Z   t                                          |           |j        | _        t          |d          | _        |j        |j        n|j        }t          j        |          | _	        t          j
        |j        |j                  | _        |                                  d S r  )rv   rw   r  r  r)   r  r   r   r   r   r   r   r?   r  r  s      r4   rw   z)MobileBertForTokenClassification.__init__  s        +)&EJJJ)/)B)NF%%TZTn 	 z"455)F$68IJJ 	r6   r  r  Nr   r   r   r   r   r   r  r   rQ  rR  r   c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }d}|Ft                      } ||                    d| j                  |                    d                    }|
s|f|dd         z   }||f|z   n|S t          |||j	        |j
                  S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r   r@   r  )rb   r  r)   r   r?   r	   r   r  r   r   rW  )r{   r   r   r   r   r   r   r  r   rQ  rR  r   rx  r  r  r  r   s                    r4   r   z(MobileBertForTokenClassification.forward  s   2 &1%<kk$+B]//))%'/!5# " 

 

 "!*,,7711'))H8FKKDO<<fkk"ooNND 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
r6   r  )r   r   r   rw   r   r  r  r   $_CHECKPOINT_FOR_TOKEN_CLASSIFICATIONr   r  _TOKEN_CLASS_EXPECTED_OUTPUT_TOKEN_CLASS_EXPECTED_LOSSr   r^   r   r   r   r   r   r   r   s   @r4   r!  r!    s            +*+F+M+MNk+l+lmm7)$40   -11515/3,004)-,0/3&*2
 2
EL)2
 !.2
 !.	2

 u|,2
 EL)2
  -2
 &2
 $D>2
 'tn2
 d^2
 
uU\"$99	:2
 2
 2
  nm2
 2
 2
 2
 2
r6   r!  )\r   rH   r  dataclassesr   typingr   r   r   r^   r   torch.nnr   r	   r
   activationsr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   r   configuration_mobilebertr   
get_loggerr   rF   r  r  r$  r%  r&  r  r  r  r  r  r  r  r  rq   Modulers   r&   r   r   r   r   r   r	  r  r  r   r&  r1  r5  r;  rL  r]  rf  rk  ru  r|  r  r  MOBILEBERT_START_DOCSTRINGr  r  r  r  r  r  r  r  r  r!  r1   r6   r4   <module>r3     s  .  				  ! ! ! ! ! ! ) ) ) ) ) ) ) ) ) )        A A A A A A A A A A ! ! ! ! ! !	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 . - - - - - Q Q Q Q Q Q Q Q                7 6 6 6 6 6 
	H	%	%1 $ (J $l !  ; '     +D '' ! K K K\6 6 6 6 6RY 6 6 6 &
9
9I I I I I29 I I IX7 7 7 7 7bi 7 7 7t    29   "/ / / / /") / / /d    RY       ry       ry   0	 	 	 	 	bi 	 	 	!] !] !] !] !] !] !] !]H	 	 	 	 		 	 	 		 	 	 	 	ry 	 	 	< < < < <bi < < <~'
 '
 '
 '
 '
	 '
 '
 '
T! ! ! ! !ry ! ! !&    	   "       ,! ! ! ! !BI ! ! !	9 	9 	9 	9 	9 	9 	9 	9* * * * * * * *6 : : : : :[ : : :B  / d j h
 h
 h
 h
 h
/ h
 h
	 h
V   i
 i
 i
 i
 i
8 i
 i
 i
X TVpqqT
 T
 T
 T
 T
5 T
 T
 rqT
n& & & & &BI & & & Z ]
 ]
 ]
 ]
 ]
*C ]
 ]
	 ]
@   ]
 ]
 ]
 ]
 ]
*C ]
 ]
 ]
@   ]
 ]
 ]
 ]
 ]
%> ]
 ]
 ]
@   V
 V
 V
 V
 V
"; V
 V
 V
r   I
 I
 I
 I
 I
'@ I
 I
 I
 I
 I
r6   