
    g                        d Z ddlZddlZddlmZ ddlmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZmZmZ ddlmZmZ dd	lmZmZmZmZmZmZ dd
lmZmZ ddlmZmZm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'  e%j(        e)          Z*dZ+dZ,d Z- G d dej.                  Z/ G d de          Z0 G d dej.                  Z1 G d dej.                  Z2 G d dej.                  Z3 G d dej.                  Z4 G d dej.                  Z5 G d  d!ej.                  Z6 G d" d#ej.                  Z7 G d$ d%ej.                  Z8 G d& d'ej.                  Z9 G d( d)ej.                  Z:d*Z;d+Z< e#d,e;           G d- d.e0                      Z= G d/ d0ej.                  Z> e#d1e;           G d2 d3e0                      Z? G d4 d5ej.                  Z@ e#d6e;           G d7 d8e0                      ZA e#d9e;           G d: d;e0                      ZB e#d<e;           G d= d>e0                      ZC e#d?e;           G d@ dAe0                      ZDdS )BzPyTorch ConvBERT model.    N)
attrgetter)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNget_activation)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModelSequenceSummary)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )ConvBertConfigzYituTech/conv-bert-baser   c                    	 ddl }n)# t          $ r t                              d            w xY wt          j                            |          }t                              d|            |j        	                    |          }i }|D ]E\  }}t                              d| d|            |j        
                    ||          }	|	||<   Fddd	d
dddd}
|j        dk    rd}nd}t          |j                  D ]:}d| d|
d| d<   d| d|
d| d<   d| d|
d| d<   d| d|
d| d<   d| d|
d| d<   d| d|
d| d<   d| d |
d| d!<   d| d"|
d| d#<   d| d$|
d| d%<   d| d&|
d| d'<   d| d(|
d| d)<   d| d*|
d| d+<   d| d,|
d| d-<   d| d.|
d| d/<   d| d0|
d| d1<   d| d2|
d| d3<   d| d4|
d| d5<   d| d6| d7|
d| d8<   d| d6| d9|
d| d:<   d| d;| d7|
d| d<<   d| d;| d9|
d| d=<   d| d>|
d| d?<   d| d@|
d| dA<   <|                                 D ]7}|d         }t          |          } ||           }|
|         }t!          j        ||                   }t                              dB| dC| dD           |                    d7          r1|                    dE          s|                    dF          s|j        }|                    dG          r|                    ddHd          }|                    dI          r|                    dHdd          }|                    dJ          r|                    dK          }||_        9| S )Lz'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape z"electra/embeddings/word_embeddingsz&electra/embeddings/position_embeddingsz(electra/embeddings/token_type_embeddingsz"electra/embeddings/LayerNorm/gammaz!electra/embeddings/LayerNorm/betaz!electra/embeddings_project/kernelzelectra/embeddings_project/bias)z!embeddings.word_embeddings.weightz%embeddings.position_embeddings.weightz'embeddings.token_type_embeddings.weightzembeddings.LayerNorm.weightzembeddings.LayerNorm.biaszembeddings_project.weightzembeddings_project.biasr   g_densedensezelectra/encoder/layer_z/attention/self/query/kernelzencoder.layer.z.attention.self.query.weightz/attention/self/query/biasz.attention.self.query.biasz/attention/self/key/kernelz.attention.self.key.weightz/attention/self/key/biasz.attention.self.key.biasz/attention/self/value/kernelz.attention.self.value.weightz/attention/self/value/biasz.attention.self.value.biasz./attention/self/conv_attn_key/depthwise_kernelz4.attention.self.key_conv_attn_layer.depthwise.weightz./attention/self/conv_attn_key/pointwise_kernelz4.attention.self.key_conv_attn_layer.pointwise.weightz"/attention/self/conv_attn_key/biasz(.attention.self.key_conv_attn_layer.biasz'/attention/self/conv_attn_kernel/kernelz(.attention.self.conv_kernel_layer.weightz%/attention/self/conv_attn_kernel/biasz&.attention.self.conv_kernel_layer.biasz&/attention/self/conv_attn_point/kernelz%.attention.self.conv_out_layer.weightz$/attention/self/conv_attn_point/biasz#.attention.self.conv_out_layer.biasz/attention/output/dense/kernelz.attention.output.dense.weightz!/attention/output/LayerNorm/gammaz".attention.output.LayerNorm.weightz/attention/output/dense/biasz.attention.output.dense.biasz /attention/output/LayerNorm/betaz .attention.output.LayerNorm.biasz/intermediate/z/kernelz.intermediate.dense.weightz/biasz.intermediate.dense.biasz/output/z.output.dense.weightz.output.dense.biasz/output/LayerNorm/gammaz.output.LayerNorm.weightz/output/LayerNorm/betaz.output.LayerNorm.biaszTF: z, PT:  z/intermediate/g_dense/kernelz/output/g_dense/kernelz/depthwise_kernel   z/pointwise_kernelz/conv_attn_key/bias)
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variable
num_groupsrangenum_hidden_layersnamed_parametersr   torch
from_numpyendswithTpermute	unsqueezedata)modelconfigtf_checkpoint_pathtftf_path	init_varstf_datanameshapearrayparam_mappinggroup_dense_namejparam
param_name	retrieverresulttf_namevalues                      j/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/convbert/modeling_convbert.pyload_tf_weights_in_convbertrO   0   sX      Q	
 	
 	
 	 goo011G
KKBBBCCC''00IG   eBBB5BBCCC&&w55 .R1Y3]'K%H%H#D M 1$"6+,, Cw CwDQDDD 	FqFFFG CQBBB 	DqDDDE CQBBB 	DqDDDE AQ@@@ 	BqBBBC EQDDD 	FqFFFG CQBBB 	DqDDDE WQVVV 	^q^^^_ WQVVV 	^q^^^_ KQJJJ 	RqRRRS PQOOO 	RqRRRS NQMMM 	PqPPPQ OQNNN 	OqOOOP MQLLL 	MqMMMN GQFFF 	HqHHHI JQIII 	LqLLLM EQDDD 	FqFFFG IQHHH 	JqJJJK PQOO6FOOO 	DqDDDE NQMM6FMMM 	BqBBBC JQII0@III 	>q>>>? HQGG0@GGG 	<q<<<= @Q??? 	BqBBBC Ew]^DvDvDv@q@@@AA''))  1X
z**	5!!
+ !1227777*777888I&& 	$##$BCC $''(@AA $!GE/00 	+MM!Q**E/00 	+MM!Q**E122 	(OOB''ELs    &-c                        e Zd ZdZ fdZ	 	 	 	 d
deej                 deej                 deej                 deej                 dej        f
d	Z	 xZ
S )ConvBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j        |j
        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        |                     dt%          j        |j                                      d          d           |                     dt%          j        | j                                        t$          j                  d           d S )	N)padding_idxepsposition_ids)r   r$   F)
persistenttoken_type_ids)dtype)super__init__r   	Embedding
vocab_sizeembedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr4   arangeexpandzerosrV   sizelongselfr<   	__class__s     rN   r[   zConvBertEmbeddings.__init__   s5   !|F,=v?Tbhbuvvv#%<0NPVPe#f#f %'\&2H&J_%`%`" f&;AVWWWz&"<==EL)GHHOOPWXXej 	 	
 	
 	
 	ek$*;*@*@*B*B%*UUUbg 	 	
 	
 	
 	
 	
    N	input_idsrX   rV   inputs_embedsreturnc                 j   ||                                 }n|                                 d d         }|d         }|| j        d d d |f         }|mt          | d          r2| j        d d d |f         }|                    |d         |          }|}n+t          j        |t
          j        | j        j                  }|| 	                    |          }| 
                    |          }	|                     |          }
||	z   |
z   }|                     |          }|                     |          }|S )Nr$   r   rX   r   rY   device)rn   rV   hasattrrX   rl   r4   rm   ro   ry   r`   rb   rd   re   ri   )rq   rt   rX   rV   ru   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedrb   rd   
embeddingss               rN   forwardzConvBertEmbeddings.forward   sQ     #..**KK',,..ss3K ^
,QQQ^<L
 !t-.. m*.*=aaa*n*M'3J3Q3QR]^_R`bl3m3m0!A!&[
SWSdSk!l!l!l  00;;M"66|DD $ : :> J J"%88;PP
^^J//
\\*--
rs   )NNNN)__name__
__module____qualname____doc__r[   r   r4   
LongTensorFloatTensorr   __classcell__rr   s   @rN   rQ   rQ      s        QQ
 
 
 
 
( 15593759$ $E,-$ !!12$ u/0	$
   12$ 
	$ $ $ $ $ $ $ $rs   rQ   c                   (    e Zd ZdZeZeZdZdZ	d Z
dS )ConvBertPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    convbertTc                    t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j        	                                 dS dS t          |t          j
                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 	                                 dS dS t          |t          j                  r?|j        j        	                                 |j        j                            d           dS dS )zInitialize the weights        meanstdNg      ?)
isinstancer   Linearweightr:   normal_r<   initializer_rangebiaszero_r\   rS   re   fill_)rq   modules     rN   _init_weightsz%ConvBertPreTrainedModel._init_weights   s)   fbi(( 	* M&&CT[5R&SSS{& &&((((( '&-- 	*M&&CT[5R&SSS!-"6#56<<>>>>> .--- 	*K""$$$M$$S)))))	* 	*rs   N)r   r   r   r   r   config_classrO   load_tf_weightsbase_model_prefixsupports_gradient_checkpointingr    rs   rN   r   r      sE         
 "L1O"&*#* * * * *rs   r   c                   F     e Zd ZdZ fdZdej        dej        fdZ xZS )SeparableConv1DzSThis class implements separable convolution, i.e. a depthwise and a pointwise layerc                    t                                                       t          j        |||||dz  d          | _        t          j        ||dd          | _        t          j        t          j        |d                    | _	        | j        j
        j                            d|j                   | j        j
        j                            d|j                   d S )Nr#   F)kernel_sizegroupspaddingr   r   )r   r   r   r   )rZ   r[   r   Conv1d	depthwise	pointwise	Parameterr4   rm   r   r   r:   r   r   )rq   r<   input_filtersoutput_filtersr   kwargsrr   s         rN   r[   zSeparableConv1D.__init__  s    # 1$
 
 
 =.aV[\\\L^Q!?!?@@	"**9Q*RRR"**9Q*RRRRRrs   hidden_statesrv   c                 n    |                      |          }|                     |          }|| j        z  }|S N)r   r   r   )rq   r   xs      rN   r   zSeparableConv1D.forward  s4    NN=))NN1	TYrs   	r   r   r   r   r[   r4   Tensorr   r   r   s   @rN   r   r     si        ]]S S S S S U\ el        rs   r   c                        e Zd Z fdZd Z	 	 	 	 ddej        deej                 deej                 deej                 d	ee	         d
e
ej        eej                 f         fdZ xZS )ConvBertSelfAttentionc                 p   t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          |j        |j        z  }|dk     r|j        | _        d| _        n|| _        |j        | _        |j        | _        |j        | j        z  dk    rt          d          |j        | j        z  dz  | _        | j        | j        z  | _	        t          j        |j        | j	                  | _        t          j        |j        | j	                  | _        t          j        |j        | j	                  | _        t          ||j        | j	        | j                  | _        t          j        | j	        | j        | j        z            | _        t          j        |j        | j	                  | _        t          j        | j        dgt)          | j        dz
  dz            dg	          | _        t          j        |j                  | _        d S )
Nr   r^   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   z6hidden_size should be divisible by num_attention_headsr#   )r   r   )rZ   r[   hidden_sizenum_attention_headsrz   
ValueError
head_ratioconv_kernel_sizeattention_head_sizeall_head_sizer   r   querykeyrM   r   key_conv_attn_layerconv_kernel_layerconv_out_layerUnfoldintunfoldrg   attention_probs_dropout_probri   )rq   r<   new_num_attention_headsrr   s      rN   r[   zConvBertSelfAttention.__init__  s&    ::a??PVXhHiHi?8F$6 8 8 48 8 8  
 #)"<@Q"Q"Q&&$8DO'(D$$'>D$$/DO & 7 88A==UVVV$*$6$:R$RWX#X !58PPYv143EFF
9V/1CDDYv143EFF
#2F&(:D<Q$
 $
  "$4+=t?WZ^Zo?o!p!p i(:D<NOOi.2S$BWZ[B[_`A`=a=acd<e
 
 
 z&"EFFrs   c                     |                                 d d         | j        | j        fz   } |j        | }|                    dddd          S )Nr$   r   r#   r   r   )rn   r   r   viewr8   )rq   r   new_x_shapes      rN   transpose_for_scoresz*ConvBertSelfAttention.transpose_for_scoresF  sM    ffhhssmt'?AY&ZZAFK yyAq!$$$rs   NFr   attention_mask	head_maskencoder_hidden_statesoutput_attentionsrv   c                    |                      |          }|                    d          }|+|                     |          }|                     |          }	n*|                     |          }|                     |          }	|                     |                    dd                    }
|
                    dd          }
|                     |          }|                     |          }|                     |	          }t          j        |
|          }| 	                    |          }t          j
        |d| j        dg          }t          j        |d          }|                     |          }t          j
        ||d| j        g          }|                    dd                                                              d          }t"          j                            || j        dgd| j        dz
  dz  dgd          }|                    dd          
                    |d| j        | j                  }t          j
        |d| j        | j        g          }t          j        ||          }t          j
        |d| j        g          }t          j        ||                    dd                    }|t-          j        | j                  z  }|||z   }t"          j                            |d          }|                     |          }|||z  }t          j        ||          }|                    dddd                                          }t          j
        ||d| j        | j        g          }t          j        ||gd          }|                                d d         | j        | j        z  dz  fz   } |j        | }|r||fn|f}|S )	Nr   r   r#   r$   dim)r   dilationr   strider   )r   rn   r   rM   r   	transposer   r4   multiplyr   reshaper   softmaxr   r   
contiguousr9   r   
functionalr   r   matmulmathsqrtri   r8   r   catr   )rq   r   r   r   r   r   mixed_query_layer
batch_sizemixed_key_layermixed_value_layermixed_key_conv_attn_layerquery_layer	key_layervalue_layerconv_attn_layerr   r   attention_scoresattention_probscontext_layerconv_outnew_context_layer_shapeoutputss                          rN   r   zConvBertSelfAttention.forwardK  s    !JJ}55"''**
 !,"hh'<==O $

+@ A A"hh}55O $

= 9 9$($<$<]=T=TUVXY=Z=Z$[$[!$=$G$G1$M$M!//0ABB--o>>	//0ABB.)BDUVV 22?CC!M*;b$BWYZ=[\\!M*;CCC,,];;~
BHZ7[\\'11!Q77BBDDNNrRR--.2+a/A5q9 . 
 
 (11!Q77??D.0E
 
 ~D<TVZVk7lmmn6GHH~D<N7OPP !<Y5H5HR5P5PQQ+di8P.Q.QQ%/.@ -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF=*b$BZ\`\t1uvv	=(";Q?? #0"4"4"6"6ss";$t'??!C?
 #
 +*,CD6G]=/22mM]rs   NNNF)r   r   r   r[   r   r4   r   r   r   boolr   r   r   r   s   @rN   r   r     s        %G %G %G %G %GN% % % 7;158<,1P P|P !!23P E-.	P
  (5P $D>P 
u|Xel33	4P P P P P P P Prs   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )ConvBertSelfOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j	                  | _
        d S NrT   )rZ   r[   r   r   r   r!   re   rf   rg   rh   ri   rp   s     rN   r[   zConvBertSelfOutput.__init__  sf    Yv163EFF
f&8f>STTTz&"<==rs   r   input_tensorrv   c                     |                      |          }|                     |          }|                     ||z             }|S r   r!   ri   re   rq   r   r   s      rN   r   zConvBertSelfOutput.forward  @    

=11]33}|'CDDrs   r   r   r   r[   r4   r   r   r   r   s   @rN   r   r     si        > > > > >U\  RWR^        rs   r   c                        e Zd Z fdZd Z	 	 	 	 ddej        deej                 deej                 deej                 d	ee	         d
e
ej        eej                 f         fdZ xZS )ConvBertAttentionc                     t                                                       t          |          | _        t	          |          | _        t                      | _        d S r   )rZ   r[   r   rq   r   outputsetpruned_headsrp   s     rN   r[   zConvBertAttention.__init__  sI    )&11	(00EErs   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   r   )lenr   rq   r   r   r   r   r   r   rM   r   r!   r   union)rq   headsindexs      rN   prune_headszConvBertAttention.prune_heads  s    u::??F7490$)2OQUQb
 
u
 -TY_eDD	*49=%@@	,TY_eDD	.t{/@%QOOO )-	(EE

(R	%"&)"?$)B_"_	 -33E::rs   NFr   r   r   r   r   rv   c                     |                      |||||          }|                     |d         |          }|f|dd          z   }|S )Nr   r   )rq   r   )	rq   r   r   r   r   r   self_outputsattention_outputr   s	            rN   r   zConvBertAttention.forward  s[     yy!
 
  ;;|AFF#%QRR(88rs   r   )r   r   r   r[   r  r4   r   r   r   r   r   r   r   r   s   @rN   r   r     s        " " " " "; ; ;* 7;158<,1 | !!23 E-.	
  (5 $D> 
u|Xe&788	9       rs   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )GroupedLinearLayerc                    t                                                       || _        || _        || _        | j        | j        z  | _        | j        | j        z  | _        t          j        t          j
        | j        | j        | j                            | _        t          j        t          j
        |                    | _        d S r   )rZ   r[   
input_sizeoutput_sizer0   group_in_dimgroup_out_dimr   r   r4   emptyr   r   )rq   r  r  r0   rr   s       rN   r[   zGroupedLinearLayer.__init__  s    $&$ Ot>!-@l5;t@QSWSe#f#fggL[!9!9::			rs   r   rv   c                 v   t          |                                          d         }t          j        |d| j        | j        g          }|                    ddd          }t          j        || j                  }|                    ddd          }t          j        ||d| j	        g          }|| j
        z   }|S )Nr   r$   r   r#   )listrn   r4   r   r0   r  r8   r   r   r  r   )rq   r   r   r   s       rN   r   zGroupedLinearLayer.forward  s    -,,..//2
M-"dot?P)QRRIIaALDK((IIaAM!j"d.>?@@	Mrs   r   r   s   @rN   r  r    s^        ; ; ; ; ;U\ el        rs   r  c                   B     e Zd Z fdZdej        dej        fdZ xZS )ConvBertIntermediatec                 r   t                                                       |j        dk    r%t          j        |j        |j                  | _        n&t          |j        |j        |j                  | _        t          |j
        t                    rt          |j
                 | _        d S |j
        | _        d S )Nr   r  r  r0   )rZ   r[   r0   r   r   r   intermediate_sizer!   r  r   
hidden_actstrr   intermediate_act_fnrp   s     rN   r[   zConvBertIntermediate.__init__  s    !!6#5v7OPPDJJ+!-6;S`f`q  DJ f'-- 	9'-f.?'@D$$$'-'8D$$$rs   r   rv   c                 Z    |                      |          }|                     |          }|S r   )r!   r  rq   r   s     rN   r   zConvBertIntermediate.forward  s,    

=1100??rs   r   r   s   @rN   r  r    s^        9 9 9 9 9U\ el        rs   r  c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )ConvBertOutputc                 z   t                                                       |j        dk    r%t          j        |j        |j                  | _        n&t          |j        |j        |j                  | _        t          j	        |j        |j
                  | _	        t          j        |j                  | _        d S )Nr   r  rT   )rZ   r[   r0   r   r   r  r   r!   r  re   rf   rg   rh   ri   rp   s     rN   r[   zConvBertOutput.__init__  s    !!6#;V=OPPDJJ+!3AS`f`q  DJ f&8f>STTTz&"<==rs   r   r   rv   c                     |                      |          }|                     |          }|                     ||z             }|S r   r   r   s      rN   r   zConvBertOutput.forward  r   rs   r   r   s   @rN   r  r    si        	> 	> 	> 	> 	>U\  RWR^        rs   r  c                        e Zd Z fdZ	 	 	 	 	 ddej        deej                 deej                 deej                 deej                 d	ee         d
e	ej        eej                 f         fdZ
d Z xZS )ConvBertLayerc                 ~   t                                                       |j        | _        d| _        t	          |          | _        |j        | _        |j        | _        | j        r-| j        st          |  d          t	          |          | _	        t          |          | _        t          |          | _        d S )Nr   z> should be used as a decoder model if cross attention is added)rZ   r[   chunk_size_feed_forwardseq_len_dimr   	attention
is_decoderadd_cross_attention	TypeErrorcrossattentionr  intermediater  r   rp   s     rN   r[   zConvBertLayer.__init__  s    '-'E$*622 +#)#= # 	<? i4 g g ghhh"3F";";D088$V,,rs   NFr   r   r   r   encoder_attention_maskr   rv   c                 ^   |                      ||||          }|d         }|dd          }	| j        rS|Qt          | d          st          d|  d          |                     |||||          }
|
d         }|	|
dd          z   }	t          | j        | j        | j        |          }|f|	z   }	|	S )N)r   r   r   r+  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)	r'  r(  rz   AttributeErrorr+  r   feed_forward_chunkr%  r&  )rq   r   r   r   r   r-  r   self_attention_outputsr	  r   cross_attention_outputslayer_outputs               rN   r   zConvBertLayer.forward%  s    "&/	 "0 "
 "
 2!4(,? 	<4@4!122 $Dd D D D   '+&9&9 &%!' '#  7q9 7 ;;G0#T%A4CSUe
 
  /G+rs   c                 \    |                      |          }|                     ||          }|S r   )r,  r   )rq   r	  intermediate_outputr3  s       rN   r0  z ConvBertLayer.feed_forward_chunkM  s2    "//0@AA{{#68HIIrs   )NNNNF)r   r   r   r[   r4   r   r   r   r   r   r   r0  r   r   s   @rN   r#  r#    s        - - - - -" 7;158<9=,1& &|& !!23& E-.	&
  (5& !) 6& $D>& 
u|Xe&788	9& & & &P      rs   r#  c                        e Zd Z fdZ	 	 	 	 	 	 	 ddej        deej                 deej                 deej                 d	eej                 d
ee         dee         dee         de	e
ef         fdZ xZS )ConvBertEncoderc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r   )r#  ).0_r<   s     rN   
<listcomp>z,ConvBertEncoder.__init__.<locals>.<listcomp>W  s!    #c#c#caM&$9$9#c#c#crs   F)	rZ   r[   r<   r   
ModuleListr1   r2   layergradient_checkpointingrp   s    `rN   r[   zConvBertEncoder.__init__T  s`    ]#c#c#c#c5IaCbCb#c#c#cdd
&+###rs   NFTr   r   r   r   r-  r   output_hidden_statesreturn_dictrv   c	           
         |rdnd }	|rdnd }
|r| j         j        rdnd }t          | j                  D ]\  }}|r|	|fz   }	|||         nd }| j        r(| j        r!|                     |j        ||||||          }n |||||||          }|d         }|r$|
|d         fz   }
| j         j        r||d         fz   }|r|	|fz   }	|st          d ||	|
|fD                       S t          ||	|
|          S )Nr   r   r   r#   c              3      K   | ]}||V  	d S r   r   )r:  vs     rN   	<genexpr>z*ConvBertEncoder.forward.<locals>.<genexpr>  s0        =  === rs   )last_hidden_stater   
attentionscross_attentions)
r<   r)  	enumerater>  r?  training_gradient_checkpointing_func__call__tupler   )rq   r   r   r   r   r-  r   r@  rA  all_hidden_statesall_self_attentionsall_cross_attentionsilayer_modulelayer_head_masklayer_outputss                   rN   r   zConvBertEncoder.forwardZ  s    #7@BBD$5?bb4%6d4;;Zdrr`d(44 	V 	VOA|# I$58H$H!.7.CillO* t}  $ A A )!"#)*%! ! !-!"#)*%! ! *!,M  V&9]1=M<O&O#;2 V+?=QRCSBU+U( 	E 1]4D D 	  '):<OQef     
 2++*1	
 
 
 	
rs   )NNNNFFT)r   r   r   r[   r4   r   r   r   r   r   r   r   r   r   r   s   @rN   r7  r7  S  s        , , , , , 7;158<9=,1/4&*;
 ;
|;
 !!23;
 E-.	;

  (5;
 !) 6;
 $D>;
 'tn;
 d^;
 
u88	9;
 ;
 ;
 ;
 ;
 ;
 ;
 ;
rs   r7  c                   B     e Zd Z fdZdej        dej        fdZ xZS )ConvBertPredictionHeadTransformc                 V   t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _
        n|j        | _
        t          j        |j        |j                  | _        d S r   )rZ   r[   r   r   r   r!   r   r  r  r   transform_act_fnre   rf   rp   s     rN   r[   z(ConvBertPredictionHeadTransform.__init__  s    Yv163EFF
f'-- 	6$*6+<$=D!!$*$5D!f&8f>STTTrs   r   rv   c                     |                      |          }|                     |          }|                     |          }|S r   )r!   rX  re   r  s     rN   r   z'ConvBertPredictionHeadTransform.forward  s=    

=11--m<<}55rs   r   r   s   @rN   rV  rV    sc        U U U U UU\ el        rs   rV  aK  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`ConvBertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a8
  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:


            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:


            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:


            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zbThe bare ConvBERT Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd Z fdZd Zd Zd Z ee	                    d                     e
eee          	 	 	 	 	 	 	 	 	 ddeej                 d	eej                 d
eej                 deej                 deej                 deej                 dee         dee         dee         deeef         fd                        Z xZS )ConvBertModelc                 8   t                                          |           t          |          | _        |j        |j        k    r$t          j        |j        |j                  | _        t          |          | _
        || _        |                                  d S r   )rZ   r[   rQ   r   r^   r   r   r   embeddings_projectr7  encoderr<   	post_initrp   s     rN   r[   zConvBertModel.__init__  s       ,V44 F$666&(i0EvGY&Z&ZD#&v..rs   c                     | j         j        S r   r   r`   rq   s    rN   get_input_embeddingsz"ConvBertModel.get_input_embeddings  s    ..rs   c                     || j         _        d S r   ra  )rq   rM   s     rN   set_input_embeddingsz"ConvBertModel.set_input_embeddings  s    */'''rs   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr^  r>  r'  r  )rq   heads_to_pruner>  r  s       rN   _prune_headszConvBertModel._prune_heads   sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Crs   batch_size, sequence_length
checkpointoutput_typer   Nrt   r   rX   rV   r   ru   r   r@  rA  rv   c
                    ||n| j         j        }||n| j         j        }|	|	n| j         j        }	||t	          d          |+|                     ||           |                                }
n.||                                d d         }
nt	          d          |
\  }}||j        n|j        }|t          j	        |
|          }|gt          | j        d          r1| j        j        d d d |f         }|                    ||          }|}n!t          j        |
t          j        |          }|                     ||
          }|                     || j         j                  }|                     ||||          }t          | d          r|                     |          }|                     ||||||		          }|S )
NzDYou cannot specify both input_ids and inputs_embeds at the same timer$   z5You have to specify either input_ids or inputs_embeds)ry   rX   rx   )rt   rV   rX   ru   r]  )r   r   r   r@  rA  )r<   r   r@  use_return_dictr   %warn_if_padding_and_no_attention_maskrn   ry   r4   onesrz   r   rX   rl   rm   ro   get_extended_attention_maskget_head_maskr2   r]  r^  )rq   rt   r   rX   rV   r   ru   r   r@  rA  r{   r   r|   ry   r}   r~   extended_attention_maskr   s                     rN   r   zConvBertModel.forward  s   $ 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU!,
J%.%:!!@T!"ZFCCCN!t(899 [*./*HKZK*X'3J3Q3QR\^h3i3i0!A!&[
SY!Z!Z!Z"&"B"B>S^"_"_&&y$+2OPP	l>iv ( 
 
 4-.. 	C 33MBBM2/!5# % 
 
 rs   )	NNNNNNNNN)r   r   r   r[   rc  re  ri  r   CONVBERT_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r4   r   r   r   r   r   r   r   r   s   @rN   r[  r[    s       

 
 
 
 
/ / /0 0 0C C C +*+D+K+KLi+j+jkk&6$   156:59371559,0/3&*< <E,-< !!23< !!12	<
 u/0< E-.<   12< $D>< 'tn< d^< 
u88	9< < <  lk< < < < <rs   r[  c                   F     e Zd ZdZ fdZdej        dej        fdZ xZS )ConvBertGeneratorPredictionszAPrediction module for the generator, made up of two dense layers.c                    t                                                       t          d          | _        t	          j        |j        |j                  | _        t	          j        |j	        |j                  | _
        d S )NgelurT   )rZ   r[   r   
activationr   re   r^   rf   r   r   r!   rp   s     rN   r[   z%ConvBertGeneratorPredictions.__init__P  sa    (00f&;AVWWWYv163HII


rs   generator_hidden_statesrv   c                     |                      |          }|                     |          }|                     |          }|S r   )r!   r}  re   )rq   r~  r   s      rN   r   z$ConvBertGeneratorPredictions.forwardW  s<    

#:;;66}55rs   )	r   r   r   r   r[   r4   r   r   r   r   s   @rN   rz  rz  M  sk        KKJ J J J Ju/@ UEV        rs   rz  z6ConvBERT Model with a `language modeling` head on top.c                       e Zd ZdgZ fdZd Zd Z ee	                    d                     e
eee          	 	 	 	 	 	 	 	 	 	 ddeej                 d	eej                 d
eej                 deej                 deej                 deej                 deej                 dee         dee         dee         deeef         fd                        Z xZS )ConvBertForMaskedLMzgenerator.lm_head.weightc                 
   t                                          |           t          |          | _        t	          |          | _        t          j        |j        |j	                  | _
        |                                  d S r   )rZ   r[   r[  r   rz  generator_predictionsr   r   r^   r]   generator_lm_headr_  rp   s     rN   r[   zConvBertForMaskedLM.__init__c  sj       %f--%A&%I%I"!#6+@&BS!T!Trs   c                     | j         S r   r  rb  s    rN   get_output_embeddingsz)ConvBertForMaskedLM.get_output_embeddingsm  s    %%rs   c                     || _         d S r   r  )rq   r`   s     rN   set_output_embeddingsz)ConvBertForMaskedLM.set_output_embeddingsp  s    !0rs   rj  rk  Nrt   r   rX   rV   r   ru   labelsr   r@  rA  rv   c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }d}|Pt          j                    } ||                    d| j         j                  |                    d                    }|
s|f|dd         z   }||f|z   n|S t          |||j
        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   r$   r   losslogitsr   rG  )r<   ro  r   r  r  r   r	   r   r]   r   r   rG  )rq   rt   r   rX   rV   r   ru   r  r   r@  rA  r~  generator_sequence_outputprediction_scoresr  loss_fctr   s                    rN   r   zConvBertForMaskedLM.forwards  s-   2 &1%<kk$+B]"&-- 
#
 
#
 %<A$>! 667PQQ 223DEE*,,H8-222t{7MNNPVP[P[\^P_P_``D 	F'),CABB,GGF)-)9TGf$$vE$1?.9	
 
 
 	
rs   
NNNNNNNNNN)r   r   r   _tied_weights_keysr[   r  r  r   ru  rv  r   rw  r   rx  r   r4   r   r   r   r   r   r   r   r   s   @rN   r  r  _  s       45    & & &1 1 1 +*+D+K+KLi+j+jkk&"$   156:59371559-1,0/3&*4
 4
E,-4
 !!234
 !!12	4

 u/04
 E-.4
   124
 )*4
 $D>4
 'tn4
 d^4
 
un$	%4
 4
 4
  lk4
 4
 4
 4
 4
rs   r  c                   F     e Zd ZdZ fdZdej        dej        fdZ xZS )ConvBertClassificationHeadz-Head for sentence-level classification tasks.c                 B   t                                                       t          j        |j        |j                  | _        |j        |j        n|j        }t          j        |          | _	        t          j        |j        |j
                  | _        || _        d S r   )rZ   r[   r   r   r   r!   classifier_dropoutrh   rg   ri   
num_labelsout_projr<   rq   r<   r  rr   s      rN   r[   z#ConvBertClassificationHead.__init__  s    Yv163EFF
)/)B)NF%%TZTn 	 z"455	&"4f6GHHrs   r   rv   c                 
   |d d dd d f         }|                      |          }|                     |          }t          | j        j                 |          }|                      |          }|                     |          }|S )Nr   )ri   r!   r   r<   r  r  )rq   r   r   r   s       rN   r   z"ConvBertClassificationHead.forward  st    !!!Q'"LLOOJJqMM4;)*1--LLOOMM!rs   r   r   s   @rN   r  r    sd        77	 	 	 	 	U\         rs   r  z
    ConvBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                       e Zd Z fdZ ee                    d                     eee	e
          	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 dee         dee         dee         deee	f         fd                        Z xZS )!ConvBertForSequenceClassificationc                     t                                          |           |j        | _        || _        t	          |          | _        t          |          | _        |                                  d S r   )	rZ   r[   r  r<   r[  r   r  
classifierr_  rp   s     rN   r[   z*ConvBertForSequenceClassification.__init__  sb        +%f--4V<< 	rs   rj  rk  Nrt   r   rX   rV   r   ru   r  r   r@  rA  rv   c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }d}|Z| j         j        f| j        dk    rd| j         _        nN| j        dk    r7|j        t          j        k    s|j        t          j	        k    rd| j         _        nd| j         _        | j         j        dk    rWt                      }| j        dk    r1 ||                                |                                          }n |||          }n| j         j        dk    rGt                      } ||                    d| j                  |                    d                    }n*| j         j        dk    rt                      } |||          }|
s|f|dd         z   }||f|z   n|S t          |||j        |j        	          S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   rX   rV   r   ru   r   r@  rA  r   r   
regressionsingle_label_classificationmulti_label_classificationr$   r  )r<   ro  r   r  problem_typer  rY   r4   ro   r   r
   squeezer	   r   r   r   r   rG  rq   rt   r   rX   rV   r   ru   r  r   r@  rA  r   sequence_outputr  r  r  r   s                    rN   r   z)ConvBertForSequenceClassification.forward  s   2 &1%<kk$+B]--))%'/!5#   

 

 "!*11{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE'!/)	
 
 
 	
rs   r  )r   r   r   r[   r   ru  rv  r   rw  r   rx  r   r4   r   r   r   r   r   r   r   r   s   @rN   r  r    s            +*+D+K+KLi+j+jkk&,$   156:59371559-1,0/3&*D
 D
E,-D
 !!23D
 !!12	D

 u/0D
 E-.D
   12D
 )*D
 $D>D
 'tnD
 d^D
 
u..	/D
 D
 D
  lkD
 D
 D
 D
 D
rs   r  z
    ConvBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                       e Zd Z fdZ ee                    d                     eee	e
          	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 dee         dee         dee         deee	f         fd                        Z xZS )ConvBertForMultipleChoicec                     t                                          |           t          |          | _        t	          |          | _        t          j        |j        d          | _	        | 
                                 d S )Nr   )rZ   r[   r[  r   r   sequence_summaryr   r   r   r  r_  rp   s     rN   r[   z"ConvBertForMultipleChoice.__init__/  sh       %f-- / 7 7)F$6:: 	rs   z(batch_size, num_choices, sequence_lengthrk  Nrt   r   rX   rV   r   ru   r  r   r@  rA  rv   c                    |
|
n| j         j        }
||j        d         n|j        d         }|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|=|                    d|                    d          |                    d                    nd}|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }|                    d|          }d}|t                      } |||          }|
s|f|dd         z   }||f|z   n|S t          |||j
        |j                  S )aJ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r$   r   r  r   r  )r<   ro  rC   r   rn   r   r  r  r	   r   r   rG  )rq   rt   r   rX   rV   r   ru   r  r   r@  rA  num_choicesr   r  pooled_outputr  reshaped_logitsr  r  r   s                       rN   r   z!ConvBertForMultipleChoice.forward9  s,   6 &1%<kk$+B],5,Aioa((}GZ[\G]>G>SINN2y~~b'9'9:::Y]	M[Mg,,R1D1DR1H1HIIImqM[Mg,,R1D1DR1H1HIIImqGSG_|((\->->r-B-BCCCei ( r=#5#5b#9#9=;M;Mb;Q;QRRR 	 --))%'/!5#   

 

 "!*--o>>// ++b+66'))H8OV44D 	F%''!""+5F)-)9TGf$$vE("!/)	
 
 
 	
rs   r  )r   r   r   r[   r   ru  rv  r   rw  r   rx  r   r4   r   r   r   r   r   r   r   r   s   @rN   r  r  '  s            +*!(()STT   &-$   156:59371559-1,0/3&*@
 @
E,-@
 !!23@
 !!12	@

 u/0@
 E-.@
   12@
 )*@
 $D>@
 'tn@
 d^@
 
u//	0@
 @
 @
  @
 @
 @
 @
 @
rs   r  z
    ConvBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       e Zd Z fdZ ee                    d                     eee	e
          	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 dee         dee         dee         deee	f         fd                        Z xZS )ConvBertForTokenClassificationc                 V   t                                          |           |j        | _        t          |          | _        |j        |j        n|j        }t          j        |          | _	        t          j
        |j        |j                  | _        |                                  d S r   )rZ   r[   r  r[  r   r  rh   r   rg   ri   r   r   r  r_  r  s      rN   r[   z'ConvBertForTokenClassification.__init__  s        +%f--)/)B)NF%%TZTn 	 z"455)F$68IJJ 	rs   rj  rk  Nrt   r   rX   rV   r   ru   r  r   r@  rA  rv   c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }d}|Ft                      } ||                    d| j                  |                    d                    }|
s|f|dd         z   }||f|z   n|S t          |||j	        |j
                  S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r$   r   r  )r<   ro  r   ri   r  r	   r   r  r   r   rG  r  s                    rN   r   z&ConvBertForTokenClassification.forward  s   . &1%<kk$+B]--))%'/!5#   

 

 "!*,,7711'))H8FKKDO<<fkk"ooNND 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
rs   r  )r   r   r   r[   r   ru  rv  r   rw  r   rx  r   r4   r   r   r   r   r   r   r   r   s   @rN   r  r    s}            +*+D+K+KLi+j+jkk&)$   156:59371559-1,0/3&*2
 2
E,-2
 !!232
 !!12	2

 u/02
 E-.2
   122
 )*2
 $D>2
 'tn2
 d^2
 
u++	,2
 2
 2
  lk2
 2
 2
 2
 2
rs   r  z
    ConvBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd Z fdZ ee                    d                     eee	e
          	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 deej                 dee         dee         dee         deee	f         fd                        Z xZS )ConvBertForQuestionAnsweringc                     t                                          |           |j        | _        t          |          | _        t          j        |j        |j                  | _        | 	                                 d S r   )
rZ   r[   r  r[  r   r   r   r   
qa_outputsr_  rp   s     rN   r[   z%ConvBertForQuestionAnswering.__init__  se        +%f--)F$68IJJ 	rs   rj  rk  Nrt   r   rX   rV   r   ru   start_positionsend_positionsr   r@  rA  rv   c                    ||n| j         j        }|                     |||||||	|
|	  	        }|d         }|                     |          }|                    dd          \  }}|                    d                                          }|                    d                                          }d}||t          |                                          dk    r|                    d          }t          |                                          dk    r|                    d          }|                    d          }|	                    d|          }|	                    d|          }t          |          } |||          } |||          }||z   dz  }|s||f|dd         z   }||f|z   n|S t          ||||j        |j        	          S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        Nr  r   r   r$   r   )ignore_indexr#   )r  start_logits
end_logitsr   rG  )r<   ro  r   r  splitr  r   r  rn   clampr	   r   r   rG  )rq   rt   r   rX   rV   r   ru   r  r  r   r@  rA  r   r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                          rN   r   z$ConvBertForQuestionAnswering.forward  s   < &1%<kk$+B]--))%'/!5#   

 

 "!*11#)<<r<#:#: j#++B//::<<''++6688

&=+D?''))**Q.."1"9"9""="==%%''((1,, - 5 5b 9 9(--a00M-33A}EEO)//=AAM']CCCH!,@@Jx
M::H$x/14J 	R"J/'!""+=F/9/EZMF**6Q+%!!/)
 
 
 	
rs   )NNNNNNNNNNN)r   r   r   r[   r   ru  rv  r   rw  r   rx  r   r4   r   r   r   r   r   r   r   r   s   @rN   r  r    s            +*+D+K+KLi+j+jkk&0$   156:593715596:48,0/3&*H
 H
E,-H
 !!23H
 !!12	H

 u/0H
 E-.H
   12H
 "%"23H
   01H
 $D>H
 'tnH
 d^H
 
u22	3H
 H
 H
  lkH
 H
 H
 H
 H
rs   r  )Er   r   r)   operatorr   typingr   r   r   r4   torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   r   modeling_outputsr   r   r   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   r   configuration_convbertr   
get_loggerr   r'   rw  rx  rO   ModulerQ   r   r   r   r   r   r  r  r  r#  r7  rV  CONVBERT_START_DOCSTRINGru  r[  rz  r  r  r  r  r  r  r   rs   rN   <module>r     sB      				       ) ) ) ) ) ) ) ) ) )            A A A A A A A A A A 1 1 1 1 1 1 1 1                ? > > > > > > > l l l l l l l l l l u u u u u u u u u u u u 2 2 2 2 2 2 
	H	%	%/ "y y yx9 9 9 9 9 9 9 9x* * * * *o * * *8    bi   4} } } } }BI } } }@       * * * * *	 * * *Z       ,    29   (    RY   &: : : : :BI : : :zB
 B
 B
 B
 B
bi B
 B
 B
J    bi   "	 2 j h ] ] ] ] ]+ ] ]	 ]@    29   $ RTlmmM
 M
 M
 M
 M
1 M
 M
 nmM
`       0   U
 U
 U
 U
 U
(? U
 U
 U
p   S
 S
 S
 S
 S
 7 S
 S
 S
l   G
 G
 G
 G
 G
%< G
 G
 G
T   Y
 Y
 Y
 Y
 Y
#: Y
 Y
 Y
 Y
 Y
rs   