
    gI                        d Z ddlmZ ddlmZmZmZ ddlZddlZddlm	Z	 ddl
mZmZmZ ddlmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlm Z   ej!        e"          Z#dZ$dZ%dZ&dZ'dZ(dZ)dZ*dZ+dZ,dZ- G d de	j.                  Z/ G d dej0        j1                  Z2 G d d          Z3d Z4 G d  d!ej0        j1                  Z5 G d" d#e	j.                  Z6 G d$ d%e	j.                  Z7 G d& d'e	j.                  Z8 G d( d)e	j.                  Z9 G d* d+e	j.                  Z: G d, d-e	j.                  Z; G d. d/e	j.                  Z< G d0 d1e	j.                  Z=d2 Z>ej?        j@        d3             ZAej?        j@        d4             ZBej?        j@        d5             ZC G d6 d7e	j.                  ZD G d8 d9e	j.                  ZE G d: d;e          ZFd<ZGd=ZH ed>eG           G d? d@eF                      ZI edAeG           G dB dCeF                      ZJ G dD dEe	j.                  ZK G dF dGe	j.                  ZL G dH dIe	j.                  ZM edJeG           G dK dLeF                      ZN edMeG           G dN dOeF                      ZO edPeG           G dQ dReF                      ZPdS )SzPyTorch DeBERTa model.    )Sequence)OptionalTupleUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputMaskedLMOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)softmax_backward_data)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )DebertaConfigr   zmicrosoft/deberta-basez!lsanochkin/deberta-large-feedbackz' Paris'z0.54z#Palak/microsoft_deberta-large_squadz' a nice puppet'gQ?      c                   :     e Zd Z fdZd Zed             Z xZS )ContextPoolerc                     t                                                       t          j        |j        |j                  | _        t          |j                  | _        || _	        d S N)
super__init__r   Linearpooler_hidden_sizedenseStableDropoutpooler_dropoutdropoutconfigselfr(   	__class__s     h/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/deberta/modeling_deberta.pyr!   zContextPooler.__init__9   sO    Yv8&:STT
$V%:;;    c                     |d d df         }|                      |          }|                     |          }t          | j        j                 |          }|S Nr   )r'   r$   r   r(   pooler_hidden_act)r*   hidden_statescontext_tokenpooled_outputs       r,   forwardzContextPooler.forward?   sU     &aaad+]33

=11t{<=mLLr-   c                     | j         j        S r   )r(   hidden_sizer*   s    r,   
output_dimzContextPooler.output_dimI   s    {&&r-   )__name__
__module____qualname__r!   r4   propertyr8   __classcell__r+   s   @r,   r   r   8   sb               ' ' X' ' ' ' 'r-   r   c                   T    e Zd ZdZed             Zed             Zed             ZdS )XSoftmaxa  
    Masked Softmax which is optimized for saving memory

    Args:
        input (`torch.tensor`): The input tensor that will apply softmax.
        mask (`torch.IntTensor`):
            The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
        dim (int): The dimension that will apply softmax

    Example:

    ```python
    >>> import torch
    >>> from transformers.models.deberta.modeling_deberta import XSoftmax

    >>> # Make a tensor
    >>> x = torch.randn([4, 20, 100])

    >>> # Create a mask
    >>> mask = (x > 0).int()

    >>> # Specify the dimension to apply softmax
    >>> dim = -1

    >>> y = XSoftmax.apply(x, mask, dim)
    ```c                 f   || _         |                    t          j                   }|                    |t          j        t          j        |j                  j                            }t          j	        || j                   }|
                    |d           |                     |           |S r/   )dimtotorchboolmasked_filltensorfinfodtypeminsoftmaxmasked_fill_save_for_backward)ctxinputmaskrB   rmaskoutputs         r,   r4   zXSoftmax.forwardj   s    ''%*%%&""5%,u{5;7O7O7S*T*TUUvsw//E1%%%f%%%r-   c                 N    | j         \  }t          | ||| j        |          }|d d fS r   )saved_tensorsr   rB   )rN   grad_outputrR   	inputGrads       r,   backwardzXSoftmax.backwardu   s0    %	)#{FCGVTT	$$$r-   c                    dd l mc m} ddlm}m} |                     d||j        d                   }|                     d|                     d|                     dt          j	        dt          j
        	          
          |          |j        d                   } || |||                     dt          j	        t          j        |                                                                          j                  
                    }	 || |	|          }	 || |	||                     dt          j	        dt          j        	          
                    S )Nr   )rF   rK   CastLong)to_iSubConstantr   rI   )value_tBool)torch.onnx.symbolic_helperonnxsymbolic_helpertorch.onnx.symbolic_opset9rF   rK   opcast_pytorch_to_onnxrD   rG   int64rH   typerI   rJ   rE   )
gr*   rP   rB   sym_helprF   rK   mask_cast_valuer_maskrR   s
             r,   symboliczXSoftmax.symbolic{   sV   555555555CCCCCCCC$$vt(2OPV2W$XXDDZau{1S1S1STTVeff.v6  
 

 tVQTT*el5;tyy{{O`O`ObObCcCcCg6h6hTii
 
 FC(({1ffadd:u|TU]b]gGhGhGhd.i.ijjjr-   N)r9   r:   r;   __doc__staticmethodr4   rW   rm    r-   r,   r@   r@   N   st         6   \ % % \%
 k k \k k kr-   r@   c                       e Zd Zd ZdS )DropoutContextc                 >    d| _         d | _        d| _        d| _        d S )Nr   r   T)r'   rP   scale
reuse_maskr7   s    r,   r!   zDropoutContext.__init__   s"    	
r-   N)r9   r:   r;   r!   rp   r-   r,   rr   rr      s#            r-   rr   c                 j   t          |t                    s|}d }n!|j        }||j        z  }|j        r|j        nd }|dk    rL|Jdt          j        |                               d|z
            z
  	                    t          j
                  }t          |t                    r|j        ||_        ||fS )Nr   r   )
isinstancerr   r'   rt   ru   rP   rD   
empty_like
bernoulli_rC   rE   )rO   local_contextr'   rP   s       r,   get_maskr{      s    m^44 H'=&&%2%=G}!!4{{t|E$U++66q7{CCCGG
SS-00 &%!%M=r-   c            	           e Zd ZdZed             Zed             Zedej        j	        dej        j
        deeef         dej        j
        fd            Zd	S )
XDropoutzlOptimized dropout function to save computation and memory by using mask operation instead of multiplication.c                     t          ||          \  }}dd|z
  z  | _        |dk    r3|                     |           |                    |d          | j        z  S |S )Ng      ?r   r   )r{   rt   rM   rF   )rN   rO   	local_ctxrP   r'   s        r,   r4   zXDropout.forward   sd     	22g1w;'	Q;;!!$'''$$T1--	99Lr-   c                 r    | j         dk    r)| j        \  }|                    |d          | j         z  d fS |d fS )Nr   r   )rt   rT   rF   )rN   rU   rP   s      r,   rW   zXDropout.backward   sD    9q=='GT**433ci?EE$$r-   ri   rO   r   returnc                 ~    ddl m} |}t          |t                    r|j        }d}|                    | |||          S )Nr   )symbolic_opset12T)
torch.onnxr   rw   rr   r'   )ri   rO   r   r   	dropout_ptrains         r,   rm   zXDropout.symbolic   sT    //////	i00 	*!)I  ''5)UCCCr-   N)r9   r:   r;   rn   ro   r4   rW   rD   _CGraphValuer   floatrr   rm   rp   r-   r,   r}   r}      s        vv  \ % % \% DEHN D58> DeESaLaFb Dglgogu D D D \D D Dr-   r}   c                   <     e Zd ZdZ fdZd Zd Zd	dZd Z xZ	S )
r%   z
    Optimized dropout module for stabilizing the training

    Args:
        drop_prob (float): the dropout probabilities
    c                 r    t                                                       || _        d| _        d | _        d S r/   )r    r!   	drop_probcountcontext_stack)r*   r   r+   s     r,   r!   zStableDropout.__init__   s5    "
!r-   c                     | j         r8| j        dk    r-t                              ||                                           S |S )zr
        Call the module

        Args:
            x (`torch.tensor`): The input tensor to apply dropout
        r   )trainingr   r}   applyget_context)r*   xs     r,   r4   zStableDropout.forward   s?     = 	9T^a//>>!T%5%5%7%7888r-   c                 "    d| _         d | _        d S r/   )r   r   r7   s    r,   clear_contextzStableDropout.clear_context   s    
!r-   Tr   c                 `    | j         g | _         d| _        | j         D ]}||_        ||_        d S r/   )r   r   ru   rt   )r*   ru   rt   cs       r,   init_contextzStableDropout.init_context   sF    %!#D
# 	 	A%ALAGG	 	r-   c                    | j         s| j        t          | j                   k    r&| j                             t	                                 | j         | j                 }| j        |_        | xj        dz  c_        |S | j        S )Nr   )r   r   lenappendrr   r   r'   )r*   rN   s     r,   r   zStableDropout.get_context   st    )zS!34444")).*:*:;;;$TZ0C.CKJJ!OJJJ>!r-   )Tr   )
r9   r:   r;   rn   r!   r4   r   r   r   r=   r>   s   @r,   r%   r%      s         " " " " "	 	 	" " "   	" 	" 	" 	" 	" 	" 	"r-   r%   c                   *     e Zd ZdZd fd	Zd Z xZS )DebertaLayerNormzBLayerNorm module in the TF style (epsilon inside the square root).-q=c                    t                                                       t          j        t	          j        |                    | _        t          j        t	          j        |                    | _        || _	        d S r   )
r    r!   r   	ParameterrD   onesweightzerosbiasvariance_epsilon)r*   sizeepsr+   s      r,   r!   zDebertaLayerNorm.__init__  s]    l5:d#3#344LT!2!233	 #r-   c                 V   |j         }|                                }|                    dd          }||z
                      d                              dd          }||z
  t	          j        || j        z             z  }|                    |          }| j        |z  | j	        z   }|S )NT)keepdim   )
rI   r   meanpowrD   sqrtr   rC   r   r   )r*   r1   
input_typer   varianceys         r,   r4   zDebertaLayerNorm.forward	  s    "(
%++--!!"d!33!D(--a0055b$5GG&-HtG\<\1]1]]%((44K-'$)3r-   )r   r9   r:   r;   rn   r!   r4   r=   r>   s   @r,   r   r      sR        LL$ $ $ $ $ $      r-   r   c                   $     e Zd Z fdZd Z xZS )DebertaSelfOutputc                     t                                                       t          j        |j        |j                  | _        t          |j        |j                  | _        t          |j
                  | _        d S r   )r    r!   r   r"   r6   r$   r   layer_norm_eps	LayerNormr%   hidden_dropout_probr'   r)   s     r,   r!   zDebertaSelfOutput.__init__  s_    Yv163EFF
)&*<f>STT$V%?@@r-   c                     |                      |          }|                     |          }|                     ||z             }|S r   r$   r'   r   r*   r1   input_tensors      r,   r4   zDebertaSelfOutput.forward  @    

=11]33}|'CDDr-   r9   r:   r;   r!   r4   r=   r>   s   @r,   r   r     sL        A A A A A      r-   r   c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )DebertaAttentionc                     t                                                       t          |          | _        t	          |          | _        || _        d S r   )r    r!   DisentangledSelfAttentionr*   r   rR   r(   r)   s     r,   r!   zDebertaAttention.__init__#  sB    -f55	'//r-   FNc                     |                      ||||||          }|r|\  }}||}|                     ||          }	|r|	|fS |	S )N)query_statesrelative_posrel_embeddings)r*   rR   )
r*   r1   attention_maskoutput_attentionsr   r   r   self_output
att_matrixattention_outputs
             r,   r4   zDebertaAttention.forward)  sz     ii%%)   
 
  	2&1#K(L;;{LAA 	$$j11##r-   FNNNr   r>   s   @r,   r   r   "  sZ              $ $ $ $ $ $ $ $r-   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )DebertaIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r   )r    r!   r   r"   r6   intermediate_sizer$   rw   
hidden_actstrr   intermediate_act_fnr)   s     r,   r!   zDebertaIntermediate.__init__H  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r-   r1   r   c                 Z    |                      |          }|                     |          }|S r   )r$   r   r*   r1   s     r,   r4   zDebertaIntermediate.forwardP  s,    

=1100??r-   )r9   r:   r;   r!   rD   Tensorr4   r=   r>   s   @r,   r   r   G  s^        9 9 9 9 9U\ el        r-   r   c                   $     e Zd Z fdZd Z xZS )DebertaOutputc                    t                                                       t          j        |j        |j                  | _        t          |j        |j                  | _	        t          |j                  | _        || _        d S r   )r    r!   r   r"   r   r6   r$   r   r   r   r%   r   r'   r(   r)   s     r,   r!   zDebertaOutput.__init__W  sf    Yv79KLL
)&*<f>STT$V%?@@r-   c                     |                      |          }|                     |          }|                     ||z             }|S r   r   r   s      r,   r4   zDebertaOutput.forward^  r   r-   r   r>   s   @r,   r   r   V  sG                  r-   r   c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )DebertaLayerc                     t                                                       t          |          | _        t	          |          | _        t          |          | _        d S r   )r    r!   r   	attentionr   intermediater   rR   r)   s     r,   r!   zDebertaLayer.__init__f  sK    )&11/77#F++r-   NFc                     |                      ||||||          }|r|\  }}|                     |          }	|                     |	|          }
|r|
|fS |
S )Nr   r   r   r   )r   r   rR   )r*   r1   r   r   r   r   r   r   r   intermediate_outputlayer_outputs              r,   r4   zDebertaLayer.forwardl  s      >>/%%) * 
 
  	<+;(j"//0@AA{{#68HII 	  *--r-   )NNNFr   r>   s   @r,   r   r   e  sZ        , , , , ,                r-   r   c                   H     e Zd ZdZ fdZd Zd Zd
dZ	 	 	 	 	 dd	Z xZ	S )DebertaEncoderz8Modified BertEncoder with relative position bias supportc                    t                                                       t          j        fdt	          j                  D                       | _        t          dd          | _        | j        rTt          dd          | _	        | j	        dk     rj
        | _	        t          j        | j	        dz  j                  | _        d| _        d S )Nc                 .    g | ]}t                    S rp   )r   ).0_r(   s     r,   
<listcomp>z+DebertaEncoder.__init__.<locals>.<listcomp>  s!    #b#b#bQL$8$8#b#b#br-   relative_attentionFmax_relative_positionsr   r   r   )r    r!   r   
ModuleListrangenum_hidden_layerslayergetattrr   r   max_position_embeddings	Embeddingr6   r   gradient_checkpointingr)   s    `r,   r!   zDebertaEncoder.__init__  s    ]#b#b#b#b%H`BaBa#b#b#bcc
")&2F"N"N" 	d*1&:RTV*W*WD'*Q...4.L+"$,t/JQ/NPVPb"c"cD&+###r-   c                 0    | j         r| j        j        nd }|S r   )r   r   r   )r*   r   s     r,   get_rel_embeddingz DebertaEncoder.get_rel_embedding  s!    7;7NX,33TXr-   c                 8   |                                 dk    rT|                    d                              d          }||                    d                              d          z  }n-|                                 dk    r|                    d          }|S )Nr   r   r   r   )rB   	unsqueezesqueeze)r*   r   extended_attention_masks      r,   get_attention_maskz!DebertaEncoder.get_attention_mask  s    1$$&4&>&>q&A&A&K&KA&N&N#47N7V7VWY7Z7Z7d7deg7h7hhNN!!Q&&+55a88Nr-   Nc                     | j         rW|U||                    d          n|                    d          }t          ||                    d          |j                  }|S )Nr   )r   r   build_relative_positiondevice)r*   r1   r   r   qs        r,   get_rel_poszDebertaEncoder.get_rel_pos  sg    " 	d|';)5)A!!"%%%}GYGYZ\G]G]A21m6H6H6L6LmNbccLr-   TFc           
         |                      |          }|                     |||          }|rdnd }|rdnd }	t          |t                    r	|d         }
n|}
|                                 }t          | j                  D ]\  }}|r||fz   }| j        r(| j        r!| 	                    |j
        |
|||||          }n ||
|||||          }|r|\  }}|@|}t          |t                    r(|dz   t          | j                  k     r||dz            nd }
n|}
|r|	|fz   }	|r||fz   }|st          d |||	fD                       S t          |||	          S )Nrp   r   )r   r   r   r   r   c              3      K   | ]}||V  	d S r   rp   )r   vs     r,   	<genexpr>z)DebertaEncoder.forward.<locals>.<genexpr>  s(      hhqZ[ZgZgZgZgZghhr-   last_hidden_stater1   
attentions)r   r  rw   r   r   	enumerater   r   r   _gradient_checkpointing_func__call__r   tupler   )r*   r1   r   output_hidden_statesr   r   r   return_dictall_hidden_statesall_attentionsnext_kvr   ilayer_moduleatt_ms                  r,   r4   zDebertaEncoder.forward  s    00@@''|\RR"6@BBD0:dmX.. 	$#A&GG#G//11(44 #	; #	;OA|# I$58H$H!* t}  $ A A )"  "%! ! !-"!-!-#1&7! ! ! ! 5'4$u',mX66 X67!ec$*oo6M6MmAE22SWG'  ;!/5(!: 	E 1]4D D 	ihh]4E~$Vhhhhhh+;LYg
 
 
 	
r-   )NN)TFNNT)
r9   r:   r;   rn   r!   r   r   r  r4   r=   r>   s   @r,   r   r     s        BB	, 	, 	, 	, 	,        "A
 A
 A
 A
 A
 A
 A
 A
r-   r   c                 >   t          j        | t           j        |          }t          j        |t           j        |          }|dddf         |                    dd                              | d          z
  }|d| ddf         }|                    d          }|S )a  
    Build relative position according to the query and key

    We assume the absolute position of query \(P_q\) is range from (0, query_size) and the absolute position of key
    \(P_k\) is range from (0, key_size), The relative positions from query to key is \(R_{q \rightarrow k} = P_q -
    P_k\)

    Args:
        query_size (int): the length of query
        key_size (int): the length of key

    Return:
        `torch.LongTensor`: A tensor with shape [1, query_size, key_size]

    rI   r   Nr   r   r   )rD   arangelongviewrepeatr   )
query_sizekey_sizer   q_idsk_idsrel_pos_idss         r,   r   r     s    " L5:fEEEELFCCCE4.5::a#4#4#;#;J#J#JJKkzk111n-K''**Kr-   c                     |                      |                    d          |                    d          |                    d          |                    d          g          S )Nr   r   r   r   expandr   )c2p_posquery_layerr   s      r,   c2p_dynamic_expandr&    sX    >>;++A..0@0@0C0C[EUEUVWEXEXZfZkZklnZoZopqqqr-   c                     |                      |                    d          |                    d          |                    d          |                    d          g          S )Nr   r   r   r"  )r$  r%  	key_layers      r,   p2c_dynamic_expandr)  
  sV    >>;++A..0@0@0C0CY^^TVEWEWYbYgYghjYkYklmmmr-   c                     |                      |                                d d         |                     d          |                    d          fz             S )Nr   r   r"  )	pos_indexp2c_attr(  s      r,   pos_dynamic_expandr-    sJ    GLLNN2A2.)..2D2DinnUWFXFX1YYZZZr-   c                   >     e Zd ZdZ fdZd Z	 	 	 	 ddZd Z xZS )	r   a  
    Disentangled self-attention module

    Parameters:
        config (`str`):
            A model config class instance with the configuration to build a new model. The schema is similar to
            *BertConfig*, for more details, please refer [`DebertaConfig`]

    c                    t                                                       |j        |j        z  dk    r t	          d|j         d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j	        |j        | j        dz  d          | _
        t          j        t          j        | j        t          j                            | _        t          j        t          j        | j        t          j                            | _        |j        |j        ng | _        t%          |d	d          | _        t%          |d
d          | _        | j        rLt          j	        |j        |j        d          | _        t          j	        |j        |j        d          | _        | j        rt%          |dd          | _        | j        dk     r|j        | _        t3          |j                  | _        d| j        v r&t          j	        |j        | j        d          | _        d| j        v r$t          j	        |j        | j                  | _        t3          |j                  | _        d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   Fr   r^   r   talking_headr   r   r   c2pp2c) r    r!   r6   num_attention_heads
ValueErrorintattention_head_sizeall_head_sizer   r"   in_projr   rD   r   r   q_biasv_biaspos_att_typer   r   r2  head_logits_projhead_weights_projr   r   r%   r   pos_dropoutpos_proj
pos_q_projattention_probs_dropout_probr'   r)   s     r,   r!   z"DisentangledSelfAttention.__init__  sY    ::a??8F$6 8 8 48 8 8   $*#= #&v'9F<V'V#W#W !58PPy!3T5G!5KRWXXXl5;0B5;#W#W#WXXl5;0B5;#W#W#WXX393F3RF//XZ")&2F"N"N#FNEBB 	s$&If.H&Jdkp$q$q$qD!%'Yv/I6Kelq%r%r%rD"" 		T*1&:RTV*W*WD'*Q...4.L+,V-GHHD))) "	&*<d>PW\ ] ] ])))"$)F,>@R"S"S$V%HIIr-   c                     |                                 d d         | j        dfz   }|                    |          }|                    dddd          S )Nr   r   r   r   r   )r   r5  r  permute)r*   r   new_x_shapes      r,   transpose_for_scoresz.DisentangledSelfAttention.transpose_for_scoresB  sM    ffhhssmt'?&DDFF;yyAq!$$$r-   FNc                     |D                                }                     |                              dd          \  }}	}
nd  j         j                             j        dz  d           fdt          d          D             dgdz   d         d         |                    d         j                            }fd	t          d
d          D             \  }} fd|||fD             \  }}	}
|                      j        ddddf                   z   }|
                      j	        ddddf                   z   }
d}d
t           j                  z   }t          j        t          j        |                    d          t          j                  |z            }||                    |j                  z  }t          j        ||	                    dd                    } j        r.                     |          }                     ||	|||          }|||z   } j        rA                     |                    dddd
                                        ddd
d          }t2                              ||d          }                     |          } j        rA                     |                    dddd
                                        ddd
d          }t          j        ||
          }|                    ddd
d                                          }|                                dd         dz   }|                    |          }|r||fS |S )a  
        Call the module

        Args:
            hidden_states (`torch.FloatTensor`):
                Input states to the module usually the output from previous layer, it will be the Q,K and V in
                *Attention(Q,K,V)*

            attention_mask (`torch.BoolTensor`):
                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.

            output_attentions (`bool`, *optional*):
                Whether return the attention matrix.

            query_states (`torch.FloatTensor`, *optional*):
                The *Q* state in *Attention(Q,K,V)*.

            relative_pos (`torch.LongTensor`):
                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
                values ranging in [*-max_relative_positions*, *max_relative_positions*].

            rel_embeddings (`torch.FloatTensor`):
                The embedding of relative distances. It's a tensor of shape [\(2 \times
                \text{max_relative_positions}\), *hidden_size*].


        Nr   r   rB   c                     |<t          j        ||                                           |                                z   S t          j        ||                                           S r   )rD   matmult)wbr   s      r,   linearz1DisentangledSelfAttention.forward.<locals>.linearr  sH    = <1335511ACCEE99 <13355111r-   r   c                 |    g | ]7t          j        fd t          j                  D             d          8S )c                 ,    g | ]}|d z  z            S )r   rp   )r   r  kwss     r,   r   z@DisentangledSelfAttention.forward.<locals>.<listcomp>.<listcomp>y  s%    VVVr!a%!)}VVVr-   r   rI  )rD   catr   r5  )r   rR  r*   rS  s    @r,   r   z5DisentangledSelfAttention.forward.<locals>.<listcomp>y  sN    qqqcdEIVVVVVeD<T6U6UVVV\]^^^qqqr-   r^   c           
          g | ]:} |         |                              |         j                             ;S )r^   )rC   rI   )r   r  r1   rO  qkvbqkvws     r,   r   z5DisentangledSelfAttention.forward.<locals>.<listcomp>}  sG    iiiXYFF47DG]-=-=DGM-=-R-RSSiiir-   r   c                 :    g | ]}                     |          S rp   )rG  )r   r   r*   s     r,   r   z5DisentangledSelfAttention.forward.<locals>.<listcomp>~  s(    2c2c2cTU43L3LQ3O3O2c2c2cr-   r   r   r   )r:  rG  chunkr   r5  r   rC   rI   r;  r<  r   r=  rD   r   rG   r   r   rK  	transposer   r@  disentangled_att_biasr2  r>  rE  r@   r   r'   r?  
contiguousr  )r*   r1   r   r   r   r   r   qpr%  r(  value_layerr  rR  r  rel_attscale_factorrt   attention_scoresattention_probscontext_layernew_context_layer_shaperO  rV  rW  rS  s   ``                   @@@@r,   r4   z!DisentangledSelfAttention.forwardG  s   L m,,B262K2KB2O2O2U2UVW]_2U2`2`/KKK2 2 2 $**4+Ca+GQ*OOBqqqqqhmnohphpqqqD6A:DtAwQtAw})M)MNNAiiiiiii]bcdfg]h]hiiiDAq2c2c2c2cZ[]^`aYb2c2c2c/KK!D$=$=dk$PTVWVWVW->X$Y$YY!D$=$=dk$PTVWVWVW->X$Y$YY3t0111
5<(8(8(<(<EKPPPS__``!EHH;3DH$E$EE <Y5H5HR5P5PQQ" 	u!--n==N00iWegsttG/'9  	o#445E5M5MaQRTUWX5Y5YZZbbcdfgijlmnn"..)9>2NN,,77 	n"44_5L5LQPQSTVW5X5XYYaabcefhiklmmO_kBB%--aAq99DDFF"/"4"4"6"6ss";e"C%**+BCC 	!!?33  r-   c           	      |   |>|                     d          }t          ||                     d          |j                  }|                                dk    r)|                    d                              d          }nj|                                dk    r|                    d          }n<|                                dk    r$t          d|                                           t          t          |                     d          |                     d                    | j                  }|	                                
                    |j                  }|| j        |z
  | j        |z   d d f                             d          }d}d| j        v r|                     |          }	|                     |	          }	t          j        ||	                    d	d                    }
t          j        ||z   d|dz  dz
            }t          j        |
d	t'          |||          
          }
||
z  }d| j        v r|                     |          }|                     |          }|t          j        t          j        |                     d	          t          j                  |z            z  }|                     d          |                     d          k    r=t          |                     d          |                     d          |j                  }n|}t          j        | |z   d|dz  dz
            }t          j        ||                    d	d          
                    |j                            }t          j        |d	t3          |||          
                              d	d          }|                     d          |                     d          k    rK|d d d d d d df                             d	          }t          j        |dt5          |||          
          }||z  }|S )Nr   r   r   r   r      z2Relative position ids must be of dim 2 or 3 or 4. r3  r   )rB   indexr4  r^   )r   r   r   rB   r   r6  rJ   maxr   r  rC   r=  rA  rG  rD   rK  r[  clampgatherr&  rB  r   rG   r   rI   r)  r-  )r*   r%  r(  r   r   ra  r  att_spanscorepos_key_layerc2p_attr$  pos_query_layerr_posp2c_posr,  r+  s                    r,   r\  z/DisentangledSelfAttention.disentangled_att_bias  s     $$A21innR6H6H+J\]]L""'11!44>>qAALL1$$'11!44LL1$$fR^RbRbRdRdffgggs;++B//1C1CDDdFabb#((**--k.@AA''(2T5PS[5[[]^]^]^^

)A,, 	  D%%% MM.99M 55mDDMl;0G0GB0O0OPPGk,"91hlQ>NOOGl7:LWVaco:p:pqqqGWE D%%%"oon==O"77HHOuz%,7K7KB7O7OW\Wb*c*c*cfr*rsssO##y~~b'9'999/	r0B0BINNSUDVDVXcXjkk$k5&8"3Q1q8HIIGl9o.G.GB.O.O.R.RYbYh.R.i.ijjGlR'9';PY'Z'Z  iB  ##y~~b'9'999(AAAqqq!4>>rBB	,wB>PQZ\cen>o>opppWEr-   r   )	r9   r:   r;   rn   r!   rG  r4   r\  r=   r>   s   @r,   r   r     s         !J !J !J !J !JF% % %  Y! Y! Y! Y!v1 1 1 1 1 1 1r-   r   c                   *     e Zd ZdZ fdZddZ xZS )DebertaEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 J   t                                                       t          |dd          }t          |d|j                  | _        t          j        |j        | j        |          | _        t          |dd          | _	        | j	        sd | _
        n$t          j        |j        | j                  | _
        |j        dk    r$t          j        |j        | j                  | _        | j        |j        k    r&t          j        | j        |j        d          | _        t!          |j        |j                  | _        t'          |j                  | _        || _        |                     d	t1          j        |j                                      d
          d           d S )Npad_token_idr   embedding_size)padding_idxposition_biased_inputTFr1  position_ids)r   r   )
persistent)r    r!   r   r6   rw  r   r   
vocab_sizeword_embeddingsry  position_embeddingsr   type_vocab_sizetoken_type_embeddingsr"   
embed_projr   r   r   r%   r   r'   r(   register_bufferrD   r  r#  )r*   r(   rv  r+   s      r,   r!   zDebertaEmbeddings.__init__  s~   v~q99%f.>@RSS!|F,=t?R`lmmm%,V5Ld%S%S") 	i'+D$$')|F4RTXTg'h'hD$!A%%)+f6LdNa)b)bD&&"444 i(;V=OV[\\\DO)&*<f>STT$V%?@@ 	EL)GHHOOPWXXej 	 	
 	
 	
 	
 	
r-   Nc                    ||                                 }n|                                 d d         }|d         }|| j        d d d |f         }|+t          j        |t          j        | j        j                  }||                     |          }| j        (|                     |                                          }nt          j        |          }|}	| j	        r|	|z  }	| j
        j        dk    r|                     |          }
|	|
z  }	| j        | j
        j        k    r|                     |	          }	|                     |	          }	||                                |	                                k    rU|                                dk    r(|                    d                              d          }|                    d          }|                    |	j                  }|	|z  }	|                     |	          }	|	S )Nr   r   r  r   rg  r   )r   rz  rD   r   r  r   r}  r~  
zeros_likery  r(   r  r  rw  r6   r  r   rB   r   r   rC   rI   r'   )r*   	input_idstoken_type_idsrz  rP   inputs_embedsinput_shape
seq_lengthr~  
embeddingsr  s              r,   r4   zDebertaEmbeddings.forward  s    #..**KK',,..ss3K ^
,QQQ^<L!"[EJtO`OghhhN  00;;M#/"&":":<;L;L;N;N"O"O"'"2="A"A"
% 	.--J;&**$($>$>~$N$N!//J$+"99944J^^J//
xxzzZ^^----88::??<<??22155D~~a((77:+,,D#d*J\\*--
r-   )NNNNNr   r>   s   @r,   rt  rt    sR        QQ
 
 
 
 
4, , , , , , , ,r-   rt  c                   *    e Zd ZdZeZdZdgZdZd Z	dS )DebertaPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    debertar~  Tc                    t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j        	                                 dS dS t          |t          j
                  r]|j        j                            d| j        j                   |j        -|j        j        |j                 	                                 dS dS dS )zInitialize the weights.g        )r   stdN)rw   r   r"   r   datanormal_r(   initializer_ranger   zero_r   rx  )r*   modules     r,   _init_weightsz$DebertaPreTrainedModel._init_weights-  s    fbi(( 		? M&&CT[5R&SSS{& &&((((( '&-- 	?M&&CT[5R&SSS!-"6#56<<>>>>>	? 	?--r-   N)
r9   r:   r;   rn   r   config_classbase_model_prefix"_keys_to_ignore_on_load_unexpectedsupports_gradient_checkpointingr  rp   r-   r,   r  r  "  sI         
 !L!*?)@&&*#? ? ? ? ?r-   r  a  
    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
    Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.


    Parameters:
        config ([`DebertaConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zaThe bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd Z fdZd Zd Zd Z ee	                    d                     e
eee          	 	 	 	 	 	 	 	 ddeej                 d	eej                 d
eej                 deej                 deej                 dee         dee         dee         deeef         fd                        Z xZS )DebertaModelc                     t                                          |           t          |          | _        t	          |          | _        d| _        || _        |                                  d S r/   )	r    r!   rt  r  r   encoderz_stepsr(   	post_initr)   s     r,   r!   zDebertaModel.__init__}  s]       +F33%f--r-   c                     | j         j        S r   r  r}  r7   s    r,   get_input_embeddingsz!DebertaModel.get_input_embeddings  s    ..r-   c                     || j         _        d S r   r  r*   new_embeddingss     r,   set_input_embeddingsz!DebertaModel.set_input_embeddings  s    *8'''r-   c                      t          d          )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        z7The prune function is not implemented in DeBERTa model.)NotImplementedError)r*   heads_to_prunes     r,   _prune_headszDebertaModel._prune_heads  s    
 ""[\\\r-   batch_size, sequence_length
checkpointoutput_typer  Nr  r   r  rz  r  r   r  r  r   c	           	      <    ||n j         j        }||n j         j        }||n j         j        }||t	          d          |+                     ||           |                                }	n.||                                d d         }	nt	          d          ||j        n|j        }
|t          j	        |	|
          }|!t          j
        |	t          j        |
          }                     |||||          }                     ||d||          }|d	         } j        d	k    r|d
         } fdt           j                  D             }|d         } j                                        } j                            |          } j                            |          }|d	d          D ](} |||d|||          }|                    |           )|d         }|s|f||rd	ndd          z   S t)          ||r|j        nd |j                  S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)r   r  )r  r  rz  rP   r  T)r  r   r  r   r   c                 4    g | ]}j         j        d          S rY  )r  r   )r   r   r*   s     r,   r   z(DebertaModel.forward.<locals>.<listcomp>  s#    JJJdl(,JJJr-   Fr   r   r  )r(   r   r  use_return_dictr6  %warn_if_padding_and_no_attention_maskr   r   rD   r   r   r  r  r  r  r   r   r   r  r   r   r1   r	  )r*   r  r   r  rz  r  r   r  r  r  r   embedding_outputencoder_outputsencoded_layersr1   layersr   r   rel_posr   sequence_outputs   `                    r,   r4   zDebertaModel.forward  s   " 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU%.%:!!@T!"ZFCCCN!"[EJvVVVN??)%' + 
 
 ,,!%/# ' 
 
 )+<!*2.MJJJJeDL6I6IJJJF)"-L!\;;==N!\<<^LLNl../?@@G 	4 	4$u!"&+!-!(#1      %%l3333(, 	^#%>R9YXY8\8\(]]]-;OY/77UY&1
 
 
 	
r-   )NNNNNNNN)r9   r:   r;   r!   r  r  r  r   DEBERTA_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   rD   r   rE   r   r   r4   r=   r>   s   @r,   r  r  x  s       
    / / /9 9 9] ] ] +*+C+J+JKh+i+ijj&#$   -11515/304,0/3&*N
 N
EL)N
 !.N
 !.	N

 u|,N
  -N
 $D>N
 'tnN
 d^N
 
uo%	&N
 N
 N
  kjN
 N
 N
 N
 N
r-   r  z5DeBERTa Model with a `language modeling` head on top.c                       e Zd ZddgZ fdZd Zd Z ee	                    d                     e
eeedee          	 	 	 	 	 	 	 	 	 dd
eej                 deej                 deej                 deej                 deej                 deej                 dee         dee         dee         deeef         fd                        Z xZS )DebertaForMaskedLMzcls.predictions.decoder.weightzcls.predictions.decoder.biasc                     t                                          |           t          |          | _        t	          |          | _        |                                  d S r   )r    r!   r  r  DebertaOnlyMLMHeadclsr  r)   s     r,   r!   zDebertaForMaskedLM.__init__  sQ       #F++%f-- 	r-   c                 $    | j         j        j        S r   )r  predictionsdecoderr7   s    r,   get_output_embeddingsz(DebertaForMaskedLM.get_output_embeddings  s    x#++r-   c                 T    || j         j        _        |j        | j         j        _        d S r   )r  r  r  r   r  s     r,   set_output_embeddingsz(DebertaForMaskedLM.set_output_embeddings  s%    '5$$2$7!!!r-   r  z[MASK])r  r  r  rP   expected_outputexpected_lossNr  r   r  rz  r  labelsr   r  r  r   c
           
         |	|	n| j         j        }	|                     ||||||||	          }
|
d         }|                     |          }d}|Kt	                      } ||                    d| j         j                  |                    d                    }|	s|f|
dd         z   }||f|z   n|S t          |||
j        |
j	                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   r  rz  r  r   r  r  r   r   r   losslogitsr1   r	  )
r(   r  r  r  r	   r  r|  r   r1   r	  )r*   r  r   r  rz  r  r  r   r  r  outputsr  prediction_scoresmasked_lm_lossloss_fctrR   s                   r,   r4   zDebertaForMaskedLM.forward  s   8 &1%<kk$+B],,))%'/!5#  	
 	
 "!* HH_55'))H%X&7&<&<RAW&X&XZ`ZeZefhZiZijjN 	Z')GABBK7F3A3M^%..SYY$!/)	
 
 
 	
r-   	NNNNNNNNN)r9   r:   r;   _tied_weights_keysr!   r  r  r   r  r  r   _CHECKPOINT_FOR_MASKED_LMr   r  _MASKED_LM_EXPECTED_OUTPUT_MASKED_LM_EXPECTED_LOSSr   rD   r   rE   r   r   r4   r=   r>   s   @r,   r  r    s       :<Z[    , , ,8 8 8 +*+C+J+JKh+i+ijj,"$2.   -11515/304)-,0/3&*1
 1
EL)1
 !.1
 !.	1

 u|,1
  -1
 &1
 $D>1
 'tn1
 d^1
 
un$	%1
 1
 1
  kj1
 1
 1
 1
 1
r-   r  c                   $     e Zd Z fdZd Z xZS )DebertaPredictionHeadTransformc                    t                                                       t          |d|j                  | _        t          j        |j        | j                  | _        t          |j	        t                    rt          |j	                 | _        n|j	        | _        t          j        | j        |j                  | _        d S )Nrw  )r   )r    r!   r   r6   rw  r   r"   r$   rw   r   r   r   transform_act_fnr   r   r)   s     r,   r!   z'DebertaPredictionHeadTransform.__init__=  s    %f.>@RSSYv143FGG
f'-- 	6$*6+<$=D!!$*$5D!d&9v?TUUUr-   c                     |                      |          }|                     |          }|                     |          }|S r   )r$   r  r   r   s     r,   r4   z&DebertaPredictionHeadTransform.forwardH  s=    

=11--m<<}55r-   r   r>   s   @r,   r  r  <  sL        	V 	V 	V 	V 	V      r-   r  c                   *     e Zd Z fdZd Zd Z xZS )DebertaLMPredictionHeadc                 t   t                                                       t          |          | _        t	          |d|j                  | _        t          j        | j        |j	        d          | _
        t          j        t          j        |j	                            | _        | j        | j
        _        d S )Nrw  Fr1  )r    r!   r  	transformr   r6   rw  r   r"   r|  r  r   rD   r   r   r)   s     r,   r!   z DebertaLMPredictionHead.__init__P  s    7??%f.>@RSS y!4f6GeTTTLV->!?!?@@	 !Ir-   c                 (    | j         | j        _         d S r   )r   r  r7   s    r,   _tie_weightsz$DebertaLMPredictionHead._tie_weights^  s     Ir-   c                 Z    |                      |          }|                     |          }|S r   )r  r  r   s     r,   r4   zDebertaLMPredictionHead.forwarda  s*    }55]33r-   )r9   r:   r;   r!   r  r4   r=   r>   s   @r,   r  r  O  sV        & & & & && & &      r-   r  c                   $     e Zd Z fdZd Z xZS )r  c                 p    t                                                       t          |          | _        d S r   )r    r!   r  r  r)   s     r,   r!   zDebertaOnlyMLMHead.__init__i  s/    26::r-   c                 0    |                      |          }|S r   )r  )r*   r  r  s      r,   r4   zDebertaOnlyMLMHead.forwardm  s     ,,_==  r-   r   r>   s   @r,   r  r  h  sG        ; ; ; ; ;! ! ! ! ! ! !r-   r  z
    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                       e Zd Z fdZd Zd Z ee                    d                     e	e
ee          	 	 	 	 	 	 	 	 	 ddeej                 deej                 d	eej                 d
eej                 deej                 deej                 dee         dee         dee         deeef         fd                        Z xZS ) DebertaForSequenceClassificationc                    t                                          |           t          |dd          }|| _        t	          |          | _        t          |          | _        | j        j        }t          j
        ||          | _        t          |dd           }|| j        j        n|}t          |          | _        |                                  d S )N
num_labelsr   cls_dropout)r    r!   r   r  r  r  r   poolerr8   r   r"   
classifierr(   r   r%   r'   r  )r*   r(   r  r8   drop_outr+   s        r,   r!   z)DebertaForSequenceClassification.__init__z  s       V\155
$#F++#F++[+
)J
;;6=$776>6F4;22H$X.. 	r-   c                 4    | j                                         S r   )r  r  r7   s    r,   r  z5DebertaForSequenceClassification.get_input_embeddings  s    |00222r-   c                 :    | j                             |           d S r   )r  r  r  s     r,   r  z5DebertaForSequenceClassification.set_input_embeddings  s    )).99999r-   r  r  Nr  r   r  rz  r  r  r   r  r  r   c
           
         |	|	n| j         j        }	|                     ||||||||	          }
|
d         }|                     |          }|                     |          }|                     |          }d}|| j         j        (| j        dk    rat          j	                    }|
                    d                              |j                  } |||
                    d                    }n|                                dk    s|                    d          dk    rA|dk                                    }|                                }|                    d          dk    rt#          j        |d|                    |                    d          |                    d                              }t#          j        |d|
                    d                    }t)                      } ||
                    d| j                                                  |
                    d                    }nZt#          j        d                              |          }n1t          j        d          } ||          |z                      d                                           }n| j         j        dk    rWt                      }| j        dk    r1 ||                                |                                          }n |||          }n| j         j        dk    rGt)                      } ||
                    d| j                  |
                    d                    }n*| j         j        dk    rt7                      } |||          }|	s|f|
dd         z   }||f|z   n|S t9          |||
j        |
j        	          S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r  r   rz  r  r   r  r  r   r   r   
regressionsingle_label_classificationmulti_label_classificationr  )r(   r  r  r  r'   r  problem_typer  r   r
   r  rC   rI   rB   r   nonzeror  rD   rk  r#  r	   r   rG   
LogSoftmaxsumr   r   r   r   r1   r	  )r*   r  r   r  rz  r  r  r   r  r  r  encoder_layerr3   r  r  loss_fnlabel_indexlabeled_logitsr  log_softmaxrR   s                        r,   r4   z(DebertaForSequenceClassification.forward  s   0 &1%<kk$+B],,))%'/!5#  	
 	
  
M22]33//{'/?a'' jllG#[[__//==F"766;;r??;;DDZZ\\Q&&&++b//Q*>*>#)Q;"7"7"9"9K#[[]]F"''**Q..)."A{'9'9+:J:J1:M:Mv{{[\~~'^'^* * "'fa9I9I"9M9M!N!N#3#5#5'x(;(;B(P(P(V(V(X(XZ`ZeZefhZiZijj$|A11&99"$-"3"3K)k&11F:??CCIIKKKDD)\99"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE'fG4IV]Vh
 
 
 	
r-   r  )r9   r:   r;   r!   r  r  r   r  r  r   r  r   r  r   rD   r   rE   r   r   r4   r=   r>   s   @r,   r  r  r  s           $3 3 3: : : +*+C+J+JKh+i+ijj&,$   -11515/304)-,0/3&*M
 M
EL)M
 !.M
 !.	M

 u|,M
  -M
 &M
 $D>M
 'tnM
 d^M
 
u..	/M
 M
 M
  kjM
 M
 M
 M
 M
r-   r  z
    DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       e Zd Z fdZ ee                    d                     eee	e
          	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 d	eej                 d
eej                 dee         dee         dee         deee	f         fd                        Z xZS )DebertaForTokenClassificationc                 6   t                                          |           |j        | _        t          |          | _        t          j        |j                  | _        t          j	        |j
        |j                  | _        |                                  d S r   )r    r!   r  r  r  r   Dropoutr   r'   r"   r6   r  r  r)   s     r,   r!   z&DebertaForTokenClassification.__init__  sy        +#F++z&"<==)F$68IJJ 	r-   r  r  Nr  r   r  rz  r  r  r   r  r  r   c
           
         |	|	n| j         j        }	|                     ||||||||	          }
|
d         }|                     |          }|                     |          }d}|Ft                      } ||                    d| j                  |                    d                    }|	s|f|
dd         z   }||f|z   n|S t          |||
j	        |
j
                  S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r   r   r  )r(   r  r  r'   r  r	   r  r  r   r1   r	  )r*   r  r   r  rz  r  r  r   r  r  r  r  r  r  r  rR   s                   r,   r4   z%DebertaForTokenClassification.forward  s   , &1%<kk$+B],,))%'/!5#  	
 	
 "!*,,7711'))H8FKKDO<<fkk"ooNND 	FY,F)-)9TGf$$vE$fG4IV]Vh
 
 
 	
r-   r  )r9   r:   r;   r!   r   r  r  r   r  r   r  r   rD   r   rE   r   r   r4   r=   r>   s   @r,   r   r     sb       	 	 	 	 	 +*+C+J+JKh+i+ijj&)$   -11515/304)-,0/3&*-
 -
EL)-
 !.-
 !.	-

 u|,-
  --
 &-
 $D>-
 'tn-
 d^-
 
u++	,-
 -
 -
  kj-
 -
 -
 -
 -
r-   r   z
    DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd Z fdZ ee                    d                     eee	e
eeee          	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 dee         dee         dee         deee	f         fd                        Z xZS )DebertaForQuestionAnsweringc                     t                                          |           |j        | _        t          |          | _        t          j        |j        |j                  | _        | 	                                 d S r   )
r    r!   r  r  r  r   r"   r6   
qa_outputsr  r)   s     r,   r!   z$DebertaForQuestionAnswering.__init__9  se        +#F++)F$68IJJ 	r-   r  )r  r  r  r  r  qa_target_start_indexqa_target_end_indexNr  r   r  rz  r  start_positionsend_positionsr   r  r  r   c           
         |
|
n| j         j        }
|                     |||||||	|
          }|d         }|                     |          }|                    dd          \  }}|                    d                                          }|                    d                                          }d}||t          |                                          dk    r|                    d          }t          |                                          dk    r|                    d          }|                    d          }|	                    d|          }|	                    d|          }t          |          } |||          } |||          }||z   dz  }|
s||f|dd         z   }||f|z   n|S t          ||||j        |j        	          S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        Nr  r   r   r   rI  )ignore_indexr   )r  start_logits
end_logitsr1   r	  )r(   r  r  r  splitr   r]  r   r   rj  r	   r   r1   r	  )r*   r  r   r  rz  r  r
  r  r   r  r  r  r  r  r  r  
total_lossignored_indexr  
start_lossend_lossrR   s                         r,   r4   z#DebertaForQuestionAnswering.forwardC  s   B &1%<kk$+B],,))%'/!5#  	
 	
 "!*11#)<<r<#:#: j#++B//::<<''++6688

&=+D?''))**Q.."1"9"9""="==%%''((1,, - 5 5b 9 9(--a00M-33A}EEO)//=AAM']CCCH!,@@Jx
M::H$x/14J 	R"J/'!""+=F/9/EZMF**6Q+%!!/)
 
 
 	
r-   )
NNNNNNNNNN)r9   r:   r;   r!   r   r  r  r   _CHECKPOINT_FOR_QAr   r  _QA_EXPECTED_OUTPUT_QA_EXPECTED_LOSS_QA_TARGET_START_INDEX_QA_TARGET_END_INDEXr   rD   r   rE   r   r   r4   r=   r>   s   @r,   r  r  1  s            +*+C+J+JKh+i+ijj%0$+'40   -11515/3042604,0/3&*F
 F
EL)F
 !.F
 !.	F

 u|,F
  -F
 "%,/F
  -F
 $D>F
 'tnF
 d^F
 
u22	3F
 F
 F
  kjF
 F
 F
 F
 F
r-   r  )Qrn   collections.abcr   typingr   r   r   rD   torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   modeling_outputsr   r   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   r   configuration_debertar   
get_loggerr9   loggerr  r  r  r  r  r  r  r  r  r  Moduler   autogradFunctionr@   rr   r{   r}   r%   r   r   r   r   r   r   r   r   jitscriptr&  r)  r-  r   rt  r  DEBERTA_START_DOCSTRINGr  r  r  r  r  r  r  r   r  rp   r-   r,   <module>r,     s%     $ $ $ $ $ $ ) ) ) ) ) ) ) ) ) )            A A A A A A A A A A ! ! ! ! ! !              . - - - - - 2 2 2 2 2 2 u u u u u u u u u u u u 0 0 0 0 0 0 
	H	%	%!.  @ ' !  ; (    ' ' ' ' 'BI ' ' ',<k <k <k <k <ku~& <k <k <k~         &$D $D $D $D $Du~& $D $D $DN." ." ." ." ."BI ." ." ."b    ry   (    	   !$ !$ !$ !$ !$ry !$ !$ !$J    ")       BI            29      Db
 b
 b
 b
 b
RY b
 b
 b
J  2 r r r n n n [ [ [    	   DI I I I I	 I I IX? ? ? ? ?_ ? ? ?2 ") X g l
 l
 l
 l
 l
) l
 l
	 l
^ QSjkkM
 M
 M
 M
 M
/ M
 M
 lkM
`    RY   &    bi   2! ! ! ! ! ! ! !   l
 l
 l
 l
 l
'= l
 l
 l
^   ?
 ?
 ?
 ?
 ?
$: ?
 ?
 ?
D   [
 [
 [
 [
 [
"8 [
 [
 [
 [
 [
r-   