
    g                       d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZmZmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZmZmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'  e$j(        e)          Z*dZ+dZ,g dZ-e G d de                      Z.d Z/ G d dej0                  Z1 G d dej0                  Z2 G d dej0                  Z3 G d dej0                  Z4 G d dej0                  Z5 G d d ej0                  Z6 G d! d"ej0                  Z7 G d# d$ej0                  Z8 G d% d&ej0                  Z9 G d' d(ej0                  Z: G d) d*ej0                  Z; G d+ d,ej0                  Z< G d- d.ej0                  Z= G d/ d0ej0                  Z> G d1 d2e          Z?d3Z@d4ZA e"d5e@           G d6 d7e?                      ZB e"d8e@           G d9 d:e?                      ZC e"d;e@           G d< d=e?                      ZD e"d>e@           G d? d@e?                      ZE e"dAe@           G dB dCe?                      ZFdS )DzPyTorch CANINE model.    N)	dataclass)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputModelOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )CanineConfigzgoogle/canine-sr   )   +   ;   =   I   a   g   q                           c                       e Zd ZU dZdZej        ed<   dZej        ed<   dZ	e
eej                          ed<   dZe
eej                          ed<   dS )CanineModelOutputWithPoolinga  
    Output type of [`CanineModel`]. Based on [`~modeling_outputs.BaseModelOutputWithPooling`], but with slightly
    different `hidden_states` and `attentions`, as these also include the hidden states and attentions of the shallow
    Transformer encoders.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model (i.e. the output of the final
            shallow Transformer encoder).
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            Hidden-state of the first token of the sequence (classification token) at the last layer of the deep
            Transformer encoder, further processed by a Linear layer and a Tanh activation function. The Linear layer
            weights are trained from the next sentence prediction (classification) objective during pretraining.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the input to each encoder + one for the output of each layer of each
            encoder) of shape `(batch_size, sequence_length, hidden_size)` and `(batch_size, sequence_length //
            config.downsampling_rate, hidden_size)`. Hidden-states of the model at the output of each layer plus the
            initial input to each Transformer encoder. The hidden states of the shallow encoders have length
            `sequence_length`, but the hidden states of the deep encoder have length `sequence_length` //
            `config.downsampling_rate`.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of the 3 Transformer encoders of shape `(batch_size,
            num_heads, sequence_length, sequence_length)` and `(batch_size, num_heads, sequence_length //
            config.downsampling_rate, sequence_length // config.downsampling_rate)`. Attentions weights after the
            attention softmax, used to compute the weighted average in the self-attention heads.
    Nlast_hidden_statepooler_outputhidden_states
attentions)__name__
__module____qualname____doc__r0   torchFloatTensor__annotations__r1   r2   r   r   r3        f/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/canine/modeling_canine.pyr/   r/   ;   s          6 ,0u(///'+M5$+++8<M8E%"345<<<59Ju01299999r<   r/   c           	         	 ddl }ddl}ddl}n)# t          $ r t                              d            w xY wt          j                            |          }t          	                    d|            |j
                            |          }g }g }	|D ]j\  }
}t          	                    d|
 d|            |j
                            ||
          }|                    |
           |	                    |           kt          ||	          D ]\  }
}|
                    d          }
t!          d |
D                       r1t          	                    d	d                    |
                      e|
d         d
k    rd|
d<   ny|
d         dk    r|
                    |
d                    nQ|
d         dk    rd|
d<   n?|
d         dk    rdg|
dd         z   }
n$|
d         dk    r|
d         dv rdg|
dd         z   }
| }|
D ].}|                    d|          rd|vr|                    d|          }n|g}|d         dk    s|d         dk    rt)          |d          }n|d         dk    s|d         dk    rt)          |d          }nu|d         dk    rt)          |d          }nX	 t)          ||d                   }n@# t*          $ r3 t          	                    d	d                    |
                      Y w xY wt-          |          d k    rt/          |d                   }||         }0|d!d         d"k    rt)          |d          }nO|d#d         d$ t1          d%          D             v rt)          |d          }n|dk    r|                    |          }|j        |j        k    r t7          d&|j         d'|j         d(          t          	                    d)|
            t9          j        |          |_        | S )*z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape /c              3      K   | ]}|d v V  	dS ))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepclsautoregressive_decoderchar_output_weightsNr;   ).0ns     r=   	<genexpr>z,load_tf_weights_in_canine.<locals>.<genexpr>|   sB       
 
  	

 
 
 
 
 
r<   z	Skipping bertencoderr   
embeddingssegment_embeddingstoken_type_embeddingsinitial_char_encoderchars_to_moleculesfinal_char_encoder)	LayerNormconv
projectionz[A-Za-z]+_\d+Embedderz_(\d+)kernelgammaweightoutput_biasbetabiasoutput_weights   i_embeddingsic                     g | ]}d | S )	Embedder_r;   )rI   is     r=   
<listcomp>z-load_tf_weights_in_canine.<locals>.<listcomp>   s    @@@!o!oo@@@r<      zPointer shape z and array shape z mismatchedzInitialize PyTorch weight )renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzipsplitanyjoinremove	fullmatchgetattrAttributeErrorlenintrange	transposeshape
ValueErrorr8   
from_numpydata)modelconfigtf_checkpoint_pathrg   nptftf_path	init_varsnamesarraysnamer   arraypointerm_namescope_namesnums                    r=   load_tf_weights_in_caniner   ^   s   
			   Q	
 	
 	
 	 goo011G
KKBBBCCC''00IEF   eBBB5BBCCC&&w55Te5&)) D/ D/ezz#  
 
 
 
 
 
 
 	 KK4CHHTNN445557fDGG!W$$KKQ    !W,,,-DGG!W...()DI5DD!W,,,a<Q1Q1Q >DH,D 	' 	'F-v66 'Jf<T<T hhy&99%h1~))[^w-F-F!'844Q=00KNf4L4L!'622Q#333!'844%g{1~>>GG%   KK <CHHTNN < <===H ;1$$+a.))!#,#$$<=((gx00GGCDD\@@uQxx@@@@@gx00GGxLL''E=EK''fgmffekfffggg777888'..Ls    &5*K:K>=K>c                        e Zd ZdZ fdZdedefdZdededefdZ	 	 	 	 dd	ee	j
                 d
ee	j
                 dee	j
                 dee	j                 de	j        f
dZ xZS )CanineEmbeddingsz<Construct the character, position and token_type embeddings.c           	         t                                                       || _        |j        |j        z  }t          |j                  D ]0}d| }t          | |t          j        |j	        |                     1t          j        |j	        |j                  | _
        t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        |                     dt'          j        |j                                      d          d           t/          |dd          | _        d S )	NHashBucketCodepointEmbedder_epsposition_ids)r   F)
persistentposition_embedding_typeabsolute)super__init__r   hidden_sizenum_hash_functionsr   setattrr   	Embeddingnum_hash_bucketschar_position_embeddingstype_vocab_sizerP   rU   layer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr8   arangemax_position_embeddingsexpandr{   r   )selfr   shard_embedding_sizerd   r   	__class__s        r=   r   zCanineEmbeddings.__init__   sH     &1V5NNv011 	] 	]A5!55DD$V-DFZ [ [\\\\(*V5LfN`(a(a%%'\&2H&J\%]%]" f&8f>STTTz&"<== 	EL)GHHOOPWXXej 	 	
 	
 	
 (/v7PR\']']$$$r<   
num_hashesnum_bucketsc                     |t          t                    k    r$t          dt          t                               t          d|         }g }|D ]"}|dz   |z  |z  }|                    |           #|S )a  
        Converts ids to hash bucket ids via multiple hashing.

        Args:
            input_ids: The codepoints or other IDs to be hashed.
            num_hashes: The number of hash functions to use.
            num_buckets: The number of hash buckets (i.e. embeddings in each table).

        Returns:
            A list of tensors, each of which is the hash bucket IDs from one hash function.
        z`num_hashes` must be <= Nr   )r}   _PRIMESr   rt   )r   	input_idsr   r   primesresult_tensorsprimehasheds           r=   _hash_bucket_tensorsz%CanineEmbeddings._hash_bucket_tensors   s     G$$FGFFGGG*% 	* 	*E 1}-<F!!&))))r<   embedding_sizec                 0   ||z  dk    rt          d| d| d          |                     |||          }g }t          |          D ]8\  }}d| }	 t          | |	          |          }
|                    |
           9t          j        |d          S )	zDConverts IDs (e.g. codepoints) into embeddings via multiple hashing.r   zExpected `embedding_size` (z) % `num_hashes` (z) == 0)r   r   r   r   dim)r   r   	enumerater{   rt   r8   cat)r   r   r   r   r   hash_bucket_tensorsembedding_shardsrd   hash_bucket_idsr   shard_embeddingss              r=   _embed_hash_bucketsz$CanineEmbeddings._embed_hash_buckets   s    J&!++o>oo]goooppp"77	jfq7rr"+,?"@"@ 	6 	6A5!55D2wtT22?CC##$45555y)r2222r<   Nr   token_type_idsr   inputs_embedsreturnc                 B   ||                                 }n|                                 d d         }|d         }|| j        d d d |f         }|+t          j        |t          j        | j        j                  }|6|                     || j        j        | j        j	        | j        j
                  }|                     |          }||z   }| j        dk    r|                     |          }	||	z  }|                     |          }|                     |          }|S )Nr   r   dtypedevicer   )sizer   r8   zeroslongr   r   r   r   r   r   rP   r   r   rU   r   )
r   r   r   r   r   input_shape
seq_lengthrP   rN   position_embeddingss
             r=   forwardzCanineEmbeddings.forward   s,     #..**KK',,..ss3K ^
,QQQ^<L!"[EJtO`OghhhN  444;2DK4RTXT_Tp M !% : :> J J"%::
':55"&"?"?"M"M--J^^J//
\\*--
r<   )NNNN)r4   r5   r6   r7   r   r~   r   r   r   r8   
LongTensorr9   r   __classcell__r   s   @r=   r   r      s	       FF^ ^ ^ ^ ^0# C    .3S 3c 3`c 3 3 3 3  15593759" "E,-" !!12" u/0	"
   12" 
	" " " " " " " "r<   r   c                   F     e Zd ZdZ fdZdej        dej        fdZ xZS )CharactersToMoleculeszeConvert character sequence to initial molecule sequence (i.e. downsample) using strided convolutions.c                 "   t                                                       t          j        |j        |j        |j        |j                  | _        t          |j                 | _	        t          j
        |j        |j                  | _
        d S )Nin_channelsout_channelskernel_sizestrider   )r   r   r   Conv1dr   downsampling_raterV   r   
hidden_act
activationrU   r   r   r   r   s     r=   r   zCharactersToMolecules.__init__'  s|    I*+0+	
 
 
	 !!23 f&8f>STTTr<   char_encodingr   c                 P   |d d ddd d f         }t          j        |dd          }|                     |          }t          j        |dd          }|                     |          }|d d ddd d f         }t          j        ||gd          }|                     |          }|S )Nr   r   r`   r   r   )r8   r   rV   r   r   rU   )r   r   cls_encodingdownsampleddownsampled_truncatedresults         r=   r   zCharactersToMolecules.forward6  s    $QQQ!QQQY/ q!<<ii..ok1a88ook22 !,AAAqtQQQJ 7 L*?@aHHH''r<   )	r4   r5   r6   r7   r   r8   Tensorr   r   r   s   @r=   r   r   $  si        ooU U U U UU\ el        r<   r   c                   d     e Zd ZdZ fdZ	 ddej        deej                 dej        fdZ xZ	S )	ConvProjectionz
    Project representations from hidden_size*2 back to hidden_size across a window of w = config.upsampling_kernel_size
    characters.
    c                 h   t                                                       || _        t          j        |j        dz  |j        |j        d          | _        t          |j	                 | _
        t          j        |j        |j                  | _        t          j        |j                  | _        d S )Nr`   r   r   r   )r   r   r   r   r   r   upsampling_kernel_sizerV   r   r   r   rU   r   r   r   r   r   s     r=   r   zConvProjection.__init__X  s    I*Q.+5	
 
 
	 !!23 f&8f>STTTz&"<==r<   Ninputsfinal_seq_char_positionsr   c                    t          j        |dd          }| j        j        dz
  }|dz  }||z
  }t	          j        ||fd          }|                      ||                    }t          j        |dd          }|                     |          }|                     |          }| 	                    |          }|}|t          d          |}	|	S )Nr   r`   r   z,CanineForMaskedLM is currently not supported)r8   r   r   r   r   ConstantPad1drV   r   rU   r   NotImplementedError)
r   r   r   	pad_totalpad_begpad_endpadr   final_char_seq	query_seqs
             r=   r   zConvProjection.forwardg  s     A..
 K6:	q.g%115533v;;''A..((''f%%#/
 &&TUUU&Ir<   N)
r4   r5   r6   r7   r   r8   r   r   r   r   r   s   @r=   r   r   R  s         
> > > > >$ <@" "" #+5<"8" 
	" " " " " " " "r<   r   c                        e Zd Z fdZd Z	 	 	 ddej        dej        deej                 deej                 d	ee	         d
e
ej        eej                 f         fdZ xZS )CanineSelfAttentionc                 ,   t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j        |j                  | _        t#          |dd          | _        | j        dk    s| j        d	k    r8|j        | _        t          j        d
|j        z  dz
  | j                  | _        d S d S )Nr   r   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   r   relative_keyrelative_key_queryr`   r   )r   r   r   num_attention_headshasattrr   r~   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probr   r{   r   r   r   distance_embeddingr   s     r=   r   zCanineSelfAttention.__init__  s    ::a??PVXhHiHi?8F$6 8 8 48 8 8  
 $*#= #&v'9F<V'V#W#W !58PPYv143EFF
9V/1CDDYv143EFF
z&"EFF'.v7PR\']']$'>99T=Y]q=q=q+1+ID(&(l1v7U3UXY3Y[_[s&t&tD### >r=qr<   c                     |                                 d d         | j        | j        fz   } |j        | }|                    dddd          S )Nr   r   r`   r   r   )r   r  r  viewpermute)r   xnew_x_shapes      r=   transpose_for_scoresz(CanineSelfAttention.transpose_for_scores  sM    ffhhssmt'?AY&ZZAFK yyAq!$$$r<   NFfrom_tensor	to_tensorattention_mask	head_maskoutput_attentionsr   c                 >   |                      |          }|                     |                     |                    }|                     |                     |                    }|                     |          }	t	          j        |	|                    dd                    }
| j        dk    s| j        dk    r4|                                d         }t	          j	        |t          j
        |j                                      dd          }t	          j	        |t          j
        |j                                      dd          }||z
  }|                     || j        z   dz
            }|                    |	j                  }| j        dk    rt	          j        d|	|          }|
|z   }
n?| j        dk    r4t	          j        d|	|          }t	          j        d	||          }|
|z   |z   }
|
t%          j        | j                  z  }
|\|j        d
k    rLt	          j        |d          }d|                                z
  t	          j        |
j                  j        z  }|
|z   }
t4          j                            |
d          }|                     |          }|||z  }t	          j        ||          }|                    dddd
                                          }|                                d d         | j         fz   } |j        | }|r||fn|f}|S )Nr   rS   r  r  r   r   )r   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   r         ?r   r`   )!r	  r  r
  r  r8   matmulr   r   r   r   r   r   r  r  r   tor   einsummathsqrtr  ndim	unsqueezefloatfinfominr   
functionalsoftmaxr   r  
contiguousr  )r   r  r  r  r  r  mixed_query_layer	key_layervalue_layerquery_layerattention_scoresr   position_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                          r=   r   zCanineSelfAttention.forward  s$    !JJ{33 --dhhy.A.ABB	//

90E0EFF//0ABB !<Y5H5HR5P5PQQ'>99T=Y]q=q=q$))++A.J"\*EJ{OabbbgghjlmnnN"\*EJ{OabbbgghikmnnN%6H#'#:#:8dFb;bef;f#g#g #7#:#:AR#:#S#S +~==+0<8H+Wk+l+l(#36N#N  -1EEE16>NP[]q1r1r./4|<LiYm/n/n,#36T#TWs#s +di8P.Q.QQ%"a''!&Q!G!G!G #&(<(<(>(>">%+N^NdBeBeBi!i/.@ -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S**,CD6G]=/22mM]r<   NNF)r4   r5   r6   r   r  r8   r   r   r9   boolr   r   r   r   s   @r=   r   r     s        u u u u u,% % % 7;15,1E E\E <E !!23	E
 E-.E $D>E 
u|Xel33	4E E E E E E E Er<   r   c                   v     e Zd Z fdZdeej                 dej        deej        ej        f         fdZ xZS )CanineSelfOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j	                  | _
        d S Nr   )r   r   r   r  r   denserU   r   r   r   r   r   s     r=   r   zCanineSelfOutput.__init__  sf    Yv163EFF
f&8f>STTTz&"<==r<   r2   input_tensorr   c                     |                      |          }|                     |          }|                     ||z             }|S r   r>  r   rU   r   r2   r?  s      r=   r   zCanineSelfOutput.forward  sB     

=11]33}|'CDDr<   	r4   r5   r6   r   r   r8   r9   r   r   r   s   @r=   r;  r;    s        > > > > >"5#45EJEV	u %"33	4       r<   r;  c                        e Zd ZdZ	 	 	 	 	 	 	 ddededededed	ef fd
Zd Z	 	 	 ddee	j
                 dee	j
                 dee	j
                 dee         dee	j
        ee	j
                 f         f
dZ xZS )CanineAttentionav  
    Additional arguments related to local attention:

        - **local** (`bool`, *optional*, defaults to `False`) -- Whether to apply local attention.
        - **always_attend_to_first_position** (`bool`, *optional*, defaults to `False`) -- Should all blocks be able to
          attend
        to the `to_tensor`'s first position (e.g. a [CLS] position)? - **first_position_attends_to_all** (`bool`,
        *optional*, defaults to `False`) -- Should the *from_tensor*'s first position be able to attend to all
        positions within the *from_tensor*? - **attend_from_chunk_width** (`int`, *optional*, defaults to 128) -- The
        width of each block-wise chunk in `from_tensor`. - **attend_from_chunk_stride** (`int`, *optional*, defaults to
        128) -- The number of elements to skip when moving to the next block in `from_tensor`. -
        **attend_to_chunk_width** (`int`, *optional*, defaults to 128) -- The width of each block-wise chunk in
        *to_tensor*. - **attend_to_chunk_stride** (`int`, *optional*, defaults to 128) -- The number of elements to
        skip when moving to the next block in `to_tensor`.
    F   always_attend_to_first_positionfirst_position_attends_to_allattend_from_chunk_widthattend_from_chunk_strideattend_to_chunk_widthattend_to_chunk_stridec	                 t   t                                                       t          |          | _        t	          |          | _        t                      | _        || _        ||k     rt          d          ||k     rt          d          || _
        || _        || _        || _        || _        || _        d S )Nze`attend_from_chunk_width` < `attend_from_chunk_stride` would cause sequence positions to get skipped.z``attend_to_chunk_width` < `attend_to_chunk_stride`would cause sequence positions to get skipped.)r   r   r   r   r;  outputsetpruned_headslocalr   rG  rH  rI  rJ  rK  rL  
r   r   rQ  rG  rH  rI  rJ  rK  rL  r   s
            r=   r   zCanineAttention.__init__  s     	'//	&v..EE 
"%===w   !#999r   0O,-J*'>$(@%%:"&<###r<   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   r   )r}   r   r   r  r  rP  r   r	  r
  r  rN  r>  r  union)r   headsindexs      r=   prune_headszCanineAttention.prune_heads2  s    u::??F7490$)2OQUQb
 
u
 -TY_eDD	*49=%@@	,TY_eDD	.t{/@%QOOO )-	(EE

(R	%"&)"?$)B_"_	 -33E::r<   Nr2   r  r  r  r   c                 8   | j         s#|                     |||||          }|d         }n/|j        d         x}}|x}	}
g }| j        r|                    d           d}nd}t          ||| j                  D ]1}t          ||| j        z             }|                    ||f           2g }| j        r|                    d|f           t          d|| j	                  D ]1}t          ||| j
        z             }|                    ||f           2t          |          t          |          k    rt          d| d| d          g }g }t          ||          D ]\  \  }}\  }}|	d d ||d d f         }|
d d ||d d f         }|d d ||||f         }| j        rR|d d ||ddf         }t          j        ||gd          }|
d d ddd d f         }t          j        ||gd          }|                     |||||          }|                    |d                    |r|                    |d                    t          j        |d          }|                     ||          }|f}| j         s||dd          z   }n|t%          |          z   }|S )	Nr   r   )r   r   z/Expected to have same number of `from_chunks` (z) and `to_chunks` (z). Check strides.r`   r   )rQ  r   r   rH  rt   r   rJ  r$  rI  rL  rK  r}   r   ru   rG  r8   r   rN  tuple)r   r2   r  r  r  self_outputsattention_outputfrom_seq_lengthto_seq_lengthr  r  from_chunks
from_startchunk_start	chunk_end	to_chunksattention_output_chunksattention_probs_chunksfrom_endto_startto_endfrom_tensor_chunkto_tensor_chunkattention_mask_chunkcls_attention_maskcls_positionattention_outputs_chunkr7  s                               r=   r   zCanineAttention.forwardD  s    z 9	I99]M>S\^oppL+A.;.A!.DDOm&33K) K1 ""6*** 


$Z$B_`` = =t?[1[\\	""K#;<<<< I1 5  !]!3444$Qt7RSS ; ;{T=W/WXX	  +y!9::::;3y>>11 Ck C C$/C C C   ')#%'">A+y>Y>Y N N:&X(:6$/:h3F0I$J!"+AAAx,A"B (6aaaH9LhW]o6]'^$7 X)7:h;NPQRSPS8S)T&+096HJ^5_ef+g+g+g(#,QQQ!QQQY#7L&+i0OUV&W&W&WO*.))%8LiYj+ +' (../Fq/IJJJ$ N*112I!2LMMM$y)@aHHH;;'7GG#%z 	>QRR 00GG&< = ==Gr<   FFFrF  rF  rF  rF  r8  )r4   r5   r6   r7   r9  r~   r   rW  r   r8   r9   r   r   r   r   s   @r=   rE  rE     sB        & 05.3'*(+%(&)= = *.	=
 (,= "%= #&=  #= !$= = = = = =B; ; ;* 7;15,1H HU./H !!23H E-.	H
 $D>H 
u (5+<"==	>H H H H H H H Hr<   rE  c                   B     e Zd Z fdZdej        dej        fdZ xZS )CanineIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r   )r   r   r   r  r   intermediate_sizer>  
isinstancer   strr   intermediate_act_fnr   s     r=   r   zCanineIntermediate.__init__  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r<   r2   r   c                 Z    |                      |          }|                     |          }|S r   )r>  ru  r   r2   s     r=   r   zCanineIntermediate.forward  s,    

=1100??r<   )r4   r5   r6   r   r8   r9   r   r   r   s   @r=   rp  rp    s`        9 9 9 9 9U%6 5;L        r<   rp  c                   \     e Zd Z fdZdeej                 dej        dej        fdZ xZS )CanineOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j	        |j
                  | _        d S r=  )r   r   r   r  rr  r   r>  rU   r   r   r   r   r   s     r=   r   zCanineOutput.__init__  sf    Yv79KLL
f&8f>STTTz&"<==r<   r2   r?  r   c                     |                      |          }|                     |          }|                     ||z             }|S r   rA  rB  s      r=   r   zCanineOutput.forward  s@    

=11]33}|'CDDr<   rC  r   s   @r=   ry  ry    sp        > > > > >U5+<%= UM^ chct        r<   ry  c                        e Zd Z fdZ	 	 	 ddeej                 deej                 deej                 dee         deej        eej                 f         f
d	Z	d
 Z
 xZS )CanineLayerc	           
          t                                                       |j        | _        d| _        t	          ||||||||          | _        t          |          | _        t          |          | _	        d S Nr   )
r   r   chunk_size_feed_forwardseq_len_dimrE  	attentionrp  intermediatery  rN  rR  s
            r=   r   zCanineLayer.__init__  s|     	'-'E$(+)#$!"	
 	
 /v66"6**r<   NFr2   r  r  r  r   c                     |                      ||||          }|d         }|dd          }t          | j        | j        | j        |          }|f|z   }|S )N)r  r   r   )r  r   feed_forward_chunkr  r  )	r   r2   r  r  r  self_attention_outputsr[  r7  layer_outputs	            r=   r   zCanineLayer.forward  sy     "&/	 "0 "
 "
 2!4(,0#T%A4CSUe
 
  /G+r<   c                 \    |                      |          }|                     ||          }|S r   )r  rN  )r   r[  intermediate_outputr  s       r=   r  zCanineLayer.feed_forward_chunk  s2    "//0@AA{{#68HIIr<   r8  )r4   r5   r6   r   r   r8   r9   r   r9  r   r  r   r   s   @r=   r}  r}    s        + + + + +< 7;15,1 U./ !!23 E-.	
 $D> 
u (5+<"==	>   0      r<   r}  c                        e Zd Z	 	 	 	 	 	 	 d fd	Z	 	 	 	 	 ddeej                 deej                 deej                 d	ee         d
ee         dee         de	ee
f         fdZ xZS )CanineEncoderFrF  c	           
          t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 <    g | ]}t          	          S r;   )r}  )
rI   _rG  rJ  rI  rL  rK  r   rH  rQ  s
     r=   re   z*CanineEncoder.__init__.<locals>.<listcomp>  sM         31+,)*	 	  r<   F)	r   r   r   r   
ModuleListr   num_hidden_layerslayergradient_checkpointingrR  s
    ````````r=   r   zCanineEncoder.__init__  s     	]           v788  
 

 ',###r<   NTr2   r  r  r  output_hidden_statesreturn_dictr   c                    |rdnd }|rdnd }t          | j                  D ]j\  }	}
|r||fz   }|||	         nd }| j        r&| j        r|                     |
j        ||||          }n |
||||          }|d         }|r||d         fz   }k|r||fz   }|st          d |||fD                       S t          |||          S )Nr;   r   r   c              3      K   | ]}||V  	d S r   r;   rI   vs     r=   rK   z(CanineEncoder.forward.<locals>.<genexpr>*  s(      mmq_`_l_l_l_l_lmmr<   )r0   r2   r3   )r   r  r  training_gradient_checkpointing_func__call__rY  r   )r   r2   r  r  r  r  r  all_hidden_statesall_self_attentionsrd   layer_modulelayer_head_masklayer_outputss                r=   r   zCanineEncoder.forward  sN    #7@BBD$5?bb4(44 	P 	POA|# I$58H$H!.7.CillO* 	pt} 	p $ A A )!"#%! ! !-]NO]n o o)!,M  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r<   rn  )NNFFT)r4   r5   r6   r   r   r8   r9   r   r9  r   r   r   r   r   s   @r=   r  r    s         (-&+ #!$!", , , , , ,B 7;15,1/4&**
 *
U./*
 !!23*
 E-.	*

 $D>*
 'tn*
 d^*
 
uo%	&*
 *
 *
 *
 *
 *
 *
 *
r<   r  c                   N     e Zd Z fdZdeej                 dej        fdZ xZS )CaninePoolerc                     t                                                       t          j        |j        |j                  | _        t          j                    | _        d S r   )r   r   r   r  r   r>  Tanhr   r   s     r=   r   zCaninePooler.__init__3  sC    Yv163EFF
'))r<   r2   r   c                 r    |d d df         }|                      |          }|                     |          }|S )Nr   )r>  r   )r   r2   first_token_tensorpooled_outputs       r=   r   zCaninePooler.forward8  s@     +111a40

#56666r<   rC  r   s   @r=   r  r  2  se        $ $ $ $ $
U5+<%= %BS        r<   r  c                   N     e Zd Z fdZdeej                 dej        fdZ xZS )CaninePredictionHeadTransformc                 V   t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _
        n|j        | _
        t          j        |j        |j                  | _        d S r=  )r   r   r   r  r   r>  rs  r   rt  r   transform_act_fnrU   r   r   s     r=   r   z&CaninePredictionHeadTransform.__init__B  s    Yv163EFF
f'-- 	6$*6+<$=D!!$*$5D!f&8f>STTTr<   r2   r   c                     |                      |          }|                     |          }|                     |          }|S r   )r>  r  rU   rw  s     r=   r   z%CaninePredictionHeadTransform.forwardK  s=    

=11--m<<}55r<   rC  r   s   @r=   r  r  A  sj        U U U U UU5+<%= %BS        r<   r  c                   N     e Zd Z fdZdeej                 dej        fdZ xZS )CanineLMPredictionHeadc                 >   t                                                       t          |          | _        t	          j        |j        |j        d          | _        t	          j	        t          j        |j                            | _        | j        | j        _        d S )NF)r^   )r   r   r  	transformr   r  r   
vocab_sizedecoder	Parameterr8   r   r^   r   s     r=   r   zCanineLMPredictionHead.__init__S  sz    6v>> y!3V5FUSSSLV->!?!?@@	 !Ir<   r2   r   c                 Z    |                      |          }|                     |          }|S r   )r  r  rw  s     r=   r   zCanineLMPredictionHead.forward`  s*    }55]33r<   rC  r   s   @r=   r  r  R  se        & & & & &U5+<%= %BS        r<   r  c                   Z     e Zd Z fdZdeej                 deej                 fdZ xZS )CanineOnlyMLMHeadc                 p    t                                                       t          |          | _        d S r   )r   r   r  predictionsr   s     r=   r   zCanineOnlyMLMHead.__init__g  s/    1&99r<   sequence_outputr   c                 0    |                      |          }|S r   )r  )r   r  prediction_scoress      r=   r   zCanineOnlyMLMHead.forwardk  s     !,,_==  r<   )	r4   r5   r6   r   r   r8   r   r   r   r   s   @r=   r  r  f  sl        : : : : :!u|,! 
u|	! ! ! ! ! ! ! !r<   r  c                   (    e Zd ZdZeZeZdZdZ	d Z
dS )CaninePreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    canineTc                    t          |t          j        t          j        f          rT|j        j                            d| j        j                   |j	         |j	        j        
                                 dS dS t          |t          j                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 
                                 dS dS t          |t          j                  r?|j	        j        
                                 |j        j                            d           dS dS )zInitialize the weightsg        )meanstdNr  )rs  r   r  r   r[   r   normal_r   initializer_ranger^   zero_r   padding_idxrU   fill_)r   modules     r=   _init_weightsz#CaninePreTrainedModel._init_weights~  s0   fry")455 	* M&&CT[5R&SSS{& &&((((( '&-- 	*M&&CT[5R&SSS!-"6#56<<>>>>> .--- 	*K""$$$M$$S)))))	* 	*r<   N)r4   r5   r6   r7   r   config_classr   load_tf_weightsbase_model_prefixsupports_gradient_checkpointingr  r;   r<   r=   r  r  s  sE         
  L/O &*#* * * * *r<   r  aI  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`CanineConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a5
  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z`The bare CANINE Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd Zd fd	Zd Zd Zdej        defdZ	dej        d	ej        d
ej        fdZ
 ee                    d                     eeee          	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 dee         dee         dee         d
eeef         fd                        Z xZS )CanineModelTc           
         t                                          |           || _        t          j        |          }d|_        t          |          | _        t          |ddd|j	        |j	        |j	        |j	                  | _
        t          |          | _        t          |          | _        t          |          | _        t          |          | _        |rt#          |          nd | _        |                                  d S )Nr   TF)rQ  rG  rH  rI  rJ  rK  rL  )r   r   r   copydeepcopyr  r   char_embeddingsr  local_transformer_striderQ   r   rR   rM   r   rW   rT   r  pooler	post_init)r   r   add_pooling_layershallow_configr   s       r=   r   zCanineModel.__init__  s       v..+,(/77$1,1*/$*$C%+%D"("A#)#B	%
 	%
 	%
! #8"?"?$V,,(00"/"?"?.?Il6***T 	r<   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrM   r  r  rW  )r   heads_to_pruner  rU  s       r=   _prune_headszCanineModel._prune_heads  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr<   c                    |j         d         |j         d         }}|j         d         }t          j        ||d|f                                          }t          j        ||dft          j        |j                  }||z  }|S )aP  
        Create 3D attention mask from a 2D tensor mask.

        Args:
            from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
            to_mask: int32 Tensor of shape [batch_size, to_seq_length].

        Returns:
            float Tensor of shape [batch_size, from_seq_length, to_seq_length].
        r   r   )r   r   r   )r   r8   reshaper"  onesfloat32r   )r   r  to_mask
batch_sizer\  r]  broadcast_onesmasks           r=   )_create_3d_attention_mask_from_input_maskz5CanineModel._create_3d_attention_mask_from_input_mask  s     '2&7&:K<Ma<PO
a(-*a)GHHNNPP
 *oq)IQVQ^gnguvvv 'r<   char_attention_maskr   c                     |j         \  }}t          j        ||d|f          }t          j                            ||          |                                          }t          j        |d          }|S )z[Downsample 2D character attention mask to 2D molecule attention mask using MaxPool1d layer.r   )r   r   r   r   )r   r8   r  r   	MaxPool1dr"  squeeze)r   r  r   r  char_seq_lenpoolable_char_maskpooled_molecule_maskmolecule_attention_masks           r=   _downsample_attention_maskz&CanineModel._downsample_attention_mask  s     $7#< 
L"]+>QP\@]^^  %x11>OXi1jj$$&& 
  

 #(-0D""M"M"M&&r<   	moleculeschar_seq_lengthr   c                    | j         j        }|ddddddf         }t          j        ||d          }|ddddddf         }t          j        t          j        |          t          j        |                                                    }t          j        |||z   d          }t          j        ||gd          S )zDRepeats molecules to make them the same length as the char sequence.Nr   rS   )repeatsr   r   r   )r   r   r8   repeat_interleavefmodtensoritemr   )	r   r  r  ratemolecules_without_extra_clsrepeatedlast_moleculeremainder_lengthremainder_repeateds	            r=   _repeat_moleculeszCanineModel._repeat_molecules#  s     {,&/122qqq&9#*+FPTZ\]]] "!!!RSS!!!), :el?&C&CU\RVEWEWXX]]__"4$t+	
 
 
 y($67R@@@@r<   batch_size, sequence_length
checkpointoutput_typer  Nr   r  r   r   r  r   r  r  r  c
                 &   ||n| j         j        }||n| j         j        }|rdnd }
|rdnd }|	|	n| j         j        }	||t	          d          |+|                     ||           |                                }n.||                                d d         }nt	          d          |\  }}||j        n|j        }|t          j	        ||f|          }|!t          j
        |t          j        |          }|                     ||          }|                     || j         j                  }|                     |||j        d         f          }|                     || j         j                  }|                     ||||          }|                     ||n||          }|                     ||||	          }|j        }|                     |          }|                     ||||||	
          }|d         }| j        |                     |          nd }|                     ||d                   }t          j        ||gd          }|                     |          }|                     ||||	          }|j        }|r&|	r|j        n|d         }|
|j        z   |z   |j        z   }
|r&|	r|j        n|d         } ||j        z   | z   |j        z   }|	s$||f}!|!t?          d |
|fD                       z  }!|!S tA          |||
|          S )Nr;   zDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)r   r   )r   )r   r   r   r   )r  r  r  )r  r  r  r  r  r   )r  r   r   c              3      K   | ]}||V  	d S r   r;   r  s     r=   rK   z&CanineModel.forward.<locals>.<genexpr>  s(      aa!STS`AS`S`S`S`aar<   )r0   r1   r2   r3   )!r   r  r  use_return_dictr   %warn_if_padding_and_no_attention_maskr   r   r8   r  r   r   get_extended_attention_maskr  r   r   get_head_maskr  r  r  rQ   r0   rR   rM   r  r  r   rW   rT   r2   r3   rY  r/   )"r   r   r  r   r   r  r   r  r  r  r  r  r   r  r   r   extended_attention_maskr   extended_molecule_attention_maskinput_char_embeddingsr  init_chars_encoder_outputsinput_char_encodinginit_molecule_encodingencoder_outputsmolecule_sequence_outputr  repeated_moleculesconcatr  final_chars_encoder_outputsdeep_encoder_hidden_statesdeep_encoder_self_attentionsrN  s"                                     r=   r   zCanineModel.forward<  s6   $ 2C1N--TXT_Tq$8$D  $+Jj 	 #7@BBD$5?bb4%0%<kk$+B] ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU!,
J%.%:!!@T!"Z*j)A6RRRN!"[EJvVVVN 150P0PQ_al0m0m"&"A"Adk.K #B #
 #
 :>9Y9Y#j2I2OPR2S%T:
 :
( &&y$+2OPP	 !% 4 4%)'	 !5 !
 !
 #LL".IIM>
 
 &*%>%>!./!5	 &? &
 &
" 9J  "&!8!89L!M!M ,,";/!5# ' 
 
 $31#5 AEAX$<===^b "334L^ijl^m3nn /1CD"MMM //&11 '+&=&=2/!5	 '> '
 '
# 6G 	JU)m)F)F[jkl[m&!,:;,- .;<   	IT+m?+E+EZijlZm(#,78./ .89    	%}5Feaa(9;N'OaaaaaaFM+-'+*	
 
 
 	
r<   )T)	NNNNNNNNN)r4   r5   r6   r   r  r  r8   r   r~   r  r  r   CANINE_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr/   _CONFIG_FOR_DOCr   r   r9   r9  r   r   r   r   r   s   @r=   r  r    s       
     <C C C  6'el '_b ' ' ' '"A5< A%, A[`[g A A A A2 +*+B+I+IJg+h+hii&0$   156:59371559,0/3&*\
 \
E,-\
 !!23\
 !!12	\

 u/0\
 E-.\
   12\
 $D>\
 'tn\
 d^\
 
u22	3\
 \
 \
  ji\
 \
 \
 \
 \
r<   r  z
    CANINE Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                       e Zd Z fdZ ee                    d                     eee	e
          	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 dee         dee         dee         deee	f         fd                        Z xZS )CanineForSequenceClassificationc                 6   t                                          |           |j        | _        t          |          | _        t          j        |j                  | _        t          j	        |j
        |j                  | _        |                                  d S r   r   r   
num_labelsr  r  r   r   r   r   r  r   
classifierr  r   s     r=   r   z(CanineForSequenceClassification.__init__  y        +!&))z&"<==)F$68IJJ 	r<   r  r  Nr   r  r   r   r  r   labelsr  r  r  r   c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }d}|Z| j         j        f| j        dk    rd| j         _        nN| j        dk    r7|j        t          j	        k    s|j        t          j
        k    rd| j         _        nd| j         _        | j         j        dk    rWt                      }| j        dk    r1 ||                                |                                          }n |||          }n| j         j        dk    rGt                      } ||                    d| j                  |                    d                    }n*| j         j        dk    rt                      } |||          }|
s|f|dd         z   }||f|z   n|S t!          |||j        |j        	          S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   r  r   r  r  r  r   
regressionsingle_label_classificationmulti_label_classificationr   r`   losslogitsr2   r3   )r   r  r  r   r  problem_typer  r   r8   r   r~   r
   r  r	   r  r   r   r2   r3   )r   r   r  r   r   r  r   r  r  r  r  r7  r  r'  r&  loss_fctrN  s                    r=   r   z'CanineForSequenceClassification.forward  s   2 &1%<kk$+B]++))%'/!5#  

 

  
]33//{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE'!/)	
 
 
 	
r<   
NNNNNNNNNN)r4   r5   r6   r   r   r  r  r   r  r   r  r   r8   r   r9   r9  r   r   r   r   r   s   @r=   r  r    s       	 	 	 	 	 +*+B+I+IJg+h+hii&,$   156:59371559-1,0/3&*E
 E
E,-E
 !!23E
 !!12	E

 u/0E
 E-.E
   12E
 )*E
 $D>E
 'tnE
 d^E
 
u..	/E
 E
 E
  jiE
 E
 E
 E
 E
r<   r  z
    CANINE Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                       e Zd Z fdZ ee                    d                     eee	e
          	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 d	eej                 d
eej                 deej                 dee         dee         dee         deee	f         fd                        Z xZS )CanineForMultipleChoicec                    t                                          |           t          |          | _        t	          j        |j                  | _        t	          j        |j	        d          | _
        |                                  d S r  )r   r   r  r  r   r   r   r   r  r   r  r  r   s     r=   r   z CanineForMultipleChoice.__init__J  sl       !&))z&"<==)F$6:: 	r<   z(batch_size, num_choices, sequence_lengthr  Nr   r  r   r   r  r   r  r  r  r  r   c                    |
|
n| j         j        }
||j        d         n|j        d         }|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|)|                    d|                    d                    nd}|=|                    d|                    d          |                    d                    nd}|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }|                    d|          }d}|t                      } |||          }|
s|f|dd         z   }||f|z   n|S t          |||j
        |j                  S )aJ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r   rS   r!  r`   r%  )r   r  r   r  r   r  r   r  r	   r   r2   r3   )r   r   r  r   r   r  r   r  r  r  r  num_choicesr7  r  r'  reshaped_logitsr&  r)  rN  s                      r=   r   zCanineForMultipleChoice.forwardT  s*   2 &1%<kk$+B],5,Aioa((}GZ[\G]>G>SINN2y~~b'9'9:::Y]	M[Mg,,R1D1DR1H1HIIImqM[Mg,,R1D1DR1H1HIIImqGSG_|((\->->r-B-BCCCei ( r=#5#5b#9#9=;M;Mb;Q;QRRR 	 ++))%'/!5#  

 

  
]33// ++b+66'))H8OV44D 	F%''!""+5F)-)9TGf$$vE("!/)	
 
 
 	
r<   r*  )r4   r5   r6   r   r   r  r  r   r  r   r  r   r8   r   r9   r9  r   r   r   r   r   s   @r=   r,  r,  B  s            +*+B+I+IJt+u+uvv&-$   156:59371559-1,0/3&*@
 @
E,-@
 !!23@
 !!12	@

 u/0@
 E-.@
   12@
 )*@
 $D>@
 'tn@
 d^@
 
u//	0@
 @
 @
  wv@
 @
 @
 @
 @
r<   r,  z
    CANINE Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       e Zd Z fdZ ee                    d                     eee	          	 	 	 	 	 	 	 	 	 	 dde
ej                 de
ej                 de
ej                 de
ej                 d	e
ej                 d
e
ej                 de
ej                 de
e         de
e         de
e         deeef         fd                        Z xZS )CanineForTokenClassificationc                 6   t                                          |           |j        | _        t          |          | _        t          j        |j                  | _        t          j	        |j
        |j                  | _        |                                  d S r   r  r   s     r=   r   z%CanineForTokenClassification.__init__  r  r<   r  )r   r  Nr   r  r   r   r  r   r  r  r  r  r   c                    |
|
n| j         j        }
|                     ||||||||	|
	  	        }|d         }|                     |          }|                     |          }d}|Ft                      } ||                    d| j                  |                    d                    }|
s|f|dd         z   }||f|z   n|S t          |||j	        |j
                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, CanineForTokenClassification
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/canine-s")
        >>> model = CanineForTokenClassification.from_pretrained("google/canine-s")

        >>> inputs = tokenizer(
        ...     "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="pt"
        ... )

        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_token_class_ids = logits.argmax(-1)

        >>> # Note that tokens are classified rather then input words which means that
        >>> # there might be more predicted token classes than words.
        >>> # Multiple token classes might account for the same word
        >>> predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
        >>> predicted_tokens_classes  # doctest: +SKIP
        ```

        ```python
        >>> labels = predicted_token_class_ids
        >>> loss = model(**inputs, labels=labels).loss
        >>> round(loss.item(), 2)  # doctest: +SKIP
        ```Nr!  r   r   r`   r%  )r   r  r  r   r  r	   r  r  r   r2   r3   )r   r   r  r   r   r  r   r  r  r  r  r7  r  r'  r&  r)  rN  s                    r=   r   z$CanineForTokenClassification.forward  s   f &1%<kk$+B]++))%'/!5#  

 

 "!*,,7711'))H8FKKDO<<fkk"ooNND 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
r<   r*  )r4   r5   r6   r   r   r  r  r   r   r  r   r8   r   r9   r9  r   r   r   r   r   s   @r=   r2  r2    s       	 	 	 	 	 +*+B+I+IJg+h+hii+@___ 156:59371559-1,0/3&*R
 R
E,-R
 !!23R
 !!12	R

 u/0R
 E-.R
   12R
 )*R
 $D>R
 'tnR
 d^R
 
u++	,R
 R
 R
 `_ jiR
 R
 R
 R
 R
r<   r2  z
    CANINE Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd Z fdZ ee                    d                     edee	dd          	 	 	 	 	 	 	 	 	 	 	 dde
ej                 d	e
ej                 d
e
ej                 de
ej                 de
ej                 de
ej                 de
ej                 de
ej                 de
e         de
e         de
e         deeef         fd                        Z xZS )CanineForQuestionAnsweringc                     t                                          |           |j        | _        t          |          | _        t          j        |j        |j                  | _        | 	                                 d S r   )
r   r   r  r  r  r   r  r   
qa_outputsr  r   s     r=   r   z#CanineForQuestionAnswering.__init__  se        +!&)))F$68IJJ 	r<   r  zSplend1dchan/canine-c-squadz'nice puppet'gQ!@)r  r   r  expected_outputexpected_lossNr   r  r   r   r  r   start_positionsend_positionsr  r  r  r   c                 h   ||n| j         j        }|                     |||||||	|
|	  	        }|d         }|                     |          }|                    dd          \  }}|                    d          }|                    d          }d}||t          |                                          dk    r|                    d          }t          |                                          dk    r|                    d          }|                    d          }|                    d|           |                    d|           t          |          } |||          } |||          }||z   dz  }|s||f|dd         z   }||f|z   n|S t          ||||j        |j        	          S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        Nr!  r   r   r   r   )ignore_indexr`   )r&  start_logits
end_logitsr2   r3   )r   r  r  r8  rv   r  r}   r   clamp_r	   r   r2   r3   )r   r   r  r   r   r  r   r;  r<  r  r  r  r7  r  r'  r?  r@  
total_lossignored_indexr)  
start_lossend_lossrN  s                          r=   r   z"CanineForQuestionAnswering.forward  s   @ &1%<kk$+B]++))%'/!5#  

 

 "!*11#)<<r<#:#: j#++B//''++

&=+D?''))**Q.."1"9"9""="==%%''((1,, - 5 5b 9 9(--a00M""1m444  M222']CCCH!,@@Jx
M::H$x/14J 	R"J/'!""+=F/9/EZMF**6Q+%!!/)
 
 
 	
r<   )NNNNNNNNNNN)r4   r5   r6   r   r   r  r  r   r   r  r   r8   r   r9   r9  r   r   r   r   r   s   @r=   r6  r6    s            +*+B+I+IJg+h+hii00$'   156:593715596:48,0/3&*H
 H
E,-H
 !!23H
 !!12	H

 u/0H
 E-.H
   12H
 "%"23H
   01H
 $D>H
 'tnH
 d^H
 
u22	3H
 H
 H
  jiH
 H
 H
 H
 H
r<   r6  )Gr7   r  r  rm   dataclassesr   typingr   r   r   r8   torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   configuration_caniner   
get_loggerr4   rk   r  r  r   r/   r   Moduler   r   r   r   r;  rE  rp  ry  r}  r  r  r  r  r  r  CANINE_START_DOCSTRINGr  r  r  r,  r2  r6  r;   r<   r=   <module>rS     s       				 ! ! ! ! ! ! ) ) ) ) ) ) ) ) ) )            A A A A A A A A A A ! ! ! ! ! !                . - - - - - l l l l l l l l l l              / . . . . . 
	H	%	%'   U
T
T : : : : :; : : :D^ ^ ^Bb b b b bry b b bJ+ + + + +BI + + +\7 7 7 7 7RY 7 7 7ta a a a a") a a aH    ry    L L L L Lbi L L L^           29   7 7 7 7 7") 7 7 7tI
 I
 I
 I
 I
BI I
 I
 I
X    29       BI   "    RY   (
! 
! 
! 
! 
!	 
! 
! 
!* * * * *O * * *8	 / d f N
 N
 N
 N
 N
' N
 N
	 N
b   W
 W
 W
 W
 W
&; W
 W
 W
t   Q
 Q
 Q
 Q
 Q
3 Q
 Q
 Q
h   `
 `
 `
 `
 `
#8 `
 `
 `
F   [
 [
 [
 [
 [
!6 [
 [
 [
 [
 [
r<   