
    g p              	          d Z ddlZddlmZ ddlmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZmZmZ ddlmZmZmZ dd	lmZmZ dd
lmZmZmZ ddlmZ ddlmZ  ej        e           Z!dZ"dZ#g dZ$dZ%dZ&e G d de                      Z'dAde	j(        de)de*de	j(        fdZ+ G d dej,                  Z- G d dej,                  Z. G d d ej,                  Z/ G d! d"ej,                  Z0 G d# d$ej,                  Z1 G d% d&ej,                  Z2 G d' d(ej,                  Z3 G d) d*ej,                  Z4 G d+ d,ej,                  Z5 G d- d.ej,                  Z6 G d/ d0ej,                  Z7 G d1 d2ej,                  Z8 G d3 d4ej,                  Z9 G d5 d6ej,                  Z: G d7 d8e          Z;d9Z<d:Z= ed;e<           G d< d=e;                      Z> ed>e<           G d? d@e;                      Z?dS )BzPyTorch CvT model.    N)	dataclass)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forward)$ImageClassifierOutputWithNoAttentionModelOutput)PreTrainedModel find_pruneable_heads_and_indicesprune_linear_layer)logging   )	CvtConfigr   zmicrosoft/cvt-13)r   i     r   ztabby, tabby catc                   x    e Zd ZU dZdZej        ed<   dZej        ed<   dZ	e
eej        df                  ed<   dS )BaseModelOutputWithCLSTokena  
    Base class for model's outputs, with potential hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        cls_token_value (`torch.FloatTensor` of shape `(batch_size, 1, hidden_size)`):
            Classification token at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
    Nlast_hidden_statecls_token_value.hidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   r   r        `/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/cvt/modeling_cvt.pyr   r   /   sg           ,0u(///)-OU&---=AM8E%"3S"89:AAAAAr%   r           Finput	drop_probtrainingreturnc                     |dk    s|s| S d|z
  }| j         d         fd| j        dz
  z  z   }|t          j        || j        | j                  z   }|                                 |                     |          |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r'   r   r   )r   )dtypedevice)shapendimr!   randr-   r.   floor_div)r(   r)   r*   	keep_probr/   random_tensoroutputs          r&   	drop_pathr7   E   s     CxII[^
Q 77E
5EL Y Y YYMYYy!!M1FMr%   c                   j     e Zd ZdZd	dee         ddf fdZdej        dej        fdZ	de
fdZ xZS )
CvtDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr)   r+   c                 V    t                                                       || _        d S N)super__init__r)   )selfr)   	__class__s     r&   r=   zCvtDropPath.__init__]   s$    "r%   r   c                 8    t          || j        | j                  S r;   )r7   r)   r*   )r>   r   s     r&   forwardzCvtDropPath.forwarda   s    FFFr%   c                 6    d                     | j                  S )Nzp={})formatr)   )r>   s    r&   
extra_reprzCvtDropPath.extra_reprd   s    }}T^,,,r%   r;   )r   r   r   r    r   floatr=   r!   TensorrA   strrD   __classcell__r?   s   @r&   r9   r9   Z   s        bb# #(5/ #T # # # # # #GU\ Gel G G G G-C - - - - - - - -r%   r9   c                   (     e Zd ZdZ fdZd Z xZS )CvtEmbeddingsz'
    Construct the CvT embeddings.
    c                     t                                                       t          |||||          | _        t	          j        |          | _        d S )N)
patch_sizenum_channels	embed_dimstridepadding)r<   r=   CvtConvEmbeddingsconvolution_embeddingsr   Dropoutdropout)r>   rM   rN   rO   rP   rQ   dropout_rater?   s          r&   r=   zCvtEmbeddings.__init__m   sT    &7!	Z`jq'
 '
 '
# z,//r%   c                 Z    |                      |          }|                     |          }|S r;   )rS   rU   )r>   pixel_valueshidden_states      r&   rA   zCvtEmbeddings.forwardt   s,    22<@@||L11r%   r   r   r   r    r=   rA   rH   rI   s   @r&   rK   rK   h   sQ         0 0 0 0 0      r%   rK   c                   (     e Zd ZdZ fdZd Z xZS )rR   z"
    Image to Conv Embedding.
    c                    t                                                       t          |t          j        j                  r|n||f}|| _        t          j        |||||          | _	        t          j
        |          | _        d S )N)kernel_sizerP   rQ   )r<   r=   
isinstancecollectionsabcIterablerM   r   Conv2d
projection	LayerNormnormalization)r>   rM   rN   rO   rP   rQ   r?   s         r&   r=   zCvtConvEmbeddings.__init__   sz    #-j+/:R#S#SqZZZdfpYq
$)L)\blsttt\)44r%   c                 <   |                      |          }|j        \  }}}}||z  }|                    |||                              ddd          }| j        r|                     |          }|                    ddd                              ||||          }|S Nr      r   )rc   r/   viewpermutere   )r>   rX   
batch_sizerN   heightwidthhidden_sizes          r&   rA   zCvtConvEmbeddings.forward   s    |442>2D/
L&%un#((\;OOWWXY[\^_`` 	<--l;;L#++Aq!4499*lTZ\abbr%   rZ   rI   s   @r&   rR   rR   z   sQ         5 5 5 5 5
 
 
 
 
 
 
r%   rR   c                   $     e Zd Z fdZd Z xZS )CvtSelfAttentionConvProjectionc           	          t                                                       t          j        |||||d|          | _        t          j        |          | _        d S )NF)r]   rQ   rP   biasgroups)r<   r=   r   rb   convolutionBatchNorm2dre   )r>   rO   r]   rQ   rP   r?   s        r&   r=   z'CvtSelfAttentionConvProjection.__init__   sa    9#
 
 
  ^I66r%   c                 Z    |                      |          }|                     |          }|S r;   )rt   re   r>   rY   s     r&   rA   z&CvtSelfAttentionConvProjection.forward   s.    ''55)),77r%   r   r   r   r=   rA   rH   rI   s   @r&   rp   rp      sG        7 7 7 7 7      r%   rp   c                       e Zd Zd ZdS ) CvtSelfAttentionLinearProjectionc                     |j         \  }}}}||z  }|                    |||                              ddd          }|S rg   )r/   ri   rj   )r>   rY   rk   rN   rl   rm   rn   s          r&   rA   z(CvtSelfAttentionLinearProjection.forward   sN    2>2D/
L&%un#((\;OOWWXY[\^_``r%   N)r   r   r   rA   r$   r%   r&   rz   rz      s#            r%   rz   c                   &     e Zd Zd fd	Zd Z xZS )CvtSelfAttentionProjectiondw_bnc                     t                                                       |dk    rt          ||||          | _        t	                      | _        d S )Nr~   )r<   r=   rp   convolution_projectionrz   linear_projection)r>   rO   r]   rQ   rP   projection_methodr?   s         r&   r=   z#CvtSelfAttentionProjection.__init__   sQ    ''*HT_ahjp*q*qD'!A!C!Cr%   c                 Z    |                      |          }|                     |          }|S r;   )r   r   rw   s     r&   rA   z"CvtSelfAttentionProjection.forward   s.    22<@@--l;;r%   )r~   rx   rI   s   @r&   r}   r}      sR        D D D D D D      r%   r}   c                   .     e Zd Z	 d fd	Zd Zd Z xZS )CvtSelfAttentionTc                    t                                                       |dz  | _        || _        || _        || _        t          |||||dk    rdn|          | _        t          |||||          | _        t          |||||          | _	        t          j        |||	          | _        t          j        |||	          | _        t          j        |||	          | _        t          j        |
          | _        d S )Ng      avglinear)r   )rr   )r<   r=   scalewith_cls_tokenrO   	num_headsr}   convolution_projection_queryconvolution_projection_keyconvolution_projection_valuer   Linearprojection_queryprojection_keyprojection_valuerT   rU   )r>   r   rO   r]   	padding_q
padding_kvstride_q	stride_kvqkv_projection_methodqkv_biasattention_drop_rater   kwargsr?   s                r&   r=   zCvtSelfAttention.__init__   s     	_
,"",F*?5*H*HhhNc-
 -
 -
) +E{J	Mb+
 +
 +
' -G{J	Mb-
 -
 -
) !#	)YX N N N i	98LLL "	)YX N N Nz"566r%   c                     |j         \  }}}| j        | j        z  }|                    ||| j        |                              dddd          S )Nr   rh   r   r   )r/   rO   r   ri   rj   )r>   rY   rk   rn   _head_dims         r&   "rearrange_for_multi_head_attentionz3CvtSelfAttention.rearrange_for_multi_head_attention   sS    %1%7"
K>T^3  [$.(SS[[\]_`bcefgggr%   c                 r   | j         rt          j        |d||z  gd          \  }}|j        \  }}}|                    ddd                              ||||          }|                     |          }|                     |          }	|                     |          }
| j         rHt          j	        ||	fd          }	t          j	        ||fd          }t          j	        ||
fd          }
| j
        | j        z  }|                     |                     |	                    }	|                     |                     |                    }|                     |                     |
                    }
t          j        d|	|g          | j        z  }t          j        j                            |d          }|                     |          }t          j        d||
g          }|j        \  }}}}|                    dddd                                                              ||| j        |z            }|S )	Nr   r   rh   dimzbhlk,bhtk->bhltzbhlt,bhtv->bhlvr   )r   r!   splitr/   rj   ri   r   r   r   catrO   r   r   r   r   r   einsumr   r   
functionalsoftmaxrU   
contiguous)r>   rY   rl   rm   	cls_tokenrk   rn   rN   keyqueryvaluer   attention_scoreattention_probscontextr   s                   r&   rA   zCvtSelfAttention.forward   s"    	X&+k,FUN@SUV&W&W#I|0<0B-
K#++Aq!4499*lTZ\abb--l;;11,??11,?? 	9Iy%0a888E)Y,!444CIy%0a888E>T^3778M8Me8T8TUU55d6I6I#6N6NOO778M8Me8T8TUU,'85#,GG$*T(-55o25NN,,77,0?E2JKK&}1k1//!Q1--88::??
KY]YgjrYrssr%   T)r   r   r   r=   r   rA   rH   rI   s   @r&   r   r      sd         '7 '7 '7 '7 '7 '7Rh h h      r%   r   c                   (     e Zd ZdZ fdZd Z xZS )CvtSelfOutputz
    The residual connection is defined in CvtLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    c                     t                                                       t          j        ||          | _        t          j        |          | _        d S r;   )r<   r=   r   r   denserT   rU   )r>   rO   	drop_rater?   s      r&   r=   zCvtSelfOutput.__init__  sA    Yy)44
z),,r%   c                 Z    |                      |          }|                     |          }|S r;   r   rU   r>   rY   input_tensors      r&   rA   zCvtSelfOutput.forward  s*    zz,//||L11r%   rZ   rI   s   @r&   r   r     sQ         
- - - - -
      r%   r   c                   .     e Zd Z	 d fd	Zd Zd Z xZS )CvtAttentionTc                     t                                                       t          |||||||||	|
|          | _        t	          ||          | _        t                      | _        d S r;   )r<   r=   r   	attentionr   r6   setpruned_heads)r>   r   rO   r]   r   r   r   r   r   r   r   r   r   r?   s                r&   r=   zCvtAttention.__init__   sr     	)!
 
 $Iy99EEr%   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   r   )lenr   r   num_attention_headsattention_head_sizer   r   r   r   r   r6   r   all_head_sizeunion)r>   headsindexs      r&   prune_headszCvtAttention.prune_heads@  s   u::??F74>5t~7Y[_[l
 
u
  2$.2FNN/0BEJJ1$.2FNN.t{/@%QOOO .2^-ORUV[R\R\-\*'+~'IDNLn'n$ -33E::r%   c                 `    |                      |||          }|                     ||          }|S r;   )r   r6   )r>   rY   rl   rm   self_outputattention_outputs         r&   rA   zCvtAttention.forwardR  s1    nn\65AA;;{LAAr%   r   )r   r   r   r=   r   rA   rH   rI   s   @r&   r   r     sa         " " " " " "@; ; ;$             r%   r   c                   $     e Zd Z fdZd Z xZS )CvtIntermediatec                     t                                                       t          j        |t	          ||z                      | _        t          j                    | _        d S r;   )r<   r=   r   r   intr   GELU
activation)r>   rO   	mlp_ratior?   s      r&   r=   zCvtIntermediate.__init__Y  sJ    Yy#i).C*D*DEE
'))r%   c                 Z    |                      |          }|                     |          }|S r;   )r   r   rw   s     r&   rA   zCvtIntermediate.forward^  s*    zz,//|44r%   rx   rI   s   @r&   r   r   X  sG        $ $ $ $ $
      r%   r   c                   $     e Zd Z fdZd Z xZS )	CvtOutputc                     t                                                       t          j        t	          ||z            |          | _        t          j        |          | _        d S r;   )r<   r=   r   r   r   r   rT   rU   )r>   rO   r   r   r?   s       r&   r=   zCvtOutput.__init__e  sN    Ys9y#8999EE
z),,r%   c                 d    |                      |          }|                     |          }||z   }|S r;   r   r   s      r&   rA   zCvtOutput.forwardj  s4    zz,//||L11#l2r%   rx   rI   s   @r&   r   r   d  sG        - - - - -
      r%   r   c                   ,     e Zd ZdZ	 d fd	Zd Z xZS )CvtLayerzb
    CvtLayer composed by attention layers, normalization and multi-layer perceptrons (mlps).
    Tc                    t                                                       t          |||||||||	|
||          | _        t	          ||          | _        t          |||          | _        |dk    rt          |          nt          j
                    | _        t          j        |          | _        t          j        |          | _        d S )Nr'   )r)   )r<   r=   r   r   r   intermediater   r6   r9   r   Identityr7   rd   layernorm_beforelayernorm_after)r>   r   rO   r]   r   r   r   r   r   r   r   r   r   drop_path_rater   r?   s                  r&   r=   zCvtLayer.__init__v  s    " 	%!
 
 ,IyAA	9i@@BPSVBVBV~>>>>\^\g\i\i "Y 7 7!|I66r%   c                 <   |                      |                     |          ||          }|}|                     |          }||z   }|                     |          }|                     |          }|                     ||          }|                     |          }|S r;   )r   r   r7   r   r   r6   )r>   rY   rl   rm   self_attention_outputr   layer_outputs          r&   rA   zCvtLayer.forward  s     $!!,//!
 !

 1>>*:;; (,6 ++L99((66 {{<>>~~l33r%   r   rZ   rI   s   @r&   r   r   q  s\         & %7 %7 %7 %7 %7 %7N      r%   r   c                   $     e Zd Z fdZd Z xZS )CvtStagec           	      "    t                                                        _        | _         j        j         j                 r=t          j        t          j        dd j        j	        d                              _        t          j         j                 j         j                  j        dk    rj        nj	         j        dz
           j	         j                 j         j                 j         j                            _        d t          j        dj         j                 j        |                   D             t          j         fdt+          j         j                           D               _        d S )Nr   r   r   )rM   rP   rN   rO   rQ   rV   c                 6    g | ]}|                                 S r$   )item).0xs     r&   
<listcomp>z%CvtStage.__init__.<locals>.<listcomp>  s     www16688wwwr%   c                     g | ]}t          j        j                 j        j                 j        j                 j        j                 j        j                 j        j                 j        j                 j	        j                 j
        j                 j        j                 j        j                 j                 j        j                 j        j                            S ))r   rO   r]   r   r   r   r   r   r   r   r   r   r   r   )r   r   stagerO   
kernel_qkvr   r   r   r   r   r   r   r   r   r   )r   r   configdrop_path_ratesr>   s     r&   r   z%CvtStage.__init__.<locals>.<listcomp>  s       " ! $.tz:$.tz: & 1$* =$.tz:%0<$.tz:#_TZ8*0*Ftz*R#_TZ8(.(B4:(N$.tz:#24:#>$.tz:#)#3DJ#?    r%   )r<   r=   r   r   r   r   	Parameterr!   randnrO   rK   patch_sizespatch_striderN   patch_paddingr   	embeddinglinspacer   depth
Sequentialrangelayers)r>   r   r   r   r?   s   `` @r&   r=   zCvtStage.__init__  sv   
; , 	X\%+aDK<QRT<U*V*VWWDN&)$*5&tz204
a,,VEUVZV`cdVdEe&tz2(4)$*5
 
 
 xwU^Av?TUYU_?`bhbnotbu-v-vwwwm     " v|DJ788#  
r%   c                 :   d }|                      |          }|j        \  }}}}|                    ||||z                                ddd          }| j        j        | j                 r4| j                            |dd          }t          j	        ||fd          }| j
        D ]} ||||          }|}| j        j        | j                 rt          j        |d||z  gd          \  }}|                    ddd                              ||||          }||fS )Nr   rh   r   r   r   )r   r/   ri   rj   r   r   r   expandr!   r   r   r   )	r>   rY   r   rk   rN   rl   rm   layerlayer_outputss	            r&   rA   zCvtStage.forward  s:   	~~l332>2D/
L&%#((\6E>RRZZ[\^_abcc; , 	G--j"bAAI 9i%>AFFFL[ 	) 	)E!E,>>M(LL; , 	X&+k,FUN@SUV&W&W#I|#++Aq!4499*lTZ\abbY&&r%   rx   rI   s   @r&   r   r     sH        &
 &
 &
 &
 &
P' ' ' ' ' ' 'r%   r   c                   &     e Zd Z fdZddZ xZS )
CvtEncoderc                     t                                                       || _        t          j        g           | _        t          t          |j                            D ]*}| j        	                    t          ||                     +d S r;   )r<   r=   r   r   
ModuleListstagesr   r   r   appendr   )r>   r   	stage_idxr?   s      r&   r=   zCvtEncoder.__init__  s    mB''s6<0011 	< 	<IKx	::;;;;	< 	<r%   FTc                     |rdnd }|}d }t          | j                  D ]\  }} ||          \  }}|r||fz   }|st          d |||fD                       S t          |||          S )Nr$   c              3      K   | ]}||V  	d S r;   r$   )r   vs     r&   	<genexpr>z%CvtEncoder.forward.<locals>.<genexpr>  s(      bbqTUTaTaTaTaTabbr%   r   r   r   )	enumerater  tupler   )	r>   rX   output_hidden_statesreturn_dictall_hidden_statesrY   r   r   stage_modules	            r&   rA   zCvtEncoder.forward  s    "6@BBD#	!*4;!7!7 	H 	HA&2l<&@&@#L)# H$5$G! 	cbb\9>O$Pbbbbbb**%+
 
 
 	
r%   )FTrx   rI   s   @r&   r   r     sL        < < < < <
 
 
 
 
 
 
 
r%   r   c                   *    e Zd ZdZeZdZdZdgZd Z	dS )CvtPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    cvtrX   r   c                    t          |t          j        t          j        f          rit          j                            |j        j        d| j        j	                  |j        _        |j
         |j
        j                                         dS dS t          |t          j                  r?|j
        j                                         |j        j                            d           dS t          |t                    rs| j        j        |j                 r^t          j                            t#          j        dd| j        j        d                   d| j        j	                  |j        _        dS dS dS )zInitialize the weightsr'   )meanstdNg      ?r   r   )r^   r   r   rb   inittrunc_normal_weightdatar   initializer_rangerr   zero_rd   fill_r   r   r   r!   zerosrO   )r>   modules     r&   _init_weightsz CvtPreTrainedModel._init_weights  sJ   fry")455 	!#!6!6v}7IPSY]YdYv!6!w!wFM{& &&((((( '&-- 	K""$$$M$$S))))))) 	{$V\2 (*(=(=K1dk&;B&?@@sPTP[Pm )> ) ) %%%	 	 r%   N)
r   r   r   r    r   config_classbase_model_prefixmain_input_name_no_split_modulesr   r$   r%   r&   r  r    sG         
 L$O#    r%   r  aE  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`CvtConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aE  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`CvtImageProcessor.__call__`]
            for details.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
z]The bare Cvt Model transformer outputting raw hidden-states without any specific head on top.c                        e Zd Zd fd	Zd Z ee           eee	e
de          	 	 	 ddeej                 dee         d	ee         d
eee	f         fd                        Z xZS )CvtModelTc                     t                                          |           || _        t          |          | _        |                                  d S r;   )r<   r=   r   r   encoder	post_init)r>   r   add_pooling_layerr?   s      r&   r=   zCvtModel.__init__D  sG       !&))r%   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr(  r   r   r   )r>   heads_to_pruner   r   s       r&   _prune_headszCvtModel._prune_headsJ  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr%   vision)
checkpointoutput_typer!  modalityexpected_outputNrX   r  r  r+   c                     ||n| j         j        }||n| j         j        }|t          d          |                     |||          }|d         }|s|f|dd          z   S t          ||j        |j                  S )Nz You have to specify pixel_valuesr  r  r   r   r
  )r   r  use_return_dict
ValueErrorr(  r   r   r   )r>   rX   r  r  encoder_outputssequence_outputs         r&   rA   zCvtModel.forwardR  s     %9$D  $+Jj 	 &1%<kk$+B]?@@@,,!5# ' 
 

 *!, 	<#%(;;;*-+;)7
 
 
 	
r%   r   )NNN)r   r   r   r=   r.  r   CVT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r!   rF   boolr   r   rA   rH   rI   s   @r&   r&  r&  ?  s        
     C C C +*+?@@&/$.   04/3&*	
 
u|,
 'tn
 d^	

 
u11	2
 
 
  A@
 
 
 
 
r%   r&  z
    Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                        e Zd Z fdZ ee           eeee	e
          	 	 	 	 d
deej                 deej                 dee         dee         deeef         f
d	                        Z xZS )CvtForImageClassificationc                    t                                          |           |j        | _        t          |d          | _        t          j        |j        d                   | _        |j        dk    r%t          j	        |j        d         |j                  nt          j
                    | _        |                                  d S )NF)r*  r   r   )r<   r=   
num_labelsr&  r  r   rd   rO   	layernormr   r   
classifierr)  )r>   r   r?   s     r&   r=   z"CvtForImageClassification.__init__  s        +Fe<<<f&6r&:;; CIBSVWBWBWBIf&r*F,=>>>]_]h]j]j 	
 	r%   )r0  r1  r!  r3  NrX   labelsr  r  r+   c                     ||n| j         j        }|                     |||          }|d         }|d         }| j         j        d         r|                     |          }nP|j        \  }}	}
}|                    ||	|
|z                                ddd          }|                     |          }|                    d          }| 	                    |          }d}|n| j         j
        p| j         j        dk    rd| j         _
        nS| j         j        dk    r7|j        t          j        k    s|j        t          j        k    rd	| j         _
        nd
| j         _
        | j         j
        dk    r\t!                      }| j         j        dk    r1 ||                                |                                          }n |||          }n| j         j
        d	k    rLt%                      } ||                    d| j         j                  |                    d                    }n*| j         j
        d
k    rt'                      } |||          }|s|f|dd         z   }||f|z   n|S t)          |||j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr5  r   r   r   rh   r   
regressionsingle_label_classificationmulti_label_classification)losslogitsr   )r   r6  r  r   rC  r/   ri   rj   r  rD  problem_typerB  r-   r!   longr   r
   squeezer	   r   r   r   )r>   rX   rE  r  r  outputsr9  r   rk   rN   rl   rm   sequence_output_meanrK  rJ  loss_fctr6   s                    r&   rA   z!CvtForImageClassification.forward  s   ( &1%<kk$+B]((!5#  
 
 "!*AJ	; $ 	>"nnY77OO6E6K3Jfe-22:|VV[^\\ddefhiklmmO"nn_==O.333::!566{'/;)Q../;DK,,[+a//V\UZ5O5OSYS_chclSlSl/LDK,,/KDK,{'<77"99;)Q..#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB0F G GUWYY)-III,..x// 	FY,F)-)9TGf$$vE3f\c\qrrrrr%   )NNNN)r   r   r   r=   r   r:  r   _IMAGE_CLASS_CHECKPOINTr   r<  _IMAGE_CLASS_EXPECTED_OUTPUTr   r!   rF   r>  r   r   rA   rH   rI   s   @r&   r@  r@  y  s            +*+?@@*8$4	   04)-/3&*<s <su|,<s &<s 'tn	<s
 d^<s 
u::	;<s <s <s  A@<s <s <s <s <sr%   r@  )r'   F)@r    collections.abcr_   dataclassesr   typingr   r   r   r!   torch.utils.checkpointr   torch.nnr   r	   r
   
file_utilsr   r   r   modeling_outputsr   r   modeling_utilsr   r   r   utilsr   configuration_cvtr   
get_loggerr   loggerr<  r;  r=  rR  rS  r   rF   rE   r>  r7   Moduler9   rK   rR   rp   rz   r}   r   r   r   r   r   r   r   r   r  CVT_START_DOCSTRINGr:  r&  r@  r$   r%   r&   <module>rb     s'         ! ! ! ! ! ! ) ) ) ) ) ) ) ) ) )            A A A A A A A A A A q q q q q q q q q q Q Q Q Q Q Q Q Q c c c c c c c c c c       ( ( ( ( ( ( 
	H	%	%  ) )))  - 1  B B B B B+ B B B* U\ e T V[Vb    *- - - - -") - - -    BI   $    	   2    RY   (    ry   
 
 
 
 
 
 
 
N N N N Nry N N Nb    BI   "6  6  6  6  6 29 6  6  6 r	 	 	 	 	bi 	 	 	
 
 
 
 
	 
 
 
? ? ? ? ?ry ? ? ?D:' :' :' :' :'ry :' :' :'z
 
 
 
 
 
 
 
8       6	 
  c 3
 3
 3
 3
 3
! 3
 3
	 3
l   Rs Rs Rs Rs Rs 2 Rs Rs Rs Rs Rsr%   