
    gY                        d Z ddlZddlZddlmZmZmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZmZmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&  e"j'        e(          Z)dZ*dZ+g dZ,dZ-dZ. G d dej/                  Z0 G d dej/                  Z1 G d dej/                  Z2 G d de2          Z3 G d dej/                  Z4 G d dej/                  Z5 G d d e5          Z6 G d! d"ej/                  Z7 G d# d$ej/                  Z8e5e6d%Z9 G d& d'ej/                  Z: G d( d)ej/                  Z; G d* d+e          Z<d,Z=d-Z> e d.e=           G d/ d0e<                      Z? G d1 d2ej/                  Z@ e d3e=           G d4 d5e<                      ZA e d6e=           G d7 d8e<                      ZBdS )9zPyTorch ViT model.    N)DictListOptionalSetTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedImageModelingOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings	torch_int   )	ViTConfigr   z!google/vit-base-patch16-224-in21k)r      i   zgoogle/vit-base-patch16-224zEgyptian catc            	            e Zd ZdZddededdf fdZdej        d	e	d
e	dej        fdZ
	 	 ddej        deej                 dedej        fdZ xZS )ViTEmbeddingszb
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
    Fconfiguse_mask_tokenreturnNc                 $   t                                                       t          j        t	          j        dd|j                            | _        |r-t          j        t	          j        dd|j                            nd | _	        t          |          | _        | j        j        }t          j        t	          j        d|dz   |j                            | _        t          j        |j                  | _        |j        | _        || _        d S )Nr   )super__init__r	   	Parametertorchrandnhidden_size	cls_tokenzeros
mask_tokenViTPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout
patch_sizer!   )selfr!   r"   r0   	__class__s       `/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/vit/modeling_vit.pyr&   zViTEmbeddings.__init__A   s    ek!Q8J&K&KLLQ_i",u{1a9K'L'LMMMei 26 : :+7#%<A{QPVPb0c0c#d#d z&"<== +    
embeddingsheightwidthc                    |j         d         dz
  }| j        j         d         dz
  }t          j                                        s||k    r||k    r| j        S | j        ddddf         }| j        ddddf         }|j         d         }|| j        z  }	|| j        z  }
t          |dz            }|                    d|||          }|                    dddd          }t          j
                            ||	|
fdd	
          }|                    dddd                              dd|          }t          j        ||fd          S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   N      ?r   r      bicubicF)sizemodealign_cornersdim)shaper1   r(   jit
is_tracingr5   r   reshapepermuter	   
functionalinterpolateviewcat)r6   r:   r;   r<   r0   num_positionsclass_pos_embedpatch_pos_embedrF   
new_height	new_widthsqrt_num_positionss               r8   interpolate_pos_encodingz&ViTEmbeddings.interpolate_pos_encodingM   sr    !&q)A-06q9A= y##%% 	,+*F*F6UZ??++2111bqb592111abb59r"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNy/?;CCCCr9   pixel_valuesbool_masked_posrV   c                    |j         \  }}}}|                     ||          }|_|j         d         }	| j                            ||	d          }
|                    d                              |
          }|d|z
  z  |
|z  z   }| j                            |dd          }t          j        ||fd          }|r|| 	                    |||          z   }n
|| j
        z   }|                     |          }|S )N)rV   r   r>         ?rE   )rG   r/   r-   expand	unsqueezetype_asr+   r(   rO   rV   r1   r4   )r6   rW   rX   rV   
batch_sizenum_channelsr;   r<   r:   
seq_lengthmask_tokensmask
cls_tokenss                r8   forwardzViTEmbeddings.forwardu   s    3?2D/
L&%**<Rj*kk
&#)!,J/00ZLLK",,R0088EED#sTz2[45GGJ ^**:r2>>
Y
J7Q???
 $ 	?#d&C&CJPVX]&^&^^JJ#d&>>J\\*--
r9   FNF)__name__
__module____qualname____doc__r   boolr&   r(   TensorintrV   r   
BoolTensorrd   __classcell__r7   s   @r8   r    r    <   s         
 
y 
$ 
4 
 
 
 
 
 
&D5< &D &DUX &D]b]i &D &D &D &DV 7;).	 l "%"23 #'	
 
       r9   r    c                   L     e Zd ZdZ fdZddej        dedej        fdZ xZ	S )	r.   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t                                                       |j        |j        }}|j        |j        }}t          |t          j        j	                  r|n||f}t          |t          j        j	                  r|n||f}|d         |d         z  |d         |d         z  z  }|| _        || _        || _        || _
        t          j        ||||          | _        d S )Nr   r   )kernel_sizestride)r%   r&   
image_sizer5   r_   r*   
isinstancecollectionsabcIterabler0   r	   Conv2d
projection)r6   r!   ru   r5   r_   r*   r0   r7   s          r8   r&   zViTPatchEmbeddings.__init__   s    !'!2F4EJ
$*$79Kk#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
!!}
15*Q-:VW=:XY$$(&)L+:^hiiir9   FrW   rV   r#   c                    |j         \  }}}}|| j        k    rt          d| j         d| d          |sT|| j        d         k    s|| j        d         k    r2t          d| d| d| j        d          d| j        d          d		          |                     |                              d
                              dd
          }|S )NzoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r   zInput image size (*z) doesn't match model (z).r@   )rG   r_   
ValueErrorru   r{   flatten	transpose)r6   rW   rV   r^   r_   r;   r<   r:   s           r8   rd   zViTPatchEmbeddings.forward   s*   2>2D/
L&%4,,,I!.I I9EI I I   ( 	+++u8J/J/J E E E% E E+E E.2oa.@E E E   __\22::1==GG1MM
r9   re   )
rg   rh   ri   rj   r&   r(   rl   rk   rd   ro   rp   s   @r8   r.   r.      s{         j j j j j EL D ]b]i        r9   r.   c            
            e Zd Zdeddf fdZdej        dej        fdZ	 ddeej                 d	e	de
eej        ej        f         eej                 f         fd
Z xZS )ViTSelfAttentionr!   r#   Nc                    t                                                       |j        |j        z  dk    r1t	          |d          s!t          d|j        f d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j
        |j        | j        |j                  | _        t          j
        |j        | j        |j                  | _        t          j
        |j        | j        |j                  | _        t          j        |j                  | _        d S )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r}   )bias)r%   r&   r*   num_attention_headshasattrr   rm   attention_head_sizeall_head_sizer	   Linearqkv_biasquerykeyvaluer2   attention_probs_dropout_probr4   r6   r!   r7   s     r8   r&   zViTSelfAttention.__init__   s1    ::a??PVXhHiHi?76#5"7 7 737 7 7  
 $*#= #&v'9F<V'V#W#W !58PPYv143EFO\\\
9V/1C&/ZZZYv143EFO\\\
z&"EFFr9   xc                     |                                 d d         | j        | j        fz   }|                    |          }|                    dddd          S )Nr>   r   r@   r   r   )rB   r   r   rN   rK   )r6   r   new_x_shapes      r8   transpose_for_scoresz%ViTSelfAttention.transpose_for_scores   sP    ffhhssmt'?AY&ZZFF;yyAq!$$$r9   F	head_maskoutput_attentionsc                    |                      |          }|                     |                     |                    }|                     |                     |                    }|                     |          }t	          j        ||                    dd                    }|t          j        | j	                  z  }t          j                            |d          }	|                     |	          }	||	|z  }	t	          j        |	|          }
|
                    dddd                                          }
|
                                d d         | j        fz   }|
                    |          }
|r|
|	fn|
f}|S )Nr>   rE   r   r@   r   r   )r   r   r   r   r(   matmulr   mathsqrtr   r	   rL   softmaxr4   rK   
contiguousrB   r   rN   )r6   hidden_statesr   r   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                r8   rd   zViTSelfAttention.forward   sr    !JJ}55--dhh}.E.EFF	//

=0I0IJJ//0ABB !<Y5H5HR5P5PQQ+di8P.Q.QQ -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S%**+BCC6G]=/22mM]r9   rf   )rg   rh   ri   r   r&   r(   rl   r   r   rk   r   r   rd   ro   rp   s   @r8   r   r      s        Gy GT G G G G G G$%el %u| % % % % bg! !(0(>!Z^!	uU\5</0%2EE	F! ! ! ! ! ! ! !r9   r   c                        e Zd Zdeddf fdZ	 	 d
dej        deej                 de	de
eej        ej        f         eej                 f         f fd	Z xZS )ViTSdpaSelfAttentionr!   r#   Nc                 b    t                                          |           |j        | _        d S N)r%   r&   r   r   s     r8   r&   zViTSdpaSelfAttention.__init__   s,       ,2,O)))r9   Fr   r   r   c           	         |s|>t                               d           t                                          |||          S |                     |          }|                     |                     |                    }|                     |                     |                    }|                     |          }t          j	        j
                            ||||| j        r| j        nddd           }|                    dddd	                                          }|                                d d
         | j        fz   }	|                    |	          }|d fS )Na  `ViTSdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)r   r   r           F)	is_causalscaler   r@   r   r   r   )loggerwarning_oncer%   rd   r   r   r   r   r(   r	   rL   scaled_dot_product_attentiontrainingr   rK   r   rB   r   rN   )r6   r   r   r   r   r   r   r   r   r   r7   s             r8   rd   zViTSdpaSelfAttention.forward   sp     		 5w   77??+#"3 #    !JJ}55--dhh}.E.EFF	//

=0I0IJJ//0ABB+HH15GD--C I 
 
 &--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S%**+BCCd""r9   rf   )rg   rh   ri   r   r&   r(   FloatTensorr   rl   rk   r   r   rd   ro   rp   s   @r8   r   r      s        Py PT P P P P P P -1"'	'# '#('# EL)'#  	'#
 
uU\5</0%2EE	F'# '# '# '# '# '# '# '# '# '#r9   r   c                   ^     e Zd ZdZdeddf fdZdej        dej        dej        fdZ xZ	S )	ViTSelfOutputz
    The residual connection is defined in ViTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r!   r#   Nc                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _        d S r   )	r%   r&   r	   r   r*   denser2   r3   r4   r   s     r8   r&   zViTSelfOutput.__init__,  sJ    Yv163EFF
z&"<==r9   r   input_tensorc                 Z    |                      |          }|                     |          }|S r   r   r4   r6   r   r   s      r8   rd   zViTSelfOutput.forward1  s*    

=11]33r9   )
rg   rh   ri   rj   r   r&   r(   rl   rd   ro   rp   s   @r8   r   r   &  s         
>y >T > > > > > >
U\  RWR^        r9   r   c                        e Zd Zdeddf fdZdee         ddfdZ	 	 ddej	        d	e
ej	                 d
edeeej	        ej	        f         eej	                 f         fdZ xZS )ViTAttentionr!   r#   Nc                     t                                                       t          |          | _        t	          |          | _        t                      | _        d S r   )r%   r&   r   	attentionr   outputsetpruned_headsr   s     r8   r&   zViTAttention.__init__9  sI    )&11#F++EEr9   headsc                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   rE   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)r6   r   indexs      r8   prune_headszViTAttention.prune_heads?  s   u::??F74>5t~7Y[_[l
 
u
  2$.2FNN/0BEJJ1$.2FNN.t{/@%QOOO .2^-ORUV[R\R\-\*'+~'IDNLn'n$ -33E::r9   Fr   r   r   c                     |                      |||          }|                     |d         |          }|f|dd          z   }|S )Nr   r   )r   r   )r6   r   r   r   self_outputsattention_outputr   s          r8   rd   zViTAttention.forwardQ  sM     ~~mY@QRR;;|AFF#%QRR(88r9   rf   )rg   rh   ri   r   r&   r   rm   r   r(   rl   r   rk   r   r   rd   ro   rp   s   @r8   r   r   8  s        "y "T " " " " " ";S ;d ; ; ; ;* -1"'	 | EL)  	
 
uU\5</0%2EE	F       r9   r   c                   (     e Zd Zdeddf fdZ xZS )ViTSdpaAttentionr!   r#   Nc                 r    t                                          |           t          |          | _        d S r   )r%   r&   r   r   r   s     r8   r&   zViTSdpaAttention.__init__`  s.       -f55r9   )rg   rh   ri   r   r&   ro   rp   s   @r8   r   r   _  sK        6y 6T 6 6 6 6 6 6 6 6 6 6r9   r   c                   L     e Zd Zdeddf fdZdej        dej        fdZ xZS )ViTIntermediater!   r#   Nc                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r   )r%   r&   r	   r   r*   intermediate_sizer   rv   
hidden_actstrr   intermediate_act_fnr   s     r8   r&   zViTIntermediate.__init__f  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r9   r   c                 Z    |                      |          }|                     |          }|S r   )r   r   )r6   r   s     r8   rd   zViTIntermediate.forwardn  s,    

=1100??r9   	rg   rh   ri   r   r&   r(   rl   rd   ro   rp   s   @r8   r   r   e  sq        9y 9T 9 9 9 9 9 9U\ el        r9   r   c                   Z     e Zd Zdeddf fdZdej        dej        dej        fdZ xZS )	ViTOutputr!   r#   Nc                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _	        d S r   )
r%   r&   r	   r   r   r*   r   r2   r3   r4   r   s     r8   r&   zViTOutput.__init__v  sJ    Yv79KLL
z&"<==r9   r   r   c                 d    |                      |          }|                     |          }||z   }|S r   r   r   s      r8   rd   zViTOutput.forward{  s4    

=11]33%4r9   r   rp   s   @r8   r   r   u  s|        >y >T > > > > > >
U\  RWR^        r9   r   )eagersdpac                        e Zd ZdZdeddf fdZ	 	 ddej        deej                 d	e	de
eej        ej        f         eej                 f         fd
Z xZS )ViTLayerz?This corresponds to the Block class in the timm implementation.r!   r#   Nc                    t                                                       |j        | _        d| _        t	          |j                 |          | _        t          |          | _        t          |          | _
        t          j        |j        |j                  | _        t          j        |j        |j                  | _        d S )Nr   eps)r%   r&   chunk_size_feed_forwardseq_len_dimVIT_ATTENTION_CLASSES_attn_implementationr   r   intermediater   r   r	   	LayerNormr*   layer_norm_epslayernorm_beforelayernorm_afterr   s     r8   r&   zViTLayer.__init__  s    '-'E$.v/JKFSS+F33'' "V-?VEZ [ [ [!|F,>FDYZZZr9   Fr   r   r   c                    |                      |                     |          ||          }|d         }|dd          }||z   }|                     |          }|                     |          }|                     ||          }|f|z   }|S )N)r   r   r   )r   r   r   r   r   )r6   r   r   r   self_attention_outputsr   r   layer_outputs           r8   rd   zViTLayer.forward  s     "&!!-00/ "0 "
 "

 2!4(, )=8 ++M::((66 {{<??/G+r9   rf   )rg   rh   ri   rj   r   r&   r(   rl   r   rk   r   r   rd   ro   rp   s   @r8   r   r     s        II[y [T [ [ [ [ [ [ -1"'	 | EL)  	
 
uU\5</0%2EE	F       r9   r   c                        e Zd Zdeddf fdZ	 	 	 	 ddej        deej                 d	ed
edede	e
ef         fdZ xZS )
ViTEncoderr!   r#   Nc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S  )r   ).0_r!   s     r8   
<listcomp>z'ViTEncoder.__init__.<locals>.<listcomp>  s!    #^#^#^HV$4$4#^#^#^r9   F)	r%   r&   r!   r	   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   s    `r8   r&   zViTEncoder.__init__  s`    ]#^#^#^#^eFD\>]>]#^#^#^__
&+###r9   FTr   r   r   output_hidden_statesreturn_dictc                    |rdnd }|rdnd }t          | j                  D ]h\  }}	|r||fz   }|||         nd }
| j        r%| j        r|                     |	j        ||
|          }n |	||
|          }|d         }|r||d         fz   }i|r||fz   }|st          d |||fD                       S t          |||          S )Nr   r   r   c              3      K   | ]}||V  	d S r   r   )r   vs     r8   	<genexpr>z%ViTEncoder.forward.<locals>.<genexpr>  s(      mmq_`_l_l_l_l_lmmr9   )last_hidden_stater   
attentions)	enumerater   r   r   _gradient_checkpointing_func__call__tupler   )r6   r   r   r   r   r   all_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss               r8   rd   zViTEncoder.forward  sI    #7@BBD$5?bb4(44 	P 	POA|# I$58H$H!.7.CillO* `t} ` $ A A )!#%	! ! !-]OM^ _ _)!,M  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r9   )NFFT)rg   rh   ri   r   r&   r(   rl   r   rk   r   r	  r   rd   ro   rp   s   @r8   r   r     s        ,y ,T , , , , , , -1"'%* )
 )
|)
 EL))
  	)

 #)
 )
 
uo%	&)
 )
 )
 )
 )
 )
 )
 )
r9   r   c                   n    e Zd ZdZeZdZdZdZddgZ	dZ
deej        ej        ej        f         dd	fd
Zd	S )ViTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    vitrW   Tr    r   moduler#   Nc                 `   t          |t          j        t          j        f          rt          j                            |j        j                            t          j
                  d| j        j                                      |j        j                  |j        _        |j         |j        j                                         dS dS t          |t          j                  r?|j        j                                         |j        j                            d           dS t          |t$                    rt          j                            |j        j                            t          j
                  d| j        j                                      |j        j                  |j        _        t          j                            |j        j                            t          j
                  d| j        j                                      |j        j                  |j        _        dS dS )zInitialize the weightsr   )meanstdNrZ   )rv   r	   r   rz   inittrunc_normal_weightdatator(   float32r!   initializer_rangedtyper   zero_r   fill_r    r1   r+   )r6   r  s     r8   _init_weightsz ViTPreTrainedModel._init_weights  s   fry")455 	) "$!6!6"%%em443DKDa "7 " "b$%% M {& &&((((( '&-- 	)K""$$$M$$S))))).. 	).0g.C.C*/225=AAK1 /D / / b+122	 &+ %'G$9$9 %((77K1 %: % % b!'((	 !!!	) 	)r9   )rg   rh   ri   rj   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpar   r	   r   rz   r   r!  r   r9   r8   r  r    s{         
 L$O&*#(*5N)E")RY*L$M )RV ) ) ) ) ) )r9   r  aE  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`ViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
            for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        interpolate_pos_encoding (`bool`, *optional*):
            Whether to interpolate the pre-trained position encodings.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z]The bare ViT Model transformer outputting raw hidden-states without any specific head on top.c                   n    e Zd Zddededef fdZdefdZd	ee	e
e	         f         dd
fdZ ee           eeeede          	 	 	 	 	 	 	 ddeej                 deej                 deej                 dee         dee         dee         dee         deeef         fd                        Z xZS )ViTModelTFr!   add_pooling_layerr"   c                 N   t                                          |           || _        t          ||          | _        t          |          | _        t          j        |j	        |j
                  | _        |rt          |          nd | _        |                                  d S )N)r"   r   )r%   r&   r!   r    r:   r   encoderr	   r   r*   r   	layernorm	ViTPoolerpooler	post_init)r6   r!   r*  r"   r7   s       r8   r&   zViTModel.__init__8  s       '~NNN!&))f&8f>STTT+<Fi'''$ 	r9   r#   c                     | j         j        S r   )r:   r/   )r6   s    r8   get_input_embeddingszViTModel.get_input_embeddingsE  s    //r9   heads_to_pruneNc                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr,  r   r   r   )r6   r3  r   r   s       r8   _prune_headszViTModel._prune_headsH  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr9   vision)
checkpointoutput_typer"  modalityexpected_outputrW   rX   r   r   r   rV   r   c                 ~   ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |                     || j         j                  }| j        j        j	        j
        j        }|j        |k    r|                    |          }|                     |||          }	|                     |	||||          }
|
d         }|                     |          }| j        |                     |          nd}|s|||fn|f}||
dd         z   S t!          |||
j        |
j                  S )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rX   rV   )r   r   r   r   r   r   )r  pooler_outputr   r  )r!   r   r   use_return_dictr   get_head_maskr   r:   r/   r{   r  r  r  r,  r-  r/  r   r   r  )r6   rW   rX   r   r   r   rV   r   expected_dtypeembedding_outputencoder_outputssequence_outputpooled_outputhead_outputss                 r8   rd   zViTModel.forwardP  s   , 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]?@@@ &&y$+2OPP	 9DKQ//'??>::L??/Tl + 
 
 ,,/!5# ' 
 
 *!,..998<8OO444UY 	6?L?XO];;_n^pL/!"""555)-')7&1	
 
 
 	
r9   )TFNNNNNNN)rg   rh   ri   r   rk   r&   r.   r2  r   rm   r   r6  r   VIT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r(   rl   rn   r   r   rd   ro   rp   s   @r8   r)  r)  3  s       
 y T Z^      0&8 0 0 0 0C4T#Y+? CD C C C C +*+?@@&.$.   046:,0,0/337&*;
 ;
u|,;
 "%"23;
 EL)	;

 $D>;
 'tn;
 #+4.;
 d^;
 
u00	1;
 ;
 ;
  A@;
 ;
 ;
 ;
 ;
r9   r)  c                   *     e Zd Zdef fdZd Z xZS )r.  r!   c                     t                                                       t          j        |j        |j                  | _        t          j                    | _        d S r   )r%   r&   r	   r   r*   r   Tanh
activationr   s     r8   r&   zViTPooler.__init__  sC    Yv163EFF
'))r9   c                 r    |d d df         }|                      |          }|                     |          }|S )Nr   )r   rN  )r6   r   first_token_tensorrD  s       r8   rd   zViTPooler.forward  s@     +111a40

#56666r9   )rg   rh   ri   r   r&   rd   ro   rp   s   @r8   r.  r.    sS        $y $ $ $ $ $ $
      r9   r.  aV  ViT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    c                   *    e Zd Zdeddf fdZ ee           eee	          	 	 	 	 	 	 	 dde
ej                 de
ej                 de
ej                 d	e
e         d
e
e         de
e         de
e         deeef         fd                        Z xZS )ViTForMaskedImageModelingr!   r#   Nc                 V   t                                          |           t          |dd          | _        t	          j        t	          j        |j        |j        dz  |j	        z  d          t	          j
        |j                            | _        |                                  d S )NFT)r*  r"   r@   r   )in_channelsout_channelsrs   )r%   r&   r)  r  r	   
Sequentialrz   r*   encoder_strider_   PixelShuffledecoderr0  r   s     r8   r&   z"ViTForMaskedImageModeling.__init__  s       FeDQQQ}I".#2A58KK  
 OF122
 
 	r9   )r9  r"  rW   rX   r   r   r   rV   r   c           	      6   ||n| j         j        }|D| j         j        | j         j        k    r*t	          d| j         j         d| j         j         d          |                     |||||||          }|d         }	|	ddddf         }	|	j        \  }
}}t          j        |dz            x}}|		                    dd	d          
                    |
|||          }	|                     |	          }d}|| j         j        | j         j        z  }|
                    d
||          }|                    | j         j        d                              | j         j        d	                              d                                          }t           j                            ||d          }||z                                  |                                dz   z  | j         j        z  }|s|f|dd         z   }||f|z   n|S t+          |||j        |j                  S )a=  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, ViTForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
        >>> model = ViTForMaskedImageModeling.from_pretrained("google/vit-base-patch16-224-in21k")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 224, 224]
        ```NzWhen `bool_masked_pos` is provided, `patch_size` must be equal to `encoder_stride` to ensure that the reconstructed image has the same dimensions as the input. Got `patch_size` = z and `encoder_stride` = r}   )rX   r   r   r   rV   r   r   r   r?   r@   r>   none)	reductiongh㈵>)lossreconstructionr   r  )r!   r>  r5   rW  r   r  rG   r   floorrK   rJ   rY  ru   repeat_interleaver\   r   r	   rL   l1_losssumr_   r   r   r  )r6   rW   rX   r   r   r   rV   r   r   rC  r^   sequence_lengthr_   r;   r<   reconstructed_pixel_valuesmasked_im_lossrB   rb   reconstruction_lossr   s                        r8   rd   z!ViTForMaskedImageModeling.forward  sf   R &1%<kk$+B]&DK,BdkF`,`,`t&*k&<t tVZVaVpt t t   ((+/!5%=#  
 
 "!* *!!!QRR%04C4I1
O\OS$8999)11!Q::BB:|]cejkk &*\\/%B%B"&;)T[-CCD-55b$EEO11$+2H!LL""4;#91==1	  #%-"7"7F`lr"7"s"s1D8==??488::PTCTUX\XcXppN 	Z02WQRR[@F3A3M^%..SYY(5!/)	
 
 
 	
r9   rF  )rg   rh   ri   r   r&   r   rG  r   r   rI  r   r(   rl   rn   rk   r   r	  rd   ro   rp   s   @r8   rR  rR    sC       y T      " +*+?@@+DSbccc 046:,0,0/337&*[
 [
u|,[
 "%"23[
 EL)	[

 $D>[
 'tn[
 #+4.[
 d^[
 
u//	0[
 [
 [
 dc A@[
 [
 [
 [
 [
r9   rR  a  
    ViT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune ViT on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                   .    e Zd Zdeddf fdZ ee           eee	e
e          	 	 	 	 	 	 	 ddeej                 deej                 deej                 d	ee         d
ee         dee         dee         deee	f         fd                        Z xZS )ViTForImageClassificationr!   r#   Nc                 :   t                                          |           |j        | _        t          |d          | _        |j        dk    rt          j        |j        |j                  nt          j                    | _	        | 
                                 d S )NF)r*  r   )r%   r&   
num_labelsr)  r  r	   r   r*   Identity
classifierr0  r   s     r8   r&   z"ViTForImageClassification.__init__3  s        +Fe<<< OUN_bcNcNc")F$68IJJJikitiviv 	r9   )r8  r9  r"  r;  rW   r   labelsr   r   rV   r   c                    ||n| j         j        }|                     ||||||          }|d         }	|                     |	dddddf                   }
d}|t|                    |
j                  }| j         j        f| j        dk    rd| j         _        nN| j        dk    r7|j        t          j
        k    s|j        t          j        k    rd| j         _        nd| j         _        | j         j        dk    rWt                      }| j        dk    r1 ||
                                |                                          }n ||
|          }n| j         j        dk    rGt                      } ||
                    d| j                  |                    d                    }n*| j         j        dk    rt!                      } ||
|          }|s|
f|dd         z   }||f|z   n|S t#          ||
|j        |j        	          S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r   r   rV   r   r   r   
regressionsingle_label_classificationmulti_label_classificationr>   )r]  logitsr   r  )r!   r>  r  rl  r  deviceproblem_typerj  r  r(   longrm   r   squeezer   rN   r
   r   r   r  )r6   rW   r   rm  r   r   rV   r   r   rC  rr  r]  loss_fctr   s                 r8   rd   z!ViTForImageClassification.forward?  s(   . &1%<kk$+B]((/!5%=#  
 
 "!*Aqqq!9::YYv}--F{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
r9   rF  )rg   rh   ri   r   r&   r   rG  r   _IMAGE_CLASS_CHECKPOINTr   rI  _IMAGE_CLASS_EXPECTED_OUTPUTr   r(   rl   rk   r   r	  rd   ro   rp   s   @r8   rh  rh  #  sO        
y 
T 
 
 
 
 
 
 +*+?@@*)$4	   04,0)-,0/337&*A
 A
u|,A
 EL)A
 &	A

 $D>A
 'tnA
 #+4.A
 d^A
 
u++	,A
 A
 A
  A@A
 A
 A
 A
 A
r9   rh  )Crj   collections.abcrw   r   typingr   r   r   r   r   r   r(   torch.utils.checkpointr	   torch.nnr
   r   r   activationsr   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   r   configuration_vitr   
get_loggerrg   r   rI  rH  rJ  rx  ry  Moduler    r.   r   r   r   r   r   r   r   r   r   r   r  VIT_START_DOCSTRINGrG  r)  r.  rR  rh  r   r9   r8   <module>r     s1          : : : : : : : : : : : : : : : :            A A A A A A A A A A ! ! ! ! ! !            . - - - - - Q Q Q Q Q Q Q Q                ) ( ( ( ( ( 
	H	%	%  : &  8 - U U U U UBI U U Up$ $ $ $ $ $ $ $N9 9 9 9 9ry 9 9 9x,# ,# ,# ,# ,#+ ,# ,# ,#^    BI   $$ $ $ $ $29 $ $ $N6 6 6 6 6| 6 6 6    bi        	       ' ' ' ' 'ry ' ' 'T0
 0
 0
 0
 0
 0
 0
 0
f%) %) %) %) %) %) %) %)P	  2 c \
 \
 \
 \
 \
! \
 \
	 \
~    	      o
 o
 o
 o
 o
 2 o
 o
 o
d   U
 U
 U
 U
 U
 2 U
 U
 U
 U
 U
r9   