
    gͨ                     $   d Z ddlZddlZddlmZ ddlmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZmZmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'  e#j(        e)          Z*dZ+dZ,g dZ-dZ.dZ/ G d dej0                  Z1 G d dej0                  Z2 G d dej0                  Z3 G d de3          Z4 G d dej0                  Z5 G d dej0                  Z6 G d d e6          Z7 G d! d"ej0                  Z8 G d# d$ej0                  Z9e6e7d%Z: G d& d'ej0                  Z; G d( d)ej0                  Z< G d* d+e          Z=d,Z>d-Z? e!d.e>           G d/ d0e=                      Z@ G d1 d2ej0                  ZA e!d3e>           G d4 d5e=                      ZB e!d6e>           G d7 d8e=                      ZCe G d9 d:e                      ZD e!d;e>           G d< d=e=                      ZEdS )>zPyTorch DeiT model.    N)	dataclass)OptionalSetTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedImageModelingOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputadd_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings	torch_int   )
DeiTConfigr   z(facebook/deit-base-distilled-patch16-224)r      i   ztabby, tabby catc            	            e Zd ZdZddededdf fdZdej        d	e	d
e	dej        fdZ
	 	 ddej        deej                 dedej        fdZ xZS )DeiTEmbeddingszv
    Construct the CLS token, distillation token, position and patch embeddings. Optionally, also the mask token.
    Fconfiguse_mask_tokenreturnNc                 z   t                                                       t          j        t	          j        dd|j                            | _        t          j        t	          j        dd|j                            | _        |r-t          j        t	          j        dd|j                            nd | _	        t          |          | _        | j        j        }t          j        t	          j        d|dz   |j                            | _        t          j        |j                  | _        |j        | _        d S )Nr      )super__init__r   	Parametertorchzeroshidden_size	cls_tokendistillation_token
mask_tokenDeiTPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout
patch_size)selfr!   r"   r1   	__class__s       b/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/deit/modeling_deit.pyr'   zDeiTEmbeddings.__init__C   s    ek!Q8J&K&KLL"$,u{1aAS/T/T"U"UQ_i",u{1a9K'L'LMMMei 3F ; ;+7#%<A{QPVPb0c0c#d#d z&"<== +    
embeddingsheightwidthc                    |j         d         dz
  }| j        j         d         dz
  }t          j                                        s||k    r||k    r| j        S | j        ddddf         }| j        ddddf         }|j         d         }|| j        z  }	|| j        z  }
t          |dz            }|                    d|||          }|                    dddd          }t          j
                            ||	|
fdd	
          }|                    dddd                              dd|          }t          j        ||fd          S )a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and 2 class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r%   N      ?r   r   bicubicF)sizemodealign_cornersdim)shaper2   r)   jit
is_tracingr6   r   reshapepermuter   
functionalinterpolateviewcat)r7   r;   r<   r=   r1   num_positionsclass_and_dist_pos_embedpatch_pos_embedrF   
new_height	new_widthsqrt_num_positionss               r9   interpolate_pos_encodingz'DeiTEmbeddings.interpolate_pos_encodingO   st    !&q)A-06q9A= y##%% 	,+*F*F6UZ??++#'#;AAArrE#B 2111abb59r"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNy2OD!LLLLr:   pixel_valuesbool_masked_posrV   c                 6   |j         \  }}}}|                     |          }|                                \  }}	}|R| j                            ||	d          }
|                    d                              |
          }|d|z
  z  |
|z  z   }| j                            |dd          }| j                            |dd          }t          j
        |||fd          }| j        }|r|                     |||          }||z   }|                     |          }|S )Nr?         ?r   rE   )rG   r0   rB   r.   expand	unsqueezetype_asr,   r-   r)   rO   r2   rV   r5   )r7   rW   rX   rV   _r<   r=   r;   
batch_size
seq_lengthmask_tokensmask
cls_tokensdistillation_tokensposition_embeddings                  r9   forwardzDeiTEmbeddings.forwardw   s4    +01fe**<88
$.OO$5$5!
J&/00ZLLK",,R0088EED#sTz2[45GGJ^**:r2>>
"5<<ZRPPY
,?LRSTTT
!5# 	Z!%!>!>z6SX!Y!Y"44
\\*--
r:   )FNF)__name__
__module____qualname____doc__r   boolr'   r)   TensorintrV   r   
BoolTensorrf   __classcell__r8   s   @r9   r    r    >   s         
, 
,z 
,4 
,D 
, 
, 
, 
, 
, 
,&M5< &M &MUX &M]b]i &M &M &M &MV 7;).	 l "%"23 #'	
 
       r:   r    c                   F     e Zd ZdZ fdZdej        dej        fdZ xZS )r/   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t                                                       |j        |j        }}|j        |j        }}t          |t          j        j	                  r|n||f}t          |t          j        j	                  r|n||f}|d         |d         z  |d         |d         z  z  }|| _        || _        || _        || _
        t          j        ||||          | _        d S )Nr   r   )kernel_sizestride)r&   r'   
image_sizer6   num_channelsr+   
isinstancecollectionsabcIterabler1   r   Conv2d
projection)r7   r!   rv   r6   rw   r+   r1   r8   s          r9   r'   zDeiTPatchEmbeddings.__init__   s    !'!2F4EJ
$*$79Kk#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
!!}
15*Q-:VW=:XY$$(&)L+:^hiiir:   rW   r#   c                     |j         \  }}}}|| j        k    rt          d          |                     |                              d                              dd          }|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r%   r   )rG   rw   
ValueErrorr}   flatten	transpose)r7   rW   r_   rw   r<   r=   xs          r9   rf   zDeiTPatchEmbeddings.forward   sl    2>2D/
L&%4,,,w   OOL))11!44>>q!DDr:   )	rh   ri   rj   rk   r'   r)   rm   rf   rp   rq   s   @r9   r/   r/      sm         j j j j jEL U\        r:   r/   c            
            e Zd Zdeddf fdZdej        dej        fdZ	 ddeej                 d	e	de
eej        ej        f         eej                 f         fd
Z xZS )DeiTSelfAttentionr!   r#   Nc                    t                                                       |j        |j        z  dk    r1t	          |d          s!t          d|j        f d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j
        |j        | j        |j                  | _        t          j
        |j        | j        |j                  | _        t          j
        |j        | j        |j                  | _        t          j        |j                  | _        d S )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .)bias)r&   r'   r+   num_attention_headshasattrr   rn   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyvaluer3   attention_probs_dropout_probr5   r7   r!   r8   s     r9   r'   zDeiTSelfAttention.__init__   s1    ::a??PVXhHiHi?76#5"7 7 737 7 7  
 $*#= #&v'9F<V'V#W#W !58PPYv143EFO\\\
9V/1C&/ZZZYv143EFO\\\
z&"EFFr:   r   c                     |                                 d d         | j        | j        fz   }|                    |          }|                    dddd          S )Nr?   r   r%   r   r   )rB   r   r   rN   rK   )r7   r   new_x_shapes      r9   transpose_for_scoresz&DeiTSelfAttention.transpose_for_scores   sP    ffhhssmt'?AY&ZZFF;yyAq!$$$r:   F	head_maskoutput_attentionsc                    |                      |          }|                     |                     |                    }|                     |                     |                    }|                     |          }t	          j        ||                    dd                    }|t          j        | j	                  z  }t          j                            |d          }	|                     |	          }	||	|z  }	t	          j        |	|          }
|
                    dddd                                          }
|
                                d d         | j        fz   }|
                    |          }
|r|
|	fn|
f}|S )Nr?   rE   r   r%   r   r   )r   r   r   r   r)   matmulr   mathsqrtr   r   rL   softmaxr5   rK   
contiguousrB   r   rN   )r7   hidden_statesr   r   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                r9   rf   zDeiTSelfAttention.forward   sr    !JJ}55--dhh}.E.EFF	//

=0I0IJJ//0ABB !<Y5H5HR5P5PQQ+di8P.Q.QQ -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S%**+BCC6G]=/22mM]r:   rg   )rh   ri   rj   r   r'   r)   rm   r   r   rl   r   r   rf   rp   rq   s   @r9   r   r      s        Gz Gd G G G G G G$%el %u| % % % % bg! !(0(>!Z^!	uU\5</0%2EE	F! ! ! ! ! ! ! !r:   r   c                        e Zd Zdeddf fdZ	 	 d
dej        deej                 de	de
eej        ej        f         eej                 f         f fd	Z xZS )DeiTSdpaSelfAttentionr!   r#   Nc                 b    t                                          |           |j        | _        d S N)r&   r'   r   r   s     r9   r'   zDeiTSdpaSelfAttention.__init__   s,       ,2,O)))r:   Fr   r   r   c           	         |s|>t                               d           t                                          |||          S |                     |          }|                     |                     |                    }|                     |                     |                    }|                     |          }t          j	        j
                            ||||| j        r| j        nddd           }|                    dddd	                                          }|                                d d
         | j        fz   }	|                    |	          }|d fS )Na  `DeiTSdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)r   r   r           F)	is_causalscaler   r%   r   r   r   )loggerwarning_oncer&   rf   r   r   r   r   r)   r   rL   scaled_dot_product_attentiontrainingr   rK   r   rB   r   rN   )r7   r   r   r   r   r   r   r   r   r   r8   s             r9   rf   zDeiTSdpaSelfAttention.forward   sp     		 5w   77??+#"3 #    !JJ}55--dhh}.E.EFF	//

=0I0IJJ//0ABB+HH15GD--C I 
 
 &--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S%**+BCCd""r:   rg   )rh   ri   rj   r   r'   r)   FloatTensorr   rm   rl   r   r   rf   rp   rq   s   @r9   r   r      s        Pz Pd P P P P P P -1"'	'# '#('# EL)'#  	'#
 
uU\5</0%2EE	F'# '# '# '# '# '# '# '# '# '#r:   r   c                   ^     e Zd ZdZdeddf fdZdej        dej        dej        fdZ xZ	S )	DeiTSelfOutputz
    The residual connection is defined in DeiTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r!   r#   Nc                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _        d S r   )	r&   r'   r   r   r+   denser3   r4   r5   r   s     r9   r'   zDeiTSelfOutput.__init__+  sJ    Yv163EFF
z&"<==r:   r   input_tensorc                 Z    |                      |          }|                     |          }|S r   r   r5   r7   r   r   s      r9   rf   zDeiTSelfOutput.forward0  s*    

=11]33r:   )
rh   ri   rj   rk   r   r'   r)   rm   rf   rp   rq   s   @r9   r   r   %  s         
>z >d > > > > > >
U\  RWR^        r:   r   c                        e Zd Zdeddf fdZdee         ddfdZ	 	 ddej	        d	e
ej	                 d
edeeej	        ej	        f         eej	                 f         fdZ xZS )DeiTAttentionr!   r#   Nc                     t                                                       t          |          | _        t	          |          | _        t                      | _        d S r   )r&   r'   r   	attentionr   outputsetpruned_headsr   s     r9   r'   zDeiTAttention.__init__9  sI    *622$V,,EEr:   headsc                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   rE   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)r7   r   indexs      r9   prune_headszDeiTAttention.prune_heads?  s   u::??F74>5t~7Y[_[l
 
u
  2$.2FNN/0BEJJ1$.2FNN.t{/@%QOOO .2^-ORUV[R\R\-\*'+~'IDNLn'n$ -33E::r:   Fr   r   r   c                     |                      |||          }|                     |d         |          }|f|dd          z   }|S )Nr   r   )r   r   )r7   r   r   r   self_outputsattention_outputr   s          r9   rf   zDeiTAttention.forwardQ  sM     ~~mY@QRR;;|AFF#%QRR(88r:   rg   )rh   ri   rj   r   r'   r   rn   r   r)   rm   r   rl   r   r   rf   rp   rq   s   @r9   r   r   8  s        "z "d " " " " " ";S ;d ; ; ; ;* -1"'	 | EL)  	
 
uU\5</0%2EE	F       r:   r   c                   (     e Zd Zdeddf fdZ xZS )DeiTSdpaAttentionr!   r#   Nc                 r    t                                          |           t          |          | _        d S r   )r&   r'   r   r   r   s     r9   r'   zDeiTSdpaAttention.__init__a  s.       .v66r:   )rh   ri   rj   r   r'   rp   rq   s   @r9   r   r   `  sK        7z 7d 7 7 7 7 7 7 7 7 7 7r:   r   c                   L     e Zd Zdeddf fdZdej        dej        fdZ xZS )DeiTIntermediater!   r#   Nc                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r   )r&   r'   r   r   r+   intermediate_sizer   rx   
hidden_actstrr   intermediate_act_fnr   s     r9   r'   zDeiTIntermediate.__init__h  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r:   r   c                 Z    |                      |          }|                     |          }|S r   )r   r   )r7   r   s     r9   rf   zDeiTIntermediate.forwardp  s,    

=1100??r:   	rh   ri   rj   r   r'   r)   rm   rf   rp   rq   s   @r9   r   r   g  sq        9z 9d 9 9 9 9 9 9U\ el        r:   r   c                   Z     e Zd Zdeddf fdZdej        dej        dej        fdZ xZS )
DeiTOutputr!   r#   Nc                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _	        d S r   )
r&   r'   r   r   r   r+   r   r3   r4   r5   r   s     r9   r'   zDeiTOutput.__init__y  sJ    Yv79KLL
z&"<==r:   r   r   c                 d    |                      |          }|                     |          }||z   }|S r   r   r   s      r9   rf   zDeiTOutput.forward~  s4    

=11]33%4r:   r   rq   s   @r9   r   r   x  s|        >z >d > > > > > >
U\  RWR^        r:   r   )eagersdpac                        e Zd ZdZdeddf fdZ	 	 ddej        deej                 d	e	de
eej        ej        f         eej                 f         fd
Z xZS )	DeiTLayerz?This corresponds to the Block class in the timm implementation.r!   r#   Nc                    t                                                       |j        | _        d| _        t	          |j                 |          | _        t          |          | _        t          |          | _
        t          j        |j        |j                  | _        t          j        |j        |j                  | _        d S )Nr   eps)r&   r'   chunk_size_feed_forwardseq_len_dimDEIT_ATTENTION_CLASSES_attn_implementationr   r   intermediater   r   r   	LayerNormr+   layer_norm_epslayernorm_beforelayernorm_afterr   s     r9   r'   zDeiTLayer.__init__  s    '-'E$/0KLVTT,V44 (( "V-?VEZ [ [ [!|F,>FDYZZZr:   Fr   r   r   c                    |                      |                     |          ||          }|d         }|dd          }||z   }|                     |          }|                     |          }|                     ||          }|f|z   }|S )N)r   r   r   )r   r   r   r   r   )r7   r   r   r   self_attention_outputsr   r   layer_outputs           r9   rf   zDeiTLayer.forward  s     "&!!-00/ "0 "
 "

 2!4(, )=8 ++M::((66 {{<??/G+r:   rg   )rh   ri   rj   rk   r   r'   r)   rm   r   rl   r   r   rf   rp   rq   s   @r9   r   r     s        II[z [d [ [ [ [ [ [ -1"'	 | EL)  	
 
uU\5</0%2EE	F       r:   r   c                        e Zd Zdeddf fdZ	 	 	 	 ddej        deej                 d	ed
edede	e
ef         fdZ xZS )DeiTEncoderr!   r#   Nc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S  )r   ).0r^   r!   s     r9   
<listcomp>z(DeiTEncoder.__init__.<locals>.<listcomp>  s!    #_#_#_!If$5$5#_#_#_r:   F)	r&   r'   r!   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   s    `r9   r'   zDeiTEncoder.__init__  s`    ]#_#_#_#_uVE]?^?^#_#_#_``
&+###r:   FTr   r   r   output_hidden_statesreturn_dictc                    |rdnd }|rdnd }t          | j                  D ]h\  }}	|r||fz   }|||         nd }
| j        r%| j        r|                     |	j        ||
|          }n |	||
|          }|d         }|r||d         fz   }i|r||fz   }|st          d |||fD                       S t          |||          S )Nr   r   r   c              3      K   | ]}||V  	d S r   r   )r   vs     r9   	<genexpr>z&DeiTEncoder.forward.<locals>.<genexpr>  s(      mmq_`_l_l_l_l_lmmr:   )last_hidden_stater   
attentions)	enumerater   r   r   _gradient_checkpointing_func__call__tupler   )r7   r   r   r   r   r   all_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss               r9   rf   zDeiTEncoder.forward  sI    #7@BBD$5?bb4(44 	P 	POA|# I$58H$H!.7.CillO* `t} ` $ A A )!#%	! ! !-]OM^ _ _)!,M  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r:   )NFFT)rh   ri   rj   r   r'   r)   rm   r   rl   r   r	  r   rf   rp   rq   s   @r9   r   r     s        ,z ,d , , , , , , -1"'%* )
 )
|)
 EL))
  	)

 #)
 )
 
uo%	&)
 )
 )
 )
 )
 )
 )
 )
r:   r   c                   l    e Zd ZdZeZdZdZdZdgZ	dZ
deej        ej        ej        f         ddfd	ZdS )
DeiTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    deitrW   Tr   moduler#   Nc                 J   t          |t          j        t          j        f          rt          j                            |j        j                            t          j
                  d| j        j                                      |j        j                  |j        _        |j         |j        j                                         dS dS t          |t          j                  r?|j        j                                         |j        j                            d           dS dS )zInitialize the weightsr   )meanstdNrZ   )rx   r   r   r|   inittrunc_normal_weightdatator)   float32r!   initializer_rangedtyper   zero_r   fill_)r7   r  s     r9   _init_weightsz!DeiTPreTrainedModel._init_weights  s    fry")455 
	* "$!6!6"%%em443DKDa "7 " "b$%% M {& &&((((( '&-- 	*K""$$$M$$S)))))	* 	*r:   )rh   ri   rj   rk   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpar   r   r   r|   r   r!  r   r:   r9   r  r    sx         
 L$O&*#$N*E")RY*L$M *RV * * * * * *r:   r  aF  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`DeiTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`DeiTImageProcessor.__call__`] for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
            Whether to interpolate the pre-trained position encodings.
z^The bare DeiT Model transformer outputting raw hidden-states without any specific head on top.c                   @    e Zd Zddedededdf fdZdefd	Zd
 Z e	e
           eeeede          	 	 	 	 	 	 	 ddeej                 deej                 deej                 dee         dee         dee         dedeeef         fd                        Z xZS )	DeiTModelTFr!   add_pooling_layerr"   r#   Nc                 N   t                                          |           || _        t          ||          | _        t          |          | _        t          j        |j	        |j
                  | _        |rt          |          nd | _        |                                  d S )N)r"   r   )r&   r'   r!   r    r;   r   encoderr   r   r+   r   	layernorm
DeiTPoolerpooler	post_init)r7   r!   r*  r"   r8   s       r9   r'   zDeiTModel.__init__1  s       (OOO"6**f&8f>STTT,=Gj(((4 	r:   c                     | j         j        S r   )r;   r0   )r7   s    r9   get_input_embeddingszDeiTModel.get_input_embeddings>  s    //r:   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr,  r   r   r   )r7   heads_to_pruner   r   s       r9   _prune_headszDeiTModel._prune_headsA  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr:   vision)
checkpointoutput_typer"  modalityexpected_outputrW   rX   r   r   r   r   rV   c                 ~   ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |                     || j         j                  }| j        j        j	        j
        j        }|j        |k    r|                    |          }|                     |||          }	|                     |	||||          }
|
d         }|                     |          }| j        |                     |          nd}|s|||fn|f}||
dd         z   S t!          |||
j        |
j                  S )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rX   rV   )r   r   r   r   r   r   )r  pooler_outputr   r  )r!   r   r   use_return_dictr   get_head_maskr   r;   r0   r}   r  r  r  r,  r-  r/  r   r   r  )r7   rW   rX   r   r   r   r   rV   expected_dtypeembedding_outputencoder_outputssequence_outputpooled_outputhead_outputss                 r9   rf   zDeiTModel.forwardI  s   , 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]?@@@ &&y$+2OPP	 9DKQ//'??>::L??/Tl + 
 
 ,,/!5# ' 
 
 *!,..998<8OO444UY 	6?L?XO];;_n^pL/!"""555)-')7&1	
 
 
 	
r:   )TFNNNNNNF)rh   ri   rj   r   rl   r'   r/   r2  r6  r   DEIT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r)   rm   ro   r   r   rf   rp   rq   s   @r9   r)  r)  ,  s       
 z d [_ lp      0&9 0 0 0 0C C C +*+@AA&.$.   046:,0,0/3&*).;
 ;
u|,;
 "%"23;
 EL)	;

 $D>;
 'tn;
 d^;
 #';
 
u00	1;
 ;
 ;
  BA;
 ;
 ;
 ;
 ;
r:   r)  c                   *     e Zd Zdef fdZd Z xZS )r.  r!   c                     t                                                       t          j        |j        |j                  | _        t          j                    | _        d S r   )r&   r'   r   r   r+   r   Tanh
activationr   s     r9   r'   zDeiTPooler.__init__  sC    Yv163EFF
'))r:   c                 r    |d d df         }|                      |          }|                     |          }|S )Nr   )r   rN  )r7   r   first_token_tensorrD  s       r9   rf   zDeiTPooler.forward  s@     +111a40

#56666r:   )rh   ri   rj   r   r'   rf   rp   rq   s   @r9   r.  r.    sS        $z $ $ $ $ $ $
      r:   r.  aW  DeiT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    c                       e Zd Zdeddf fdZ ee           eee	          	 	 	 	 	 	 	 dde
ej                 de
ej                 d	e
ej                 d
e
e         de
e         de
e         dedeeef         fd                        Z xZS )DeiTForMaskedImageModelingr!   r#   Nc                 V   t                                          |           t          |dd          | _        t	          j        t	          j        |j        |j        dz  |j	        z  d          t	          j
        |j                            | _        |                                  d S )NFT)r*  r"   r%   r   )in_channelsout_channelsrt   )r&   r'   r)  r  r   
Sequentialr|   r+   encoder_striderw   PixelShuffledecoderr0  r   s     r9   r'   z#DeiTForMaskedImageModeling.__init__  s       fdSSS	}I".#2A58KK  
 OF122
 
 	r:   r9  r"  FrW   rX   r   r   r   r   rV   c           	         ||n| j         j        }|                     |||||||          }|d         }	|	ddddf         }	|	j        \  }
}}t	          |dz            x}}|	                    ddd                              |
|||          }	|                     |	          }d}|| j         j        | j         j	        z  }|                    d||          }|
                    | j         j	        d          
                    | j         j	        d                              d                                          }t          j                            ||d	          }||z                                  |                                d
z   z  | j         j        z  }|s|f|dd         z   }||f|z   n|S t%          |||j        |j                  S )aM  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DeiTForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
        >>> model = DeiTForMaskedImageModeling.from_pretrained("facebook/deit-base-distilled-patch16-224")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 224, 224]
        ```N)rX   r   r   r   r   rV   r   r   r?   r@   r%   none)	reductiongh㈵>)lossreconstructionr   r  )r!   r>  r  rG   rn   rK   rJ   rY  rv   r6   repeat_interleaver\   r   r   rL   l1_losssumrw   r   r   r  )r7   rW   rX   r   r   r   r   rV   r   rC  r_   sequence_lengthrw   r<   r=   reconstructed_pixel_valuesmasked_im_lossrB   rb   reconstruction_lossr   s                        r9   rf   z"DeiTForMaskedImageModeling.forward  s   R &1%<kk$+B]))+/!5#%=  
 
 "!* *!!!QrT'24C4I1
O\_c1222)11!Q::BB:|]cejkk &*\\/%B%B"&;)T[-CCD-55b$EEO11$+2H!LL""4;#91==1	  #%-"7"7F`lr"7"s"s1D8==??488::PTCTUX\XcXppN 	Z02WQRR[@F3A3M^%..SYY(5!/)	
 
 
 	
r:   rF  )rh   ri   rj   r   r'   r   rG  r   r   rI  r   r)   rm   ro   rl   r   r	  rf   rp   rq   s   @r9   rR  rR    s?       z d      " +*+@AA+DSbccc 046:,0,0/3&*).T
 T
u|,T
 "%"23T
 EL)	T

 $D>T
 'tnT
 d^T
 #'T
 
u//	0T
 T
 T
 dc BAT
 T
 T
 T
 T
r:   rR  z
    DeiT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                       e Zd Zdeddf fdZ ee           eee	          	 	 	 	 	 	 	 dde
ej                 de
ej                 d	e
ej                 d
e
e         de
e         de
e         dedeeef         fd                        Z xZS )DeiTForImageClassificationr!   r#   Nc                 :   t                                          |           |j        | _        t          |d          | _        |j        dk    rt          j        |j        |j                  nt          j                    | _	        | 
                                 d S NF)r*  r   )r&   r'   
num_labelsr)  r  r   r   r+   Identity
classifierr0  r   s     r9   r'   z#DeiTForImageClassification.__init__  s        +f>>>	 OUN_bcNcNc")F$68IJJJikitiviv 	r:   rZ  FrW   r   labelsr   r   r   rV   c                    ||n| j         j        }|                     ||||||          }|d         }	|                     |	dddddf                   }
d}|t|                    |
j                  }| j         j        f| j        dk    rd| j         _        nN| j        dk    r7|j        t          j
        k    s|j        t          j        k    rd| j         _        nd| j         _        | j         j        dk    rWt                      }| j        dk    r1 ||
                                |                                          }n ||
|          }n| j         j        dk    rGt                      } ||
                    d| j                  |                    d                    }n*| j         j        dk    rt!                      } ||
|          }|s|
f|dd         z   }||f|z   n|S t#          ||
|j        |j        	          S )
al  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, DeiTForImageClassification
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> # note: we are loading a DeiTForImageClassificationWithTeacher from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random
        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
        >>> model = DeiTForImageClassification.from_pretrained("facebook/deit-base-distilled-patch16-224")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the 1000 ImageNet classes
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: Polaroid camera, Polaroid Land camera
        ```Nr   r   r   r   rV   r   r   
regressionsingle_label_classificationmulti_label_classificationr?   )r^  logitsr   r  )r!   r>  r  rm  r  deviceproblem_typerk  r  r)   longrn   r   squeezer
   rN   r	   r   r   r  )r7   rW   r   rn  r   r   r   rV   r   rC  rt  r^  loss_fctr   s                 r9   rf   z"DeiTForImageClassification.forward*  s+   Z &1%<kk$+B]))/!5#%=  
 
 "!*Aqqq!9:: YYv}--F{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
r:   rF  )rh   ri   rj   r   r'   r   rG  r   r   rI  r   r)   rm   rl   r   r	  rf   rp   rq   s   @r9   rh  rh    s=       
z 
d 
 
 
 
 
 
 +*+@AA+@___ 04,0)-,0/3&*).[
 [
u|,[
 EL)[
 &	[

 $D>[
 'tn[
 d^[
 #'[
 
u++	,[
 [
 [
 `_ BA[
 [
 [
 [
 [
r:   rh  c                       e Zd ZU dZdZej        ed<   dZej        ed<   dZ	ej        ed<   dZ
eeej                          ed<   dZeeej                          ed<   dS )+DeiTForImageClassificationWithTeacherOutputa5  
    Output type of [`DeiTForImageClassificationWithTeacher`].

    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Prediction scores as the average of the cls_logits and distillation logits.
        cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
            class token).
        distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
            distillation token).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    Nrt  
cls_logitsdistillation_logitsr   r  )rh   ri   rj   rk   rt  r)   r   __annotations__r|  r}  r   r   r   r  r   r:   r9   r{  r{    s          , !%FE$$$$(J!(((-1*1118<M8E%"345<<<59Ju01299999r:   r{  a  
    DeiT Model transformer with image classification heads on top (a linear layer on top of the final hidden state of
    the [CLS] token and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet.

    .. warning::

           This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
           supported.
    c                       e Zd Zdeddf fdZ ee           eee	e
e          	 	 	 	 	 	 ddeej                 deej                 d	ee         d
ee         dee         dedeee	f         fd                        Z xZS )%DeiTForImageClassificationWithTeacherr!   r#   Nc                    t                                          |           |j        | _        t          |d          | _        |j        dk    rt          j        |j        |j                  nt          j                    | _	        |j        dk    rt          j        |j        |j                  nt          j                    | _
        |                                  d S rj  )r&   r'   rk  r)  r  r   r   r+   rl  cls_classifierdistillation_classifierr0  r   s     r9   r'   z.DeiTForImageClassificationWithTeacher.__init__  s        +f>>>	 AG@QTU@U@UBIf(&*;<<<[][f[h[h 	 AG@QTU@U@UBIf(&*;<<<[][f[h[h 	$
 	r:   )r8  r9  r"  r;  FrW   r   r   r   r   rV   c                 d   ||n| j         j        }|                     ||||||          }|d         }|                     |d d dd d f                   }	|                     |d d dd d f                   }
|	|
z   dz  }|s||	|
f|dd          z   }|S t          ||	|
|j        |j                  S )Nrp  r   r   r%   )rt  r|  r}  r   r  )r!   r>  r  r  r  r{  r   r  )r7   rW   r   r   r   r   rV   r   rC  r|  r}  rt  r   s                r9   rf   z-DeiTForImageClassificationWithTeacher.forward  s     &1%<kk$+B]))/!5#%=  
 
 "!*((Aqqq)ABB
"::?111aQRQRQR7;STT 22a7 	j*=>LFM:! 3!/)
 
 
 	
r:   )NNNNNF)rh   ri   rj   r   r'   r   rG  r   _IMAGE_CLASS_CHECKPOINTr{  rI  _IMAGE_CLASS_EXPECTED_OUTPUTr   r)   rm   rl   r   r	  rf   rp   rq   s   @r9   r  r    s&       z d      " +*+@AA*?$4	   04,0,0/3&*).&
 &
u|,&
 EL)&
 $D>	&

 'tn&
 d^&
 #'&
 
uAA	B&
 &
 &
  BA&
 &
 &
 &
 &
r:   r  )Frk   collections.abcry   r   dataclassesr   typingr   r   r   r   r)   torch.utils.checkpointr   torch.nnr	   r
   r   activationsr   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   r   r   configuration_deitr   
get_loggerrh   r   rI  rH  rJ  r  r  Moduler    r/   r   r   r   r   r   r   r   r   r   r   r  DEIT_START_DOCSTRINGrG  r)  r.  rR  rh  r{  r  r   r:   r9   <module>r     s          ! ! ! ! ! ! . . . . . . . . . . . .            A A A A A A A A A A ! ! ! ! ! !            . - - - - - Q Q Q Q Q Q Q Q                  + * * * * * 
	H	%	%  A &  E 1 V V V V VRY V V Vr    ")   B9 9 9 9 9	 9 9 9z,# ,# ,# ,# ,#- ,# ,# ,#`    RY   &$ $ $ $ $BI $ $ $P7 7 7 7 7 7 7 7    ry   "           ' ' ' ' '	 ' ' 'V0
 0
 0
 0
 0
") 0
 0
 0
f* * * * */ * * *8	  2 d \
 \
 \
 \
 \
# \
 \
	 \
@          h
 h
 h
 h
 h
!4 h
 h
 h
V   j
 j
 j
 j
 j
!4 j
 j
 j
Z : : : : :+ : : :<   ?
 ?
 ?
 ?
 ?
,? ?
 ?
 ?
 ?
 ?
r:   