
    g              	          d Z ddlZddlZddlmZmZmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZmZmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(  e"j)        e*          Z+dZ,dZ-g dZ.dZ/dZ0 G d dej1                  Z2 G d dej1                  Z3 G d dej1                  Z4 G d de4          Z5 G d dej1                  Z6 G d dej1                  Z7 G d  d!e7          Z8 G d" d#ej1                  Z9dDd&ej:        d'e;d(e<d)ej:        fd*Z= G d+ d,ej1                  Z> G d- d.ej1                  Z? G d/ d0ej1                  Z@e7e8d1ZA G d2 d3ej1                  ZB G d4 d5ej1                  ZC G d6 d7e          ZDd8ZEd9ZFd:ZG e d;eE           G d< d=eD                      ZH e d>eE           G d? d@eD                      ZI e dAeE           G dB dCeDe&                      ZJdS )EzPyTorch DINOv2 model.    N)DictListOptionalSetTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutputBaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings	torch_int)BackboneMixin   )Dinov2Configr   zfacebook/dinov2-base)r   i  i   z(facebook/dinov2-small-imagenet1k-1-layerztabby, tabby catc                        e Zd ZdZdeddf fdZdej        dededej        fd	Z	dd
ej        de
ej                 dej        fdZ xZS )Dinov2EmbeddingszM
    Construct the CLS token, mask token, position and patch embeddings.
    configreturnNc                    t                                                       t          j        t	          j        dd|j                            | _        t          j        t	          j        d|j                            | _	        t          |          | _        | j        j        }t          j        t	          j        d|dz   |j                            | _        t          j        |j                  | _        |j        | _        || _        d S )Nr   )super__init__r	   	Parametertorchrandnhidden_size	cls_tokenzeros
mask_tokenDinov2PatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout
patch_sizer!   )selfr!   r/   	__class__s      f/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/dinov2/modeling_dinov2.pyr%   zDinov2Embeddings.__init__B   s    ek!Q8J&K&KLL,u{1f6H'I'IJJ 5f = =+7#%<A{QPVPb0c0c#d#d z&"<== +    
embeddingsheightwidthc                    |j         d         dz
  }| j        j         d         dz
  }t          j                                        s||k    r||k    r| j        S | j        ddddf         }| j        ddddf         }|j         d         }|| j        z  }	|| j        z  }
t          |dz            }|                    d|||          }|                    dddd          }|j	        }t          j                            |                    t          j                  |	|
fdd	
                              |          }|                    dddd                              dd|          }t          j        ||fd          S )a-  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and interpolation at torch.float32 precision.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r      bicubicF)sizemodealign_cornersdtypedim)shaper0   r'   jit
is_tracingr4   r   reshapepermuterD   r	   
functionalinterpolatetofloat32viewcat)r5   r9   r:   r;   r/   num_positionsclass_pos_embedpatch_pos_embedrF   
new_height	new_widthsqrt_num_positionstarget_dtypes                r7   interpolate_pos_encodingz)Dinov2Embeddings.interpolate_pos_encodingN   s    !&q)A-06q9A= y##%% 	,+*F*F6UZ??++2111bqb592111abb59r"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==&,-33u}--i(	 4 
 

 "<"
 
  	 *11!Q1==BB1b#NNy/?;CCCCr8   pixel_valuesbool_masked_posc                    |j         \  }}}}| j        j        j        j        }|                     |                    |                    }|Yt          j        |                    d          | j	                            |j                                      d          |          }| j
                            |dd          }	t          j        |	|fd          }||                     |||          z   }|                     |          }|S )NrC   r=   r   r   rE   )rG   r.   
projectionweightrD   rN   r'   where	unsqueezer,   r*   expandrQ   rY   r3   )
r5   rZ   r[   
batch_size_r:   r;   rX   r9   
cls_tokenss
             r7   forwardzDinov2Embeddings.forwardv   s    '3'9$
Avu,7>D**<???+N+NOO
&))"--t/A/A*BR/S/S/]/]^_/`/`bl J
 ^**:r2>>
Y
J7Q???
  $"?"?
FTY"Z"ZZ
\\*--
r8   N)__name__
__module____qualname____doc__r   r%   r'   TensorintrY   r   re   __classcell__r6   s   @r7   r    r    =   s         
| 
 
 
 
 
 
 
&D5< &D &DUX &D]b]i &D &D &D &DP EL 8ELCY ejeq        r8   r    c                   F     e Zd ZdZ fdZdej        dej        fdZ xZS )r-   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t                                                       |j        |j        }}|j        |j        }}t          |t          j        j	                  r|n||f}t          |t          j        j	                  r|n||f}|d         |d         z  |d         |d         z  z  }|| _        || _        || _        || _
        t          j        ||||          | _        d S )Nr   r   )kernel_sizestride)r$   r%   
image_sizer4   num_channelsr)   
isinstancecollectionsabcIterabler/   r	   Conv2dr]   )r5   r!   rs   r4   rt   r)   r/   r6   s          r7   r%   zDinov2PatchEmbeddings.__init__   s    !'!2F4EJ
$*$79Kk#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
!!}
15*Q-:VW=:XY$$(&)L+:^hiiir8   rZ   r"   c                     |j         d         }|| j        k    rt          d| j         d| d          |                     |                              d                              dd          }|S )Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r>   )rG   rt   
ValueErrorr]   flatten	transpose)r5   rZ   rt   r9   s       r7   re   zDinov2PatchEmbeddings.forward   s    #)!,4,,,I!.I I9EI I I   __\22::1==GG1MM
r8   )	rg   rh   ri   rj   r%   r'   rk   re   rm   rn   s   @r7   r-   r-      sm         j j j j jEL U\        r8   r-   c            
            e Zd Zdeddf fdZdej        dej        fdZ	 ddeej                 d	e	de
eej        ej        f         eej                 f         fd
Z xZS )Dinov2SelfAttentionr!   r"   Nc                    t                                                       |j        |j        z  dk    r1t	          |d          s!t          d|j        f d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j
        |j        | j        |j                  | _        t          j
        |j        | j        |j                  | _        t          j
        |j        | j        |j                  | _        t          j        |j                  | _        d S )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r{   bias)r$   r%   r)   num_attention_headshasattrr|   rl   attention_head_sizeall_head_sizer	   Linearqkv_biasquerykeyvaluer1   attention_probs_dropout_probr3   r5   r!   r6   s     r7   r%   zDinov2SelfAttention.__init__   s1    ::a??PVXhHiHi?76#5"7 7 737 7 7  
 $*#= #&v'9F<V'V#W#W !58PPYv143EFO\\\
9V/1C&/ZZZYv143EFO\\\
z&"EFFr8   xc                     |                                 d d         | j        | j        fz   }|                    |          }|                    dddd          S )Nr=   r   r>   r   r   )r@   r   r   rP   rK   )r5   r   new_x_shapes      r7   transpose_for_scoresz(Dinov2SelfAttention.transpose_for_scores   sP    ffhhssmt'?AY&ZZFF;yyAq!$$$r8   F	head_maskoutput_attentionsc                    |                      |          }|                     |                     |                    }|                     |                     |                    }|                     |          }t	          j        ||                    dd                    }|t          j        | j	                  z  }t          j                            |d          }	|                     |	          }	||	|z  }	t	          j        |	|          }
|
                    dddd                                          }
|
                                d d         | j        fz   }|
                    |          }
|r|
|	fn|
f}|S )Nr=   rE   r   r>   r   r   )r   r   r   r   r'   matmulr~   mathsqrtr   r	   rL   softmaxr3   rK   
contiguousr@   r   rP   )r5   hidden_statesr   r   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                r7   re   zDinov2SelfAttention.forward   sr    !JJ}55--dhh}.E.EFF	//

=0I0IJJ//0ABB !<Y5H5HR5P5PQQ+di8P.Q.QQ -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S%**+BCC6G]=/22mM]r8   NF)rg   rh   ri   r   r%   r'   rk   r   r   boolr   r   re   rm   rn   s   @r7   r   r      s        G| G G G G G G G$%el %u| % % % % bg! !(0(>!Z^!	uU\5</0%2EE	F! ! ! ! ! ! ! !r8   r   c            
            e Zd Zdeddf fdZ	 d	deej                 dede	e
ej        ej        f         e
ej                 f         f fdZ xZS )
Dinov2SdpaSelfAttentionr!   r"   Nc                 b    t                                          |           |j        | _        d S rf   )r$   r%   r   r   s     r7   r%   z Dinov2SdpaSelfAttention.__init__   s,       ,2,O)))r8   Fr   r   c           	         |r>t                               d           t                                          |||          S |                     |          }|                     |                     |                    }|                     |                     |                    }|                     |          }t          j	        j
                            ||||| j        r| j        nddd           }|                    dddd	                                          }|                                d d
         | j        fz   }	|                    |	          }|d fS )Na  Dinov2Model is using Dinov2SdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)r   r   r           F)	is_causalscaler   r>   r   r   r   )loggerwarning_oncer$   re   r   r   r   r   r'   r	   rL   scaled_dot_product_attentiontrainingr   rK   r   r@   r   rP   )r5   r   r   r   r   r   r   r   r   r   r6   s             r7   re   zDinov2SdpaSelfAttention.forward   sj     	[   77??+yTe #    !JJ}55--dhh}.E.EFF	//

=0I0IJJ//0ABB+HH15GD--C I 
 
 &--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S%**+BCCd""r8   r   )rg   rh   ri   r   r%   r   r'   rk   r   r   r   re   rm   rn   s   @r7   r   r      s        P| P P P P P P P
 bg!# !#(0(>!#Z^!#	uU\5</0%2EE	F!# !# !# !# !# !# !# !# !# !#r8   r   c                   ^     e Zd ZdZdeddf fdZdej        dej        dej        fdZ xZ	S )	Dinov2SelfOutputz
    The residual connection is defined in Dinov2Layer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r!   r"   Nc                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _        d S rf   )	r$   r%   r	   r   r)   denser1   r2   r3   r   s     r7   r%   zDinov2SelfOutput.__init__  sJ    Yv163EFF
z&"<==r8   r   input_tensorc                 Z    |                      |          }|                     |          }|S rf   )r   r3   )r5   r   r   s      r7   re   zDinov2SelfOutput.forward  s*    

=11]33r8   )
rg   rh   ri   rj   r   r%   r'   rk   re   rm   rn   s   @r7   r   r     s         
>| > > > > > > >
U\  RWR^        r8   r   c                        e Zd Zdeddf fdZdee         ddfdZ	 	 ddej	        d	e
ej	                 d
edeeej	        ej	        f         eej	                 f         fdZ xZS )Dinov2Attentionr!   r"   Nc                     t                                                       t          |          | _        t	          |          | _        t                      | _        d S rf   )r$   r%   r   	attentionr   outputsetpruned_headsr   s     r7   r%   zDinov2Attention.__init__(  sI    ,V44&v..EEr8   headsc                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   rE   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)r5   r   indexs      r7   prune_headszDinov2Attention.prune_heads.  s   u::??F74>5t~7Y[_[l
 
u
  2$.2FNN/0BEJJ1$.2FNN.t{/@%QOOO .2^-ORUV[R\R\-\*'+~'IDNLn'n$ -33E::r8   Fr   r   r   c                     |                      |||          }|                     |d         |          }|f|dd          z   }|S )Nr   r   )r   r   )r5   r   r   r   self_outputsattention_outputr   s          r7   re   zDinov2Attention.forward@  sM     ~~mY@QRR;;|AFF#%QRR(88r8   r   )rg   rh   ri   r   r%   r   rl   r   r'   rk   r   r   r   r   re   rm   rn   s   @r7   r   r   '  s        "| " " " " " " ";S ;d ; ; ; ;* -1"'	 | EL)  	
 
uU\5</0%2EE	F       r8   r   c                   (     e Zd Zdeddf fdZ xZS )Dinov2SdpaAttentionr!   r"   Nc                 r    t                                          |           t          |          | _        d S rf   )r$   r%   r   r   r   s     r7   r%   zDinov2SdpaAttention.__init__P  s.       088r8   )rg   rh   ri   r   r%   rm   rn   s   @r7   r   r   O  sK        9| 9 9 9 9 9 9 9 9 9 9 9r8   r   c                   D     e Zd Zd fdZdej        dej        fdZ xZS )Dinov2LayerScaler"   Nc                     t                                                       t          j        |j        t          j        |j                  z            | _        d S rf   )	r$   r%   r	   r&   layerscale_valuer'   onesr)   lambda1r   s     r7   r%   zDinov2LayerScale.__init__V  sC    |F$;ejI[>\>\$\]]r8   hidden_statec                     || j         z  S rf   )r   r5   r   s     r7   re   zDinov2LayerScale.forwardZ  s    dl**r8   r"   Nrg   rh   ri   r%   r'   rk   re   rm   rn   s   @r7   r   r   U  si        ^ ^ ^ ^ ^ ^+EL +U\ + + + + + + + +r8   r   r   Finput	drop_probr   r"   c                     |dk    s|s| S d|z
  }| j         d         fd| j        dz
  z  z   }|t          j        || j        | j                  z   }|                                 |                     |          |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )rD   device)rG   ndimr'   randrD   r   floor_div)r   r   r   	keep_probrG   random_tensorr   s          r7   	drop_pathr   _  s     CxII[^
Q 77E
5EL Y Y YYMYYy!!M1FMr8   c                   j     e Zd ZdZd	dee         ddf fdZdej        dej        fdZ	de
fdZ xZS )
Dinov2DropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r"   c                 V    t                                                       || _        d S rf   )r$   r%   r   )r5   r   r6   s     r7   r%   zDinov2DropPath.__init__w  s$    "r8   r   c                 8    t          || j        | j                  S rf   )r   r   r   )r5   r   s     r7   re   zDinov2DropPath.forward{  s    FFFr8   c                 6    d                     | j                  S )Nzp={})formatr   r5   s    r7   
extra_reprzDinov2DropPath.extra_repr~  s    }}T^,,,r8   rf   )rg   rh   ri   rj   r   floatr%   r'   rk   re   strr   rm   rn   s   @r7   r   r   t  s        bb# #(5/ #T # # # # # #GU\ Gel G G G G-C - - - - - - - -r8   r   c                   D     e Zd Zd fdZdej        dej        fdZ xZS )	Dinov2MLPr"   Nc                 ~   t                                                       |j        x}}t          |j        |j        z            }t          j        ||d          | _        t          |j	        t                    rt          |j	                 | _        n|j	        | _        t          j        ||d          | _        d S )NTr   )r$   r%   r)   rl   	mlp_ratior	   r   fc1ru   
hidden_actr   r   
activationfc2r5   r!   in_featuresout_featureshidden_featuresr6   s        r7   r%   zDinov2MLP.__init__  s    %+%77lf063CCDD9[/EEEf'-- 	0$V%67DOO$/DO9_lFFFr8   r   c                     |                      |          }|                     |          }|                     |          }|S rf   )r   r   r   r   s     r7   re   zDinov2MLP.forward  s;    xx--|44xx--r8   r   r   rn   s   @r7   r   r     si        	G 	G 	G 	G 	G 	GEL U\        r8   r   c                   D     e Zd Zd fdZdej        dej        fdZ xZS )Dinov2SwiGLUFFNr"   Nc                 D   t                                                       |j        x}}t          |j        |j        z            }t          |dz  dz            dz   dz  dz  }t          j        |d|z  d          | _        t          j        ||d          | _        d S )Nr>   r         Tr   )	r$   r%   r)   rl   r   r	   r   
weights_inweights_outr   s        r7   r%   zDinov2SwiGLUFFN.__init__  s    %+%77lf063CCDD2Q677!;AAE)K_1D4PPP9_lNNNr8   r   c                     |                      |          }|                    dd          \  }}t          j                            |          |z  }|                     |          S )Nr>   r=   rE   )r   chunkr	   rL   silur   )r5   r   x1x2hiddens        r7   re   zDinov2SwiGLUFFN.forward  s]    |44##A2#..B##B''",'''r8   r   r   rn   s   @r7   r   r     si        O O O O O O(EL (U\ ( ( ( ( ( ( ( (r8   r   )eagersdpac                        e Zd ZdZdeddf fdZ	 	 ddej        deej                 d	e	de
eej        ej        f         eej                 f         fd
Z xZS )Dinov2LayerzCThis corresponds to the Block class in the original implementation.r!   r"   Nc                 8   t                                                       t          j        |j        |j                  | _        t          |j                 |          | _	        t          |          | _        |j        dk    rt          |j                  nt          j                    | _        t          j        |j        |j                  | _        |j        rt%          |          | _        nt)          |          | _        t          |          | _        d S )Nepsr   )r$   r%   r	   	LayerNormr)   layer_norm_epsnorm1DINOV2_ATTENTION_CLASSES_attn_implementationr   r   layer_scale1drop_path_rater   Identityr   norm2use_swiglu_ffnr   mlpr   layer_scale2r   s     r7   r%   zDinov2Layer.__init__  s    \&"4&:OPPP
1&2MNvVV,V44BHBWZ]B]B](=>>>cecncpcp\&"4&:OPPP
  	)&v..DHH ((DH,V44r8   Fr   r   r   c                    |                      |                     |          ||          }|d         }|                     |          }|dd          }|                     |          |z   }|                     |          }|                     |          }|                     |          }|                     |          |z   }|f|z   }|S )N)r   r   r   )r   r  r  r   r  r  r  )r5   r   r   r   self_attention_outputsr   r   layer_outputs           r7   re   zDinov2Layer.forward  s     "&JJ}%%/ "0 "
 "

 2!4,,-=>>(, '788=H zz-00xx--((66 ~~l33mC/G+r8   r   )rg   rh   ri   rj   r   r%   r'   rk   r   r   r   r   re   rm   rn   s   @r7   r
  r
    s        MM5| 5 5 5 5 5 5 5& -1"'	 | EL)  	
 
uU\5</0%2EE	F       r8   r
  c                        e Zd Zdeddf fdZ	 	 	 	 ddej        deej                 d	ed
edede	e
ef         fdZ xZS )Dinov2Encoderr!   r"   Nc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S  )r
  .0rc   r!   s     r7   
<listcomp>z*Dinov2Encoder.__init__.<locals>.<listcomp>  s!    #a#a#aAK$7$7#a#a#ar8   F)	r$   r%   r!   r	   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   s    `r7   r%   zDinov2Encoder.__init__  s`    ]#a#a#a#avG_A`A`#a#a#abb
&+###r8   FTr   r   r   output_hidden_statesreturn_dictc                    |rdnd }|rdnd }t          | j                  D ]h\  }}	|r||fz   }|||         nd }
| j        r%| j        r|                     |	j        ||
|          }n |	||
|          }|d         }|r||d         fz   }i|r||fz   }|st          d |||fD                       S t          |||          S )Nr!  r   r   c              3      K   | ]}||V  	d S rf   r!  )r#  vs     r7   	<genexpr>z(Dinov2Encoder.forward.<locals>.<genexpr>  s(      mmq_`_l_l_l_l_lmmr8   )last_hidden_stater   
attentions)	enumerater(  r)  r   _gradient_checkpointing_func__call__tupler   )r5   r   r   r   r*  r+  all_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss               r7   re   zDinov2Encoder.forward  sI    #7@BBD$5?bb4(44 	P 	POA|# I$58H$H!.7.CillO* `t} ` $ A A )!#%	! ! !-]OM^ _ _)!,M  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r8   )NFFT)rg   rh   ri   r   r%   r'   rk   r   r   r   r5  r   re   rm   rn   s   @r7   r  r    s        ,| , , , , , , , -1"'%* )
 )
|)
 EL))
  	)

 #)
 )
 
uo%	&)
 )
 )
 )
 )
 )
 )
 )
r8   r  c                   l    e Zd ZdZeZdZdZdZdgZ	dZ
deej        ej        ej        f         ddfd	ZdS )
Dinov2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    dinov2rZ   Tr   moduler"   Nc                 `   t          |t          j        t          j        f          rt          j                            |j        j                            t          j
                  d| j        j                                      |j        j                  |j        _        |j         |j        j                                         dS dS t          |t          j                  r?|j        j                                         |j        j                            d           dS t          |t$                    rt          j                            |j        j                            t          j
                  d| j        j                                      |j        j                  |j        _        t          j                            |j        j                            t          j
                  d| j        j                                      |j        j                  |j        _        dS dS )zInitialize the weightsr   )meanstdNg      ?)ru   r	   r   ry   inittrunc_normal_r^   datarN   r'   rO   r!   initializer_rangerD   r   zero_r  fill_r    r0   r*   )r5   r?  s     r7   _init_weightsz#Dinov2PreTrainedModel._init_weights   s   fry")455 	) "$!6!6"%%em443DKDa "7 " "b$%% M {& &&((((( '&-- 	)K""$$$M$$S))))) 011 	).0g.C.C*/225=AAK1 /D / / b+122	 &+ %'G$9$9 %((77K1 %: % % b!'((	 !!!	) 	)r8   )rg   rh   ri   rj   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpar   r	   r   ry   r  rI  r!  r8   r7   r=  r=    sy         
  L $O&*#*+N)E")RY*L$M )RV ) ) ) ) ) )r8   r=  aH  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`Dinov2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a4  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`BitImageProcessor.preprocess`] for details.

        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
            pre-training.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
aM  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`BitImageProcessor.preprocess`] for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z`The bare DINOv2 Model transformer outputting raw hidden-states without any specific head on top.c                   R    e Zd Zdef fdZdefdZdeee	e         f         ddfdZ
 ee           eeeede	          	 	 	 	 	 	 dd
eej                 deej                 deej                 dee         dee         dee         deeef         fd                        Z xZS )Dinov2Modelr!   c                    t                                          |           || _        t          |          | _        t          |          | _        t          j        |j	        |j
                  | _        |                                  d S )Nr  )r$   r%   r!   r    r9   r  encoderr	   r  r)   r  	layernorm	post_initr   s     r7   r%   zDinov2Model.__init__|  st       *622$V,,f&8f>STTT 	r8   r"   c                     | j         j        S rf   r9   r.   r   s    r7   get_input_embeddingsz Dinov2Model.get_input_embeddings      //r8   heads_to_pruneNc                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrS  r(  r   r   )r5   rZ  r(  r   s       r7   _prune_headszDinov2Model._prune_heads  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr8   vision)
checkpointoutput_typerJ  modalityexpected_outputrZ   r[   r   r   r*  r+  c                    ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |                     || j         j                  }|                     ||          }|                     |||||          }|d         }	| 	                    |	          }	|	d d dd d f         }
|s|	|
f}||dd          z   S t          |	|
|j        |j                  S )Nz You have to specify pixel_values)r[   r   r   r*  r+  r   r   )r0  pooler_outputr   r1  )r!   r   r*  use_return_dictr|   get_head_maskr'  r9   rS  rT  r   r   r1  )r5   rZ   r[   r   r   r*  r+  embedding_outputencoder_outputssequence_outputpooled_outputhead_outputss               r7   re   zDinov2Model.forward  s<   " 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]?@@@ &&y$+2OPP	??<?YY,,/!5# ' 
 
 *!,..99'1aaa0 	6+];L/!"""555)-')7&1	
 
 
 	
r8   NNNNNN)rg   rh   ri   r   r%   r-   rX  r   rl   r   r]  r   DINOV2_BASE_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r'   rk   r   r   r   re   rm   rn   s   @r7   rQ  rQ  w  sz       

| 
 
 
 
 
 
0&; 0 0 0 0C4T#Y+? CD C C C C +*+GHH&.$.   0426,0,0/3&*/
 /
u|,/
 "%,//
 EL)	/

 $D>/
 'tn/
 d^/
 
u00	1/
 /
 /
  IH/
 /
 /
 /
 /
r8   rQ  z
    Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
    of the [CLS] token) e.g. for ImageNet.
    c                       e Zd Zdeddf fdZ ee           eee	e
e          	 	 	 	 	 	 ddeej                 deej                 deej                 d	ee         d
ee         dee         deee	f         fd                        Z xZS )Dinov2ForImageClassificationr!   r"   Nc                 <   t                                          |           |j        | _        t          |          | _        |j        dk    r"t          j        |j        dz  |j                  nt          j                    | _	        | 
                                 d S )Nr   r>   )r$   r%   
num_labelsrQ  r>  r	   r   r)   r  
classifierrU  r   s     r7   r%   z%Dinov2ForImageClassification.__init__  s        +!&)) EKDUXYDYDYBIf(1,f.?@@@_a_j_l_l 	
 	r8   )r_  r`  rJ  rb  rZ   r   labelsr   r*  r+  c                 |   ||n| j         j        }|                     |||||          }|d         }|dddf         }	|ddddf         }
t          j        |	|
                    d          gd          }|                     |          }d}|t|                    |j                  }| j         j	        f| j
        dk    rd| j         _	        nN| j
        dk    r7|j        t          j        k    s|j        t          j        k    rd| j         _	        nd| j         _	        | j         j	        dk    rWt                      }| j
        dk    r1 ||                                |                                          }n |||          }n| j         j	        dk    rGt!                      } ||                    d	| j
                  |                    d	                    }n*| j         j	        dk    rt%                      } |||          }|s|f|d
d         z   }||f|z   n|S t'          |||j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nrd  r   r   rE   
regressionsingle_label_classificationmulti_label_classificationr=   r>   )losslogitsr   r1  )r!   rf  r>  r'   rQ   rA  rv  rN   r   problem_typeru  rD   longrl   r   squeezer   rP   r
   r   r   r1  )r5   rZ   r   rw  r   r*  r+  r   rj  r*   patch_tokenslinear_inputr}  r|  loss_fctr   s                   r7   re   z$Dinov2ForImageClassification.forward  sd   , &1%<kk$+B]++/!5#  
 
 "!*#AAAqD)	&qqq!""u-y)\->->1->-E-E!FANNN..YYv}--F{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
r8   rm  )rg   rh   ri   r   r%   r   DINOV2_INPUTS_DOCSTRINGr   _IMAGE_CLASS_CHECKPOINTr   rp  _IMAGE_CLASS_EXPECTED_OUTPUTr   r'   rk   r   r   r5  re   rm   rn   s   @r7   rs  rs    s=       |        +*+BCC*)$4	   04,0)-,0/3&*D
 D
u|,D
 EL)D
 &	D

 $D>D
 'tnD
 d^D
 
u++	,D
 D
 D
  DCD
 D
 D
 D
 D
r8   rs  zO
    Dinov2 backbone, to be used with frameworks like DETR and MaskFormer.
    c                        e Zd Z fdZdefdZ ee           ee	e
          	 	 	 ddej        dee         dee         d	ee         de	f
d
                        Z xZS )Dinov2Backbonec                    t                                                     t                                                     fdt          j        dz             D             | _        t                    | _        t                    | _	        t          j        j        j                  | _        |                                  d S )Nc                     g | ]	}j         
S r!  )r)   r"  s     r7   r$  z+Dinov2Backbone.__init__.<locals>.<listcomp><  s    ]]]AV/]]]r8   r   r  )r$   r%   _init_backboner&  r'  num_featuresr    r9   r  rS  r	   r  r)   r  rT  rU  r   s    `r7   r%   zDinov2Backbone.__init__8  s       v&&&]]]]v?WZ[?[9\9\]]]*622$V,,f&8f>STTT 	r8   r"   c                     | j         j        S rf   rW  r   s    r7   rX  z#Dinov2Backbone.get_input_embeddingsE  rY  r8   )r`  rJ  NrZ   r*  r   r+  c                    ||n| j         j        }||n| j         j        }||n| j         j        }|                     |          }|                     |d||          }|r|j        n|d         }d}t          | j        |          D ]\  }	}
|	| j	        v r| j         j
        r|                     |
          }
| j         j        rn|
ddddf         }
|j        \  }}}}| j         j        }|
                    |||z  ||z  d          }
|
                    dddd	                                          }
||
fz  }|s!|r|f|dd         z   }n|f|d	d         z   }|S t%          ||r|j        nd|r|j        nd
          S )a7  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
        >>> model = AutoBackbone.from_pretrained(
        ...     "facebook/dinov2-base", out_features=["stage2", "stage5", "stage8", "stage11"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 16, 16]
        ```NT)r*  r   r+  r   r!  r=   r   r   r>   )feature_mapsr   r1  )r!   rf  r*  r   r9   rS  r   zipstage_namesr   apply_layernormrT  reshape_hidden_statesrG   r4   rJ   rK   r   r   r1  )r5   rZ   r*  r   r+  rh  r   r   r  stager   rb   rc   r:   r;   r4   r   s                    r7   re   zDinov2Backbone.forwardH  s   F &1%<kk$+B]$8$D  $+Jj 	 2C1N--TXT_Tq??<88,,4K\ju  
 
 2=L--'!*#&t'7#G#G 	0 	0E<)));. @#'>>,#?#?L;4 Q#/122#6L 4@3E0J65!%!7J#/#7#7
FjDXZ_cmZmoq#r#rL#/#7#71a#C#C#N#N#P#PL/ 	# 7&7122;6&7122;6M%3GQ'//T->Hw))D
 
 
 	
r8   )NNN)rg   rh   ri   r%   r-   rX  r   r  r   r   rp  r'   rk   r   r   re   rm   rn   s   @r7   r  r  1  s            0&; 0 0 0 0 +*+BCC>XXX 04,0&*I
 I
lI
 'tnI
 $D>	I

 d^I
 
I
 I
 I
 YX DCI
 I
 I
 I
 I
r8   r  )r   F)Krj   collections.abcrv   r   typingr   r   r   r   r   r   r'   torch.utils.checkpointr	   torch.nnr
   r   r   activationsr   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   r   utils.backbone_utilsr   configuration_dinov2r   
get_loggerrg   r   rp  ro  rq  r  r  Moduler    r-   r   r   r   r   r   r   rk   r   r   r   r   r   r   r  r
  r  r=  DINOV2_START_DOCSTRINGrn  r  rQ  rs  r  r!  r8   r7   <module>r     s          : : : : : : : : : : : : : : : :            A A A A A A A A A A ! ! ! ! ! !            . - - - - - Q Q Q Q Q Q Q Q                2 1 1 1 1 1 . . . . . . 
	H	%	% ! - &  E 1 L L L L Lry L L L^    BI   D9 9 9 9 9") 9 9 9x&# &# &# &# &#1 &# &# &#T    ry   &$ $ $ $ $bi $ $ $P9 9 9 9 9/ 9 9 9+ + + + +ry + + + U\ e T V[Vb    *- - - - -RY - - -    	   &( ( ( ( (bi ( ( ($   0 0 0 0 0") 0 0 0h0
 0
 0
 0
 0
BI 0
 0
 0
f%) %) %) %) %)O %) %) %)P	   4 . f O
 O
 O
 O
 O
' O
 O
	 O
d   Z
 Z
 Z
 Z
 Z
#8 Z
 Z
 Z
z  	 \
 \
 \
 \
 \
*M \
 \
 \
 \
 \
r8   