
    g             	       p   d Z ddlZddlZddlmZ ddlmZmZm	Z	m
Z
 ddlZddlZddlmZmZ ddlmZmZmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+  e%j,        e-          Z.dZ/dZ0g dZ1dZ2dZ3e G d de                      Z4dSdej        de5de6dej        fdZ7 G d dej8                  Z9 G d  d!ej8                  Z: G d" d#ej8                  Z; G d$ d%ej8                  Z< G d& d'ej8                  Z= G d( d)ej8                  Z> G d* d+ej8                  Z? G d, d-ej8                  Z@ G d. d/ej8                  ZA G d0 d1ej8                  ZB G d2 d3ej8                  ZC G d4 d5e          ZDd6ZEd7ZF e#d8eE           G d9 d:eD                      ZG G d; d<ej8                  ZH e#d=eE           G d> d?eD                      ZI e#d@eE           G dA dBeD                      ZJ G dC dDej8                  ZK G dE dFej8                  ZL G dG dHej8                  ZM G dI dJej8                  ZN G dK dLej8                  ZO e#dMeE           G dN dOeD                      ZP e#dPeE           G dQ dReDe)                      ZQdS )TzPyTorch BEiT model.    N)	dataclass)ListOptionalTupleUnion)Tensornn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutputBaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedLMOutputSemanticSegmenterOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings	torch_int)BackboneMixin   )
BeitConfigr    z%microsoft/beit-base-patch16-224-pt22k)r      i   zmicrosoft/beit-base-patch16-224ztabby, tabby catc                       e Zd ZdZdS )BeitModelOutputWithPoolinga  
    Class for outputs of [`BeitModel`].

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
            *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
            will be returned.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    N)__name__
__module____qualname____doc__     b/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/beit/modeling_beit.pyr#   r#   @   s           r)   r#           Finput	drop_probtrainingreturnc                     |dk    s|s| S d|z
  }| j         d         fd| j        dz
  z  z   }|t          j        || j        | j                  z   }|                                 |                     |          |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r+   r   r   r   )dtypedevice)shapendimtorchrandr2   r3   floor_div)r,   r-   r.   	keep_probr4   random_tensoroutputs          r*   	drop_pathr=   Z   s     CxII[^
Q 77E
5EL Y Y YYMYYy!!M1FMr)   c                   j     e Zd ZdZd	dee         ddf fdZdej        dej        fdZ	de
fdZ xZS )
BeitDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr-   r/   c                 V    t                                                       || _        d S N)super__init__r-   )selfr-   	__class__s     r*   rC   zBeitDropPath.__init__q   s$    "r)   hidden_statesc                 8    t          || j        | j                  S rA   )r=   r-   r.   rD   rF   s     r*   forwardzBeitDropPath.forwardu   s    FFFr)   c                 6    d                     | j                  S )Nzp={})formatr-   rD   s    r*   
extra_reprzBeitDropPath.extra_reprx   s    }}T^,,,r)   rA   )r$   r%   r&   r'   r   floatrC   r6   r   rI   strrM   __classcell__rE   s   @r*   r?   r?   n   s        bb# #(5/ #T # # # # # #GU\ Gel G G G G-C - - - - - - - -r)   r?   c            	            e Zd ZdZdeddf fdZdej        dededej        fd	Z		 	 ddej        de
ej                 dedej        fdZ xZS )BeitEmbeddingszc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    configr/   Nc                    t                                                       t          j        t	          j        dd|j                            | _        |j        r3t          j        t	          j        dd|j                            | _	        nd | _	        t          |          | _        |j        | _        t          |j        t          j        j                  r|j        n|j        |j        f| _        | j        j        }|j        r6t          j        t	          j        d|dz   |j                            | _        nd | _        t          j        |j                  | _        d S )Nr   )rB   rC   r	   	Parameterr6   zeroshidden_size	cls_tokenuse_mask_token
mask_tokenBeitPatchEmbeddingspatch_embeddings
patch_size
isinstance
image_sizecollectionsabcIterablenum_patches use_absolute_position_embeddingsposition_embeddingsDropouthidden_dropout_probdropout)rD   rT   rd   rE   s      r*   rC   zBeitEmbeddings.__init__   s(   ek!Q8J&K&KLL  	# l5;q!V=O+P+PQQDOO"DO 3F ; ; + &+[_-EFF8F#V%67 	
 +72 	,')|EK;QR?TZTf4g4g'h'hD$$'+D$z&"<==r)   
embeddingsheightwidthc                    |j         d         dz
  }| j        j         d         dz
  }t          j                                        s||k    r||k    r| j        S | j        ddddf         }| j        ddddf         }|j         d         }|| j        z  }	|| j        z  }
t          |dz            }|                    d|||          }|                    dddd          }t          j
                            ||	|
fdd	
          }|                    dddd                              dd|          }t          j        ||fd          S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r      bicubicFsizemodealign_cornersdim)r4   rf   r6   jit
is_tracingr^   r   reshapepermuter	   
functionalinterpolateviewcat)rD   rj   rk   rl   rd   num_positionsclass_pos_embedpatch_pos_embedrv   
new_height	new_widthsqrt_num_positionss               r*   interpolate_pos_encodingz'BeitEmbeddings.interpolate_pos_encoding   sr    !&q)A-06q9A= y##%% 	,+*F*F6UZ??++2111bqb592111abb59r"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNy/?;CCCCr)   Fpixel_valuesbool_masked_posr   c                 |   |j         \  }}}}|                     || j        | j        d d dd d d f         nd           \  }\  }}	|                                \  }
}}|R| j                            |
|d          }|                    d                              |          }|d|z
  z  ||z  z   }| j                            |
dd          }| j        6|r|| 	                    |||          z   }n|| j        d d d dd d f         z   }t          j        ||fd          }|                     |          }|||	ffS )Nr   rn   ru   )r4   r]   rf   rr   r[   expand	unsqueezetype_asrY   r   r6   r~   ri   )rD   r   r   r   _rk   rl   rj   patch_heightpatch_width
batch_sizeseq_lenmask_tokensw
cls_tokenss                  r*   rI   zBeitEmbeddings.forward   sy    +01fe262G2G@X@d$2111abb!!!8<<jn3
 3
/
/\; ",!2!2
GQ&/00WbIIK))"--55kBBA#q1u-a?J^**:r2>>
#/' M'$*G*G
TZ\a*b*bb

'$*B111bqb!!!8*LL
Y
J7Q???
\\*--
L+666r)   )NF)r$   r%   r&   r'   r    rC   r6   r   intr   r   
BoolTensorboolrI   rP   rQ   s   @r*   rS   rS   ~   s         
>z >d > > > > > >.&D5< &D &DUX &D]b]i &D &D &D &DV 7;).	7 7l7 "%"237 #'	7
 
7 7 7 7 7 7 7 7r)   rS   c                   d     e Zd ZdZ fdZ	 ddej        deej                 dej        fdZ xZ	S )	r\   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t                                                       |j        |j        }}|j        |j        }}t          |t          j        j	                  r|n||f}t          |t          j        j	                  r|n||f}|d         |d         z  |d         |d         z  z  }|d         |d         z  |d         |d         z  f}|| _        || _        || _        || _
        || _        t          j        ||||          | _        d S )Nr   r   kernel_sizestride)rB   rC   r`   r^   num_channelsrX   r_   ra   rb   rc   rd   patch_shaper	   Conv2d
projection)	rD   rT   r`   r^   r   rX   rd   r   rE   s	           r*   rC   zBeitPatchEmbeddings.__init__   s   !'!2F4EJ
$*$79Kk#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&)L+:^hiiir)   Nr   position_embeddingr/   c                    |j         \  }}}}|| j        k    rt          d          |                     |          }|j         d         |j         d         }	}|m|                    d| j        d         | j        d         d                              dddd          }t          j        	                    |||	fd          }||z   }|
                    d                              dd          }|||	ffS )	NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.ro   r   r   r   rn   rp   rr   rs   )r4   r   
ValueErrorr   r}   r   rz   r	   r{   r|   flatten	transpose)
rD   r   r   r   r   rk   rl   rj   r   r   s
             r*   rI   zBeitPatchEmbeddings.forward   s   
 3?2D/
L&%4,,,w   __\22
$.$4Q$79I!9Lk)!3!8!8D<LQ<OQUQabcQdfh!i!i!q!q1a" " "$!:!:",)D9 "; " " $&88J''**44Q::
L+666r)   rA   )
r$   r%   r&   r'   rC   r6   r   r   rI   rP   rQ   s   @r*   r\   r\      s         j j j j j( 6:7 7l7 %U\27 
	7 7 7 7 7 7 7 7r)   r\   c                        e Zd Zddedee         ddf fdZd Z	 	 	 	 	 ddej	        d	eej	                 d
e
ded         de
deee                  deeej	                 eej	        ej	        f         f         fdZ xZS )BeitSelfAttentionNrT   window_sizer/   c                    t                                                       || _        |j        |j        z  dk    r1t          |d          s!t          d|j        f d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _	        t          j        |j        | j	                  | _        t          j        |j        | j	        d          | _        t          j        |j        | j	                  | _        t          j        |j                  | _        |rt%          ||          | _        d S d | _        d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .F)biasr   )rB   rC   rT   rX   num_attention_headshasattrr   r   attention_head_sizeall_head_sizer	   Linearquerykeyvaluerg   attention_probs_dropout_probri   BeitRelativePositionBiasrelative_position_biasrD   rT   r   rE   s      r*   rC   zBeitSelfAttention.__init__  sV    ::a??PVXhHiHi?76#5"7 7 737 7 7  
 $*#= #&v'9F<V'V#W#W !58PPYv143EFF
9V/1C%PPPYv143EFF
z&"EFF 	/*B6Wb*c*c*cD'''*.D'''r)   c                     |                                 d d         | j        | j        fz   } |j        | }|                    dddd          S )Nrn   r   ro   r   r   )rr   r   r   r}   rz   )rD   xnew_x_shapes      r*   transpose_for_scoresz&BeitSelfAttention.transpose_for_scores1  sM    ffhhssmt'?AY&ZZAFK yyAq!$$$r)   FrF   	head_maskoutput_attentionsr   r   r   
resolutionc                    |                      |          }|                     |                     |                    }|                     |                     |                    }	|                     |          }
t	          j        |
|                    dd                    }|t          j        | j	                  z  }| j
        I|\  }}|| j        j        z  || j        j        z  f}|| 
                    |||j        d                   z   }|||z   }t          j                            |d          }|                     |          }|||z  }t	          j        ||	          }|                    dddd                                          }|                                d d         | j        fz   } |j        | }|r||fn|f}|S )	Nrn   r   )dim_sizeru   r   ro   r   )r   r   r   r   r6   matmulr   mathsqrtr   r   rT   r^   r4   r	   r{   softmaxri   rz   
contiguousrr   r   r}   )rD   rF   r   r   r   r   r   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresrk   rl   r   attention_probscontext_layernew_context_layer_shapeoutputss                      r*   rI   zBeitSelfAttention.forward6  s    !JJ}55--dhh}.E.EFF	//

=0I0IJJ//0ABB !<Y5H5HR5P5PQQ+di8P.Q.QQ &2&MFE!T[%;;UdkF\=\]K/$2M2M5@STU@V 3N 3 3  
 "-/2HH -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S**,CD6G]=/22mM]r)   rA   NFNFN)r$   r%   r&   r    r   tuplerC   r   r6   r   r   r   r   r   rI   rP   rQ   s   @r*   r   r     s       / /z / /SW / / / / / /0% % % -1"'GK).+/3 3|3 EL)3  	3
 !))C D3 #'3 U3Z(3 
uU\"E%,*D$EE	F3 3 3 3 3 3 3 3r)   r   c                   `     e Zd ZdZdeddf fdZd	dej        dej        dej        fdZ xZ	S )
BeitSelfOutputz
    The residual connection is defined in BeitLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rT   r/   Nc                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _        d S rA   )	rB   rC   r	   r   rX   denserg   rh   ri   rD   rT   rE   s     r*   rC   zBeitSelfOutput.__init__r  sJ    Yv163EFF
z&"<==r)   rF   input_tensorc                 Z    |                      |          }|                     |          }|S rA   r   ri   )rD   rF   r   gammas       r*   rI   zBeitSelfOutput.forwardw  *    

=11]33r)   rA   )
r$   r%   r&   r'   r    rC   r6   r   rI   rP   rQ   s   @r*   r   r   l  s         
>z >d > > > > > >
 U\  ^c^j        r)   r   c                        e Zd Zddedee         ddf fdZd Z	 	 	 	 	 ddej	        d	eej	                 d
e
ded         de
deee                  deeej	                 eej	        ej	        f         f         fdZ xZS )BeitAttentionNrT   r   r/   c                     t                                                       t          ||          | _        t	          |          | _        t                      | _        d S )Nr   )rB   rC   r   	attentionr   r<   setpruned_headsr   s      r*   rC   zBeitAttention.__init__  sN    *6{KKK$V,,EEr)   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   ru   )lenr   r   r   r   r   r   r   r   r   r<   r   r   union)rD   headsindexs      r*   prune_headszBeitAttention.prune_heads  s   u::??F74>5t~7Y[_[l
 
u
  2$.2FNN/0BEJJ1$.2FNN.t{/@%QOOO .2^-ORUV[R\R\-\*'+~'IDNLn'n$ -33E::r)   FrF   r   r   r   r   r   r   c                     |                      ||||||          }|                     |d         |          }|f|dd          z   }	|	S )Nr   r   )r   r<   )
rD   rF   r   r   r   r   r   self_outputsattention_outputr   s
             r*   rI   zBeitAttention.forward  s]     ~~9&79OQiku
 
  ;;|AFF#%QRR(88r)   rA   r   )r$   r%   r&   r    r   r   rC   r   r6   r   r   r   r   r   rI   rP   rQ   s   @r*   r   r   ~  s       " "z " "SW " " " " " "; ; ;* -1"'GK).+/ | EL)  	
 !))C D #' U3Z( 
uU\"E%,*D$EE	F       r)   r   c                   L     e Zd Zdeddf fdZdej        dej        fdZ xZS )BeitIntermediaterT   r/   Nc                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S rA   )rB   rC   r	   r   rX   intermediate_sizer   r_   
hidden_actrO   r   intermediate_act_fnr   s     r*   rC   zBeitIntermediate.__init__  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r)   rF   c                 Z    |                      |          }|                     |          }|S rA   )r   r   rH   s     r*   rI   zBeitIntermediate.forward  s,    

=1100??r)   	r$   r%   r&   r    rC   r6   r   rI   rP   rQ   s   @r*   r   r     sq        9z 9d 9 9 9 9 9 9U\ el        r)   r   c                   L     e Zd Zdeddf fdZdej        dej        fdZ xZS )
BeitOutputrT   r/   Nc                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _	        d S rA   )
rB   rC   r	   r   r   rX   r   rg   rh   ri   r   s     r*   rC   zBeitOutput.__init__  sJ    Yv79KLL
z&"<==r)   rF   c                 Z    |                      |          }|                     |          }|S rA   r   rH   s     r*   rI   zBeitOutput.forward  r   r)   r   rQ   s   @r*   r   r     sq        >z >d > > > > > >
U\ el        r)   r   c                        e Zd ZdZddedee         deddf fdZ	 	 	 	 	 dd
e	j
        dee	j
                 deded         dedeee                  deee	j
                 ee	j
        e	j
        f         f         fdZ xZS )	BeitLayerz?This corresponds to the Block class in the timm implementation.Nr+   rT   r   drop_path_rater/   c                    t                                                       |j        | _        d| _        t	          ||          | _        t          |          | _        t          |          | _	        t          j        |j        |j                  | _        |dk    rt          |          nt          j                    | _        t          j        |j        |j                  | _        |j        }|dk    rlt          j        |t+          j        |j                  z  d          | _        t          j        |t+          j        |j                  z  d          | _        d S d\  | _        | _        d S )	Nr   r   epsr+   r   T)requires_grad)NN)rB   rC   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r<   r	   	LayerNormrX   layer_norm_epslayernorm_beforer?   Identityr=   layernorm_afterlayer_scale_init_valuerV   r6   oneslambda_1lambda_2)rD   rT   r   r   init_valuesrE   s        r*   rC   zBeitLayer.__init__  s:   '-'E$&v;GGG,V44 (( "V-?VEZ [ [ [9G#9M9Mn555SUS^S`S`!|F,>FDYZZZ3??Luz6CU7W7W)WgklllDMLuz6CU7W7W)WgklllDMMM+5(DM4===r)   FrF   r   r   r   r   r   r   c                    |                      |                     |          |||||          }|d         }|dd          }	| j        
| j        |z  }|                     |          |z   }|                     |          }
|                     |
          }
|                     |
          }
| j        
| j        |
z  }
|                     |
          |z   }
|
f|	z   }	|	S )N)r   r   r   r   r   r   )r   r  r
  r=   r  r  r<   r  )rD   rF   r   r   r   r   r   self_attention_outputsr   r   layer_outputs              r*   rI   zBeitLayer.forward  s     "&!!-00/#9%=! "0 "
 "
 2!4(, =$#}/?? '788=H ++M::((66{{<00=$=<7L ~~l33mC/G+r)   )Nr+   r   )r$   r%   r&   r'   r    r   r   rN   rC   r6   r   r   r   r   r   rI   rP   rQ   s   @r*   r   r     s       II6 6z 6 6`e 6pt 6 6 6 6 6 6* -1"'GK).+/) )|) EL))  	)
 !))C D) #') U3Z() 
uU\"E%,*D$EE	F) ) ) ) ) ) ) )r)   r   c                   r     e Zd Zdededdf fdZdeeef         dej	        fdZ
d
dedej	        fd	Z xZS )r   rT   r   r/   Nc                    t                                                       || _        d|d         z  dz
  d|d         z  dz
  z  dz   | _        t	          j        t          j        | j        |j                            | _	        i | _
        d S )Nro   r   r   r   )rB   rC   r   num_relative_distancer	   rV   r6   rW   r   relative_position_bias_tablerelative_position_indicesr   s      r*   rC   z!BeitRelativePositionBias.__init__	  s    &&'+a.&81&<[QR^ASVWAW%X[\%\",.LK2F4NOO-
 -
)
 *,&&&r)   c                 ^   d|d         z  dz
  d|d         z  dz
  z  dz   }|d         |d         z  }t          j        t          j        |d                   t          j        |d                   d          }t          j        |          }t          j        |d          }|dddddf         |dddddf         z
  }|                    ddd                                          }|dddddfxx         |d         dz
  z  cc<   |dddddfxx         |d         dz
  z  cc<   |dddddfxx         d|d         z  dz
  z  cc<   t          j        |dz   fdz  |j                  }|	                    d	          |ddddf<   |dz
  |dddf<   |dz
  |dddf<   |dz
  |d
<   |S )z
        This method creates the relative position index, modified to support arbitrary window sizes,
        as introduced in [MiDaS v3.1](https://arxiv.org/abs/2307.14460).
        ro   r   r   r   ij)indexingN)rr   r2   rn   )r   r   )
r6   meshgridarangestackr   rz   r   rW   r2   sum)	rD   r   r  window_areagridcoordscoords_flattenrelative_coordsrelative_position_indexs	            r*    generate_relative_position_indexz9BeitRelativePositionBias.generate_relative_position_index  s   
 "#[^!3a!7AA<NQR<R SVW W "!n{1~5~el;q>::ELUV<X<XcghhhT""vq11(AAAt4~aaaqqqj7QQ)11!Q::EEGG111a   KNQ$66   111a   KNQ$66   111a   AA$6$::   "'+K!O3E3IQ`Qf"g"g"g*9*=*=b*A*AABB')>)B122&)>)BA&(=(A%&&r)   Fr   c                    d| j         d         z  dz
  }d| j         d         z  dz
  }d|d         z  dz
  }d|d         z  dz
  }| j        }| j        }	||z  dz   }
|d|	dz
           }|                    d||d                              dddd          }t
          j                            |t          |          t          |          fd          }|                    dddd                              |
dz
  d          }t          j
        |||	dz
  d         g          }|}|| j                                        vr|                     |          | j        |<   || j        |                             d                   }|                    |d         |d         z  dz   |d         |d         z  dz   d          }|                    ddd                                          }|rKt
          j                            |                    d          ||fdd	
                              d          }|                    d          S )zu
        Modification of timm.models.beit.py: Attention._get_rel_pos_bias to support arbitrary window sizes.
        ro   r   r   r   Nrn   bilinearr   Frq   )r   r  r  ry   rz   r	   r{   r|   r   r6   r~   r  keysr"  r}   r   r   squeeze)rD   r   r   r   
old_height	old_widthr   r    old_relative_position_bias_tableold_num_relative_distancenew_num_relative_distanceold_sub_tablenew_sub_table new_relative_position_bias_tabler   r   s                   r*   rI   z BeitRelativePositionBias.forward,  s    )!,,q0
(++a/	Q'!+
A&*	+/+L($($>!$.$:Q$>!89X;TWX;X9XY%--aJKKSSTUWXZ[]^__11:!6!6	)8L8L MT^ 2 
 
 &--aAq99AAB[^_B_acdd+09<=VYZ=Z=\=\]^,
 ,
( d499;;;;262W2WXc2d2dD*3/!A$B`adBeBjBjkmBnBn!o!7!<!<N[^+a/Q+a.1PST1TVX"
 "
 "8!?!?1a!H!H!S!S!U!U# 	%']%>%>&0033)#	 &? & &
 gajj # &//222r)   )FN)r$   r%   r&   r    r   rC   r   r   r6   r   r"  r   rI   rP   rQ   s   @r*   r   r     s        	,z 	, 	,$ 	, 	, 	, 	, 	, 	,'E#s(O 'PUP\ ' ' ' '0/3 /3T /3]b]i /3 /3 /3 /3 /3 /3 /3 /3r)   r   c                        e Zd Zddedee         ddf fdZ	 	 	 	 	 	 ddej        d	eej                 d
e	de	de	dee
e                  de	deeef         fdZ xZS )BeitEncoderNrT   r   r/   c                 z   t                                                       | _        j        rt	                    | _        nd | _        d t          j        dj        j	                  D             t          j        fdt          j	                  D                       | _        d| _        d S )Nr   c                 6    g | ]}|                                 S r(   )item).0r   s     r*   
<listcomp>z(BeitEncoder.__init__.<locals>.<listcomp>h  s     dddAqvvxxdddr)   r   c                 R    g | ]#}t          j        rnd |                   $S )N)r   r   )r   use_relative_position_bias)r4  irT   dprr   s     r*   r5  z(BeitEncoder.__init__.<locals>.<listcomp>j  sT         /5/P ZVZ#&q6    r)   F)rB   rC   rT   !use_shared_relative_position_biasr   r   r6   linspacer   num_hidden_layersr	   
ModuleListrangelayergradient_checkpointing)rD   rT   r   r9  rE   s    ``@r*   rC   zBeitEncoder.__init___  s    3 	/*B6Wb*c*c*cD''*.D' ed63H&Jb!c!cddd]      v788  	
 	

 ',###r)   FTrF   r   r   output_hidden_statesr   r   return_dictc           	      (   |rdnd }|rdnd }	t          | j                  D ]\  }
}|r||fz   }|||
         nd }| j        r%| j        r|                     |j        |||          }n_|\  }}|| j        j        z  || j        j        z  f}| j        #|                     |||j	        d                   nd } |||||||          }|d         }|r|	|d         fz   }	|r||fz   }|st          d |||	fD                       S t          |||	          S )Nr(   r   )r   r   r   c              3      K   | ]}||V  	d S rA   r(   )r4  vs     r*   	<genexpr>z&BeitEncoder.forward.<locals>.<genexpr>  s(      mmq_`_l_l_l_l_lmmr)   )last_hidden_staterF   
attentions)	enumerater?  r@  r.   _gradient_checkpointing_func__call__rT   r^   r   r4   r   r   )rD   rF   r   r   rA  r   r   rB  all_hidden_statesall_self_attentionsr8  layer_modulelayer_head_masklayer_outputsrk   rl   r   r   s                     r*   rI   zBeitEncoder.forwardu  s    #7@BBD$5?bb4(44 #	P #	POA|# I$58H$H!.7.CillO* t}  $ A A )!#%	! ! !+%)??$+J`A`a
 2> //#>Vanatuvaw 0     ' !-!#%*,! ! *!,M  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r)   rA   )NFFFNT)r$   r%   r&   r    r   r   rC   r6   r   r   r   r   r   r   rI   rP   rQ   s   @r*   r0  r0  ^  s        , ,z , ,SW , , , , , ,2 -1"'%*).+/ ;
 ;
|;
 EL);
  	;

 #;
 #';
 U3Z(;
 ;
 
uo%	&;
 ;
 ;
 ;
 ;
 ;
 ;
 ;
r)   r0  c                   4    e Zd ZdZeZdZdZdZdgZ	dgZ
d ZdS )	BeitPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    beitr   Tr   z.*relative_position_index.*c                    t          |t          j        t          j        t          j        f          rT|j        j                            d| j        j	                   |j
         |j
        j                                         dS dS t          |t          j                  r_|j        j                            d| j        j	                   |j        +|j        j        |j                                                  dS dS t          |t          j                  r?|j
        j                                         |j        j                            d           dS dS )zInitialize the weightsr+   )meanstdNg      ?)r_   r	   r   r   ConvTranspose2dweightdatanormal_rT   initializer_ranger   zero_	Embeddingpadding_idxr  fill_)rD   modules     r*   _init_weightsz!BeitPreTrainedModel._init_weights  s5   fry")R5GHII 	* M&&CT[5R&SSS{& &&((((( '&-- 	*M&&CT[5R&SSS!-"6#56<<>>>>> .--- 	*K""$$$M$$S)))))	* 	*r)   N)r$   r%   r&   r'   r    config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules"_keys_to_ignore_on_load_unexpectedra  r(   r)   r*   rR  rR    sV         
 L$O&*#$*H)I&* * * * *r)   rR  aF  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`BeitConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`BeitImageProcessor.__call__`] for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
            Whether to interpolate the pre-trained position encodings.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z^The bare Beit Model transformer outputting raw hidden-states without any specific head on top.c                   (    e Zd Zddededdf fdZd Zd Z ee	           e
eeed	e
          	 	 	 	 	 	 ddej        deej                 deej                 dee         dee         dedee         deeef         fd                        Z xZS )	BeitModelTrT   add_pooling_layerr/   Nc                    t                                          |           || _        t          |          | _        t          || j        j        j                  | _        |j	        rt          j                    nt          j        |j        |j                  | _        |rt!          |          nd | _        |                                  d S )Nr   r   )rB   rC   rT   rS   rj   r0  r]   r   encoderuse_mean_poolingr	   r  r  rX   r  	layernorm
BeitPoolerpooler	post_init)rD   rT   rj  rE   s      r*   rC   zBeitModel.__init__  s       (00"6t7W7cddd $4uBKMMM",vGY_e_t:u:u:u 	 ->Gj(((4 	r)   c                     | j         j        S rA   rj   r]   rL   s    r*   get_input_embeddingszBeitModel.get_input_embeddings	      //r)   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrl  r?  r   r   )rD   heads_to_pruner?  r   s       r*   _prune_headszBeitModel._prune_heads  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr)   vision)
checkpointoutput_typerb  modalityexpected_outputFr   r   r   r   rA  r   rB  c           	         ||n| j         j        }||n| j         j        }||n| j         j        }|                     || j         j                  }|                     |||          \  }}	|j        dd         }
|                     |||||
||          }|d         }| 	                    |          }| j
        | 
                    |          nd}|s|||fn|f}||dd         z   S t          |||j        |j                  S )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        N)r   r   ro   )r   r   rA  r   rB  r   r   r   )rG  pooler_outputrF   rH  )rT   r   rA  use_return_dictget_head_maskr<  rj   r4   rl  rn  rp  r#   rF   rH  )rD   r   r   r   r   rA  r   rB  embedding_outputr   r   encoder_outputssequence_outputpooled_outputhead_outputss                  r*   rI   zBeitModel.forward  sd   , 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] &&y$+2OPP	"oo/Tl . 
 
! "'+
,,/!5!#%= ' 
 
 *!,..998<8OO444UY 	6?L?XO];;_n^pL/!"""555)-')7&1	
 
 
 	
r)   )T)NNNNFN)r$   r%   r&   r    r   rC   rt  ry  r   BEIT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr#   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr6   r   r   r   r   r   rI   rP   rQ   s   @r*   ri  ri    sc       
 z d d      0 0 0C C C +*+@AA&.$.   7;,0,0/3).&*6
 6
l6
 "%"236
 EL)	6

 $D>6
 'tn6
 #'6
 d^6
 
u00	16
 6
 6
  BA6
 6
 6
 6
 6
r)   ri  c                   L     e Zd Zdeddf fdZdej        dej        fdZ xZS )ro  rT   r/   Nc                     t                                                       |j        r t          j        |j        |j                  nd | _        d S )Nr   )rB   rC   rm  r	   r  rX   r  rn  r   s     r*   rC   zBeitPooler.__init__V  sJ    KQKblBL+1FGGGGhl 	r)   rF   c                     | j         :|d d dd d d f         }|                      |                    d                    }n|d d df         }|S )Nr   r   )rn  rU  )rD   rF   patch_tokensr  s       r*   rI   zBeitPooler.forward\  sa    >%(ABB2L NN<+<+<Q+?+?@@MM *!!!Q$/Mr)   r   rQ   s   @r*   ro  ro  U  sq        
z 
d 
 
 
 
 
 
	U\ 	el 	 	 	 	 	 	 	 	r)   ro  a  Beit Model transformer with a 'language' modeling head on top. BEiT does masked image modeling by predicting
    visual tokens of a Vector-Quantize Variational Autoencoder (VQ-VAE), whereas other vision models like ViT and DeiT
    predict RGB pixel values. As a result, this class is incompatible with [`AutoModelForMaskedImageModeling`], so you
    will need to use [`BeitForMaskedImageModeling`] directly if you wish to do masked image modeling with BEiT.c                   :    e Zd Zdeddf fdZ ee           eee	          	 	 	 	 	 	 	 	 dde
ej                 de
ej                 d	e
ej                 d
e
ej                 de
e         de
e         dede
e         deeef         fd                        Z xZS )BeitForMaskedImageModelingrT   r/   Nc                 H   t                                          |           |j        | _        t          |d          | _        t          j        |j        |j                  | _	        t          j
        |j        |j                  | _        |                                  d S )NFrj  r   )rB   rC   
num_labelsri  rS  r	   r  rX   r  rn  r   
vocab_sizelm_headrq  r   s     r*   rC   z#BeitForMaskedImageModeling.__init__p  s        +f>>>	 f&8f>STTTy!3V5FGG 	r)   r|  rb  Fr   r   r   labelsr   rA  r   rB  c	           	         ||n| j         j        }|                     |||||||          }	|	d         }
|                     |
          }
|                     |
ddddf                   }d}| t                      } |||         |          }|s|f|	dd         z   }||f|z   n|S t          |||	j        |	j                  S )a  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, BeitForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
        >>> model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, logits = outputs.loss, outputs.logits
        >>> list(logits.shape)
        [1, 196, 8192]
        ```N)r   r   r   rA  r   rB  r   r   losslogitsrF   rH  )	rT   r  rS  rn  r  r   r   rF   rH  )rD   r   r   r   r  r   rA  r   rB  r   r  prediction_scoresmasked_lm_lossloss_fctr<   s                  r*   rI   z"BeitForMaskedImageModeling.forward}  s   ` &1%<kk$+B]))+/!5%=#  
 
 "!*..99 LLABB)?@@'))H%X&7&H&QQN 	Z')GABBK7F3A3M^%..SYY$!/)	
 
 
 	
r)   )NNNNNNFN)r$   r%   r&   r    rC   r   r  r   r   r  r   r6   r   r   r   r   r   rI   rP   rQ   s   @r*   r  r  h  sQ       z d       +*+@AA>XXX 046:,0)-,0/3).&*L
 L
u|,L
 "%"23L
 EL)	L

 &L
 $D>L
 'tnL
 #'L
 d^L
 
un$	%L
 L
 L
 YX BAL
 L
 L
 L
 L
r)   r  z
    Beit Model transformer with an image classification head on top (a linear layer on top of the average of the final
    hidden states of the patch tokens) e.g. for ImageNet.
    c                   "    e Zd Zdeddf fdZ ee           eee	e
e          	 	 	 	 	 	 	 ddeej                 deej                 d	eej                 d
ee         dee         dedee         deee	f         fd                        Z xZS )BeitForImageClassificationrT   r/   Nc                 :   t                                          |           |j        | _        t          |d          | _        |j        dk    rt          j        |j        |j                  nt          j                    | _	        | 
                                 d S )NTr  r   )rB   rC   r  ri  rS  r	   r   rX   r  
classifierrq  r   s     r*   rC   z#BeitForImageClassification.__init__  s        +f===	 OUN_bcNcNc")F$68IJJJikitiviv 	r)   )r{  r|  rb  r~  Fr   r   r  r   rA  r   rB  c                    ||n| j         j        }|                     ||||||          }|r|j        n|d         }	|                     |	          }
d}|Z| j         j        f| j        dk    rd| j         _        nN| j        dk    r7|j        t          j	        k    s|j        t          j
        k    rd| j         _        nd| j         _        | j         j        dk    rWt                      }| j        dk    r1 ||
                                |                                          }n ||
|          }n| j         j        dk    rGt                      } ||
                    d| j                  |                    d                    }n*| j         j        dk    rt                      } ||
|          }|s|
f|dd         z   }||f|z   n|S t!          ||
|j        |j        	          S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   rA  r   rB  r   
regressionsingle_label_classificationmulti_label_classificationrn   ro   r  )rT   r  rS  r  r  problem_typer  r2   r6   longr   r   r&  r   r}   r
   r   rF   rH  )rD   r   r   r  r   rA  r   rB  r   r  r  r  r  r<   s                 r*   rI   z"BeitForImageClassification.forward  s   . &1%<kk$+B]))/!5%=#  
 
 2=L--'!*//{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
r)   NNNNNFN)r$   r%   r&   r    rC   r   r  r   _IMAGE_CLASS_CHECKPOINTr   r  _IMAGE_CLASS_EXPECTED_OUTPUTr   r6   r   r   r   r   rI   rP   rQ   s   @r*   r  r    s:       
z 
d 
 
 
 
 
 
 +*+@AA*)$4	   04,0)-,0/3).&*=
 =
u|,=
 EL)=
 &	=

 $D>=
 'tn=
 #'=
 d^=
 
u++	,=
 =
 =
  BA=
 =
 =
 =
 =
r)   r  c                        e Zd ZdZ	 	 	 ddededeeeeef         f         deeeeef         ef         d	ed
eeeeef         f         ddf fdZ	de
j        de
j        fdZ xZS )BeitConvModuleaD  
    A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
    layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    r   Fr   in_channelsout_channelsr   paddingr   dilationr/   Nc                     t                                                       t          j        ||||||          | _        t          j        |          | _        t          j                    | _        d S )N)r  r  r   r  r   r  )	rB   rC   r	   r   convBatchNorm2dbnReLU
activation)rD   r  r  r   r  r   r  rE   s          r*   rC   zBeitConvModule.__init__1  si     	I#%#
 
 
	 ...'))r)   r,   c                     |                      |          }|                     |          }|                     |          }|S rA   )r  r  r  )rD   r,   r<   s      r*   rI   zBeitConvModule.forwardF  s8    5!!((r)   )r   Fr   )r$   r%   r&   r'   r   r   r   rO   r   rC   r6   r   rI   rP   rQ   s   @r*   r  r  )  s          5601$ $$ $ 3c3h/0	$
 sE#s(OS01$ $ U38_,-$ 
$ $ $ $ $ $*U\ el        r)   r  c                   T     e Zd Zdedededdf fdZdej        dej        fdZ xZS )	BeitPyramidPoolingBlock
pool_scaler  channelsr/   Nc                    t                                                       t          j        |          t	          ||d          g| _        t          | j                  D ](\  }}|                     t          |          |           )d S )Nr   r   )	rB   rC   r	   AdaptiveAvgPool2dr  layersrI  
add_modulerO   )rD   r  r  r  r8  r?  rE   s         r*   rC   z BeitPyramidPoolingBlock.__init__O  s     ,,;a@@@
 "$+.. 	+ 	+HAuOOCFFE****	+ 	+r)   r,   c                 4    |}| j         D ]} ||          }|S rA   )r  )rD   r,   hidden_stater?  s       r*   rI   zBeitPyramidPoolingBlock.forwardX  s/    [ 	/ 	/E 5..LLr)   )	r$   r%   r&   r   rC   r6   r   rI   rP   rQ   s   @r*   r  r  N  s        +3 +S +C +D + + + + + +U\ el        r)   r  c            
       x     e Zd ZdZdeedf         dedededdf
 fd	Zd
ej	        de
ej	                 fdZ xZS )BeitPyramidPoolingModulea  
    Pyramid Pooling Module (PPM) used in PSPNet.

    Args:
        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
            Module.
        in_channels (int): Input channels.
        channels (int): Channels after modules, before conv_seg.
        align_corners (bool): align_corners argument of F.interpolate.

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    pool_scales.r  r  rt   r/   Nc                 V   t                                                       || _        || _        || _        || _        g | _        t          |          D ]T\  }}t          |||          }| j        	                    |           | 
                    t          |          |           Ud S )N)r  r  r  )rB   rC   r  rt   r  r  blocksrI  r  appendr  rO   )	rD   r  r  r  rt   r8  r  blockrE   s	           r*   rC   z!BeitPyramidPoolingModule.__init__m  s    &*& &{33 	+ 	+MAz+z{emnnnEKu%%%OOCFFE****	+ 	+r)   r   c                     g }| j         D ]d} ||          }t          j                            ||                                dd          d| j                  }|                    |           e|S )Nro   r$  rq   )r  r	   r{   r|   rr   rt   r  )rD   r   ppm_outsppmppm_outupsampled_ppm_outs         r*   rI   z BeitPyramidPoolingModule.forwardy  s{    ; 	/ 	/Cc!ffG " 9 9affhhqrrl4K] !: ! ! OO-....r)   )r$   r%   r&   r'   r   r   r   rC   r6   r   r   rI   rP   rQ   s   @r*   r  r  _  s         
+E#s(O 
+# 
+QT 
+ei 
+nr 
+ 
+ 
+ 
+ 
+ 
+ $u|*<        r)   r  c                   V     e Zd ZdZdeddf fdZd Zdej        dej        fdZ	 xZ
S )	BeitUperHeadz
    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
    [UPerNet](https://arxiv.org/abs/1807.10221).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    rT   r/   Nc                    t                                                       |j        | _        |j        gdz  | _        |j        | _        d| _        t          j        | j        |j	        d          | _
        t          | j        | j        d         | j        | j                  | _        t          | j        d         t          | j                  | j        z  z   | j        dd          | _        t          j                    | _        t          j                    | _        | j        d d         D ]j}t          || j        d          }t          | j        | j        dd          }| j                            |           | j                            |           kt          t          | j                  | j        z  | j        dd          | _        d S )	N   Fr   r  rn   )rt   r   r   r  )rB   rC   r  rX   r  r  rt   r	   r   r  r  r  psp_modulesr  r   
bottleneckr=  lateral_convs	fpn_convsr  fpn_bottleneck)rD   rT   r  l_convfpn_convrE   s        r*   rC   zBeitUperHead.__init__  s   !-"./!3*")DM63DRSTTT 4R M,	
 
 
 )R 3t'7#8#84=#HHM	
 
 
  ]__+CRC0 	, 	,K#KANNNF%dmT]PQ[\]]]H%%f---N!!(++++, !!DM1M	
 
 
r)   c                     |d         }|g}|                     |                     |                     t          j        |d          }|                     |          }|S )Nrn   r   ru   )extendr  r6   r~   r  )rD   inputsr   psp_outsr<   s        r*   psp_forwardzBeitUperHead.psp_forward  s\    2J3((++,,,9X1---**r)   encoder_hidden_statesc                 B    fdt           j                  D                                                                           t	                    }t          |dz
  dd          D ]Z}|dz
           j        dd          }|dz
           t          j        	                    |         |d j
                  z   |dz
  <   [ fdt          |dz
            D             }|                    d                    t          |dz
  dd          D ]F}t          j        	                    ||         |d         j        dd          d j
                  ||<   Gt          j        |d	          }                     |          }                     |          }|S )
Nc                 8    g | ]\  }} ||                   S r(   r(   )r4  r8  lateral_convr  s      r*   r5  z(BeitUperHead.forward.<locals>.<listcomp>  s-    pppq,LL!6q!9::pppr)   r   r   rn   ro   r$  rq   c                 H    g | ]} j         |         |                   S r(   )r  )r4  r8  lateralsrD   s     r*   r5  z(BeitUperHead.forward.<locals>.<listcomp>  s/    \\\q%DN1%hqk22\\\r)   ru   )rI  r  r  r  r   r>  r4   r	   r{   r|   rt   r6   r~   r  r  )rD   r  used_backbone_levelsr8  
prev_shapefpn_outsr<   r  s   ``     @r*   rI   zBeitUperHead.forward  s   ppppR[\`\nRoRoppp(()>??@@@  #8}}+a/B77 	 	A!!a%.qrr2J&q1uo0I0I*:TM_ 1J 1 1 HQUOO
 ]\\\\EBVYZBZ<[<[\\\%%%+a/B77 	 	A-33(1+"3ABB"7jX\Xj 4  HQKK 9X1---$$X..((r)   )r$   r%   r&   r'   r    rC   r  r6   r   rI   rP   rQ   s   @r*   r  r    s         $
z $
d $
 $
 $
 $
 $
 $
L  U\ el        r)   r  c                        e Zd ZdZ	 ddedededeeeeef         f         d	d
f
 fdZde	j
        d	e	j
        fdZ xZS )BeitFCNHeada  
    Fully Convolution Networks for Semantic Segmentation. This head is implemented of
    [FCNNet](https://arxiv.org/abs/1411.4038>).

    Args:
        config (BeitConfig): Configuration.
        in_channels
        kernel_size (int): The kernel size for convs in the head. Default: 3.
        dilation (int): The dilation rate for convs in the head. Default: 1.


    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    ro   r   r   rT   in_indexr   r  r/   Nc           
         t                                                       |j        | _        |j        | _        |j        | _        |j        | _	        || _
        |dz  |z  }g }|                    t          | j        | j        |||                     t          | j        dz
            D ]3}|                    t          | j        | j        |||                     4| j        dk    rt          j                    | _        nt          j        | | _        | j	        r-t          | j        | j        z   | j        ||dz            | _        t          j        | j        |j        d          | _        d S )Nro   )r   r  r  r   r   r  r  )rB   rC   rX   r  auxiliary_channelsr  auxiliary_num_convs	num_convsauxiliary_concat_inputconcat_inputr  r  r  r>  r	   r  convs
Sequentialconv_catr   r  r  )	rD   rT   r  r   r  conv_paddingr  r8  rE   s	           r*   rC   zBeitFCNHead.__init__  s    	!-13"9 #q(H4 $-[R^iq  	
 	
 	

 t~)** 	 	ALLM4=kS_jr     
 >QDJJ.DJ 	* 4=0$-[bmqrbr  DM )DM63DRSTTTr)   r  c                     || j                  }|                     |          }| j        r+|                     t	          j        ||gd                    }|                     |          }|S )Nr   ru   )r  r  r  r  r6   r~   r  )rD   r  rF   r<   s       r*   rI   zBeitFCNHead.forward
  sf    -dm<M** 	N]]59mV-D!#L#L#LMMF((r)   )ro   r   r   )r$   r%   r&   r'   r    r   r   r   rC   r6   r   rI   rP   rQ   s   @r*   r  r    s          tu U  U  U,/ UBE UUZ[^`efiknfn`o[oUp U	 U  U  U  U  U  UDU\ el        r)   r  zf
    Beit Model transformer with a semantic segmentation head on top e.g. for ADE20k, CityScapes.
    c                   $    e Zd Zdeddf fdZd Z ee           ee	e
          	 	 	 	 	 	 	 ddeej                 d	eej                 d
eej                 dee         dee         dedee         deee	f         fd                        Z xZS )BeitForSemanticSegmentationrT   r/   Nc                 P   t                                          |           |j        | _        t          |d          | _        t          | j        j                  dk    rt          d          t          j
        t          j        |j        |j        dd          t          j        |j                  t          j                    t          j        |j        |j        dd                    | _        t          j
        t          j        |j        |j        dd                    | _        t          j                    | _        t          j        dd          | _        t+          |          | _        |j        rt1          |          nd | _        |                                  d S )NFr  r  zBeitForSemanticSegmentation requires config.out_indices to be a list of 4 integers, specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of a base-sized architecture.ro   r   )rB   rC   r  ri  rS  r   rT   out_indicesr   r	   r  rW  rX   r  GELUfpn1fpn2r  fpn3	MaxPool2dfpn4r  decode_headuse_auxiliary_headr  auxiliary_headrq  r   s     r*   rC   z$BeitForSemanticSegmentation.__init__  sw       +f>>>	 t{&''1,,-  
 Mv163EST]^___N6-..GIIv163EST]^___	
 
	 Mv163EST]^___
 
	 KMM	LQq999	 (//5;5NXk&111TX 	r)   c                 Z   t           j                            ||j        dd          dd          }|0t           j                            ||j        dd          dd          }t	          | j        j                  } |||          }|}| |||          }	|| j        j        |	z  z  }|S )Nr   r$  Frq   )ignore_index)r	   r{   r|   r4   r   rT   semantic_loss_ignore_indexauxiliary_loss_weight)
rD   r  auxiliary_logitsr  upsampled_logitsupsampled_auxiliary_logitsr  	main_lossr  auxiliary_losss
             r*   compute_lossz(BeitForSemanticSegmentation.compute_loss;  s    =44bcc*5 5 
 
 ')+)B)B v|BCC'8zY^ *C * *& $1WXXXH-v66	'%X&@&IINDK5FFDr)   r  Fr   r   r  r   rA  r   rB  c                 \    ||n j         j        }||n j         j        }| j         j        dk    rt	          d                               |||d||          }|r|j        n|d         }	 fdt          |	          D             }
|j        d          j         j	         j         j
        z  fd|
D             }
 j         j         j         j        g}t          t!          |
                    D ]} ||         |
|                   |
|<                        |
          }d} j                             |
          }d}|                     |||          }|s)|r|f|dd         z   }n|f|d	d         z   }||f|z   n|S t)          |||r|j        nd|j        
          S )aV  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, BeitForSemanticSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
        >>> model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr  c                 <    g | ]\  }}|d z   j         j        v |S r1   )rT   r  )r4  idxfeaturerD   s      r*   r5  z7BeitForSemanticSegmentation.forward.<locals>.<listcomp>  s5    wwwWTWZ[T[_c_j_vTvTvGTvTvTvr)   r   c                     g | ]>}|d d dd d d f                              ddd                              d          ?S )Nr   r   ro   rn   )rz   ry   )r4  r   r   patch_resolutions     r*   r5  z7BeitForSemanticSegmentation.forward.<locals>.<listcomp>  sd     
 
 
ijAaaaQQQhK1a((00RAQScdd
 
 
r)   ro   r  )rT   r  rA  r  r   rS  rF   rI  r4   r`   r^   r  r   r  r  r>  r   r  r  r  r   rH  )rD   r   r   r  r   rA  r   rB  r   r  featuresopsr8  r  r  r  r<   r   r  s   `                @@r*   rI   z#BeitForSemanticSegmentation.forwardN  sB   J &1%<kk$+B]$8$D  $+Jj 	 $+"8A"="=NOOO))/!%%=#  
 
 :E T 5 5'RS* xwww	:O0P0Pwww!'*
;1T[5KK
 
 
 
 
nv
 
 

 y$)TY	:s8}}%% 	. 	.A #a&!--HQKK!!(++*#228<<$$V-=vFFD 	F# 1 WQRR[0 WQRR[0)-)9TGf$$vE&3GQ'//T)	
 
 
 	
r)   r  )r$   r%   r&   r    rC   r  r   r  r   r   r  r   r6   r   r   r   r   rI   rP   rQ   s   @r*   r  r    sN       z d      @  & +*+@AA+BQ`aaa 04,0)-,0/3).&*Z
 Z
u|,Z
 EL)Z
 &	Z

 $D>Z
 'tnZ
 #'Z
 d^Z
 
u--	.Z
 Z
 Z
 ba BAZ
 Z
 Z
 Z
 Z
r)   r  zM
    BEiT backbone, to be used with frameworks like DETR and MaskFormer.
    c                        e Zd Z fdZd Z ee           eee	          	 	 	 dde
dee         dee         dee         d	ef
d
                        Z xZS )BeitBackbonec                    t                                                     t                                                     fdt          j        dz             D             | _        t                    | _        t          | j        j	        j
                  | _        j        rt          | j        j                  dk    rt!          d          j        }t%          j        t%          j        ||dd          t%          j        |j                  t%          j                    t%          j        ||dd                    | _        t%          j        t%          j        ||dd                    | _        t%          j                    | _        t%          j        dd          | _        |                                  d S )	Nc                     g | ]	}j         
S r(   )rX   )r4  r   rT   s     r*   r5  z)BeitBackbone.__init__.<locals>.<listcomp>  s    ]]]AV/]]]r)   r   r   r  zBeitBackbone requires config.out_indices to be a list of 4 integers, specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of a base-sized architecture.ro   r   r   )rB   rC   _init_backboner>  r<  num_featuresrS   rj   r0  r]   r   rl  add_fpnr   rT   r  r   rX   r	   r  rW  r  batch_norm_epsr  r  r   r  r  r  r  rq  )rD   rT   rX   rE   s    ` r*   rC   zBeitBackbone.__init__  s      v&&&]]]]v?WZ[?[9\9\]]](00"6t7W7cddd> 	>4;*++q00 1  
 !,K";STUUU{0EFFF		";STUUU	 DI b&8k_`ij&k&k&kllDIDI1===DI 	r)   c                     | j         j        S rA   rs  rL   s    r*   rt  z!BeitBackbone.get_input_embeddings  ru  r)   r  Nr   rA  r   rB  r/   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|j        d         }|                     |          \  }\  }}|j        dd         }	|                     |d||	|          }
|r|
j        n|
d         }d}t          | j	        |          D ]`\  }}|| j
        v rR| j         j        r@|ddddddf         }|                    ddd          }|                    |d||          }||fz  }a| j         j        ry|                     |d                   |                     |d                   |                     |d                   |                     |d	                   g}t'          |          }|s!|r|f|
dd         z   }n|f|
dd         z   }|S t)          ||r|
j        nd|
j        
          S )aL  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224")
        >>> model = AutoBackbone.from_pretrained(
        ...     "microsoft/beit-base-patch16-224", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 14, 14]
        ```Nr   ro   T)rA  r   r   rB  r   r(   rn   r   )feature_mapsrF   rH  )rT   r  rA  r   r4   rj   rl  rF   zipstage_namesout_featuresreshape_hidden_statesrz   ry   r  r  r   r  r  r   r   rH  )rD   r   rA  r   rB  r   r  r   r   r   r   rF   r#  stager  r<   s                   r*   rI   zBeitBackbone.forward  s>   F &1%<kk$+B]$8$D  $+Jj 	 2C1N--TXT_Tq!'*
8<8U8U55<!'+
,,!%/!#  
 
 2=L--'!*#&t'7#G#G 	0 	0E<)));4 c#/122qqq#9L#/#7#71a#@#@L#/#7#7
BVa#b#bL/; 	/		,q/**		,q/**		,q/**		,q/**	L !..L 	# 7&7122;6&7122;6M%3GQ'//T)
 
 
 	
r)   )NNN)r$   r%   r&   rC   rt  r   r  r   r   r  r   r   r   rI   rP   rQ   s   @r*   r  r    s            <0 0 0 +*+@AA>XXX 04,0&*S
 S
S
 'tnS
 $D>	S

 d^S
 
S
 S
 S
 YX BAS
 S
 S
 S
 S
r)   r  )r+   F)Rr'   collections.abcra   r   dataclassesr   typingr   r   r   r   r6   torch.utils.checkpointr   r	   torch.nnr
   r   r   activationsr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   r   utils.backbone_utilsr   configuration_beitr    
get_loggerr$   loggerr  r  r  r  r  r#   rN   r   r=   Moduler?   rS   r\   r   r   r   r   r   r   r   r0  rR  BEIT_START_DOCSTRINGr  ri  ro  r  r  r  r  r  r  r  r  r  r(   r)   r*   <module>r9     sy          ! ! ! ! ! ! / / / / / / / / / / / /              A A A A A A A A A A ! ! ! ! ! !                . - - - - - Q Q Q Q Q Q Q Q                2 1 1 1 1 1 * * * * * * 
	H	%	%  > &  < 1      !;   2 U\ e T V[Vb    (- - - - -29 - - - b7 b7 b7 b7 b7RY b7 b7 b7J27 27 27 27 27") 27 27 27jQ Q Q Q Q	 Q Q Qh    RY   $) ) ) ) )BI ) ) )X    ry    
 
 
 
 
 
 
 
> > > > >	 > > >BS3 S3 S3 S3 S3ry S3 S3 S3lR
 R
 R
 R
 R
") R
 R
 R
j* * * * */ * * *<	  2 d Y
 Y
 Y
 Y
 Y
# Y
 Y
	 Y
x       & s  \
 \
 \
 \
 \
!4 \
 \
 \
~   Q
 Q
 Q
 Q
 Q
!4 Q
 Q
 Q
h" " " " "RY " " "J    bi   "" " " " "ry " " "JR R R R R29 R R Rj8 8 8 8 8") 8 8 8v  	 P
 P
 P
 P
 P
"5 P
 P
 P
f  	 w
 w
 w
 w
 w
& w
 w
 w
 w
 w
r)   