
    g             
       b   d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZmZmZ ddlmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZmZmZmZm Z m!Z! ddl"m#Z# ddl$m%Z%  ej&        e'          Z(dZ)dZ*g dZ+dZ,dZ-e G d de                      Z.e G d de                      Z/e G d de                      Z0e G d de                      Z1 G d dej2                  Z3 G d dej2                  Z4 G d  d!ej2                  Z5dRd$e
j6        d%e7d&e8d'e
j6        fd(Z9 G d) d*ej2                  Z: G d+ d,ej2                  Z; G d- d.ej2                  Z< G d/ d0ej2                  Z=d1e
j6        d2ee>         d3ee>         d'e
j6        fd4Z? G d5 d6ej2                  Z@d1e
j6        d7ee>e>f         d8ee>e>f         d9eee>                  d'e
j6        f
d:ZA G d; d<e          ZBd=ZCd>ZD G d? d@ej2                  ZE edAeCdB           G dC dDeB                      ZF G dE dFej2                  ZG G dG dHej2                  ZH edIeC           G dJ dKeB                      ZI edLeC           G dM dNeB                      ZJ edOeC           G dP dQeBe#                      ZKdS )SzPyTorch Hiera model.    N)	dataclass)DictListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutputBaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputModelOutput)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings	torch_int)BackboneMixin   )HieraConfigr   zfacebook/hiera-tiny-224-hf)r   1   i   zfacebook/hiera-tiny-224-in1k-hfztabby, tabby catc                       e Zd ZU dZdZej        ed<   dZe	e
ej        df                  ed<   dZe	e
ej        df                  ed<   dZe	e
ej        df                  ed<   dS )HieraEncoderOutputa  
    Hiera encoder's outputs, with potential hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`. Thesre are the unrolled hidden states of the model.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nlast_hidden_state.hidden_states
attentionsreshaped_hidden_states)__name__
__module____qualname____doc__r!   torchFloatTensor__annotations__r"   r   r   r#   r$        d/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/hiera/modeling_hiera.pyr    r    =   s          2 ,0u(///=AM8E%"3S"89:AAA:>Ju0#567>>>FJHU5+<c+A%BCJJJJJr-   r    c                      e Zd ZU dZdZej        ed<   dZe	ej                 ed<   dZ
ej        ed<   dZej        ed<   dZe	eej        df                  ed<   dZe	eej        df                  ed	<   dZe	eej        df                  ed
<   dS )HieraModelOutputa	  
    Hiera model's outputs that also contains a pooling of the last hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
            Average pooling of the last layer hidden-state.
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
            Tensor indicating which patches are masked (0) and which are not (1).
        ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Tensor containing the original index of the (shuffled) masked patches.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`. These are the unrolled hidden states of the model.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nr!   pooler_outputbool_masked_posids_restore.r"   r#   r$   )r%   r&   r'   r(   r!   r)   r*   r+   r1   r   r2   
BoolTensorr3   
LongTensorr"   r   r#   r$   r,   r-   r.   r0   r0   ^   s          > ,0u(///15M8E-.555(,OU%,,,$(K!(((=AM8E%"3S"89:AAA:>Ju0#567>>>FJHU5+<c+A%BCJJJJJr-   r0   c                       e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dZeeej        df                  ed<   dS )	!HieraForImageClassificationOutputa  
    Hiera image classification outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, `optional`):
            Loss value for the training task.
        logits (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
            Prediction scores of the classification head (logits of the output layer).
        hidden_states (`tuple(torch.FloatTensor)`, `optional`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`. These are the unrolled hidden states of the model.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, `optional`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, `optional`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nlosslogits.r"   r#   r$   )r%   r&   r'   r(   r8   r   r)   r*   r+   r9   r"   r   r#   r$   r,   r-   r.   r7   r7      s          6 )-D(5$
%,,, $FE$$$=AM8E%"3S"89:AAA:>Ju0#567>>>FJHU5+<c+A%BCJJJJJr-   r7   c                      e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
ej        ed<   dZej        ed<   dZeeej                          ed<   dZeeej                          ed<   dZeeej                          ed	<   dS )
HieraForPreTrainingOutputa  
    Class for HieraForPreTraining's outputs, with potential hidden states and attentions.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`):
            Pixel reconstruction loss.
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, patch_size ** 2 * num_channels)`):
            Pixel reconstruction logits.
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
            Tensor indicating which patches are masked (0) and which are not (1).
        ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Tensor containing the original index of the (shuffled) masked patches.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, height, width, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs reshaped to include the spatial dimensions.
    Nr8   r9   r2   r3   r"   r#   r$   )r%   r&   r'   r(   r8   r   r)   r*   r+   r9   r2   r4   r3   r5   r"   r   r#   r$   r,   r-   r.   r;   r;      s          4 )-D(5$
%,,, $FE$$$(,OU%,,,$(K!(((8<M8E%"345<<<59Ju012999AEHU5+<%=>EEEEEr-   r;   c                   H    e Zd ZdZddef fdZ	 ddej        deej	                 dej
        fd	Z	 ddej        d
eej                 deej	        ej        f         fdZ	 ddej        d
eej                 deej
        eej	                 eej                 f         fdZ xZS )HieraPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    Fis_maec                 8   t                                                       t          |j                  | _        | j        dk    rt          d| j         d          |j        | _        |j        dd          | _        d t          |j        |j	                  D             | _
        d t          | j
        |j                  D             | _        |j        | _        || _        t          j        | j        |j        |j        |j	        |j                  | _        d S )N   zAThe number of dimensions of the input image should be 2, but got .c                     g | ]
\  }}||z  S r,   r,   .0iss      r.   
<listcomp>z1HieraPatchEmbeddings.__init__.<locals>.<listcomp>   s     $d$d$d1Q!V$d$d$dr-   c                     g | ]
\  }}||z  S r,   r,   rD   s      r.   rH   z1HieraPatchEmbeddings.__init__.<locals>.<listcomp>   s     "n"n"nda16"n"n"nr-   )kernel_sizestridepadding)super__init__len
patch_sizespatial_dims
ValueErrornum_channels
image_sizezippatch_stridetokens_spatial_shapemasked_unit_sizemask_spatial_shape
mask_ratior>   r	   Conv2d	embed_dimpatch_padding
projection)selfconfigr>   	__class__s      r.   rN   zHieraPatchEmbeddings.__init__   s      122!!uaearuuuvvv"/ +BCC0$d$dF<MvOb8c8c$d$d$d!"n"nc$:SU[Ul6m6m"n"n"n +))&(
 
 
r-   Npixel_valuesr2   returnc                 "   ||                      |          S |j        dd         } |j        |j        d         dg| j        R  }t          j                            |                                |          }|                      ||z            S )zZero-out the masked regions of the input before conv.
        Prevents leakage of masked regions when using overlapping kernels.
        Nr@   r   r   )size)r^   shapeviewrY   r	   
functionalinterpolatefloat)r_   rb   r2   target_sizes       r.   masked_convz HieraPatchEmbeddings.masked_conv   s     "??<000"(,./.|/A!/Dab$Jabbb-33O4I4I4K4KR]3^^|o=>>>r-   noisec                    |j         d         }t          j        | j                  }t	          |d| j        z
  z            }|t          j        |||j                  }t          j	        |d          }t          j	        |d          
                    |j                  }t          j        ||g|j                  }d|ddd|f<   t          j        |d|                                          }||fS )a  
        Perform per-sample random masking by per-sample shuffling. Per-sample shuffling is done by argsort random
        noise.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`)
            noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*) which is
                mainly used for testing purposes to control randomness and maintain the reproducibility
        r   r   N)devicedim)rq   index)rf   mathprodrY   intrZ   r)   randro   argsorttozerosgatherbool)	r_   rb   rm   
batch_sizenum_windowslen_keepids_shuffler3   r2   s	            r.   random_maskingz#HieraPatchEmbeddings.random_masking   s     "'*
i 788{a$/&9:;;=Jz;|?RSSSE mEq111mKQ777::<;NOO  +z;&?H[\\\()9H9%,A[QQQVVXX++r-   c                     | j         r|                     ||          nd\  }}|                     ||          }|                    d                              dd          }|||fS )Nrm   )NNr@   r   )r>   r   rl   flatten	transpose)r_   rb   rm   r2   r3   
embeddingss         r.   forwardzHieraPatchEmbeddings.forward   sw     ?Ck[DE:::| 	'+ %%lODD
''**44Q::
?K77r-   FN)r%   r&   r'   r(   r{   rN   r)   r*   r   r4   Tensorrl   r   r5   r   r   __classcell__ra   s   @r.   r=   r=      sO        
 
t 
 
 
 
 
 
, ^b? ?!-?@HIY@Z?	? ? ? ?$ UY ,  ,!- ,6>u?P6Q ,	u!11	2 ,  ,  ,  ,J .28 8'8 )*8 
u|Xe&67%BR9SS	T	8 8 8 8 8 8 8 8r-   r=   c                   "    e Zd ZdZddededdf fdZdej        d	ej        d
e	de	dej        f
dZ
dej        d
e	de	dedej        f
dZ	 	 ddej        deej                 dedeej        eej                 eej                 f         fdZ xZS )HieraEmbeddingsz2
    Construct position and patch embeddings.
    Fr`   r>   rc   Nc                    t                                                       |j        | _        d t          |j        |j                  D             }d t          ||j                  D             | _        t          j        |          | _	        || _
        t          ||          | _        t          j        t          j        d| j	        |j                            | _        d S )Nc                     g | ]
\  }}||z  S r,   r,   rD   s      r.   rH   z,HieraEmbeddings.__init__.<locals>.<listcomp>7       ___41aQ___r-   c                     g | ]
\  }}||z  S r,   r,   rD   s      r.   rH   z,HieraEmbeddings.__init__.<locals>.<listcomp>8  s     "i"i"ida16"i"i"ir-   r>   r   )rM   rN   rV   rU   rT   rX   rY   rs   rt   
num_tokensr>   r=   patch_embeddingsr	   	Parameterr)   ry   r\   position_embeddings)r_   r`   r>   rW   ra   s       r.   rN   zHieraEmbeddings.__init__4  s    "/__3v7H&J]3^3^___"i"ic:NPVPg6h6h"i"i"i)$899 4VF K K K#%<AtPVP`0a0a#b#b   r-   r   
pos_embedsheightwidthc                    |j         d         }|j         d         }t          j                                        s||k    r||k    r|S |j         d         }|| j        d         z  }|| j        d         z  }	t          |dz            }
|                    d|
|
|          }|                    dddd          }t          j	        
                    |||	fdd	          }|                    dddd                              dd|          }|S )
a2  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing, no class embeddings, and different patch strides.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r         ?r   r@   bicubicF)re   modealign_corners)rf   r)   jit
is_tracingrV   r   reshapepermuter	   rh   ri   rg   )r_   r   r   r   r   num_patchesnum_positionsrq   
new_height	new_widthsqrt_num_positionss              r.   interpolate_pos_encodingz(HieraEmbeddings.interpolate_pos_encoding@  s(    !&q)"(+ y##%% 	+*F*F6UZ??r"t033
T.q11	&}c'9::''+=?QSVWW
''1a33
]..i(	 / 
 

  ''1a3388BDD
r-   r   c                 N    |r|                      || j        ||          n| j        S r   )r   r   )r_   r   r   r   r   s        r.   get_position_embeddingz&HieraEmbeddings.get_position_embeddingf  s5    
 (*D))*d6NPVX]^^^)	
r-   rb   rm   c                     |j         dd          \  }}|                     ||          \  }}}||                     ||||          z   }|||fS )NrB   r   )rf   r   r   )	r_   rb   rm   r   r   r   r   r2   r3   s	            r.   r   zHieraEmbeddings.forwardo  sf     %*233/373H3H]b3H3c3c0
O[$"="=j&RWYq"r"rr
?K77r-   r   NF)r%   r&   r'   r(   r   r{   rN   r)   r   ru   r   r*   r   r   r   r4   r5   r   r   r   s   @r.   r   r   /  s_        
c 
c{ 
cD 
cT 
c 
c 
c 
c 
c 
c$,$49L$JM$VY$	$ $ $ $L
,
03
<?
[_
		
 
 
 
 .2).		8 	8'	8 )*	8 #'		8
 
u|Xe&67%BR9SS	T	8 	8 	8 	8 	8 	8 	8 	8r-   r   c                        e Zd ZdZ	 	 	 ddedededed	ed
eddf fdZ	 	 ddej        de	ej
                 dedeej        e	ej                 f         fdZ xZS )HieraMaskUnitAttentionz
    Computes either Mask Unit or Global Attention. Also is able to perform query pooling.

    Note: this assumes the tokens have already been flattened and unrolled into mask units.
    r   r   Fhidden_sizehidden_size_output	num_headsquery_stridewindow_sizeuse_mask_unit_attnrc   Nc                 .   t                                                       || _        || _        || _        ||z  | _        | j        dz  | _        t          j        |d|z            | _	        t          j        ||          | _
        || _        || _        d S )Ng      r   )rM   rN   r   r   r   head_dimscaler	   Linearqkvprojr   r   )r_   r   r   r   r   r   r   ra   s          r.   rN   zHieraMaskUnitAttention.__init__  s     	"("4*i7m,
9[!.@*@AAI02DEE	&"4r-   r"   	head_maskoutput_attentionsc                    |j         \  }}}d}| j        r|| j        | j        z  z  }|                     |          }|                    |d|d| j        | j                  }|                    dddddd          }|	                    d          \  }	}
}| j        dk    rD|	
                    || j        || j        d| j                  }	|	                    d          j        }	|	| j        z  |
                    dd	          z  }|                    d          }|||z  }||z  }|                    dd                              |d| j                  }|                     |          }|r||fn|d
fS )z3Input should be of shape [batch, tokens, channels].r   r   r   r      r@      rp   rB   N)rf   r   r   r   r   r   r   r   r   unbindrg   maxvaluesr   r   softmaxr   r   )r_   r"   r   r   r|   seq_len_r}   r   querykeyvalueattn_weightsattn_outputs                 r.   r   zHieraMaskUnitAttention.forward  s    "/!4
GQ" 	L!d&7$:J&JKKhh}%%kk*b+q$.$-XXkk!Q1a++JJqMMsEq  JJz4>;HY[]_c_lmmEII!I$$+E
*cmmB.C.CC#+++33  ')3L"U*!++Aq1199*b$Jabbii,,.?X\**kSWEXXr-   )r   r   Fr   )r%   r&   r'   r(   ru   r{   rN   r)   r   r   r*   r   r   r   r   s   @r.   r   r   {  s         #(5 55  5 	5
 5 5 !5 
5 5 5 5 5 54 26"'	#Y #Y|#Y E-.#Y  	#Y
 
u|Xel33	4#Y #Y #Y #Y #Y #Y #Y #Yr-   r           Finput	drop_probtrainingrc   c                     |dk    s|s| S d|z
  }| j         d         fd| j        dz
  z  z   }|t          j        || j        | j                  z   }|                                 |                     |          |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   r   )dtypero   )rf   ndimr)   rv   r   ro   floor_div)r   r   r   	keep_probrf   random_tensoroutputs          r.   	drop_pathr     s     CxII[^
Q 77E
5EL Y Y YYMYYy!!M1FMr-   c                   j     e Zd ZdZd	dee         ddf fdZdej        dej        fdZ	de
fdZ xZS )
HieraDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rc   c                 V    t                                                       || _        d S r   )rM   rN   r   )r_   r   ra   s     r.   rN   zHieraDropPath.__init__  s$    "r-   r"   c                 8    t          || j        | j                  S r   )r   r   r   r_   r"   s     r.   r   zHieraDropPath.forward  s    FFFr-   c                 6    d                     | j                  S )Nzp={})formatr   r_   s    r.   
extra_reprzHieraDropPath.extra_repr  s    }}T^,,,r-   r   )r%   r&   r'   r(   r   rj   rN   r)   r   r   strr   r   r   s   @r.   r   r     s        bb# #(5/ #T # # # # # #GU\ Gel G G G G-C - - - - - - - -r-   r   c                   L     e Zd Zdeddf fdZdej        dej        fdZ xZS )HieraMlprq   rc   Nc                 2   t                                                       t          |j                 | _        t          j        |t          ||j        z                      | _	        t          j        t          ||j        z            |          | _
        d S r   )rM   rN   r   
hidden_actactivation_fnr	   r   ru   	mlp_ratiofc1fc2)r_   r`   rq   ra   s      r.   rN   zHieraMlp.__init__  ss    #F$569S#cF,<&<"="=>>9Sv'7!788#>>r-   r"   c                     |                      |          }|                     |          }|                     |          }|S r   )r   r   r   r   s     r.   r   zHieraMlp.forward  s=    //**=99//r-   )	r%   r&   r'   ru   rN   r)   r   r   r   r   s   @r.   r   r     sq        ?C ?D ? ? ? ? ? ?U\ el        r-   r   c                        e Zd Z	 	 	 	 ddedededed	ed
ededdf fdZ	 	 ddej        de	ej
                 dedeej        e	ej                 f         fdZ xZS )
HieraLayerr   r   r   Fr   r   r   r   r   r   r   rc   Nc	                    t                                                       || _        || _        || _        t          j        ||j                  | _        t          ||||||          | _
        t          j        ||j                  | _        t          ||          | _        |dk    rt          |          nt          j                    | _        ||k    rt          j        ||          | _        d S d S )Neps)r   r   r   r   r   r   r   )rM   rN   r   r   r   r	   	LayerNormlayer_norm_epslayernorm_beforer   attnlayernorm_afterr   mlpr   Identityr   r   r   )
r_   r`   r   r   r   r   r   r   r   ra   s
            r.   rN   zHieraLayer.__init__  s     	&"4( "[f>S T T T*#1%#1
 
 
	  "|,>FDYZZZF$6775>]]y111,,,	+/ABBDIII -,r-   r"   r   r   c                    |j         \  }}}|                     |          }| j        | j        k    rP|                     |          }|                    || j        d| j                                      d          j        }| 	                    |||          \  }}|| 
                    |          z   }|}	|                     |          }|                     |          }|	| 
                    |          z   }||fS )Nr   r   rp   r   )rf   r   r   r   r   rg   r   r   r   r   r   r   r   )
r_   r"   r   r   r|   r   r   hidden_states_normr   residuals
             r.   r   zHieraLayer.forward  s    "/!4
GQ!22=AAt666 II&899M "":t/@"dF]^^bbghbiip  .2YY	=N .7 .
 .
*	\ &7I(J(JJ ,,];;// 4>>-#@#@@|,,r-   )r   r   r   Fr   )r%   r&   r'   ru   rj   r{   rN   r)   r   r   r*   r   r   r   r   s   @r.   r   r     s        #( C  C  C  	 C
  C  C  C  C ! C 
 C  C  C  C  C  CJ 26"'	- -|- E-.-  	-
 
u|Xel33	4- - - - - - - -r-   r   c                        e Zd Z	 ddededededee         dee         ded	ed
ee         ddf fdZ	 dde	j
        dee	j                 dedee	j
        ee	j
                 f         fdZ xZS )
HieraStageNdepthr   r   r   r   r   r   r   	stage_numrc   c                   	 t                                                       d|
j        |
dk    r|
dz
  nd         t          j        	f	dt          |          D                       | _        d S )NFr   r   c                 v   	 g | ]5}t          |d k    rn|         |         
	po|d k              6S )r   )r`   r   r   r   r   r   r   r   )r   )rE   rF   r`   r   r   r   r   $previous_stage_used_masked_attentionr   r   r   s     r.   rH   z'HieraStage.__init__.<locals>.<listcomp>H  sy         !/0Avv;M'9''l!-a +'9'n>b>mghlmgm	 	 	  r-   )rM   rN   masked_unit_attentionr	   
ModuleListrangelayers)r_   r`   r   r   r   r   r   r   r   r   r   r  ra   s    ` ``````` @r.   rN   zHieraStage.__init__2  s     	
 05, 393OajmnananPY\]P]P]tu3v0m            u  
 
r-   Fr"   r   r   c                 x    t          | j                  D ]"\  }}|||         nd } ||||          \  }}#||fS )Nr   )	enumerater  )r_   r"   r   r   rF   layer_modulelayer_head_maskr   s           r.   r   zHieraStage.forwardW  se      )55 	 	OA|.7.CillO,8LBS- - -)]LL l**r-   r   r   )r%   r&   r'   ru   r   rj   r{   r   rN   r)   r   r*   r   r   r   r   s   @r.   r   r   1  s        $(#
 #
 #
 	#

  #
 #
 ;#
 3i#
 #
 !#
 C=#
 
#
 #
 #
 #
 #
 #
L ns	+ 	+"\	+6>u?P6Q	+fj	+	u|Xel33	4	+ 	+ 	+ 	+ 	+ 	+ 	+ 	+r-   r   r"   rf   mask_unit_shapec                     | j         d         | j         d         }}d t          ||          D             } | j        |g|||R  } |                     dddddd          }  | j        |g||R  } | S )	a]  
    Restore spatial organization by undoing windowed organization of mask units.

    Args:
        hidden_states (`torch.Tensor`): The hidden states tensor of shape `[batch_size, num_mask_unit_height*num_mask_unit_width, hidden_size]`.
        shape (`List[int]`): The original shape of the hidden states tensor before windowing.
        mask_unit_shape (`List[int]`): The shape of the mask units used for windowing.

    Returns:
        torch.Tensor: The restored hidden states tensor of shape [batch_size, num_mask_unit_height*mask_unit_height, num_mask_unit_width*mask_unit_width, hidden_size].
    r   r   c                     g | ]
\  }}||z  S r,   r,   )rE   rG   mus      r.   rH   z"undo_windowing.<locals>.<listcomp>r  s     GGG%!Ra2gGGGr-   r   r   r@   r   r   )rf   rU   rg   r   r   )r"   rf   r  r|   r   num_mask_unitss         r.   undo_windowingr  c  s     ,1!4m6I"6MJ HG3uo+F+FGGGN&M&zbNb_bVabbbM "))!Q1a;;M)M)*JuJkJJJMr-   c                        e Zd Zdeddf fdZ	 ddej        dedeej	                 dej        fdZ
	 	 	 	 	 ddej        deej	                 deej                 dedededeeef         fdZ xZS )HieraEncoderr`   rc   Nc                 @   t                                                       t          j                  }d t	          j        dj        |          D             }t	          j        j                                      d          	                                }|d j
                 fdt          |          D             }t          j                    | _        j        }dg|z   }t!          j        j                  }t!          j        j                  }	t)          j                  D ]\  }
}t+          j        j        |
z  z            }t/          |||j        |
         |||
         ||
dz                     |||
         ||
dz                     t+          ||	|
 z  z            j        |
         |

  
        }|}| j                            |           d t7          j        j                  D             }j        gt=          j        d d                   z  }i | _        t          t=          j                            D ]B}
||f| j        |
<   |
j
        k     r)d t7          |j                  D             }|dd          }Cd	| _         d S )
Nc                 6    g | ]}|                                 S r,   )item)rE   xs     r.   rH   z)HieraEncoder.__init__.<locals>.<listcomp>  s     WWWAqvvxxWWWr-   r   c                 N    g | ]!}|v rt          j        j                  nd "S r   )rs   rt   r   )rE   rF   r`   query_pool_layers     r.   rH   z)HieraEncoder.__init__.<locals>.<listcomp>  s8    ttt\]1@P;P;P6#6777VWtttr-   r   )
r`   r   r   r   r   r   r   r   r   r   c                     g | ]
\  }}||z  S r,   r,   rD   s      r.   rH   z)HieraEncoder.__init__.<locals>.<listcomp>  s     UUUAa1fUUUr-   r   c                     g | ]
\  }}||z  S r,   r,   rD   s      r.   rH   z)HieraEncoder.__init__.<locals>.<listcomp>  s     VVVAa1fVVVr-   F)!rM   rN   sumdepthsr)   linspacedrop_path_ratetensorcumsumtolistnum_query_poolr  r	   r  stagesr\   rs   rt   rX   r   r	  ru   embed_dim_multiplierr   r   r  appendrU   rT   rV   rO   schedulegradient_checkpointing)r_   r`   total_depthdprcumulative_depthsquery_stridesr   
stage_endsmasked_unit_areaquery_stride_area	idx_stager   r   stage
stage_sizeunroll_scheduler  ra   s    `              @r.   rN   zHieraEncoder.__init__~  s   &-((WW63H+!V!VWWW!L77>>qAAHHJJ,-Dv/D-DEtttttafgrasasttt moo&S,,
9V%<== If&9:: )&- 8 8 	& 	&Iu!$V%58SU^8^%^!_!_'#5 *95j3jQ6OOP*:i+@:iZ[mC\+\] 03Dyj3P PQQ#)#?	#J#  E -KKu%%%%
 VUV->@S)T)TUUU
!./#fmCRC6H2I2IIs6=1122 	6 	6I'6
'BDM)$6000VVZAT1U1UVVV
"1!"""5&+###r-   r"   	stage_idxr2   c           
         | j         |         \  }}|j        \  }}}t          |          }	dg|	z  }
|D ]} |j        |g||t	          j        |          z  |
|R  }|                    ddddddd          }t          |	          D ]}|
|xx         ||         z  cc<    |j        |dg|
|R  }|j        d         } |j        ||g|
|R  }||S t          |||
          }|S )	a\  
        Roll the given tensor back up to spatial order assuming it's from the given block.

        If no bool_masked_pos is provided returns:
            - [batch_size, height, width, hidden_size]
        If a bool_masked_pos is provided returns:
            - [batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size]
        r   r   r   r   r@   r      r   )
r'  rf   rO   rg   rs   rt   r   r  r   r  )r_   r"   r4  r2   r'  re   r|   r   r   num_dimr  stridesrF   s                r.   rerollzHieraEncoder.reroll  sj    y1$+8+>(
G[d))#- 	- 	-G.M.$&-71C1C&CFUWb  M *11!Q1aAFFM 7^^ 1 1"""gaj0""""1M1*b`?`T_```M#)!,GG +*:w^^R]^^^ &   '}dOLLr-   FTr   r   output_hidden_statesreturn_dictc                    |rdnd }|rdnd }|rdnd }	|r$||fz   }|                      |d|          }
||
fz   }t          | j                  D ]\  }}|||         nd }| j        r%| j        r|                     |j        |||          }n ||||          }|d         }|r|	|d         fz   }	|r$||fz   }|                      |||          }
||
fz   }|st          d |||	|fD                       S t          |||	|          S )Nr,   r   )r4  r2   r   c              3      K   | ]}||V  	d S r   r,   )rE   vs     r.   	<genexpr>z'HieraEncoder.forward.<locals>.<genexpr>  s0        =  === r-   )r!   r"   r#   r$   )	r9  r	  r$  r(  r   _gradient_checkpointing_func__call__tupler    )r_   r"   r2   r   r   r:  r;  all_hidden_statesall_reshaped_hidden_statesall_self_attentionsr$   rF   stage_moduler  layer_outputss                  r.   r   zHieraEncoder.forward  s    #7@BBD+?%IRRT"$5?bb4 	` 1]4D D%)[[!]l[%m%m")CG]F_)_&(55 	d 	dOA|.7.CillO* `t} ` $ A A )=/K\! ! !-]OM^ _ _)!,M  P&9]1=M<O&O## d$58H$H!)-]aap)q)q&-GKaJc-c* 	  '):<OQkl     
 "++*#=	
 
 
 	
r-   r   )NNFFT)r%   r&   r'   r   rN   r)   r   ru   r   r4   r9  r*   r{   r   rB  r   r   r   r   s   @r.   r  r  }  s$       0,{ 0,t 0, 0, 0, 0, 0, 0,f jn, ,"\,69,LTUZUeLf,	, , , ,b 7;15"'%* 1
 1
|1
 "%"231
 E-.	1

  1
 #1
 1
 
uo%	&1
 1
 1
 1
 1
 1
 1
 1
r-   r  image_shaperV   r'  c           	         | j         \  }}}d t          ||          D             }|} | j        |g|z   |gz    } |D ]}	d t          ||	          D             }d t          ||	          D             }
|g|
z   |gz   }
|                     |
          } t          |
          }dgt	          t          d|dz
  d                    z   t	          t          d|dz
  d                    z   |dz
  gz   }|                     |          } |                     dt          |	                    } |t          j	        |	          z  }| 
                    dt          j	        |          |          } | S )a  
    Reorders the tokens such that patches are contiguous in memory.
    E.g., given [batch_size, (height, width), hidden_size] and stride of (stride, stride), this will re-order the tokens as
    [batch_size, (stride, stride, height // stride, width // stride), hidden_size]

    This allows operations like Max2d to be computed as x.view(batch_size, stride*stride, -1, hidden_size).max(dim=1).
    Not only is this faster, but it also makes it easy to support inputs of arbitrary
    dimensions in addition to patch-wise sparsity.

    Performing this operation multiple times in sequence puts entire windows as contiguous
    in memory. For instance, if you applied the stride (2, 2) 3 times, entire windows of
    size 8x8 would be contiguous in memory, allowing operations like mask unit attention
    computed easily and efficiently, while also allowing max to be applied sequentially.

    Note: This means that intermediate values of the model are not in height x width order, so they
    need to be re-rolled if you want to use the intermediate values as a height x width feature map.
    The last block of the network is fine though, since by then the strides are all consumed.
    c                     g | ]
\  }}||z  S r,   r,   rD   s      r.   rH   zunroll.<locals>.<listcomp>)  s     >>>tq!AF>>>r-   c                     g | ]
\  }}||z  S r,   r,   rD   s      r.   rH   zunroll.<locals>.<listcomp>3  s     FFF41aQFFFr-   c                     g | ]	}|D ]}|
S r,   r,   )rE   pairr  s      r.   rH   zunroll.<locals>.<listcomp>5  s%    RRRdTRRTTRRRRr-   r   r@   r   r   )rf   rU   rg   rO   listr  r   r   rs   rt   r   )r"   rH  rV   r'  r|   r   r   re   current_sizer8  	new_shapenum_dimsr   s                r.   unrollrR    s   * "/!4J;>>s;==>>>DL&M&*)D})TVM ) )
 GF3|W+E+EFFFRRc,&@&@RRR	L9,}<	%**955 y>>#U1hlA66777$uQSTVW?X?X:Y:YY]ehi]i\jj%--g66 &--aW>>di(((

!))"dioo{KKMr-   c                   *    e Zd ZdZeZdZdZdZddZ	dS )	HieraPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    hierarb   Trc   Nc                 L   | j         j        }t          |t                    r(t          j                            |j        |           dS t          |t                    rNt          j                            |j	        |           t          j                            |j
        |           dS t          |t          j        t          j        t          j        f          rVt          j                            |j        |           |j        't          j                            |j        |           dS dS t          |t          j                  rVt          j                            |j        |           t          j                            |j        | j         j                   dS dS )zInitialize the weights)stdN)r`   initializer_range
isinstancer   r	   inittrunc_normal_r   HieraDecoder
mask_tokendecoder_position_embeddingsr   Conv1dr[   weightbias	constant_r   layer_norm_init)r_   modulerW  s      r.   _init_weightsz"HieraPreTrainedModel._init_weightsS  sg   k+fo.. 	JG!!&"<#!FFFFF-- 	JG!!&"3!===G!!&"D#!NNNNNBIry ABB 	JG!!&-S!999{&!!&+s33333 '& -- 	JGfk3///GfmT[-HIIIII	J 	Jr-   )rc   N)
r%   r&   r'   r(   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingre  r,   r-   r.   rT  rT  H  sP         
 L$O&*#J J J J J Jr-   rT  aG  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`HieraConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`BitImageProcessor.__call__`]
            for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        interpolate_pos_encoding (`bool`, *optional*):
            Whether to interpolate the pre-trained position encodings.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                   H     e Zd Zdef fdZdej        dej        fdZ xZS )HieraPoolerr`   c                 "   t                                                       t          |j        |j        t          |j                  dz
  z  z            }t          j        ||j	                  | _
        t          j        d          | _        d S )Nr   r   )rM   rN   ru   r\   r%  rO   r  r	   r   r   	layernormAdaptiveAvgPool1dpooler)r_   r`   num_featuresra   s      r.   rN   zHieraPooler.__init__  sw    6+f.IcRXR_N`N`cdNd.eeffl8MNNN*1--r-   r"   rc   c                     |                     dd          }|                     |          }t          j        |d          }|                     |          }|S )Nr   r@   )r   ro  r)   r   rm  )r_   r"   pooled_outputs      r.   r   zHieraPooler.forward  sP    %//155M22mQ77}55r-   )	r%   r&   r'   r   rN   r)   r   r   r   r   s   @r.   rk  rk    sj        .{ . . . . . .U\ el        r-   rk  z_The bare Hiera Model transformer outputting raw hidden-states without any specific head on top.z
        add_pooling_layer (`bool`, *optional*, defaults to `True`):
                Whether or not to apply pooling layer.
        is_mae (`bool`, *optional*, defaults to `False`):
                Whether or not to run the model on MAE mode.
    c                   n    e Zd Zddededef fdZdefdZd	ee	e
e	         f         dd
fdZ ee           eeeede          	 	 	 	 	 	 	 ddeej                 deej                 deej                 dee         dee         dee         dee         deeef         fd                        Z xZS )
HieraModelTFr`   add_pooling_layerr>   c                    t                                          |           t          |j        |j        t          |j                  dz
  z  z            | _        t          ||          | _	        t          |          | _        |j        gt          |j        d d                   z  | _        |rt          |          nd | _        |                                  d S )Nr   r   r   )rM   rN   ru   r\   r%  rO   r  rp  r   r   r  encoderr   r3  rk  ro  	post_init)r_   r`   ru  r>   ra   s       r.   rN   zHieraModel.__init__  s        063NSVW]WdSeSehiSi3j jkk)&@@@#F++ & 34s6=";M7N7NN->Hk&)))D 	r-   rc   c                     | j         j        S r   r   r   r   s    r.   get_input_embeddingszHieraModel.get_input_embeddings      //r-   heads_to_pruneNc                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrw  layer	attentionprune_heads)r_   r}  r  headss       r.   _prune_headszHieraModel._prune_heads  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr-   vision)
checkpointoutput_typerf  modalityexpected_outputrb   rm   r   r   r:  r   r;  c           	         ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |                     |t          | j         j                            }|                     |||          \  }}	}
|j	        d         |j	        d         f}t          ||| j         j        | j                  }|	rt          j        | j         j                  }|j	        \  }}}|	                    d                              d||          }||         }|                    |d|          }|                     ||	||||          }|d	         }d}| j        |                     |          }|s!|||fn|f}|	||	|
fz   n|}||dd         z   S t+          |||	|
|j        |j        |j        
          S )z
        noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*) which is
                mainly used for testing purposes to control randomness and maintain the reproducibility
                when is_mae is set to True.
        Nz You have to specify pixel_values)r   rm   rB   r   )rH  rV   r'  r   )r2   r   r   r:  r;  r   )r!   r1   r2   r3   r"   r#   r$   )r`   r   r:  use_return_dictrR   get_head_maskrO   r  r   rf   rR  rV   r3  rs   rt   rX   	unsqueezetilerg   rw  ro  r0   r"   r#   r$   )r_   rb   rm   r   r   r:  r   r;  embedding_outputr2   r3   rH  r"   mask_unit_arear|   r   r   	positionsencoder_outputssequence_outputrr  head_outputss                         r.   r   zHieraModel.forward  s>   . 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]?@@@ &&y#dk6H2I2IJJ	9=3KSX :I :
 :
6/; $)"-|/A"/EF#1)	
 
 
 &!Yt{'CDDN)6)<&J;'11"55::1nkZZI))4M)..z2{KKM,,+/!5# ' 
 
 *!,;" KK88M 	6?L?XO];;_n^pLAPA\===bn   /!"""555-'+#)7&1#2#I
 
 
 	
r-   )TFNNNNNNN)r%   r&   r'   r   r{   rN   r=   r{  r   ru   r   r  r   HIERA_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr0   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r)   r   r*   r   r   r   r   r   r   s   @r.   rt  rt    s        { t TX      0&: 0 0 0 0C4T#Y+? CD C C C C +*+ABB&$$.   04-1,0,0/337&*O
 O
u|,O
 )*O
 EL)	O

 $D>O
 'tnO
 #+4.O
 d^O
 
u00	1O
 O
 O
  CBO
 O
 O
 O
 O
r-   rt  c                        e Zd Zdef fdZ	 	 ddej        dej        deej                 de	d	e
ej        ej        f         f
d
Z xZS )r\  r`   c                 v   t                                                       t          j        j        t          j                  dz
  z  z            }d t          j        j	                  D             }fdt          |j
                  D             | _        fdt          j        j
                  D             | _        t          j        |j                  | _        t          j        t'          j        ddj                            | _        t          j        t'          j        dt-          j        | j                  j                            | _        t3          j        j        j        j        ddgj        z  dgj        z  d	  	        | _        t          j        j        j        	          | _        j	        d
         j
        d
         j         z  z  | _!        | j!        t          j
                  z  j"        z  }t          j        j        |          | _#        d S )Nr   c                     g | ]
\  }}||z  S r,   r,   rD   s      r.   rH   z)HieraDecoder.__init__.<locals>.<listcomp>  r   r-   c                 0    g | ]\  }}||j         z  z  S r,   r#  rE   rF   rG   r`   s      r.   rH   z)HieraDecoder.__init__.<locals>.<listcomp>  s7     +
 +
 +
26!QAv,--+
 +
 +
r-   c                 0    g | ]\  }}||j         z  z  S r,   r  r  s      r.   rH   z)HieraDecoder.__init__.<locals>.<listcomp>!  7     .
 .
 .
26!QAv,--.
 .
 .
r-   Fr   r   )	r`   r   r   r   r   r   r   r   r   r   r   )$rM   rN   ru   r\   r%  rO   r  rU   rT   rV   r   tokens_spatial_shape_finalrX   mask_unit_spatial_shape_finalr	   r   decoder_hidden_sizedecoder_embeddingsr   r)   ry   r]  rs   rt   r^  r   decoder_num_headsdecoder_depthdecoder_blockr   r   decoder_normr#  pred_striderS   decoder_pred)r_   r`   rp  rW   pred_dimra   s    `   r.   rN   zHieraDecoder.__init__  s+   6+f.IcRXR_N`N`cdNd.eeff__3v7H&J]3^3^___+
 +
 +
 +
:=>RTZTg:h:h+
 +
 +
'.
 .
 .
 .
:=f>UW]Wj:k:k.
 .
 .
* #%)L&:T"U"U,u{1a9S'T'TUU+-<K49T%DEEvGabb,
 ,
( (2%9.&$ef22v33

 

 

 L)CI^___ ".r2f6I"6MQWQf6fg$F,?(@(@@FDWWIf&@(KKr-   NFencoder_hidden_statesr2   r   r   rc   c           	         |                      |          }|j        dd          \  }}}|j        \  }	}
t          j        |	|
||||j        |j                  }| j                            ddddd          }|                    |	|
ddd          }|	                    dd|||          }|
                                ||<   d|                                z
  |z  |                                |z  z   }t          || j        | j                  }t          |dddf         | j        | j                  }|                    |j        d         d|j        d                   }|                    |j        d         d          }|| j        z   }|                     |||          \  }}|                     |          }|                     |          }||fS )Nr@   )ro   r   r   r   .r   )r   r   )r  rf   r)   ry   ro   r   r]  rg   r   expandr   rj   r  r  r  r^  r  r  r  )r_   r  r2   r   r   r"   mask_unit_heightmask_unit_widthr  r|   r  decoder_hidden_statesmask_tokensr   s                 r.   r   zHieraDecoder.forwardA  s#    //0EFF BOATUVUWUWAX>/+>%4%:"
N % '%!
 !
 !
 o**1aAr::)11*naQRTUVV)00R9I?\opp1>1F1F1H1Ho.%%'''!)//114II!J
 '!+.
 

 )C1H%+.
 
 &--m.A!.Db-J]^`Jabb)..}/B1/ErJJ &(HH '+&8&8YBS '9 '
 '
#| ))-88 ))-88o--r-   r   )r%   r&   r'   r   rN   r)   r   r4   r   r{   r   r   r   r   s   @r.   r\  r\    s        %L{ %L %L %L %L %L %LV -1"'>. >.$|>. )>. EL)	>.
  >. 
u|U--	.>. >. >. >. >. >. >. >.r-   r\  c                        e Zd Zdef fdZdej        dej        dej        fdZ	de
ej                 dej        fdZ xZS )	HieraMultiScaleHeadr`   c           	         t                                                       fdt          j        j                  D             | _        fdt          t          j                            D             | _	        j        }t          j                    | _        t          j                  D ]}d t          || j                  D             }d t          |j                  D             }| j                            t          j        | j	        |         | j	        d         ||                     | j                            t          j                               d S )Nc                 0    g | ]\  }}||j         z  z  S r,   r  r  s      r.   rH   z0HieraMultiScaleHead.__init__.<locals>.<listcomp>  r  r-   c                 N    g | ]!}t          j        j        |z  z            "S r,   ru   r\   r%  rE   rF   r`   s     r.   rH   z0HieraMultiScaleHead.__init__.<locals>.<listcomp>  s>     !
 !
 !
GHC 6#>#AABB!
 !
 !
r-   c                     g | ]
\  }}||z  S r,   r,   rD   s      r.   rH   z0HieraMultiScaleHead.__init__.<locals>.<listcomp>  s     kkkAa1fkkkr-   c                     g | ]
\  }}||z  S r,   r,   rD   s      r.   rH   z0HieraMultiScaleHead.__init__.<locals>.<listcomp>  s     'n'n'n41aQ'n'n'nr-   r   )rJ   rK   )rM   rN   rU   rX   r   r  r  rO   r  stage_dimensionsr	   r  multi_scale_fusion_headsr#  r&  r[   r   )r_   r`   current_masked_unit_sizeidxkernelra   s    `   r.   rN   zHieraMultiScaleHead.__init__  s   .
 .
 .
 .
:=f>UW]Wj:k:k.
 .
 .
*!
 !
 !
 !
LQRUV\VcRdRdLeLe!
 !
 !
 $*#: (*%.// 
	 
	Ckk-EtGi)j)jkkkF'n'n3?WY_Yl;m;m'n'n'n$)00	)#.)"- &!	      	%,,R[]];;;;;r-   headr"   rc   c                 \   t          |t          j                  r|S |j        \  }}}}}|                    ||z  |||          }|                    dddd          } ||          }|                    dddd          }|j        dd          \  }}	}|                    ||||	|          }|S )Nr   r   r   r@   )rY  r	   r   rf   r   r   )
r_   r  r"   r|   r  r  r  r   mask_unit_height_finalmask_unit_width_finals
             r.   apply_fusion_headz%HieraMultiScaleHead.apply_fusion_head  s    dBK(( 	!   VcUhR
N$4o{ &--')9?K
 
 &--aAq99]++ &--aAq99EREXYZY[Y[E\B 5{%--(>@UWb
 
 r-   feature_mapsc                 r    d}t          | j        |          D ]\  }}||                     ||          z   }|S )Nr   )rU   r  r  )r_   r  r"   r  feature_maps        r.   r   zHieraMultiScaleHead.forward  sL    !$T%BL!Q!Q 	V 	VD+)D,B,B4,U,UUMMr-   )r%   r&   r'   r   rN   r	   Moduler)   r   r  r   r   r   r   s   @r.   r  r    s        <{ < < < < < <0bi  QVQ]    .D$6 5<        r-   r  a6  The Hiera Model transformer with the decoder on top for self-supervised pre-training.

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    c                       e Zd Zdeddf fdZdej        dej        dej        fdZdej        dej        dej        fd	Z	 e
e           eee
          	 	 	 	 	 	 	 ddeej                 deej                 deej                 dee         dee         dee         dee         deeef         fd                        Z xZS )HieraForPreTrainingr`   rc   Nc                 f   t                                          |           t          |dd          | _        t	          j        | j        j        |j                  | _        t          |          | _
        t          |          | _        | j        j        | _        |                                  d S )NFTru  r>   r   )rM   rN   rt  rU  r	   r   rp  r   encoder_normr  multiscale_fusionr\  decoderr  rx  r_   r`   ra   s     r.   rN   zHieraForPreTraining.__init__  s       %MMM
L)@fF[\\\!4V!<!<#F++<3 	r-   rb   r2   c                    |                     dddd          }| j        }|                    d||                              d||          }|                    dd                              d          }||         }| j        j        r<|                    dd          }|                    dd          }||z
  |dz   d	z  z  }|S )
Nr   r@   r   r   r   T)rq   keepdimgư>r   )r   r  unfoldr   r`   normalize_pixel_lossmeanvar)r_   rb   r2   re   labelr  r  s          r.   get_pixel_label_2dz&HieraForPreTraining.get_pixel_label_2d  s    #++Aq!Q77##AtT2299!T4HHa##++A..o&;+ 	;::"d:33D))D)11CT\cFls%::Er-   r9   c                     | }|                      ||          }||         }||z
  dz  }|                                }|S )Nr@   )r  r  )r_   rb   r9   r2   r  r8   s         r.   forward_lossz HieraForPreTraining.forward_loss  sI    **''oFF(1$yy{{r-   )r  rf  rm   r   r   r:  r   r;  c           	         ||n| j         j        }||n| j         j        }||n| j         j        }|                     ||||d||          }|d         }	|d         }
|d         }|	d| j        j         j        dz            |	d         fz   }	|                     |	          }|                     |          }|                     ||
||          \  }}
| 	                    |||
          }|s9||
|f}|r||d         fz   }|r||d	         fz   }|r||d         fz   }||f|z   n|S t          |||
||r|j        nd|j        |r|j        nd
          S )a  
        noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*) which is
                mainly used for testing purposes to control randomness and maintain the reproducibility
                when is_mae is set to True.

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, HieraForPreTraining
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/hiera-tiny-224-mae-hf")
        >>> model = HieraForPreTraining.from_pretrained("facebook/hiera-tiny-224-mae-hf")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> loss = outputs.loss
        >>> print(list(logits.shape))
        [1, 196, 768]
        ```NT)rm   r   r   r:  r   r;  r   r   r@   )r2   r   r   r   r   )r8   r9   r2   r3   r"   r#   r$   )r`   r  r   r:  rU  r#  r  r  r  r  r;   r"   r#   r$   )r_   rb   rm   r   r   r:  r   r;  outputsr  r2   ids_to_restorefused_hidden_statesr9   r8   r   s                   r.   r   zHieraForPreTraining.forward  s   P &1%<kk$+B]1B1N--TXT_Tq$8$D  $+Jj 	 **/!%%=#  
 
 r{!!* #A
(9(H1(L$LMQ]^`QaPcc"44\BB"//0CDD #',,+/	 #/ #
 #
   vGG 	Fo~>F# 071:-/  071:-/# 172;.0)-)9TGf$$vE(+&3GQ'//T)EY#c7#A#A_c
 
 
 	
r-   r  )r%   r&   r'   r   rN   r)   r   r4   r  r  r   r  r   r;   r  r   r*   r{   r   rB  r   r   r   s   @r.   r  r    s       { t      u| eN^ chco    	 	u| 	^c^n 	 	 	 	 +*+ABB+DSbccc 04-1,0,0/337&*Z
 Z
u|,Z
 )*Z
 EL)	Z

 $D>Z
 'tnZ
 #+4.Z
 d^Z
 
u//	0Z
 Z
 Z
 dc CBZ
 Z
 Z
 Z
 Z
r-   r  a  
    Hiera Model transformer with an image classification head on top (a linear layer on top of the final hidden state with
    average pooling) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune Hiera on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                       e Zd Zdeddf fdZ ee           eee	e
e          	 	 	 	 	 	 ddeej                 deej                 dee         d	ee         d
ee         dee         deee	f         fd                        Z xZS )HieraForImageClassificationr`   rc   Nc                 F   t                                          |           |j        | _        t          |dd          | _        |j        dk    r$t          j        | j        j        |j                  nt          j                    | _	        | 
                                 d S )NTFr  r   )rM   rN   
num_labelsrt  rU  r	   r   rp  r   
classifierrx  r  s     r.   rN   z$HieraForImageClassification.__init___  s        +$uMMM
 FLEVYZEZEZBIdj-v/@AAA`b`k`m`m 	
 	r-   )r  r  rf  r  r   labelsr   r:  r   r;  c                 >   ||n| j         j        }||n| j         j        }||n| j         j        }|                     ||||||          }|d         }	|                     |	          }
d}|t|                    |
j                  }| j         j        f| j	        dk    rd| j         _        nN| j	        dk    r7|j
        t          j        k    s|j
        t          j        k    rd| j         _        nd| j         _        | j         j        dk    rWt                      }| j	        dk    r1 ||
                                |                                          }n ||
|          }n| j         j        dk    rGt!                      } ||
                    d| j	                  |                    d                    }n*| j         j        dk    rt%                      } ||
|          }|s|
f|dd         z   }||f|z   n|S t'          ||
|j        |j        |j        	          S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r   r:  r   r;  r   
regressionsingle_label_classificationmulti_label_classificationr   r@   )r8   r9   r"   r#   r$   )r`   r  r   r:  rU  r  rx   ro   problem_typer  r   r)   longru   r   squeezer   rg   r
   r7   r"   r#   r$   )r_   rb   r   r  r   r:  r   r;  r  rr  r9   r8   loss_fctr   s                 r.   r   z#HieraForImageClassification.forwardm  sM   . &1%<kk$+B]1B1N--TXT_Tq$8$D  $+Jj 	 **/!5%=#  
 
  
//YYv}--F{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE0!/)#*#A
 
 
 	
r-   )NNNNNN)r%   r&   r'   r   rN   r   r  r   _IMAGE_CLASS_CHECKPOINTr7   r  _IMAGE_CLASS_EXPECTED_OUTPUTr   r)   r   r{   r   rB  r   r   r   s   @r.   r  r  O  s<        { t       +*+ABB*5$4	   -1)-,0/337&*F
 F
 EL)F
 &	F

 $D>F
 'tnF
 #+4.F
 d^F
 
u77	8F
 F
 F
  CBF
 F
 F
 F
 F
r-   r  zN
    Hiera backbone, to be used with frameworks like DETR and MaskFormer.
    c                   |     e Zd Zdef fdZd Z	 	 	 ddej        dee	         dee	         dee	         d	e
f
d
Z xZS )HieraBackboner`   c                 $   t                                                     t                                                     j        gfdt	          t          j                            D             z   | _        t          d          | _	        t                    | _        i }t          | j        | j                  D ]\  }}t          j        |          ||<   t          j        |          | _        |                                  d S )Nc                 N    g | ]!}t          j        j        |z  z            "S r,   r  r  s     r.   rH   z*HieraBackbone.__init__.<locals>.<listcomp>  s>     2
 2
 2
GHC 6#>#AABB2
 2
 2
r-   Fr   )rM   rN   _init_backboner\   r  rO   r  rp  r   r   r  rw  rU   _out_featureschannelsr	   r   
ModuleDicthidden_states_normsrx  )r_   r`   r  r1  rS   ra   s    `   r.   rN   zHieraBackbone.__init__  s      v&&&#-. 2
 2
 2
 2
LQRUV\VcRdRdLeLe2
 2
 2
 
 *&???#F++ !#&t'94=#I#I 	D 	DE<)+l)C)C&&#%=1D#E#E  	r-   c                     | j         j        S r   rz  r   s    r.   r{  z"HieraBackbone.get_input_embeddings  r|  r-   Nrb   r:  r   r;  rc   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|                     |          \  }}}|                     |d|d|          }|d         }d}	t          | j        |          D ]\  }
}|
| j        v r|j	        \  }}}}|
                    |||z  |          } | j        |
         |          }|
                    ||||          }|                    dddd	                                          }|	|fz  }	|s!|	f}|r||d         fz  }|r||d	         fz  }|S t          |	|r|d         nd|r|d	         nd
          S )a?  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("facebook/hiera-tiny-224-hf")
        >>> model = AutoBackbone.from_pretrained(
        ...     "facebook/hiera-tiny-224-hf", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 7, 7]
        ```NT)r   r   r:  r;  r   r,   r   r   r   r@   )r  r"   r#   )r`   r  r:  r   r   rw  rU   stage_namesout_featuresrf   rg   r  r   
contiguousr   )r_   rb   r:  r   r;  r  r   r  r"   r  r1  hidden_stater|   r   r   rS   r   s                    r.   r   zHieraBackbone.forward  s   @ &1%<kk$+B]$8$D  $+Jj 	 2C1N--TXT_Tq!%!>!>!Q,,/!%#  
 
  #&t'7#G#G 	0 	0E<))):F:L7
FE<+00Ve^\ZZ>t7>|LL+00VULYY+33Aq!Q??JJLL/ 	"_F# (71:-'  (71:-'M%(<F'!**$%6@wqzzD
 
 
 	
r-   )NNN)r%   r&   r'   r   rN   r{  r)   r   r   r{   r   r   r   r   s   @r.   r  r    s        {      &0 0 0 04,0&*H
 H
lH
 'tnH
 $D>	H

 d^H
 
H
 H
 H
 H
 H
 H
 H
 H
r-   r  )r   F)Lr(   rs   dataclassesr   typingr   r   r   r   r   r)   torch.utils.checkpointr	   torch.nnr
   r   r   activationsr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   r   utils.backbone_utilsr   configuration_hierar   
get_loggerr%   loggerr  r  r  r  r  r    r0   r7   r;   r  r=   r   r   r   rj   r{   r   r   r   r   r   ru   r  r  rR  rT  HIERA_START_DOCSTRINGr  rk  rt  r\  r  r  r  r  r,   r-   r.   <module>r     sb      ! ! ! ! ! ! 5 5 5 5 5 5 5 5 5 5 5 5 5 5            A A A A A A A A A A ! ! ! ! ! !              . - - - - -                2 1 1 1 1 1 , , , , , , 
	H	%	%   3 %  < 1  K K K K K K K K@ &K &K &K &K &K{ &K &K &KR  K  K  K  K  K(=  K  K  KF !F !F !F !F !F !F !F !FH[8 [8 [8 [8 [829 [8 [8 [8|I8 I8 I8 I8 I8bi I8 I8 I8XAY AY AY AY AYRY AY AY AYJ U\ e T V[Vb    *- - - - -BI - - -    ry   =- =- =- =- =- =- =- =-@/+ /+ /+ /+ /+ /+ /+ /+d%, tCy SWX[S\ afam    4R
 R
 R
 R
 R
29 R
 R
 R
j3<3.3CHo3MRSVX[S[_3hlmqrumvhw3
\3 3 3 3lJ J J J J? J J J@	  2    ")    e	 	q
 q
 q
 q
 q
% q
 q
	 	q
hf. f. f. f. f.29 f. f. f.R6 6 6 6 6") 6 6 6r   E
 E
 E
 E
 E
. E
 E
 E
P   \
 \
 \
 \
 \
"6 \
 \
 \
~  	 _
 _
 _
 _
 _
(- _
 _
 _
 _
 _
r-   