
    g                        d Z ddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZ ddlZddlZddlZddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z*  e$j+        e,          Z-dZ.dZ/e G d de!                      Z0e G d de!                      Z1d Z2 G d dej3                  Z4 G d dej3                  Z5 G d dej3                  Z6 G d de6          Z7 G d  d!ej3                  Z8 G d" d#ej3                  Z9 G d$ d%e9          Z: G d& d'ej3                  Z; G d( d)ej3                  Z<e9e:d*Z= G d+ d,ej3                  Z> G d- d.ej3                  Z? G d/ d0e          Z@d1ZAd2ZB e"d3eA           G d4 d5e@                      ZC G d6 d7ej3                  ZD e"d8eA           G d9 d:e@                      ZE e"d;eA           G d< d=e@                      ZFdS )>z,PyTorch VideoMAE (masked autoencoder) model.    N)deepcopy)	dataclass)OptionalSetTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputImageClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD   )VideoMAEConfigr   zMCG-NJU/videomae-basec                       e Zd ZU dZdZej        ed<   dZe	e
ej                          ed<   dZe	e
ej                          ed<   dS )VideoMAEDecoderOutputaO  
    Class for VideoMAEDecoder's outputs, with potential hidden states and attentions.

    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
            Pixel reconstruction logits.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    Nlogitshidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r    r   r   r!        j/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/videomae/modeling_videomae.pyr   r   2   sk            !%FE$$$8<M8E%"345<<<59Ju01299999r*   r   c                       e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
eeej                          ed<   dZeeej                          ed<   dS )VideoMAEForPreTrainingOutputa  
    Class for VideoMAEForPreTraining's outputs, with potential hidden states and attentions.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`):
            Pixel reconstruction loss.
        logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
            Pixel reconstruction logits.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    Nlossr   r    r!   )r"   r#   r$   r%   r.   r   r&   r'   r(   r   r    r   r!   r)   r*   r+   r-   r-   I   s          $ )-D(5$
%,,, $FE$$$8<M8E%"345<<<59Ju01299999r*   r-   c                 d   fdt          j        fdt          |           D                       }t          j        |dddddf                   |dddddf<   t          j        |dddddf                   |dddddf<   t          j        |                              d          S )z Sinusoid position encoding tablec                 >      fdt                    D             S )Nc           	      R    g | ]#}t          j        d d|dz  z  z            z  $S )i'     )nppower).0hid_jd_hidpositions     r+   
<listcomp>zOget_sinusoid_encoding_table.<locals>.get_position_angle_vec.<locals>.<listcomp>j   s8    ___28E1
+;e+CDDD___r*   )range)r8   r7   s   `r+   get_position_angle_vecz;get_sinusoid_encoding_table.<locals>.get_position_angle_veci   s+    _____RWX]R^R^____r*   c                 &    g | ]} |          S r)   r)   )r5   pos_ir;   s     r+   r9   z/get_sinusoid_encoding_table.<locals>.<listcomp>l   s%    \\\55e<<\\\r*   Nr   r2   r   )r3   arrayr:   sincosr&   r'   	unsqueeze)
n_positionr7   sinusoid_tabler;   s    ` @r+   get_sinusoid_encoding_tablerD   e   s    ` ` ` ` ` X\\\\%PZJ[J[\\\]]N f^AAAqt!tG%<==N111add7 f^AAAqt!tG%<==N111add7^,,66q999r*   c                   (     e Zd ZdZ fdZd Z xZS )VideoMAEEmbeddingsz7
    Construct the patch and position embeddings.

    c                     t                                                       t          |          | _        | j        j        | _        t          | j        |j                  | _        || _        d S N)	super__init__VideoMAEPatchEmbeddingspatch_embeddingsnum_patchesrD   hidden_sizeposition_embeddingsconfigselfrP   	__class__s     r+   rJ   zVideoMAEEmbeddings.__init__y   s[     7 ? ?0<#>t?OQWQc#d#d r*   c                 <   |                      |          }|| j                            |                              |j                                                                                  z   }|+|j        \  }}}||          }|                    |d|          }|S )N)	rL   rO   type_astodeviceclonedetachshapereshape)rR   pixel_valuesbool_masked_pos
embeddings
batch_size_num_channelss          r+   forwardzVideoMAEEmbeddings.forward   s    **<88
  $":"B"B:"N"N"Q"QR\Rc"d"d"j"j"l"l"s"s"u"uu
 &*4*:'J<#_$45J#++JLIIJr*   r"   r#   r$   r%   rJ   rc   __classcell__rS   s   @r+   rF   rF   s   sQ         
          r*   rF   c                   (     e Zd ZdZ fdZd Z xZS )rK   aw  
    Video to Patch Embedding. This module turns a batch of videos of shape (batch_size, num_frames, num_channels,
    height, width) into a tensor of shape (batch_size, seq_len, hidden_size) to be consumed by a Transformer encoder.

    The seq_len (the number of patches) equals (number of frames // tubelet_size) * (height // patch_size) * (width //
    patch_size).

    c           	      v   t                                                       |j        }|j        }|j        }|j        }|j        }|j        }t          |t          j
        j                  r|n||f}t          |t          j
        j                  r|n||f}|| _        || _        t          |          | _        |d         |d         z  |d         |d         z  z  || j        z  z  }|| _        || _        t          j        ||| j        |d         |d         f| j        |d         |d         f          | _        d S )Nr   r   )in_channelsout_channelskernel_sizestride)rI   rJ   
image_size
patch_sizerb   rN   
num_framestubelet_size
isinstancecollectionsabcIterableintrM   r	   Conv3d
projection)
rR   rP   rm   rn   rb   rN   ro   rp   rM   rS   s
            r+   rJ   z VideoMAEPatchEmbeddings.__init__   sC   &
&
*(&
*#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
$$--]jm+
1A0NOS]aearSrs 	 )&)$$*JqM:a=I%z!}jmD	
 
 
r*   c                    |j         \  }}}}}|| j        k    rt          d          || j        d         k    s|| j        d         k    r2t          d| d| d| j        d          d| j        d          d	          |                    dddd	d
          }|                     |                              d                              dd          }|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model (z).r2   r      )r[   rb   
ValueErrorrm   permuterw   flatten	transpose)rR   r]   r`   ro   rb   heightwidthr_   s           r+   rc   zVideoMAEPatchEmbeddings.forward   s    >J>P;
Jfe4,,,w   T_Q'''5DOA4F+F+FwVwwewwDO\]L^wwaeapqraswww   $++Aq!Q::__\22::1==GG1MM
r*   rd   rf   s   @r+   rK   rK      sQ         
 
 
 
 
6      r*   rK   c            
            e Zd Zdeddf fdZdej        dej        fdZ	 ddeej                 d	e	de
eej        ej        f         eej                 f         fd
Z xZS )VideoMAESelfAttentionrP   returnNc                 ^   t                                                       |j        |j        z  dk    r1t	          |d          s!t          d|j        f d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j
        |j        | j        d          | _        t          j
        |j        | j        d          | _        t          j
        |j        | j        d          | _        |j        rat          j        t!          j        | j                            | _        t          j        t!          j        | j                            | _        nd | _        d | _        t          j        |j                  | _        d S )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .Fbias)rI   rJ   rN   num_attention_headshasattrr{   ru   attention_head_sizeall_head_sizer	   Linearquerykeyvalueqkv_bias	Parameterr&   zerosq_biasv_biasDropoutattention_probs_dropout_probdropoutrQ   s     r+   rJ   zVideoMAESelfAttention.__init__   s    ::a??PVXhHiHi?76#5"7 7 737 7 7  
 $*#= #&v'9F<V'V#W#W !58PPYv143EERRR
9V/1C%PPPYv143EERRR
? 	,u{43E'F'FGGDK,u{43E'F'FGGDKKDKDKz&"EFFr*   xc                     |                                 d d         | j        | j        fz   }|                    |          }|                    dddd          S )NrU   r   r2   r   r   )sizer   r   viewr|   )rR   r   new_x_shapes      r+   transpose_for_scoresz*VideoMAESelfAttention.transpose_for_scores   sP    ffhhssmt'?AY&ZZFF;yyAq!$$$r*   F	head_maskoutput_attentionsc                    | j         t          j        | j        d          nd }t          j                            || j        j        |          }t          j                            || j	        j        | j                  }t          j                            || j
        j        | j                   }|                     |          }|                     |          }	|                     |          }
t          j        |
|                    dd                    }|t          j        | j                  z  }t          j                            |d          }|                     |          }|||z  }t          j        ||	          }|                    ddd	d
                                          }|                                d d         | j        fz   }|                    |          }|r||fn|f}|S )NFrequires_gradinputweightr   rU   dimr   r2   r   r   )r   r&   
zeros_liker   r	   
functionallinearr   r   r   r   r   matmulr~   mathsqrtr   softmaxr   r|   
contiguousr   r   r   )rR   r    r   r   k_biaskeysvaluesqueries	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                   r+   rc   zVideoMAESelfAttention.forward   s    HL{G^!$+UCCCCdh}##-V\#]]%%M$*BSZ^Ze%ff-&&]4:CT[_[f&gg--d33	//77//88 !<Y5H5HR5P5PQQ+di8P.Q.QQ -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S%**+BCC6G]=/22mM]r*   NF)r"   r#   r$   r   rJ   r&   Tensorr   r   boolr   r   rc   re   rf   s   @r+   r   r      s        G~ G$ G G G G G G2%el %u| % % % % bg$ $(0(>$Z^$	uU\5</0%2EE	F$ $ $ $ $ $ $ $r*   r   c            
            e Zd Zdeddf fdZ	 d	deej                 dede	e
ej        ej        f         e
ej                 f         fdZ xZS )
VideoMAESdpaSelfAttentionrP   r   Nc                 b    t                                          |           |j        | _        d S rH   )rI   rJ   r   rQ   s     r+   rJ   z"VideoMAESdpaSelfAttention.__init__  s,       ,2,O)))r*   Fr   r   c           	      &   | j         t          j        | j        d          nd }t          j                            || j        j        |          }t          j                            || j	        j        | j                  }t          j                            || j
        j        | j                   }|                     |          }|                     |          }	|                     |          }
t          j        j                            |
||	|| j        r| j        nddd           }|                    dddd	                                          }|                                d d
         | j        fz   }|                    |          }|d fS )NFr   r           )	is_causalscaler   r2   r   r   r   )r   r&   r   r   r	   r   r   r   r   r   r   r   scaled_dot_product_attentiontrainingr   r|   r   r   r   r   )rR   r    r   r   r   r   r   r   r   r   r   r   r   s                r+   rc   z!VideoMAESdpaSelfAttention.forward  s~    HL{G^!$+UCCCCdh}##-V\#]]%%M$*BSZ^Ze%ff-&&]4:CT[_[f&gg--d33	//77//88+HH15GD--C I 
 
 &--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S%**+BCCd""r*   r   )r"   r#   r$   r   rJ   r   r&   r   r   r   r   rc   re   rf   s   @r+   r   r     s        P~ P$ P P P P P P
 bg# #(0(>#Z^#	uU\5</0%2EE	F# # # # # # # #r*   r   c                   ^     e Zd ZdZdeddf fdZdej        dej        dej        fdZ xZ	S )	VideoMAESelfOutputz
    The residual connection is defined in VideoMAELayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rP   r   Nc                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _        d S rH   )	rI   rJ   r	   r   rN   denser   hidden_dropout_probr   rQ   s     r+   rJ   zVideoMAESelfOutput.__init__6  sJ    Yv163EFF
z&"<==r*   r    input_tensorc                 Z    |                      |          }|                     |          }|S rH   r   r   rR   r    r   s      r+   rc   zVideoMAESelfOutput.forward;  s*    

=11]33r*   )
r"   r#   r$   r%   r   rJ   r&   r   rc   re   rf   s   @r+   r   r   0  s         
>~ >$ > > > > > >
U\  RWR^        r*   r   c                        e Zd Zdeddf fdZdee         ddfdZ	 	 ddej	        d	e
ej	                 d
edeeej	        ej	        f         eej	                 f         fdZ xZS )VideoMAEAttentionrP   r   Nc                     t                                                       t          |          | _        t	          |          | _        t                      | _        d S rH   )rI   rJ   r   	attentionr   outputsetpruned_headsrQ   s     r+   rJ   zVideoMAEAttention.__init__D  sI    .v66(00EEr*   headsc                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   r   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)rR   r   indexs      r+   prune_headszVideoMAEAttention.prune_headsJ  s   u::??F74>5t~7Y[_[l
 
u
  2$.2FNN/0BEJJ1$.2FNN.t{/@%QOOO .2^-ORUV[R\R\-\*'+~'IDNLn'n$ -33E::r*   Fr    r   r   c                     |                      |||          }|                     |d         |          }|f|dd          z   }|S )Nr   r   )r   r   )rR   r    r   r   self_outputsattention_outputr   s          r+   rc   zVideoMAEAttention.forward\  sM     ~~mY@QRR;;|AFF#%QRR(88r*   r   )r"   r#   r$   r   rJ   r   ru   r   r&   r   r   r   r   r   rc   re   rf   s   @r+   r   r   C  s        "~ "$ " " " " " ";S ;d ; ; ; ;* -1"'	 | EL)  	
 
uU\5</0%2EE	F       r*   r   c                   (     e Zd Zdeddf fdZ xZS )VideoMAESdpaAttentionrP   r   Nc                 r    t                                          |           t          |          | _        d S rH   )rI   rJ   r   r   rQ   s     r+   rJ   zVideoMAESdpaAttention.__init__l  s.       26::r*   )r"   r#   r$   r   rJ   re   rf   s   @r+   r   r   k  sK        ;~ ;$ ; ; ; ; ; ; ; ; ; ;r*   r   c                   L     e Zd Zdeddf fdZdej        dej        fdZ xZS )VideoMAEIntermediaterP   r   Nc                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S rH   )rI   rJ   r	   r   rN   intermediate_sizer   rq   
hidden_actstrr   intermediate_act_fnrQ   s     r+   rJ   zVideoMAEIntermediate.__init__s  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r*   r    c                 Z    |                      |          }|                     |          }|S rH   )r   r   )rR   r    s     r+   rc   zVideoMAEIntermediate.forward{  s,    

=1100??r*   	r"   r#   r$   r   rJ   r&   r   rc   re   rf   s   @r+   r   r   r  sq        9~ 9$ 9 9 9 9 9 9U\ el        r*   r   c                   Z     e Zd Zdeddf fdZdej        dej        dej        fdZ xZS )VideoMAEOutputrP   r   Nc                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _	        d S rH   )
rI   rJ   r	   r   r   rN   r   r   r   r   rQ   s     r+   rJ   zVideoMAEOutput.__init__  sJ    Yv79KLL
z&"<==r*   r    r   c                 d    |                      |          }|                     |          }||z   }|S rH   r   r   s      r+   rc   zVideoMAEOutput.forward  s4    

=11]33%4r*   r   rf   s   @r+   r   r     s|        >~ >$ > > > > > >
U\  RWR^        r*   r   )eagersdpac                        e Zd ZdZdeddf fdZ	 	 ddej        deej                 d	e	de
eej        ej        f         eej                 f         fd
Z xZS )VideoMAELayerz?This corresponds to the Block class in the timm implementation.rP   r   Nc                    t                                                       |j        | _        d| _        t	          |j                 |          | _        t          |          | _        t          |          | _
        t          j        |j        |j                  | _        t          j        |j        |j                  | _        d S )Nr   eps)rI   rJ   chunk_size_feed_forwardseq_len_dimVIDEOMAE_ATTENTION_CLASSES_attn_implementationr   r   intermediater   r   r	   	LayerNormrN   layer_norm_epslayernorm_beforelayernorm_afterrQ   s     r+   rJ   zVideoMAELayer.__init__  s    '-'E$3F4OPQWXX088$V,, "V-?VEZ [ [ [!|F,>FDYZZZr*   Fr    r   r   c                    |                      |                     |          ||          }|d         }|dd          }||z   }|                     |          }|                     |          }|                     ||          }|f|z   }|S )N)r   r   r   )r   r  r  r   r   )rR   r    r   r   self_attention_outputsr   r   layer_outputs           r+   rc   zVideoMAELayer.forward  s     "&!!-00/ "0 "
 "

 2!4(, )=8 ++M::((66 {{<??/G+r*   r   )r"   r#   r$   r%   r   rJ   r&   r   r   r   r   r   rc   re   rf   s   @r+   r   r     s        II[~ [$ [ [ [ [ [ [ -1"'	 | EL)  	
 
uU\5</0%2EE	F       r*   r   c                        e Zd Zdeddf fdZ	 	 	 	 ddej        deej                 d	ed
edede	e
ef         fdZ xZS )VideoMAEEncoderrP   r   Nc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r)   r   )r5   ra   rP   s     r+   r9   z,VideoMAEEncoder.__init__.<locals>.<listcomp>  s!    #c#c#caM&$9$9#c#c#cr*   F)	rI   rJ   rP   r	   
ModuleListr:   num_hidden_layerslayergradient_checkpointingrQ   s    `r+   rJ   zVideoMAEEncoder.__init__  s`    ]#c#c#c#c5IaCbCb#c#c#cdd
&+###r*   FTr    r   r   output_hidden_statesreturn_dictc                    |rdnd }|rdnd }t          | j                  D ]h\  }}	|r||fz   }|||         nd }
| j        r%| j        r|                     |	j        ||
|          }n |	||
|          }|d         }|r||d         fz   }i|r||fz   }|st          d |||fD                       S t          |||          S )Nr)   r   r   c              3      K   | ]}||V  	d S rH   r)   r5   vs     r+   	<genexpr>z*VideoMAEEncoder.forward.<locals>.<genexpr>  s(      mmq_`_l_l_l_l_lmmr*   last_hidden_stater    r!   )	enumerater  r  r   _gradient_checkpointing_func__call__tupler   )rR   r    r   r   r  r  all_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss               r+   rc   zVideoMAEEncoder.forward  sI    #7@BBD$5?bb4(44 	P 	POA|# I$58H$H!.7.CillO* `t} ` $ A A )!#%	! ! !-]OM^ _ _)!,M  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r*   )NFFT)r"   r#   r$   r   rJ   r&   r   r   r   r   r  r   rc   re   rf   s   @r+   r  r    s        ,~ ,$ , , , , , , -1"'%* )
 )
|)
 EL))
  	)

 #)
 )
 
uo%	&)
 )
 )
 )
 )
 )
 )
 )
r*   r  c                   ,    e Zd ZdZeZdZdZdZdZ	d Z
dS )VideoMAEPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    videomaer]   Tc                    t          |t          j        t          j        f          rT|j        j                            d| j        j                   |j	         |j	        j        
                                 dS dS t          |t          j                  r?|j	        j        
                                 |j        j                            d           dS dS )zInitialize the weightsr   )meanstdNg      ?)rq   r	   r   rv   r   datanormal_rP   initializer_ranger   zero_r   fill_)rR   modules     r+   _init_weightsz%VideoMAEPreTrainedModel._init_weights   s    fry")455 	* M&&CT[5R&SSS{& &&((((( '&-- 	*K""$$$M$$S)))))	* 	*r*   N)r"   r#   r$   r%   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_supports_sdpar.  r)   r*   r+   r#  r#    sJ         
 "L"$O&*#N
* 
* 
* 
* 
*r*   r#  aJ  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`VideoMAEConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a\  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`VideoMAEImageProcessor.__call__`] for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zbThe bare VideoMAE Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd Z fdZd Zd Z ee           ee	e
          	 	 	 	 	 ddej        deej                 deej                 d	ee         d
ee         dee         deee	f         fd                        Z xZS )VideoMAEModelc                 8   t                                          |           || _        t          |          | _        t          |          | _        |j        rd | _        n%t          j
        |j        |j                  | _        |                                  d S )Nr   )rI   rJ   rP   rF   r_   r  encoderuse_mean_pooling	layernormr	   r   rN   r   	post_initrQ   s     r+   rJ   zVideoMAEModel.__init__4  s       ,V44&v.." 	Y!DNN\&*<&BWXXXDN 	r*   c                     | j         j        S rH   )r_   rL   )rR   s    r+   get_input_embeddingsz"VideoMAEModel.get_input_embeddingsC  s    //r*   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr7  r  r   r   )rR   heads_to_pruner  r   s       r+   _prune_headszVideoMAEModel._prune_headsF  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr*   output_typer/  Nr]   r^   r   r   r  r  r   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|                     || j         j                  }|                     ||          }|                     |||||          }|d         }	| j        |                     |	          }	|s|	f|dd         z   S t          |	|j
        |j                  S )a  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Each video in the
            batch must have the same number of masked patches. If `None`, then all patches are considered. Sequence
            length is `(num_frames // tubelet_size) * (image_size // patch_size) ** 2`.

        Returns:

        Examples:

        ```python
        >>> import av
        >>> import numpy as np

        >>> from transformers import AutoImageProcessor, VideoMAEModel
        >>> from huggingface_hub import hf_hub_download

        >>> np.random.seed(0)


        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`List[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
        ...     '''
        ...     Sample a given number of frame indices from the video.
        ...     Args:
        ...         clip_len (`int`): Total number of frames to sample.
        ...         frame_sample_rate (`int`): Sample every n-th frame.
        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
        ...     Returns:
        ...         indices (`List[int]`): List of sampled frame indices
        ...     '''
        ...     converted_len = int(clip_len * frame_sample_rate)
        ...     end_idx = np.random.randint(converted_len, seg_len)
        ...     start_idx = end_idx - converted_len
        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
        ...     return indices


        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
        >>> file_path = hf_hub_download(
        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample 16 frames
        >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
        >>> video = read_video_pyav(container, indices)

        >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
        >>> model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base")

        >>> # prepare video for the model
        >>> inputs = image_processor(list(video), return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 1568, 768]
        ```Nr   r   r  r  r   r   r  )rP   r   r  use_return_dictget_head_maskr  r_   r7  r9  r   r    r!   )
rR   r]   r^   r   r   r  r  embedding_outputencoder_outputssequence_outputs
             r+   rc   zVideoMAEModel.forwardN  s   x 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] &&y$+2OPP	??<II,,/!5# ' 
 
 *!,>%"nn_==O 	<#%(;;;-)7&1
 
 
 	
r*   )NNNNN)r"   r#   r$   rJ   r<  r@  r   VIDEOMAE_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr&   r'   r   
BoolTensorr   r   r   r   rc   re   rf   s   @r+   r5  r5  /  s5       
    0 0 0C C C +*+DEE?YYY 7;,0,0/3&*{
 {
'{
 "%"23{
 EL)	{

 $D>{
 'tn{
 d^{
 
uo%	&{
 {
 {
 ZY FE{
 {
 {
 {
 {
r*   r5  c                   ,     e Zd Z fdZ	 	 	 ddZ xZS )VideoMAEDecoderc                 2   t                                                       |j        |j        z  |j        dz  z  }t          |          |j        _        |j        _	        |j
        _        |j        _        t          j        fdt!          |j                  D                       | _        t          j        |j                  | _        |dk    rt          j        |j        |          nt          j                    | _        d| _        || _        d S )Nr2   c                 .    g | ]}t                    S r)   r
  )r5   ra   decoder_configs     r+   r9   z,VideoMAEDecoder.__init__.<locals>.<listcomp>  s!    \\\q]>**\\\r*   r   F)rI   rJ   rb   rp   rn   r   decoder_hidden_sizerN   decoder_num_hidden_layersr  decoder_num_attention_headsr   decoder_intermediate_sizer   r	   r  r:   decoder_layersr   normr   Identityheadr  rP   )rR   rP   rM   decoder_num_labelsrQ  rS   s       @r+   rJ   zVideoMAEDecoder.__init__  s   #063FFIZ\]I]]!&))%+%?"+1+K(-3-O*+1+K( m\\\\E&:Z4[4[\\\
 
 L!;<<	I[^_I_I_BIf02DEEEegeperer 		 ',#r*   FTc                    |rdnd }|rdnd }t          | j                  D ]]\  }}	|r||fz   }| j        r%| j        r|                     |	j        |d |          }
n |	|d |          }
|
d         }|r||
d         fz   }^|r||fz   }|dk    r|d d | d f         }|                     |          }|                     |          }|st          d |||fD                       S t          |||          S )Nr)   )r   r   r   r   c              3      K   | ]}||V  	d S rH   r)   r  s     r+   r  z*VideoMAEDecoder.forward.<locals>.<genexpr>  s(      ffqXYXeXeXeXeXeffr*   )r   r    r!   )
r  rV  r  r   r  r  rW  rY  r  r   )rR   r    return_token_numr   r  r  r  r  r  r  r!  r   s               r+   rc   zVideoMAEDecoder.forward  s~    #7@BBD$5?bb4()<== 	P 	POA|# I$58H$H!* qt} q $ A A )!%	! ! !-]d^o p p p)!,M  P&9]1=M<O&O# 	E 1]4D Da)!!!.>->-?-?*?@M 		-00=)) 	gffV->@S$Tffffff$FBS`sttttr*   )FFT)r"   r#   r$   rJ   rc   re   rf   s   @r+   rN  rN    s_            4  "*u *u *u *u *u *u *u *ur*   rN  zXThe VideoMAE Model transformer with the decoder on top for self-supervised pre-training.c                        e Zd Z fdZ ee           eee          	 	 	 	 dde	j
        de	j        dee	j                 dee         dee         d	ee         d
eeef         fd                        Z xZS )VideoMAEForPreTrainingc                    t                                          |           || _        t          |          | _        t          j        |j        |j        d          | _	        t          j
        t          j        dd|j                            | _        t          | j        j        j        |j                  | _        t%          || j        j        j                  | _        |                                  d S )NFr   r   )rM   )rI   rJ   rP   r5  r$  r	   r   rN   rR  encoder_to_decoderr   r&   r   
mask_tokenrD   r_   rM   rO   rN  decoderr:  rQ   s     r+   rJ   zVideoMAEForPreTraining.__init__  s       %f--"$)F,>@Zaf"g"g"g,u{1a9S'T'TUU#>M$0&2L$
 $
  'v4=;S;_``` 	r*   rA  Nr]   r^   r   r   r  r  r   c                    ||n| j         j        }|                     ||||||          }|d         }|                     |          }|j        \  }	}
}|t          d          | j                            |	dd                              |          }|	                    |j
                                                                                  }||                              |	d|          }||                             |	d|          }t          j        ||z   | j        |z   gd          }|                     ||j        d                   }|j        }d}t          j                    5  | j         j        dk    r|}n|j
        }|j        }t          j        t.                    	                    ||	          ddddddf         }t          j        t0                    	                    ||	          ddddddf         }||z  |z   }|j        \  }	}}}}| j         j        | j         j        }}| j         j        r|                    |	||z  ||||z  |||z  |          }|                    ddd
ddddd                                          }|                    |	||z  |z  |z  |z  |z  ||z  |z  |          }||                    dd          z
  |                     ddd          !                                dz   z  }|                    |	||z  |z  |z  |z  |z  ||z  |z  |z            }n| j         j        dk    rt          d          |                    |	||z  ||||z  |||z  |          }|                    ddd
ddddd                                          }|                    |	||z  |z  |z  |z  |z  ||z  |z  |z            }|j        \  }	}}||                             |	d|          } ddd           n# 1 swxY w Y   tE                      }! |!||           }|s|f|dd         z   }"||f|"z   n|"S tG          |||j$        |j%                  S )a  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Each video in the
            batch must have the same number of masked patches. Sequence length is `(num_frames // tubelet_size) *
            (image_size // patch_size) ** 2`.

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, VideoMAEForPreTraining
        >>> import numpy as np
        >>> import torch

        >>> num_frames = 16
        >>> video = list(np.random.randint(0, 256, (num_frames, 3, 224, 224)))

        >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
        >>> model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base")

        >>> pixel_values = image_processor(video, return_tensors="pt").pixel_values

        >>> num_patches_per_frame = (model.config.image_size // model.config.patch_size) ** 2
        >>> seq_length = (num_frames // model.config.tubelet_size) * num_patches_per_frame
        >>> bool_masked_pos = torch.randint(0, 2, (1, seq_length)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss = outputs.loss
        ```N)r^   r   r   r  r  r   z!One must provided a boolean mask rU   r   r   r   )rX   dtyperz      r2         r   T)r   keepdim)r   unbiasedri  gư>zQCan't unnormalize non-RGB images. Consider setting config.norm_pix_loss to False.r.   r   r    r!   )&rP   rE  r$  ra  r[   r{   rO   expandrV   rW   rX   rY   rZ   r\   r&   catrb  rc  r   no_gradrb   re  	as_tensorr   r   rp   rn   norm_pix_lossr   r|   r   r&  varr   r   r-   r    r!   )#rR   r]   r^   r   r   r  r  r   rI  r`   seq_lenrb   expanded_position_embeddingspos_emb_visiblepos_emb_maskx_fulldecoder_outputsr   r.   framesrX   re  r&  r'  timer   r   rp   rn   frames_normvideos_patchra   labelsloss_fctr   s#                                      r+   rc   zVideoMAEForPreTraining.forward(  s   P &1%<kk$+B]--+/!5#   
 
 "!*11
 
 -<,A)
G\ "@AAA'+'?'F'FzSUWY'Z'Z'b'bco'p'p$'C'F'F|GZ'['['a'a'c'c'j'j'l'l$67GHPPQ[]_amnn3ODLLZY[]ijj Oo=tQ]?]^defff ,,v|/A!/DEE ']__ H	Y H	Y{'1,,% &,$*'<==@@V[@\\]acgijijijlprv]vwo&:;;>>fTY>ZZ[_aeghghghjnpt[tu%+d2<BL9JlFE'+{'?AW*L{( 6L(  j(Z'	 	  1aAq!Q??JJLLL(61Z?%G:U :-
: 	   &D(I(IIJJ2dJCCHHJJTQ  +//L(61Z?%G:U :-
:\I    ;+q00$k    L(  j(Z'	 	  1aAq!Q??JJLL%{{L(61Z?%G:U :-
:\I    +7*<'J<!/2:::r<XXFQH	Y H	Y H	Y H	Y H	Y H	Y H	Y H	Y H	Y H	Y H	Y H	Y H	Y H	Y H	YT 99x'' 	FY,F)-)9TGf$$vE+!/)	
 
 
 	
s   -JPPP)NNNN)r"   r#   r$   rJ   r   rJ  r   r-   rK  r&   r'   rL  r   r   r   r   r  rc   re   rf   s   @r+   r_  r_    s       
    " +*+DEE+GVefff
 -1,0/3&*]
 ]
']
 )]
 EL)	]

 $D>]
 'tn]
 d^]
 
u22	3]
 ]
 ]
 gf FE]
 ]
 ]
 ]
 ]
r*   r_  zVideoMAE Model transformer with a video classification head on top (a linear layer on top of the average pooled hidden
    states of all tokens) e.g. for ImageNet.c                       e Zd Z fdZ ee           eee          	 	 	 	 	 	 dde	e
j                 de	e
j                 de	e
j                 de	e         de	e         d	e	e         d
eeef         fd                        Z xZS )VideoMAEForVideoClassificationc                    t                                          |           |j        | _        t          |          | _        |j        rt          j        |j                  nd | _	        |j        dk    rt          j
        |j        |j                  nt          j                    | _        |                                  d S )Nr   )rI   rJ   
num_labelsr5  r$  r8  r	   r   rN   fc_normr   rX  
classifierr:  rQ   s     r+   rJ   z'VideoMAEForVideoClassification.__init__  s        +%f-- <B;R\r|F$6777X\NTN_bcNcNc")F$68IJJJikitiviv 	r*   rA  Nr]   r   r|  r   r  r  r   c                 4   ||n| j         j        }|                     |||||          }|d         }| j        )|                     |                    d                    }n|dddf         }|                     |          }	d}
|Z| j         j        f| j        dk    rd| j         _        nN| j        dk    r7|j        t          j
        k    s|j        t          j        k    rd| j         _        nd| j         _        | j         j        dk    rWt                      }| j        dk    r1 ||	                                |                                          }
n ||	|          }
n| j         j        dk    rGt                      } ||	                    d| j                  |                    d                    }
n*| j         j        dk    rt!                      } ||	|          }
|s|	f|dd         z   }|
|
f|z   n|S t#          |
|	|j        |j        	          S )
a3  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> import av
        >>> import torch
        >>> import numpy as np

        >>> from transformers import AutoImageProcessor, VideoMAEForVideoClassification
        >>> from huggingface_hub import hf_hub_download

        >>> np.random.seed(0)


        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`List[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
        ...     '''
        ...     Sample a given number of frame indices from the video.
        ...     Args:
        ...         clip_len (`int`): Total number of frames to sample.
        ...         frame_sample_rate (`int`): Sample every n-th frame.
        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
        ...     Returns:
        ...         indices (`List[int]`): List of sampled frame indices
        ...     '''
        ...     converted_len = int(clip_len * frame_sample_rate)
        ...     end_idx = np.random.randint(converted_len, seg_len)
        ...     start_idx = end_idx - converted_len
        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
        ...     return indices


        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
        >>> file_path = hf_hub_download(
        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample 16 frames
        >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
        >>> video = read_video_pyav(container, indices)

        >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
        >>> model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")

        >>> inputs = image_processor(list(video), return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        ...     logits = outputs.logits

        >>> # model predicts one of the 400 Kinetics-400 classes
        >>> predicted_label = logits.argmax(-1).item()
        >>> print(model.config.id2label[predicted_label])
        eating spaghetti
        ```NrD  r   r   
regressionsingle_label_classificationmulti_label_classificationrU   rk  )rP   rE  r$  r  r&  r  problem_typer  re  r&   longru   r   squeezer   r   r
   r   r    r!   )rR   r]   r   r|  r   r  r  r   rI  r   r.   r}  r   s                r+   rc   z&VideoMAEForVideoClassification.forward  s;   ~ &1%<kk$+B]--/!5#   
 
 "!*<#"ll?+?+?+B+BCCOO-aaad3O11{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
r*   )NNNNNN)r"   r#   r$   rJ   r   rJ  r   r   rK  r   r&   r   r   r   r   rc   re   rf   s   @r+   r  r    s            +*+DEE+@___ 04,0)-,0/3&*P
 P
u|,P
 EL)P
 &	P

 $D>P
 'tnP
 d^P
 
u++	,P
 P
 P
 `_ FEP
 P
 P
 P
 P
r*   r  )Gr%   collections.abcrr   r   copyr   dataclassesr   typingr   r   r   r   numpyr3   r&   torch.utils.checkpointr	   torch.nnr
   r   r   activationsr   modeling_outputsr   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   utils.constantsr   r   configuration_videomaer   
get_loggerr"   loggerrK  _CHECKPOINT_FOR_DOCr   r-   rD   ModulerF   rK   r   r   r   r   r   r   r   r   r   r  r#  VIDEOMAE_START_DOCSTRINGrJ  r5  rN  r_  r  r)   r*   r+   <module>r     s   3 2            ! ! ! ! ! ! . . . . . . . . . . . .                A A A A A A A A A A ! ! ! ! ! ! F F F F F F F F - - - - - - Q Q Q Q Q Q Q Q              K J J J J J J J 2 2 2 2 2 2 
	H	%	%"-  : : : : :K : : :, : : : : :; : : :6: : :       >2 2 2 2 2bi 2 2 2jC C C C CBI C C CL# # # # # 5 # # #F       &$ $ $ $ $	 $ $ $P; ; ; ; ;- ; ; ;    29   "    RY    (9BWXX ' ' ' ' 'BI ' ' 'V0
 0
 0
 0
 0
bi 0
 0
 0
f* * * * *o * * *2	  . h X
 X
 X
 X
 X
+ X
 X
	 X
vAu Au Au Au Aubi Au Au AuH ^ q
 q
 q
 q
 q
4 q
 q
	 q
h 0 
`
 `
 `
 `
 `
%< `
 `
 
`
 `
 `
r*   