
    gT                        d Z ddlZddlmZ ddlmZmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZmZmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ  ej        e          Z e G d de                      Z! G d de	j"                  Z# G d de	j"                  Z$ G d de	j"                  Z% G d de	j"                  Z& G d de	j"                  Z' G d de	j"                  Z( G d de	j"                  Z) G d  d!e	j"                  Z* G d" d#e	j"                  Z+ G d$ d%e	j"                  Z, G d& d'e          Z-d(Z.d)Z/ G d* d+e	j"                  Z0 G d, d-e	j"                  Z1e0e1d.Z2 ed/e.           G d0 d1e-                      Z3 G d2 d3e	j"                  Z4 ed4e.           G d5 d6e-                      Z5dS )7zPyTorch TVP Model    N)	dataclass)OptionalTuple)nn   )ACT2FN)add_start_docstrings%add_start_docstrings_to_model_forwardreplace_return_docstrings)BaseModelOutputBaseModelOutputWithPoolingModelOutput)PreTrainedModel)prune_linear_layer)logging)load_backbone   )	TvpConfigc                       e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dS )TvpVideoGroundingOutputa  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Temporal-Distance IoU loss for video grounding.
        logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
            Contains start_time/duration and end_time/duration. It is the time slot of the videos corresponding to the
            input texts.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
            the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
    Nlosslogits.hidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r        `/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/tvp/modeling_tvp.pyr   r   &   s            )-D(5$
%,,, $FE$$$=AM8E%"3S"89:AAA:>Ju0#567>>>>>r#   r   c                   :     e Zd ZdZ fdZd Zd Zd Zd Z xZ	S )TvpLossa~  
    This class computes the losses for `TvpForVideoGrounding`. The process happens in two steps: 1) we compute
    hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched
    ground-truth / prediction (supervise class and box).

    Args:
        losses (`List[str]`):
            List of all the losses to be applied.
    c                     t                                                       | j        | j        | j        d| _        |D ]}|| j        vrt          d| d          || _        d S )NioudistancedurationzLoss z not supported)super__init__loss_iouloss_distanceloss_durationloss_map
ValueErrorlosses)selfr3   r   	__class__s      r$   r-   zTvpLoss.__init__I   s    =**
 

  	? 	?D4=(( !=!=!=!=>>> ) r#   c                     t          j        ||          t          j        ||          z
  }t          j        ||          t          j        ||          z
  }d|                    d          |z  z
  }|S )z6
        Measure the intersection over union.
        r   r   min)r   r8   maxclamp)	r4   
start_timeend_timecandidates_start_timecandidates_end_timer+   interunionr)   s	            r$   r.   zTvpLoss.loss_iouV   sp     	-x8859EZ\f;g;gg	-x8859EZ\f;g;gg%++!+$$u,,
r#   c                 J   t          j        t          j        ||          d          }t          j        t          j        ||          d          }t          j        t          j        ||          t          j        ||          z
  |                              d          }|S )z5
        Measure the distance of mid points.
        g       @g?r7   )r   divaddr9   r8   r:   )	r4   r;   r<   r=   r>   r+   mid_candidatesmid_groundtruthdistance_diffs	            r$   r/   zTvpLoss.loss_distance`   s     59-BDW#X#XZ]^^)EIj($C$CSII	Ino66>Sb9c9ccem
 

%C%.. 	 r#   c                     t          j        ||          }t          j        ||          }t          j        t          j        t          j        ||          |                    }|                    d          }|S )z5
        Measure the difference of duration.
        g?r7   )r   subsquarerB   r:   )	r4   r;   r<   r=   r>   r+   duration_candidatesduration_groundtruthduration_diffs	            r$   r0   zTvpLoss.loss_durationl   sp     $i(;=RSS$y:>>UYuy9LNb/c/cem%n%noo%+++44r#   c                 *   |\  }}}t          j        ||          }|dddf                                         |dddf                                         }}i }	| j        D ]1}
|	                    |
 | j        |
         |||||          i           2|	S )am  
        This performs the loss computation.

        Args:
            logits (`torch.FloatTensor`):
                The output logits of head module.
            labels (`List[torch.FloatTensor]`):
                List of tensors ([start, end, duration]), which contains start time, end time of the video corresponding to the text, and also the duration.
        Nr   r   )r   mulfloatr3   updater1   )r4   r   labelsr+   r;   r<   
candidatesr=   r>   losses_dictr   s              r$   forwardzTvpLoss.forwardw   s     *0&*hYvx00
5?15E5K5K5M5MzZ[Z[Z[]^Z^O_OeOeOgOg2K 	 	D*t}T*:xAVXkmuvvw    r#   )
r   r   r   r   r-   r.   r/   r0   rT   __classcell__r5   s   @r$   r&   r&   >   s~               
 
 
	 	 	      r#   r&   c                   $     e Zd Z fdZd Z xZS )TvpVisionModelc           	         t                                                       t          |          | _        |j        |j        j        d         }nt          | j        d          r2t          | j        j        d          r| j        j        j        d         }nPt          | j        d          r,t          | j        j        d          r| j        j        j        }nt          d          t          j        ||j        ddddd	          | _        d S )
Nconfighidden_sizeshidden_sizezBackbone config not foundr   r   F)kernel_sizestridepaddinggroupsbias)r,   r-   r   backbonebackbone_configr\   hasattrr[   r]   r2   r   Conv2dgrid_encoder_conv)r4   r[   in_channelsr5   s      r$   r-   zTvpVisionModel.__init__   s   %f--!- 0=bAKKT]H-- 	:'$-:NP^2_2_ 	:-.;B?KKT]H-- 	:'$-:NP]2^2^ 	:-.:KK8999!#"
 "
 "
r#   c                    |j         \  }}}}}|                    ||z  |||          }|                     |          d         d         }|                     |          }t          j                            |dd          }t          j                            |d          }|j         dd          \  }	}
}|                    |||	|
|          }|                    ddd	d
d          }|S )Nfeature_mapsr      )r^   r_   T)inplacer   r      )	shapeviewrc   rg   r   
functional
max_pool2drelupermute)r4   pixel_values
batch_size
num_framesnum_channelsheightwidthgrid_feat_outputsgridnew_channel
new_height	new_widths               r$   rT   zTvpVisionModel.forward   s    >J>P;
Jfe#((j)@,PVX]^^ MM,77GJ%%&788}''!A'FF}!!$!55-1Z_*ZyyZj)TT||Aq!Q**r#   r   r   r   r-   rT   rU   rV   s   @r$   rX   rX      sG        
 
 
 
 
.      r#   rX   c                   j     e Zd ZdZ fdZdej        dededej        fdZdd	e	fd
Z
dd	e	fdZ xZS )TvpVisualInputEmbeddingz;
    Takes input of both image and video (multi-frame)
    c                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j	        |j                  | _
        t          j        d|j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        |j        | _        |j	        | _	        d S )Nr   eps)r,   r-   r   	Embeddingmax_position_embeddingsr]   position_embeddings max_grid_row_position_embeddingsrow_position_embeddings max_grid_col_position_embeddingscol_position_embeddingstoken_type_embeddings	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutr4   r[   r5   s     r$   r-   z TvpVisualInputEmbedding.__init__   s    #%<0NPVPb#c#c ')|F4[]c]o'p'p$')|F4[]c]o'p'p$%'\!V5G%H%H",v'9v?TUUUz&"<==060W-060W---r#   	embeddingry   rz   returnc                    dx}}|| j         k    r
|| j         z  }|| j        k    r
|| j        z  }|                    dddd          }t          j                            |||fdd          }|                    dddd          }|S )z
        This method allows to interpolate the pre-trained pad weights , to be able to use the model on collection of high
        resolution images (high resolution videos).

        r   r   r   rk   bicubicFscale_factormodealign_corners)r   r   rt   r   rq   interpolate)r4   r   ry   rz   h0w0s         r$   interpolate_pos_encodingz0TvpVisualInputEmbedding.interpolate_pos_encoding   s     RD999$??B4888>>B%%aAq11	M--b	 . 
 
	 %%aAq11	r#   Fr   c                 L   |j         \  }}}}t          | j        |          }t          j        |t          j        |j                  }|                     |          }	dt          |j                   dz
  z  |d|fz   }
 |	j	        |
 }	t          | j
        |          }t          j        |t          j        |j                  }|                     |          }|d||f} |j	        | }|	|z   }|r1|| j        k    s|| j
        k    r||                     |||          z   }n||z   }|S )af  
        Args:
            grid: (batch_size, height, width, hidden_dim)
            interpolate_pos_encoding: (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the pre-trained position encodings.
        Returns:
            grid + col_position_embeddings.view(*col_shape): (batch_size, *, height, width, hidden_dim)
        dtypedevice)r   r   r   )ro   r8   r   r   arangelongr   r   lenrp   r   r   r   )r4   r|   r   rv   ry   rz   
hidden_dim
row_heightrow_position_idsr   	row_shape	row_widthcol_position_idsr   	col_shapepositional_embeddingss                   r$   add_2d_positional_embeddingsz4TvpVisualInputEmbedding.add_2d_positional_embeddings   sP    15
-
FE: >GG
 <
%*T[YYY"&">">?O"P"PC
OOa/0J:3NN	">"9">	"J =uEE	 <	DKXXX"&">">?O"P"PIz:	">"9">	"J 7:Q Q $ 	0T:::edFk>k>k$778MvW\]]]DD//Dr#   c                    |j         \  }}}}}|                    d          }|                     ||          }|                    |d|          }|j         dd         }	|j        }
t          j        |	t
          j        |
          }|                     |          }||z   }| 	                    |          }| 
                    |          }|S )a  
        Args:
            grid: Array of shape (batch_size, num_frames, height, width, num_channels).
                It contains processed frames extracted from videos, and is generated by Tvp image preprocessor. Note,
                num_frames can be 1
            interpolate_pos_encoding: (bool, *optional*, defaults to `False`):
                Whether to interpolate the pre-trained position encodings.

        Returns:
            embeddings: The embedding of grid with size (batch_size, height*width, num_channels)

        r   r   rZ   Nr   )ro   meanr   rp   r   r   zerosr   r   r   r   )r4   r|   r   rv   rw   ry   rz   rx   visual_tokensvisual_tokens_shaper   token_type_idsr   
embeddingss                 r$   rT   zTvpVisualInputEmbedding.forward  s     ?Cj;
J|yy||00Ph0ii		*b,??+1#2#6% %8
SYZZZ $ : :> J J"%::
__Z00
\\*--
r#   F)r   r   r   r   r-   r   Tensorintr   boolr   rT   rU   rV   s   @r$   r   r      s         
X 
X 
X 
X 
X%,  TW \a\h    .' '4 ' ' ' 'R d        r#   r   c                   *     e Zd ZdZ fdZddZ xZS )TvpTextInputEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j        |j
        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        d S )N)padding_idxr   )r,   r-   r   r   
vocab_sizer]   pad_token_idword_embeddingsr   r   type_vocab_sizer   r   r   r   r   r   r   r   s     r$   r-   zTvpTextInputEmbeddings.__init__*  s    !|F,=v?Q_e_rsss#%<0NPVPb#c#c %'\&2H&J\%]%]",v'9v?TUUUz&"<==r#   Nc                 ^   ||                                 }n|                                 d d         }|d         }||j        n|j        }|It          j        |t          j        |          }|                    d                              |          }|!t          j        |t          j        |          }||                     |          }| 	                    |          }| 
                    |          }	||z   |	z   }
|                     |
          }
|                     |
          }
|
S )NrZ   r   r   r   )sizer   r   r   r   	unsqueezeexpandr   r   r   r   r   r   )r4   	input_idsr   position_idsinputs_embedsinput_shape
seq_lengthr   r   r   r   s              r$   rT   zTvpTextInputEmbeddings.forward2  s(    #..**KK',,..ss3K ^
%.%:!!@T <
%*VTTTL'11!44;;KHHL!"[EJvVVVN  00;;M"66|DD $ : :> J J"%88;PP
__Z00
\\*--
r#   )NNNNr   r   r   r   r-   rT   rU   rV   s   @r$   r   r   '  sR        QQ> > > > >       r#   r   c                   b     e Zd Z fdZd Zdej        dedefdZ	 	 	 d
de	e
         fd	Z xZS )TvpAttentionc                 V   t                                                       |j        |j        z  dk    r/t	          |d          st          d|j         d|j                   |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j        |j                  | _        t          j
        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        t/                      | _        d S )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r   )r,   r-   r]   num_attention_headsre   r2   r   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probattn_dropoutdenser   r   r   r   r   setpruned_headsr   s     r$   r-   zTvpAttention.__init__L  su    ::a??PVXhHiHi? H6#5  H  Hkq  lF  H  H   $*#= #&v'9F<V'V#W#W !58PPYv143EFF
9V/1CDDYv143EFF
Jv'JKKYv163EFF
,v'9v?TUUUz&"<==EEr#   c                 P   t          |          dk    rd S t          j        | j        | j                  }t          |          | j        z
  }|D ]*t          fd| j        D                       z
  d|<   +|                    d          	                                
                    d          }t          j        t          |                    |                                         }t          | j        |          | _        t          | j        |          | _        t          | j        |          | _        t          | j        |d          | _        | j        t          |          z
  | _        | j        | j        z  | _        | j                            |          | _        d S )Nr   c              3   ,   K   | ]}|k     rd ndV  dS )r   r   Nr"   ).0hheads     r$   	<genexpr>z+TvpAttention.prune_heads.<locals>.<genexpr>h  s/      NNq1t88aaNNNNNNr#   rZ   r   dim)r   r   onesr   r   r   r   sumrp   
contiguouseqr   r   r   r   r   r   r   r   r@   )r4   headsmaskindexr   s       @r$   prune_headszTvpAttention.prune_headsa  sq   u::??Fz$2D4LMME

T.. 	 	D#NNNND<MNNNNNNDDJJyy}}'')),,Q//SYY''-2244 (
E::
%dh66'
E::
'
EqAAA
 $(#;c%jj#H !58PP -33E::r#   tensorsequence_lengthrv   c                     |                     ||| j        | j                                      dd                                          S )Nr   rk   )rp   r   r   	transposer   )r4   r   r   rv   s       r$   _reshapezTvpAttention._reshapex  s8    KK
OT5MtOghhYq!__Z\\	
r#   Noutput_attentionsc                    |j         d d         \  }}|                     |          }|                     |          }|                     |          }	|                     |||          }
|                     |||          }|                     |	||          }t          j        |
|                    dd                    }|t          j	        | j
                  z  }|||z   }t          j                            |d          }|                     |          }|||z  }t          j        ||          }|                    dd                                          }|                    ||| j                  }|                     |          }|                     |          }|                     ||z             }|r||fn|f}|S )Nrk   rZ   r   r   )ro   r   r   r   r   r   matmulr   mathsqrtr   r   rq   softmaxr   r   reshaper   r   r   r   )r4   r   attention_mask	head_maskr   rv   r   mixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresattention_probsattn_outputoutputss                    r$   rT   zTvpAttention.forward  s    '4&9"1"&=#
O JJ}55((=11 JJ}55mm$5
SSMM/?JOO	mm$5
SS !<Y5H5HR5P5PQQ+di8P.Q.QQ%/.@ -//0@b/II ++O<<  -	9Ol?K@@!++Aq11<<>>!))*otGYZZjj--ll;//ookM&ABB4EY;00K>r#   NNN)r   r   r   r-   r   r   r   r   r   r   r   rT   rU   rV   s   @r$   r   r   K  s        " " " " "*; ; ;.
u| 
c 
s 
 
 
 
 ,0+ +
 $D>+ + + + + + + +r#   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )TvpIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S N)r,   r-   r   r   r]   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr   s     r$   r-   zTvpIntermediate.__init__  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r#   r   r   c                 Z    |                      |          }|                     |          }|S r  )r   r  )r4   r   s     r$   rT   zTvpIntermediate.forward  s,    

=1100??r#   r   r   r   r-   r   r   rT   rU   rV   s   @r$   r
  r
    s^        9 9 9 9 9U\ el        r#   r
  c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )TvpOutputLayerc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j
        |j                  | _        d S )Nr   )r,   r-   r   r   r  r]   r   r   r   r   r   r   r   r   s     r$   r-   zTvpOutputLayer.__init__  sf    Yv79KLL
,v'9v?TUUUz&"<==r#   r   input_tensorr   c                     |                      |          }|                     |          }|                     ||z             }|S r  )r   r   r   )r4   r   r  s      r$   rT   zTvpOutputLayer.forward  s@    

=11]33(DEEr#   r  rV   s   @r$   r  r    si        > > > > >U\  RWR^        r#   r  c                   >     e Zd Z fdZ	 	 	 ddee         fdZ xZS )TvpEncodeLayerc                     t                                                       t          |          | _        t	          |          | _        t          |          | _        d S r  )r,   r-   r   	attentionr
  intermediater  outputr   s     r$   r-   zTvpEncodeLayer.__init__  sK    %f--+F33$V,,r#   Nr   c                     |                      ||||          }|d         }|dd          }|                     |          }|                     ||          }	|	f|z   }|S )N)r   r   r   )r  r  r  )
r4   r   r   r   r   self_attention_outputsattention_outputr  intermediate_outputlayer_outputs
             r$   rT   zTvpEncodeLayer.forward  s     "&/	 "0 "
 "
 2!4(,"//0@AA{{#68HII/G+r#   r  )r   r   r   r-   r   r   rT   rU   rV   s   @r$   r  r    sj        - - - - - ,0 
 $D>       r#   r  c            
       |     e Zd Z fdZ	 	 	 	 	 ddeej                 dee         dee         dee         fdZ xZ	S )	
TvpEncoderc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r"   )r  )r   _r[   s     r$   
<listcomp>z'TvpEncoder.__init__.<locals>.<listcomp>  s!    #d#d#dqN6$:$:#d#d#dr#   F)	r,   r-   r[   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   s    `r$   r-   zTvpEncoder.__init__  s`    ]#d#d#d#dE&JbDcDc#d#d#dee
&+###r#   Nr   r   output_hidden_statesreturn_dictc                    ||n| j         j        }||n| j         j        }||n| j         j        }d}d}t	          | j                  D ]n\  }	}
|r||fz   }| j        r0| j        r)|                     |
j	        |||||	         nd |          }n |
||||	         |          }|d         }|r||d         fz   }o|r||fz   }|s|f}|r||fz   }|r||fz   }|S t          ||r|nd |r|nd           S )Nr"   r   r   )last_hidden_stater   r   )r[   r0  r   r/  	enumerater-  r.  training_gradient_checkpointing_func__call__r   )r4   r   r   r   r   r/  r0  all_hidden_statesall_attentionsilayer_modulelayer_outputsr  s                r$   rT   zTvpEncoder.forward  s    &1%<kk$+BY1B1N--TXT_Tq$8$D  $+Jj 	 (44 	F 	FOA|# I$58H$H!* 	mt} 	m $ A A )!"%.%:Yq\\%! ! !-]NIVWLZk l l)!,M  F!/=3C2E!E   	E 1]4D D 	$&G# 9!%6$88  6!^$55N+/CM++):D~~
 
 
 	
r#   )NNNNN)
r   r   r   r-   r   r   r    r   rT   rU   rV   s   @r$   r%  r%    s        , , , , , 15,0/3&*4
 4
 E-.	4

 $D>4
 'tn4
 d^4
 4
 4
 4
 4
 4
 4
 4
r#   r%  c                   B     e Zd Z fdZdej        dej        fdZ xZS )	TvpPoolerc                     t                                                       t          j        |j        |j                  | _        t          j                    | _        d S r  )r,   r-   r   r   r]   r   Tanh
activationr   s     r$   r-   zTvpPooler.__init__'  sC    Yv163EFF
'))r#   r   r   c                 r    |d d df         }|                      |          }|                     |          }|S )Nr   )r   r@  )r4   r   first_token_tensorpooled_outputs       r$   rT   zTvpPooler.forward,  s@     +111a40

#56666r#   r  rV   s   @r$   r=  r=  &  s^        $ $ $ $ $
U\ el        r#   r=  c                   $    e Zd ZdZeZdZdZd ZdS )TvpPreTrainedModelzAn abstract class to handle weights initialization and
    a simple interface for downloading and loading pretrained models.
    modelTc                    t          |t          j        t          j        f          r,|j        j                            d| j        j                   nWt          |t          j	                  r=|j
        j                                         |j        j                            d           t          |t          j                  r%|j
        |j
        j                                         t          |t          j                  rUt          j                            |j        dd           |j
        )t          j                            |j
        d           dS dS dS )	zInitialize the weights        )r   stdg      ?Nfan_outrs   )r   nonlinearityr   )r  r   r   r   weightdatanormal_r[   initializer_ranger   rb   zero_fill_rf   initkaiming_normal_	constant_)r4   modules     r$   _init_weightsz TvpPreTrainedModel._init_weights>  s'   fry",788 	* M&&CT[5R&SSSS-- 	*K""$$$M$$S)))fbi(( 	%V[-DK""$$$fbi(( 	2G##FM	PV#WWW{&!!&+q11111	2 	2&&r#   N)	r   r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointingrV  r"   r#   r$   rE  rE  5  s@          L&*#2 2 2 2 2r#   rE  aE  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`TvpConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)

        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`TvpImageProcessor`]. See [`TvpImageProcessor.__call__`]
            for details.

        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
            Whether to interpolate the pre-trained image pad prompter encodings and positional encodings.
c                   (     e Zd ZdZ fdZd Z xZS )TvpFrameDownPadPrompterz>
    Pad frames extracted from videos only at the bottom.
    c           	      V   |j         dvrt          d          t                                                       |j        | _        |j        | _        |j        | _        |j         | _         t          j        t          j
        d|j        d|j        |j        g                    | _        d S )NrC   replaceremove9`visual_prompter_apply` must be in (add, replace, remove)r   r   )visual_prompter_applyr2   r,   r-   visual_prompt_size	frame_nummax_img_sizer   	Parameterr   randnpad_downr   s     r$   r-   z TvpFrameDownPadPrompter.__init__  s    '/KKKXYYY"(";)"/%+%A"KF,a1JFL_`aa
 
r#   c                    | j         dk    rOt          j        | j        | j        g|j        |j                  }d|| j        | j        z
  | j        d d f<   ||z  }| j         dk    rt          j        |j        d         |j        d         d| j        | j        g|j                  }| j        | j        z
  }| j	        |d d d d d d || j        d d f<   ||
                    |j                  z  }|S )	NrC   r   rH  r_  r   r   r   r   )ra  r   r   rd  r   r   rb  r   ro   rg  to)r4   ru   visual_prompt_maskpromptstart_points        r$   rT   zTvpFrameDownPadPrompter.forward  s(   %..!&"D$56l>PYeYl" " " fit043JJTM^^`a`a`aab..L%11[#A&(:1(=q$BSUYUfg#*  F +d.EEKBF-F111aaaK$*;;QQQ>?FIIl&8999Lr#   r   rV   s   @r$   r[  r[    sQ         
 
 
 
 
      r#   r[  c                   \     e Zd ZdZ fdZdej        dededej        fdZdd	e	fd
Z
 xZS )TvpFramePadPrompterz?
    Pad frames extracted from videos in the surroundings.
    c           
         |j         dvrt          d          t                                                       |j        | _        |j        | _        |j         | _         |j        |j        dz  z
  | _        t          j	        t          j        d|j        d|j        |j        g                    | _        t          j	        t          j        d|j        d|j        |j        g                    | _        t          j	        t          j        d|j        d|j        |j        dz  z
  |j        g                    | _        t          j	        t          j        d|j        d|j        |j        dz  z
  |j        g                    | _        d S )Nr]  r`  rk   r   r   )ra  r2   r,   r-   rw   rd  rb  	base_sizer   re  r   rf  pad_uprg  pad_left	pad_rightr   s     r$   r-   zTvpFramePadPrompter.__init__  sx   '/KKKXYYY +"/%+%A",v/H1/LLlKF-q&2KVM`abb
 
 KF-q&2KVM`abb
 
 K%'&*Ca*GG- 

 

 K%'&*Ca*GG- 

 

r#   rl  ry   rz   r   c                     || j         z  || j         z  }}|j        \  }}}}	}
|                    ||z  ||	|
          }t          j                            |||fdd          }|                    |||||          }|S )z
        This method allows to interpolate the pre-trained pad weights, to be able to use the model on collection of high
        resolution images (high resolution videos).

        r   Fr   )rd  ro   r   r   rq   r   )r4   rl  ry   rz   r   r   batchrw   channelsprompt_heightprompt_widths              r$   interpolate_pad_encodingz,TvpFramePadPrompter.interpolate_pad_encoding  s     $++UT5F-FBCI<@z8]L 
 2Hm\ZZ**b	 + 
 
 z8VUKKr#   Frz  c                    |r|j         d         |j         d         fn| j        | j        f\  }}| j        dvrt          d| j                   | j        dv r(t	          j        ||g|j        |j                  }||z  }| j        dv rt	          j        d| j	        d	| j
        | j
        |j        
          }t	          j        | j        || j        gd          }t	          j        | j        || j        gd	          }t	          j        |                    d          |gz            }|r|                     |||          }||                    |j                  z   }|S )Nr   rZ   )rC   r_  r^  z$Invalid visual_prompter_apply value )r^  r_  r   )r^  rC   r   r   ri  rn   r   r   )ro   rd  ra  r2   r   r   r   r   r   rw   rq  catrs  rt  rr  rg  r   rz  rj  )r4   ru   rz  ry   rz   rk  baserl  s           r$   rT   zTvpFramePadPrompter.forward  sz    (8\#\%7%;<<#T%67 	
 %-III`DD^``aaa%)>>>!&VUO<CU^j^q!r!r!r..L%);;;;q$/1dndn]i]pqqqDYtT^D!LLLFYVT]CKKKFY|0033vh>??F' N66vvuMM'&))L4F*G*GGLr#   r   )r   r   r   r   r-   r   r   r   rz  r   rT   rU   rV   s   @r$   ro  ro    s         $
 $
 $
 $
 $
Lu| S QT Y^Ye    0 d        r#   ro  )framedownpadframepadzmThe bare Tvp Model transformer outputting BaseModelOutputWithPooling object without any specific head on top.c                   .    e Zd Z fdZd Zd Zd Z ee           e	e
e          	 	 	 	 	 	 	 	 ddeej                 d	eej                 d
eej                 deej                 dee         dee         dee         defd                        Z xZS )TvpModelc                 b   t                                          |           || _        t          |          | _        t          |          | _        t          |          | _        t          |          | _
        t          |          | _        t          j        t          j        dd|j        g                    | _        t          j        |j                  | _        |j        t.          vrt1          d          t/          |j                 |          | _        |                                  d S )Nr   
   z:`visual_prompter_type` must be in (framedownpad, framepad))r,   r-   r[   rX   vision_modelr   r   r   visual_embeddingsr%  encoderr=  poolerr   re  r   rf  r]   text_promptr   r   r   visual_prompter_typeTVP_PROMPTER_CLASSES_MAPPINGr2   visual_prompter	post_initr   s     r$   r-   zTvpModel.__init__  s       *622088!8!@!@!&))''<QF<N4O(P(PQQz&"<==&.JJJYZZZ;F<WXY_``r#   c                     | j         j        S r  r   r   )r4   s    r$   get_input_embeddingszTvpModel.get_input_embeddings  s    ..r#   c                     || j         _        d S r  r  )r4   r   s     r$   set_input_embeddingszTvpModel.set_input_embeddings  s    */'''r#   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )zPrunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        N)itemsr  r-  r  r   )r4   heads_to_pruner-  r   s       r$   _prune_headszTvpModel._prune_heads"  sU     +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr#   output_typerW  NFr   ru   r   r   r   r/  r0  r   c	                 6   ||n| j         j        }|                     |                     ||                    }|                     |          }	|                     ||          }
||                    |
j        dd                   }t          j	        |j        d         d          
                    |j        |j                  }t          j        |||gd	
          }|                     ||                                          
                    |j                  }| j                            |	j        d         d	d	          }t          j        ||	|
gd
          }|                     |||                     || j         j                  |||          }|r|j        n|d         }|                     |          }|                     |          }|                     |          }|s||f|dd         z   S t1          |||j        |j                  S )a(  
        Returns:

        Examples:
        ```python
        >>> import torch
        >>> from transformers import AutoConfig, AutoTokenizer, TvpModel

        >>> model = TvpModel.from_pretrained("Jiqing/tiny-random-tvp")

        >>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

        >>> pixel_values = torch.rand(1, 1, 3, 448, 448)
        >>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
        >>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
        ```N)rz  )r   r   rk   r   r  )r   r   rZ   r   r   )r   r   r   r/  r0  )r2  pooler_outputr   r   )r[   r0  r  r  r   r  new_onesro   r   r   rj  r   r   r|  get_extended_attention_maskr   r  r   r  get_head_maskr,  r2  r  r   r   r   r   )r4   r   ru   r   r   r   r/  r0  r   text_embedding_outputvisual_embedding_outputvisual_attention_maskpt_maskr  embedding_outputencoder_outputsr2  rC  s                     r$   rT   zTvpModel.forward)  sS   : &1%<kk$+BY((  H` aa
 
 !%) D D"&"8"83K #9 #
 #
 %$2$;$;<S<YZ\[\Z\<]$^$^!j!5a!8"==@@%,N4H A  G #YAV'W]_```N "==ninnN^N^__bbclcsttN&--.C.I!.LbRTUU 9k3HJa%bhijjj,,)((DK4QRR/!5# ' 
 
 BMdO==RabcRd$566 LL):;;]33 	L%}58KKK)/')7&1	
 
 
 	
r#   )NNNNNNNF)r   r   r   r-   r  r  r  r
   TVP_INPUTS_DOCSTRINGr   r   r   r   r   
LongTensorr    r   rT   rU   rV   s   @r$   r  r    s^       
     / / /0 0 0C C C +*+?@@+ET]^^^ 15485915,0/3&*).H
 H
E,-H
 u01H
 !!12	H

 E-.H
 $D>H
 'tnH
 d^H
 #'H
 H
 H
 _^ A@H
 H
 H
 H
 H
r#   r  c                   $     e Zd Z fdZd Z xZS )TvpVideoGroundingHeadc                 :   t                                                       t          j        |j        |j        dz            | _        t          j        |j        dz  d          | _        t          j                    | _        t          j	                    | _
        d S )Nrk   )r,   r-   r   r   r]   layer_0layer_1ReLUactivation_0Sigmoidactivation_1r   s     r$   r-   zTvpVideoGroundingHead.__init__w  st    y!3V5G!5KLLy!3a!7;;GIIJLLr#   c                     |                      |                     |                    }|                     |                     |                    }|S r  )r  r  r  r  )r4   r  r   s      r$   rT   zTvpVideoGroundingHead.forward~  sE    ""4<<#>#>??""4<<#7#788r#   r   rV   s   @r$   r  r  v  sG        ) ) ) ) )      r#   r  zb
    Tvp Model with a video grounding head on top computing IoU, distance, and duration loss.
    c                   8    e Zd Z fdZ ee           eee          	 	 	 	 	 	 	 	 	 dde	e
j                 de	e
j                 de	e
j                 dee
j                 d	e	e
j                 d
e	e         de	e         de	e         defd                        Z xZS )TvpForVideoGroundingc                     t                                          |           || _        t          |          | _        t          |          | _        |                                  d S r  )r,   r-   r[   r  rF  r  video_grounding_headr  r   s     r$   r-   zTvpForVideoGrounding.__init__  sW       f%%
$9&$A$A!r#   r  NFr   ru   r   rQ   r   r   r/  r0  r   c
           
         ||n| j         j        }|                     ||||||||	          }
|
d         }|                     |          }d}|kt	          g d          }|                    | j                    |||          }|d         | j         j        |d         z  z   | j         j        |d         z  z   }|s|f|
dd         z   }
||f|
z   }
|
S t          |||
j
        |
j        	          S )
a  
        labels (`torch.FloatTensor` of shape `(batch_size, 3)`, *optional*):
            The labels contains duration, start time, and end time of the video corresponding to the text.
        Returns:

        Examples:
        ```python
        >>> import torch
        >>> from transformers import AutoConfig, AutoTokenizer, TvpForVideoGrounding

        >>> model = TvpForVideoGrounding.from_pretrained("Jiqing/tiny-random-tvp")

        >>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

        >>> pixel_values = torch.rand(1, 1, 3, 448, 448)
        >>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
        >>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
        ```N)r   r   r/  r0  r   r   r(   r)   r*   r+   rk   )r   r   r   r   )r[   r0  rF  r  r&   rj  r   distance_loss_weightduration_loss_weightr   r   r   )r4   r   ru   r   rQ   r   r   r/  r0  r   r  r  r   r   	criterion	loss_dicts                   r$   rT   zTvpForVideoGrounding.forward  sE   @ &1%<kk$+BY**/!5#%=  	
 	
  
**=99 ? ? ?@@ILL%%%!	&&11I% +2Yz5JJK+2Yz5JJK 
  	i'!""+-G'G+N&!/)	
 
 
 	
r#   )	NNNNNNNNF)r   r   r   r-   r
   r  r   r   r   r   r   r  r    r   r   r   rT   rU   rV   s   @r$   r  r    sC            +*+?@@+BQZ[[[ 154859&*15,0/3&*).A
 A
E,-A
 u01A
 !!12	A

 el#A
 E-.A
 $D>A
 'tnA
 d^A
 #'A
 A
 A
 \[ A@A
 A
 A
 A
 A
r#   r  )6r   r   dataclassesr   typingr   r   r   torch.utils.checkpointr   activationsr   
file_utilsr	   r
   r   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   utilsr   utils.backbone_utilsr   configuration_tvpr   
get_loggerr   loggerr   Moduler&   rX   r   r   r   r
  r  r  r%  r=  rE  TVP_START_DOCSTRINGr  r[  ro  r  r  r  r  r"   r#   r$   <module>r     s      ! ! ! ! ! ! " " " " " " " "            ! ! ! ! ! ! p p p p p p p p p p X X X X X X X X X X - - - - - - / / / / / /       1 1 1 1 1 1 ( ( ( ( ( ( 
	H	%	% ? ? ? ? ?k ? ? ?.M M M M Mbi M M M`% % % % %RY % % %Pn n n n nbi n n nb! ! ! ! !RY ! ! !H_ _ _ _ _29 _ _ _F    bi       RY       RY   8;
 ;
 ;
 ;
 ;
 ;
 ;
 ;
~    	   2 2 2 2 2 2 2 28	 # L" " " " "bi " " "JW W W W W") W W Wv ,#     v h
 h
 h
 h
 h
! h
 h
	 h
V    BI     	 L
 L
 L
 L
 L
- L
 L
 L
 L
 L
r#   