
    g y                    f   d Z ddlZddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
mZmZmZ ddlZddlZddlmZ ddlmZ dd	lmZmZ dd
lmZmZmZ ddlmZmZmZmZmZm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'  ej(        e)          Z*dZ+dZ,dZ-dZ.dZ/g dZ0dZ1dZ2ee'e%e&f         Z3e G d de                      Z4e G d de                      Z5e G d de                      Z6 G d dej7                  Z8 G d dej7                  Z9 G d d ej7                  Z: G d! d"ej7                  Z; G d# d$ej7                  Z< G d% d&ej7                  Z= G d' d(ej7                  Z> G d) d*ej7                  Z? G d+ d,ej7                  Z@ G d- d.ej7                  ZA G d/ d0ej7                  ZBd1ZCd2ZDd3ZEeEeDz   ZFd4ZGeGeDz   ZHd5eDz   ZId6ZJeEeGz   eDz   eJz   ZKd7eGz   eEz   d8z   eDz   ZLd9ZM G d: d;e          ZN ed<eCO                    d=                     G d> d?eN                      ZP ed@eCO                    d=                     G dA dBeN                      ZQ edCeCO                    d=                     G dD dEeN                      ZR edFeCO                    dG=                     G dH dIeN                      ZS G dJ dKej7                  ZT G dL dMej7                  ZU G dN dOej7                  ZV edPeCO                    dQ=                     G dR dSeN                      ZW G dT dUej7                  ZX G dV dWej7                  ZY G dX dYej7                  ZZ G dZ d[ej7                  Z[ ed\eCO                    dG=          eMz              G d] d^eN                      Z\dS )_zPyTorch FLAVA model.    N)OrderedDict)	dataclass)AnyDictListOptionalSetTupleUnion)nn   )ACT2FN)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputadd_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings	torch_int   )FlavaConfigFlavaImageCodebookConfigFlavaImageConfigFlavaMultimodalConfigFlavaTextConfigzfacebook/flava-fullzfacebook/flava-image-codebookr   r    r   )r         g$(~k@c                       e Zd ZU dZdZeej                 ed<   dZ	ee
         ed<   dZeej                 ed<   dZee
         ed<   dZeej                 ed<   dZee
         ed<   d	ee         fd
ZdS )FlavaModelOutputa  
    Output from FlavaModel containing embeddings and outputs from individual encoders.

    Note that `image_embeddings` and `text_embeddigns` returned are similar to pooled output returned from a
    transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and
    `text_projection` layers on `image_embeddings` and `text_embeddings` respectively.

    Args:
        image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
            The image embeddings which are basically the pooled output of [`FlavaImageModel`].
        image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
            The output of the [`FlavaImageModel`].
        text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present):
            The text embeddings which are basically the pooled output of [`FlavaTextModel`].
        text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present):
            The output of the [`FlavaTextModel`].
        multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`):
            The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
        multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`):
            The output of the [`FlavaMultimodalModel`].
    Nimage_embeddingsimage_outputtext_embeddingstext_outputmultimodal_embeddingsmultimodal_outputreturnc                 ^     t           fd                                 D                       S )Nc              3   t   K   | ]2}|d vr|         n!t          |                                          V  3dS ))r(   r&   r*   Ngetattrto_tuple).0kselfs     d/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/flava/modeling_flava.py	<genexpr>z,FlavaModelOutput.to_tuple.<locals>.<genexpr>b   sc       
 
  TTTDGGZabfhiZjZjZsZsZuZu
 
 
 
 
 
    tuplekeysr3   s   `r4   r0   zFlavaModelOutput.to_tuplea   sC     
 
 
 
YY[[
 
 
 
 
 	
r6   )__name__
__module____qualname____doc__r%   r   torchFloatTensor__annotations__r&   r   r'   r(   r)   r*   r
   r   r0    r6   r4   r$   r$   B   s          , 59hu018889=L(56===37OXe/07778<K45<<<9=8E$56===>Bx :;BBB
%* 
 
 
 
 
 
r6   r$   c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeej                 ed<   dZeej                 ed<   dZeej                 ed<   d	efd
ZdS )FlavaLossesa"  Class representing pretraining losses from FLAVA model

    Args:
        mim (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels` and `pixel_values` are present, `input_ids_masked` is absent and `mim_weight` > 0.:
            Masked Image Modeling loss as used in BeIT calculated only for unimodal image data.
        mlm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels` and `input_ids_masked` are present, `pixel_values` is absent and `mlm_weight` > 0.:
            Masked Language Modeling loss as used in BERT calculated only for unimodal text data.
        itm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `itm_labels`, `input_ids_masked`, `pixel_values` are present and `itm_weight` > 0.:
            Image Text Matching (ITM) loss calculated for paired image-text data. Note that ITM loss is calculated on
            masked pairs in FLAVA.
        global_contrastive (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `input_ids` and `pixel_values` are present and `global_contrastive_weight` > 0.:
            Contrastive loss for image-text similarity similar to CLIP but calculated globally for paired image-text
            data. This is calculated on unmasked images and texts.
        mmm_image (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_image_weight` > 0.:
            Masked Multimodal Modeling loss's image component calculated on paired image-text data.
        mmm_text (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_text_weight` > 0.:
            Masked Multimodal Modeling loss's text component calculated on paired image-text data.
    Nmimmlmitmglobal_contrastive	mmm_imagemmm_textr+   c                 D    d}|                                  D ]}|d} n	|S )NTF)values)r3   all_nonevs      r4   rM   zFlavaLosses.all_none   s9     	 	A}   r6   )r;   r<   r=   r>   rE   r   r?   r@   rA   rF   rG   rH   rI   rJ   boolrM   rB   r6   r4   rD   rD   h   s          & (,C%#	$+++'+C%#	$+++'+C%#	$+++6:!23:::-1Ix)*111,0Hhu()000$      r6   rD   c                      e Zd ZU dZdZeej                 ed<   dZ	e
ed<   dZeej                 ed<   dZee         ed<   dZeej                 ed<   dZee         ed<   dZeej                 ed	<   dZee         ed
<   dZeej                 ed<   dZee         ed<   dZeej                 ed<   dZee         ed<   dZeej                 ed<   dZee         ed<   dZeej                 ed<   dZeej                 ed<   dZeej                 ed<   dZeej                 ed<   dZeej                 ed<   dZeej                 ed<   dZeej                 ed<   dee          fdZ!dS )FlavaForPreTrainingOutputa  
    Output from FlavaForPreTraining containing embeddings, and outputs from individual encoders.

    Note that `image_embeddings` and `text_embeddings` returned are similar to pooled output returned from a
    transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and
    `text_projection` layers on `image_embeddings` and `text_embeddings` respectively.

    Args:
        loss (`torch.FloatTensor`, *optional*, returned when `return_loss` is True):
            Total loss calculated for this model.
        loss_info (`FlavaLosses`):
            Detailed info for FLAVA Pretraining losses. Check `FlavaLosses` class description for the information on
            the keys.
        image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
            The image embeddings which are basically the pooled output of [`FlavaImageModel`].
        image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
            The output of the [`FlavaImageModel`].
        text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present):
            The text embeddings which are basically the pooled output of [`FlavaTextModel`].
        text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present):
            The output of the [`FlavaTextModel`].
        multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
            The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
        multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
            The output of the [`FlavaMultimodalModel`].

        image_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
            The image embeddings which are basically the pooled output of [`FlavaImageModel`]. Uses `bool_masked_pos`
            to create masked images.
        image_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
            The output of the [`FlavaImageModel`]. Uses `bool_masked_pos` to create masked images.
        text_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids_masked` are present):
            The text embeddings which are basically the pooled output of [`FlavaTextModel`].
        text_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` are present):
            The output of the [`FlavaTextModel`].
        multimodal_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present):
            The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
        multimodal_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
            The output of the [`FlavaMultimodalModel`].

        mim_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape `(total_masked_patches, image_vocab_size)` , *optional*, returned when `pixel_values` are present and `input_ids_masked` are not):
                The logits for MIM unimodal loss. Uses `book_masked_pos` to get masked patches. The flattened output is
                returned when `bool_masked_pos` has some of the patches masked.
        mlm_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(total_masked_seq_length, text_vocab_size)`, *optional*, returned when `input_ids_masked` are present and `pixel_values` are not):
                The logits for MLM unimodal loss. The flattened output is returned when `input_ids_masked` has some of
                the tokens masked.
        itm_logits (`torch.FloatTensor` of shape `(batch_size, 2)`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
                The logits for ITM loss. Note that ITM loss is calculated on masked pairs in FLAVA.
        mmm_image_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape`(total_masked_patches, image_vocab_size)`, *optional*, returned when `pixel_values` and `input_ids_masked` are present):
                The logits for MMM image multimodal loss. Uses `book_masked_pos` to get masked patches. The flattened
                output is returned when `bool_masked_pos` has some of the patches masked.
        mmm_text_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(`(total_masked_seq_length, text_vocab_size)`), *optional*, returned when `pixel_values` and `input_ids_masked` are present):
                The logits for MMM text multimodal loss. The flattened output is returned when `input_ids_masked` has
                some of the tokens masked.
        contrastive_logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeddings` and `text_embeddings` but passed through FLAVA's
            `image_projection` and `text_projection` layers respectively. This represents the image-text similarity
            scores. This is calculated on unmasked images and texts.
        contrastive_logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeddings` and `image_embeddings` but passed through FLAVA's
            `text_projection` and `image_projection` layers respectively. This is calculated on unmasked images and
            texts.
    Nloss	loss_infor%   r&   r'   r(   r)   r*   image_masked_embeddingsimage_masked_outputtext_masked_embeddingstext_masked_outputmultimodal_masked_embeddingsmultimodal_masked_output
mim_logits
mlm_logits
itm_logitscontrastive_logits_per_imagecontrastive_logits_per_textmmm_image_logitsmmm_text_logitsr+   c                 j     g dt           fd                                 D                       S )N)r(   r&   r*   rW   rU   rY   c              3   t   K   | ]2}|vr|         n!t          |                                          V  3d S Nr.   )r1   r2   r3   transformer_outputss     r4   r5   z5FlavaForPreTrainingOutput.to_tuple.<locals>.<genexpr>   sN      ssbc)< < <T!WW'$PQBRBRB[B[B]B]ssssssr6   r7   )r3   rd   s   `@r4   r0   z"FlavaForPreTrainingOutput.to_tuple   sK    
 
 
 sssssgkgpgpgrgrssssssr6   )"r;   r<   r=   r>   rR   r   r?   r@   rA   rS   rD   r%   r&   r   r'   r(   r)   r*   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   r
   r   r0   rB   r6   r4   rQ   rQ      s9        > >@ )-D(5$
%,,,!I{!!!48hu018889=L(56===37OXe/07778<K45<<<9=8E$56===>Bx :;BBB;?Xe&78???@D"<=DDD:>HU%67>>>?C!;<CCC@D (5+<"=DDDEIh'ABIII.2J*+222.2J*+222.2J*+222@D (5+<"=DDD?C%*;!<CCC48hu0188837OXe/0777	t%* 	t 	t 	t 	t 	t 	tr6   rQ   c            	            e Zd ZdZddededdf fdZdej        d	e	d
e	dej        fdZ
	 	 ddej        deej                 dedej        fdZ xZS )FlavaImageEmbeddingszb
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
    Fconfiguse_mask_tokenr+   Nc                 f   t                                                       |p|j        }t          j        t          j        dd|j                            | _        |r-t          j        t          j        dd|j                            nd | _        t          |j
        |j        |j        |j                  | _        | j        j        }t          j        t          j        d|dz   |j                            | _        t          j        |j                  | _        |j        | _        || _        d S )Nr   )
image_size
patch_sizenum_channels	embed_dim)super__init__
mask_tokenr   	Parameterr?   zeroshidden_size	cls_tokenPatchEmbeddingsrj   rk   rl   patch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropoutrg   )r3   rg   rh   rw   	__class__s       r4   ro   zFlavaImageEmbeddings.__init__   s   '<6+<ek!Q8J&K&KLLQ_i",u{1a9K'L'LMMMei /((,(	!
 !
 !
 +7#%<A{QPVPb0c0c#d#d z&"<== +r6   
embeddingsheightwidthc                    |j         d         dz
  }| j        j         d         dz
  }t          j                                        s||k    r||k    r| j        S | j        ddddf         }| j        ddddf         }|j         d         }|| j        z  }	|| j        z  }
t          |dz            }|                    d|||          }|                    dddd          }t          j
                            ||	|
fdd	
          }|                    dddd                              dd|          }t          j        ||fd          S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r      bicubicF)sizemodealign_cornersdim)shaperx   r?   jit
is_tracingrk   r   reshapepermuter   
functionalinterpolateviewcat)r3   r}   r~   r   rw   num_positionsclass_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss               r4   interpolate_pos_encodingz-FlavaImageEmbeddings.interpolate_pos_encoding  sr    !&q)A-06q9A= y##%% 	,+*F*F6UZ??++2111bqb592111abb59r"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNy/?;CCCCr6   pixel_valuesbool_masked_posr   c                    |j         \  }}}}|                     ||          }|                                \  }}	}
|| j                            ||	d          }|                                dk    r)|                    |                    d          d          }|                    d                              |          }|d|z
  z  ||z  z   }| j	                            |dd          }t          j        ||fd          }|r||                     |||          z   }n
|| j        z   }|                     |          }|S )N)r   r   r   r         ?r   r   )r   rv   r   rp   expandr   r   	unsqueezetype_asrt   r?   r   r   rx   r{   )r3   r   r   r   
batch_sizerl   r~   r   r}   seq_len_mask_tokensmask
cls_tokenss                 r4   forwardzFlavaImageEmbeddings.forward3  sY    3?2D/
L&%**<Rj*kk
!+!2!2
GQ&/00WbIIK""$$))"1"6"67K7KA7N7NPR"S"S",,R0088EED#sTz2[45GGJ ^**:r2>>
Y
J7Q???
 $ 	?#d&C&CJPVX]&^&^^JJ#d&>>J\\*--
r6   F)NF)r;   r<   r=   r>   r   rO   ro   r?   Tensorintr   r   
BoolTensorr   __classcell__r|   s   @r4   rf   rf      s          /  RV      &&D5< &D &DUX &D]b]i &D &D &D &DV 7;).	 l "%"23 #'	
 
       r6   rf   c            	            e Zd ZdZ	 	 	 	 ddedeeeeef         f         ded	ef fd
Zddej	        de
dej	        fdZ xZS )ru   z#
    Image to Patch Embedding.
          r   r"   rj   rk   rl   rm   c                 ~   t                                                       t          |t          j        j                  s||f}t          |t          j        j                  s||f}|d         |d         z  |d         |d         z  z  }|| _        || _        || _        t          j
        ||||          | _        d S )Nr   r   )kernel_sizestride)rn   ro   
isinstancecollectionsabcIterablerj   rk   rw   r   Conv2d
projection)r3   rj   rk   rl   rm   rw   r|   s         r4   ro   zPatchEmbeddings.__init__\  s     	*ko&>?? 	2$j1J*ko&>?? 	2$j1J!!}
15*Q-:VW=:XY$$&)L)\fgggr6   Fr   r   r+   c                 B   |j         \  }}}}|sT|| j        d         k    s|| j        d         k    r2t          d| d| d| j        d          d| j        d          d	          |                     |                              d                              dd          }|S )Nr   r   zInput image size (*z) doesn't match model (z).r   )r   rj   
ValueErrorr   flatten	transpose)r3   r   r   r   rl   r~   r   xs           r4   r   zPatchEmbeddings.forwardo  s    2>2D/
L&%' 	+++u8J/J/J E E E% E E+E E.2oa.@E E E   OOL))11!44>>q!DDr6   )r   r   r   r"   r   )r;   r<   r=   r>   r   r   r
   ro   r?   r   rO   r   r   r   s   @r4   ru   ru   W  s          24h hh #uS#X./h 	h
 h h h h h h&	 	EL 	D 	]b]i 	 	 	 	 	 	 	 	r6   ru   c                        e Zd ZdZ fdZ	 	 	 ddeej                 deej                 deej                 fdZ xZ	S )	FlavaTextEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j        |j
        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        t#          |dd          | _        |                     dt)          j        |j                                      d          d           |                     d	t)          j        | j                                        t(          j        
          d           d S )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r   r   F)
persistenttoken_type_ids)dtype)rn   ro   r   	Embedding
vocab_sizers   pad_token_idword_embeddingsmax_position_embeddingsrx   type_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsry   rz   r{   r/   r   register_bufferr?   aranger   rr   r   r   longr3   rg   r|   s     r4   ro   zFlavaTextEmbeddings.__init__~  sK   !|F,=v?Q_e_rsss#%<0NPVPb#c#c %'\&2H&J\%]%]" f&8f>STTTz&"<=='.v7PR\']']$EL)GHHOOPWXXej 	 	
 	
 	
 	ek$*;*@*@*B*B%*UUUbg 	 	
 	
 	
 	
 	
r6   N	input_idsr   r   c                 B   |                                 }|d         }|| j        d d d |f         }|mt          | d          r2| j        d d d |f         }|                    |d         |          }|}n+t          j        |t
          j        | j        j                  }| 	                    |          }| 
                    |          }	||	z   }
| j        dk    r|                     |          }|
|z  }
|                     |
          }
|                     |
          }
|
S )Nr   r   r   )r   devicer   )r   r   hasattrr   r   r?   rr   r   r   r   r   r   rx   r   r{   )r3   r   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedinputs_embedsr   r}   rx   s               r4   r   zFlavaTextEmbeddings.forward  s<     nn&& ^
,QQQ^<L
 !t-.. m*.*=aaa*n*M'3J3Q3QR]^_R`bl3m3m0!A!&[
SWSdSk!l!l!l,,Y77 $ : :> J J"%::
':55"&":":<"H"H--J^^J//
\\*--
r6   )NNN)
r;   r<   r=   r>   ro   r   r?   r   r   r   r   s   @r4   r   r   {  s        QQ
 
 
 
 
* -115/3	   EL)  !.  u|,	               r6   r   c                        e Zd Zdeddf fdZdej        dej        fdZ	 	 	 ddej        d	eej                 d
eej                 de	de
eej        ej        f         eej                 f         f
dZ xZS )FlavaSelfAttentionrg   r+   Nc                    t                                                       |j        |j        z  dk    r1t	          |d          s!t          d|j        f d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j
        |j        | j        |j                  | _        t          j
        |j        | j        |j                  | _        t          j
        |j        | j        |j                  | _        t          j        |j                  | _        d S )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .bias)rn   ro   rs   num_attention_headsr   r   r   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyvaluery   attention_probs_dropout_probr{   r   s     r4   ro   zFlavaSelfAttention.__init__  s1    ::a??PVXhHiHi?76#5"7 7 737 7 7  
 $*#= #&v'9F<V'V#W#W !58PPYv143EFO\\\
9V/1C&/ZZZYv143EFO\\\
z&"EFFr6   r   c                     |                                 d d         | j        | j        fz   } |j        | }|                    dddd          S )Nr   r   r   r   r   )r   r   r   r   r   )r3   r   new_x_shapes      r4   transpose_for_scoresz'FlavaSelfAttention.transpose_for_scores  sM    ffhhssmt'?AY&ZZAFK yyAq!$$$r6   Fhidden_statesattention_mask	head_maskoutput_attentionsc                    |                      |          }|                     |                     |                    }|                     |                     |                    }|                     |          }t	          j        ||                    dd                    }	|	t          j        | j	                  z  }	||	|z   }	t          j                            |	d          }
|                     |
          }
||
|z  }
t	          j        |
|          }|                    dddd                                          }|                                d d         | j        fz   } |j        | }|r||
fn|f}|S )Nr   r   r   r   r   r   )r   r   r   r   r?   matmulr   mathsqrtr   r   r   softmaxr{   r   
contiguousr   r   r   )r3   r   r   r   r   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                 r4   r   zFlavaSelfAttention.forward  s    !JJ}55--dhh}.E.EFF	//

=0I0IJJ//0ABB !<Y5H5HR5P5PQQ+di8P.Q.QQ%/.@ -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S**,CD6G]=/22mM]r6   NNF)r;   r<   r=   FlavaPossibleConfigsro   r?   r   r   r   rO   r   r
   r   r   r   s   @r4   r   r     s        G3 G G G G G G G$%el %u| % % % % 26,0"'( (|( !.( EL)	(
  ( 
uU\5</0%2EE	F( ( ( ( ( ( ( (r6   r   c                   ^     e Zd ZdZdeddf fdZdej        dej        dej        fdZ xZ	S )	FlavaSelfOutputz
    The residual connection is defined in FlavaLayer (same as ViTLayer) instead of here (as is the case with other
    models), due to the layernorm applied before each block.
    rg   r+   Nc                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _        d S rc   )	rn   ro   r   r   rs   densery   rz   r{   r   s     r4   ro   zFlavaSelfOutput.__init__  sJ    Yv163EFF
z&"<==r6   r   input_tensorc                 Z    |                      |          }|                     |          }|S rc   r  r{   r3   r   r  s      r4   r   zFlavaSelfOutput.forward  s*    

=11]33r6   )
r;   r<   r=   r>   r	  ro   r?   r   r   r   r   s   @r4   r  r    s         
>3 > > > > > > >
U\  RWR^        r6   r  c                        e Zd Zdeddf fdZdee         ddfdZ	 	 	 ddej	        d	e
ej	                 d
e
ej	                 dedeeej	        ej	        f         eej	                 f         f
dZ xZS )FlavaAttentionrg   r+   Nc                     t                                                       t          |          | _        t	          |          | _        t                      | _        d S rc   )rn   ro   r   	attentionr  outputsetpruned_headsr   s     r4   ro   zFlavaAttention.__init__
  sI    +F33%f--EEr6   headsc                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   r   )lenr   r  r   r   r  r   r   r   r   r  r  r   union)r3   r  indexs      r4   prune_headszFlavaAttention.prune_heads  s   u::??F74>5t~7Y[_[l
 
u
  2$.2FNN/0BEJJ1$.2FNN.t{/@%QOOO .2^-ORUV[R\R\-\*'+~'IDNLn'n$ -33E::r6   Fr   r   r   r   c                     |                      ||||          }|                     |d         |          }|f|dd          z   }|S N)r   r   r   r   r   )r  r  )r3   r   r   r   r   self_outputsattention_outputr  s           r4   r   zFlavaAttention.forward"  s[     ~~.Iar & 
 
  ;;|AFF#%QRR(88r6   r  )r;   r<   r=   r	  ro   r	   r   r  r?   r   r   rO   r   r
   r   r   r   s   @r4   r  r  	  s        "3 " " " " " " ";S ;d ; ; ; ;* 26,0"' | !. EL)	
   
uU\5</0%2EE	F       r6   r  c                   L     e Zd Zdeddf fdZdej        dej        fdZ xZS )FlavaIntermediaterg   r+   Nc                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S rc   )rn   ro   r   r   rs   intermediate_sizer  r   
hidden_actstrr   intermediate_act_fnr   s     r4   ro   zFlavaIntermediate.__init__4  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r6   r   c                 Z    |                      |          }|                     |          }|S rc   )r  r)  r3   r   s     r4   r   zFlavaIntermediate.forward=  s,    

=1100??r6   	r;   r<   r=   r	  ro   r?   r   r   r   r   s   @r4   r$  r$  3  sr        93 9 9 9 9 9 9 9U\ el        r6   r$  c                   Z     e Zd Zdeddf fdZdej        dej        dej        fdZ xZS )FlavaOutputrg   r+   Nc                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _	        d S rc   )
rn   ro   r   r   r&  rs   r  ry   rz   r{   r   s     r4   ro   zFlavaOutput.__init__E  sJ    Yv79KLL
z&"<==r6   r   r  c                 d    |                      |          }|                     |          }||z   }|S rc   r  r  s      r4   r   zFlavaOutput.forwardK  s4    

=11]33%4r6   r,  r   s   @r4   r.  r.  D  s}        >3 > > > > > > >U\  RWR^        r6   r.  c                        e Zd ZdZdeddf fdZ	 	 	 ddej        deej                 d	eej                 d
e	de
eej        ej        f         eej                 f         f
dZ xZS )
FlavaLayerz?This corresponds to the Block class in the timm implementation.rg   r+   Nc                 z   t                                                       |j        | _        d| _        t	          |          | _        t          |          | _        t          |          | _	        t          j        |j        |j                  | _        t          j        |j        |j                  | _        d S Nr   r   )rn   ro   chunk_size_feed_forwardseq_len_dimr  r  r$  intermediater.  r  r   r   rs   r   layernorm_beforelayernorm_afterr   s     r4   ro   zFlavaLayer.__init__W  s    '-'E$'//-f55!&)) !#V-?VEZ [ [ [!|F,>FDYZZZr6   Fr   r   r   r   c                    |                      |                     |          |||          }|d         }|dd          }||z   }|                     |          }|                     |          }|                     ||          }|f|z   }|S r   )r  r8  r9  r7  r  )	r3   r   r   r   r   self_attention_outputsr"  r  layer_outputs	            r4   r   zFlavaLayer.forwardc  s     "&!!-00)/	 "0 "
 "
 2!4(, )=8 ++M::((66 {{<??/G+r6   r  )r;   r<   r=   r>   r	  ro   r?   r   r   rO   r   r
   r   r   r   s   @r4   r2  r2  T  s        II
[3 
[ 
[ 
[ 
[ 
[ 
[ 
[ 26,0"' | !. EL)	
   
uU\5</0%2EE	F       r6   r2  c                        e Zd Zdeddf fdZ	 	 	 	 	 ddej        deej                 d	eej                 d
ededede	e
ef         fdZ xZS )FlavaEncoderrg   r+   Nc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S rB   )r2  )r1   r   rg   s     r4   
<listcomp>z)FlavaEncoder.__init__.<locals>.<listcomp>  s!    #`#`#`1Jv$6$6#`#`#`r6   F)	rn   ro   rg   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   s    `r4   ro   zFlavaEncoder.__init__  s`    ]#`#`#`#`fF^@_@_#`#`#`aa
&+###r6   FTr   r   r   r   output_hidden_statesreturn_dictc                    |rdnd }|rdnd }t          | j                  D ]j\  }	}
|r||fz   }|||	         nd }| j        r&| j        r|                     |
j        ||||          }n |
||||          }|d         }|r||d         fz   }k|r||fz   }|st          d |||fD                       S t          |||          S )NrB   r   r   c              3      K   | ]}||V  	d S rc   rB   )r1   rN   s     r4   r5   z'FlavaEncoder.forward.<locals>.<genexpr>  s(      mmq_`_l_l_l_l_lmmr6   )last_hidden_stater   
attentions)	enumeraterE  rF  training_gradient_checkpointing_func__call__r8   r   )r3   r   r   r   r   rG  rH  all_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss                r4   r   zFlavaEncoder.forward  sN    #7@BBD$5?bb4(44 	P 	POA|# I$58H$H!.7.CillO* 	pt} 	p $ A A )!"#%! ! !-]NO]n o o)!,M  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm+;LYl
 
 
 	
r6   )NNFFT)r;   r<   r=   r   ro   r?   r   r   rO   r   r8   r   r   r   r   s   @r4   r>  r>    s        ,{ ,t , , , , , , 26,0"'%* )
 )
|)
 !.)
 EL)	)

  )
 #)
 )
 
uo%	&)
 )
 )
 )
 )
 )
 )
 )
r6   r>  c                   :     e Zd Zdef fdZdej        fdZ xZS )FlavaPoolerrg   c                     t                                                       t          j        |j        |j                  | _        t          j                    | _        d S rc   )rn   ro   r   r   rs   r  Tanh
activationr   s     r4   ro   zFlavaPooler.__init__  sC    Yv163EFF
'))r6   r   c                 r    |d d df         }|                      |          }|                     |          }|S Nr   )r  r[  )r3   r   first_token_tensorpooled_outputs       r4   r   zFlavaPooler.forward  s@     +111a40

#56666r6   r,  r   s   @r4   rX  rX    sb        $3 $ $ $ $ $ $
U\        r6   rX  aD  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`{config}`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a;  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`FlavaImageProcessor.__call__`] for details.

        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        interpolate_pos_encoding (`bool`, *optional*):
            Whether to interpolate the pre-trained position encodings.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)

        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)
z
    Args:
        hidden_states (`torch.FloatTensor` of shape `(batch_size, image_num_patches + text_seq_len, hidden_size)`):
            The concatenated hidden states of unimodal encoders.
z
    Args:
        skip_multimodal_encoder (*bool*, *optional*):
            Skip any calculations for multimodal encoder. Useful if multimodal encoding is not going to be used.
a  
    Args:
        input_ids_masked (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary. These ones are the masked version of the original task
            to be used with MLM. Indices can be obtained using [`AutoTokenizer`] along with
            [`DataCollatorForMaskedLanguageModeling`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)

a  
        image_attention_mask (`torch.FloatTensor` of shape `({1})`, *optional*):
            Mask to avoid performing attention on padding token indices specifically for images. Mask values selected
            in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)

        skip_unmasked_multimodal_encoder (*bool*, *optional*):
            Skip any calculations for multimodal encoder for unmasked inputs. FLAVA pretraining doesn't need unmasked
            multimodal embeddings or outputs as of now.

        mlm_labels (`torch.LongTensor` of shape `(batch_size, text_seq_len)`, *optional*):
            Labels for computing the left-to-right language and multimodal masked modeling loss (next word prediction).
            Indices should be in `[-100, 0, ..., text_config.vocab_size - 1]` (see `input_ids` docstring). Tokens with
            indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0,
            ..., text_config.vocab_size - 1]`.

        mim_labels (`torch.LongTensor` of shape `(batch_size, image_num_patches)`, *optional*):
            Labels for computing the image and multimodal masked modeling loss. Indices should be in `[-100, 0, ...,
            image_config.vocab_size - 1]`. Tokens with indices set to `-100` are ignored (masked), the loss is only
            computed for the tokens with labels in `[0, ..., image_config.vocab_size - 1]`. If not passed, they are
            generated automatically using the image codebook assigned to the model. By default, it uses
            [`FlavaImageCodebook`]. See [`FlavaImageCodebook`] to understand how to generate mim_labels.

        itm_labels (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*):
            Labels for computing the image-text matching loss. 0 means the pairs don't match and 1 means they match.
            The pairs with 0 will be skipped for calculation of MMM and global contrastive losses as well.

        return_loss (`bool`, *optional*, default to None):
            Whether to return calculated loss or not.
z
    Parameters:
        image_codebook ([`nn.Module`]): If passed, the image codebook will be set to this. Otherwise. it will
            be initialized using the image_codebook_config defined in the config first as the first parameter.
c                   ^    e Zd ZdZeZdZdZdee	j
        e	j        e	j        f         ddfdZdS )FlavaPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    flavaTmoduler+   Nc                    t          |t          j        t          j        f          rT|j        j                            d| j        j                   |j	         |j	        j        
                                 dS dS t          |t          j                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 
                                 dS dS t          |t          j                  r?|j	        j        
                                 |j        j                            d           dS dS )zInitialize the weightsg        )meanstdNr   )r   r   r   r   weightdatanormal_rg   initializer_ranger   zero_r   r   r   fill_)r3   rc  s     r4   _init_weightsz"FlavaPreTrainedModel._init_weights^  s0   fry")455 	* M&&CT[5R&SSS{& &&((((( '&-- 	*M&&CT[5R&SSS!-"6#56<<>>>>> .--- 	*K""$$$M$$S)))))	* 	*r6   )r;   r<   r=   r>   r   config_classbase_model_prefixsupports_gradient_checkpointingr   r   r   r   r   rm  rB   r6   r4   ra  ra  T  sf         
 L&*#*E")RY*L$M *RV * * * * * *r6   ra  zeThe bare FLAVA Image Model transformer outputting raw hidden-states without any specific head on top.)rg   c                       e Zd ZeZdZdZddedef fdZde	j
        fdZd	e	j
        fd
Zdeeee         f         ddfdZ ee                    d                     eeeede          	 	 	 	 	 	 	 	 ddeej                 deej                 dee         deej                 deej                 dee         dee         dee         deeef         fd                        Z xZ S )FlavaImageModelzflava.image_modelr   Trg   add_pooling_layerc                 J   t                                          |           || _        t          |          | _        t          |          | _        t          j        |j	        |j
                  | _        |rt          |          nd | _        |                                  d S Nr   )rn   ro   rg   rf   r}   r>  encoderr   r   rs   r   	layernormrX  pooler	post_initr3   rg   rs  r|   s      r4   ro   zFlavaImageModel.__init__y  s       .v66#F++f&8f>STTT->Hk&)))Dr6   r+   c                     | j         j        S rc   r}   rv   r:   s    r4   get_input_embeddingsz$FlavaImageModel.get_input_embeddings  s    //r6   r   c                     || j         _        d S rc   r|  r3   r   s     r4   set_input_embeddingsz$FlavaImageModel.set_input_embeddings  s    +0(((r6   heads_to_pruneNc                     |                                 D ]/\  }}| j        j        |         j                            |           0dS z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        Nitemsrv  rE  r  r  r3   r  rE  r  s       r4   _prune_headszFlavaImageModel._prune_heads  U    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr6   batch_size, image_num_patchesvision)
checkpointoutput_typern  modalityexpected_outputr   r   r   r   r   rG  rH  c	                    ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |                     || j         j                  }|                     |||          }	|                     |	|||||          }
|
d         }| 	                    |          }| j
        | 
                    |          nd }|s||f|
dd          z   S t          |||
j        |
j                  S )Nz You have to specify pixel_values)r   r   r   r   r   rG  rH  r   r   rK  pooler_outputr   rL  )rg   r   rG  use_return_dictr   get_head_maskrD  r}   rv  rw  rx  r   r   rL  )r3   r   r   r   r   r   r   rG  rH  embedding_outputencoder_outputssequence_outputr_  s                r4   r   zFlavaImageModel.forward  sN   & 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]?@@@ &&y$+2OPP	??/Tl + 
 
 ,,)/!5# ' 
 
 *!,..998<8OO444UY 	J#]3oabb6III)-')7&1	
 
 
 	
r6   TNNNNNNNN)!r;   r<   r=   r   rn  ro  main_input_namerO   ro   r   Moduler}  r  r   r   r   r  r   FLAVA_IMAGE_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   !_CONFIG_CLASS_FOR_IMAGE_MODEL_DOC_EXPECTED_IMAGE_OUTPUT_SHAPEr   r?   r   r   r   r8   r   r   r   s   @r4   rr  rr  o  s       
 $L+$O / D      0bi 0 0 0 01") 1 1 1 1C4T#Y+? CD C C C C +*+G+N+NOn+o+opp&.64   046:3715,0,0/3&*3
 3
u|,3
 "%"233
 #+4.	3

 !.3
 EL)3
 $D>3
 'tn3
 d^3
 
u00	13
 3
 3
  qp3
 3
 3
 3
 3
r6   rr  zdThe bare FLAVA Text Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd ZeZdZddedef fdZdefdZ	de
j        fd	Zd
eeee         f         ddfdZ ee                    d                     eeee          	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 dee         dee         dee         deeef         fd                        Z xZS )FlavaTextModelzflava.text_modelTrg   rs  c                 J   t                                          |           || _        t          |          | _        t          |          | _        t          j        |j	        |j
                  | _        |rt          |          nd | _        |                                  d S ru  )rn   ro   rg   r   r}   r>  rv  r   r   rs   r   rw  rX  rx  ry  rz  s      r4   ro   zFlavaTextModel.__init__  s       -f55#F++f&8f>STTT->Hk&)))Dr6   r+   c                     | j         j        S rc   r}   r   r:   s    r4   r}  z#FlavaTextModel.get_input_embeddings  s    ..r6   r   c                     || j         _        d S rc   r  r  s     r4   r  z#FlavaTextModel.set_input_embeddings  s    */'''r6   r  Nc                     |                                 D ]/\  }}| j        j        |         j                            |           0dS r  r  r  s       r4   r  zFlavaTextModel._prune_heads  r  r6   batch_size, text_seq_lengthr  r  rn  r   r   r   r   r   r   rG  rH  c	                    ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |                                }	|t          j        |	|j                  }| 	                    || j         j
                  }|                     ||	|j                  }
|                     |||          }|                     ||
||||          }|d         }|                     |          }| j        |                     |          nd }|s||f|dd          z   S t!          |||j        |j                  S )NzYou have to specify input_idsr   )r   r   r   r  r   r   r  )rg   r   rG  r  r   r   r?   onesr   r  rD  get_extended_attention_maskr}   rv  rw  rx  r   r   rL  )r3   r   r   r   r   r   r   rG  rH  r   extended_attention_maskr  r  r  r_  s                  r4   r   zFlavaTextModel.forward  s   " 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]<===nn&&!"ZI<LMMMN &&y$+2OPP	040P0PK)91
 1
  ??)% + 
 
 ,,2/!5# ' 
 
 *!,..998<8OO444UY 	J#]3oabb6III)-')7&1	
 
 
 	
r6   r  r  )r;   r<   r=   r    rn  ro  rO   ro   ru   r}  r   r  r  r   r   r   r  r   FLAVA_TEXT_INPUTS_DOCSTRINGr  r   r  r    _CONFIG_CLASS_FOR_TEXT_MODEL_DOCr   r?   r   r   r8   r   r   r   s   @r4   r  r    s       
 #L*
 
 
4 
 
 
 
 
 
/o / / / /0") 0 0 0 0C4T#Y+? CD C C C C +*+F+M+MNk+l+lmm&.5   -11515/3,0,0/3&*=
 =
EL)=
 !.=
 !.	=

 u|,=
 EL)=
 $D>=
 'tn=
 d^=
 
u00	1=
 =
 =
  nm=
 =
 =
 =
 =
r6   r  zjThe bare FLAVA Multimodal Model transformer outputting raw hidden-states without any specific head on top.c                   h    e Zd ZeZdZdZddef fdZdee	e
e	         f         ddfd	Z ee                    d
                     eeee          	 	 	 	 	 ddej        deej                 deej                 dee         dee         dee         deeef         fd                        Z xZS )FlavaMultimodalModelzflava.multimodal_modelr   Trg   c                    t                                          |           || _        | j        j        | _        | j        r2t	          j        t          j        dd|j                            | _	        t          |          | _        t	          j        |j        |j                  | _        |rt          |          nd | _        |                                  d S r4  )rn   ro   rg   use_cls_tokenr   rq   r?   rr   rs   rt   r>  rv  r   r   rw  rX  rx  ry  rz  s      r4   ro   zFlavaMultimodalModel.__init__E  s       ![6 	Q\%+aF<N*O*OPPDN#F++f&8f>STTT->Hk&)))Dr6   r  r+   Nc                     |                                 D ]/\  }}| j        j        |         j                            |           0dS r  r  r  s       r4   r  z!FlavaMultimodalModel._prune_headsS  r  r6   ,batch_size, image_num_patches + text_seq_lenr  r   r   r   rG  rH  c                    ||n| j         j        }||n| j         j        }||n| j         j        }|                                \  }}}	| j        r9| j                            |dd          }
t          j	        |
|fd          }|dz  }|t          j
        ||f|j                  }|                     || j         j                  }|                     |||f|j                  }|                     ||||||          }|d         }|                     |          }| j        |                     |          nd }|s||f|dd          z   S t%          |||j        |j                  S )Nr   r   r   r  r  r   r  )rg   r   rG  r  r   r  rt   r   r?   r   r  r   r  rD  r  rv  rw  rx  r   r   rL  )r3   r   r   r   r   rG  rH  r   r   r   r   r  r  r  r_  s                  r4   r   zFlavaMultimodalModel.forward[  s   " 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]$1$6$6$8$8!
J 	..z2rBBJ!Iz=&AqIIIM!OJ!"ZZ(@I]^^^N &&y$+2OPP	040P0PZ4m6J1
 1
 ,,2/!5# ' 
 
 *!,..998<8OO444UY 	J#]3oabb6III)-')7&1	
 
 
 	
r6   r  )NNNNN)r;   r<   r=   r   rn  ro  r  ro   r   r   r   r  r   !FLAVA_MULTIMODAL_INPUTS_DOCSTRINGr  r   r  r   &_CONFIG_CLASS_FOR_MULTIMODAL_MODEL_DOCr?   r   r   rO   r   r8   r   r   r   s   @r4   r  r  ;  s~       
 )L0%O 4      C4T#Y+? CD C C C C +*)001_``   &.;   26,0,0/3&*7
 7
|7
 !.7
 EL)	7

 $D>7
 'tn7
 d^7
 
u00	17
 7
 7
  7
 7
 7
 7
 7
r6   r  z_The bare FLAVA Model transformer outputting raw hidden-states without any specific head on top.r   c                       e Zd ZeZdef fdZ ee                    d                    	 	 	 	 	 	 	 dde	e
j                 de	e
j                 de	e
j                 de	e
j                 d	e	e         d
e	e         de	e         de
j        fd            Z ee                    d                    	 	 	 	 	 	 	 	 dde	e
j                 de	e
j                 de	e         de	e
j                 de	e
j                 d	e	e         d
e	e         de	e         de
j        fd            Z ee                    d                     eee          	 	 	 	 	 	 	 	 	 	 	 dde	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e         d	e	e         d
ede	e         deeef         fd                        Z xZS )
FlavaModelrg   c                 ~   t                                          |           t          |j        t                    s%t          dt          |j                   d          t          |j        t                    s%t          dt          |j                   d          t          |j	        t                    s(t          ddt          |j	                   dz             |j        }|j        }|j	        }|j        | _        |j        | _        |j        | _        |j        | _        t!          |          | _        t%          |          | _        t)          |          | _        t-          j        | j        | j                  | _        t-          j        | j        | j                  | _        t-          j        t7          j        | j        j                            | _        t-          j        | j        | j                  | _         t-          j        | j        | j                  | _!        | "                                 d S )NzLconfig.text_config is expected to be of type FlavaTextConfig but is of type r   zNconfig.image_config is expected to be of type FlavaImageConfig but is of type zMconfig.multimodal_config is expected to be of type FlavaMultimodalConfig but zis of type )#rn   ro   r   text_configr    	TypeErrortypeimage_configr   multimodal_configr   projection_dimrs   text_hidden_sizeimage_hidden_sizemm_hidden_sizer  
text_modelrr  image_modelr  multimodal_modelr   r   image_projectiontext_projectionrq   r?   tensorrg   logit_scale_init_valuelogit_scaleimage_to_mm_projectiontext_to_mm_projectionry  )r3   rg   r  r  r  r|   s        r4   ro   zFlavaModel.__init__  s      &,o>> 	0+,,0 0 0  
 &-/?@@ 	1,--1 1 1  
 &24IJJ 	_AV%= > >AAAB  
 (*"4$3 + 7!-!9/;(55*<88 45F G G "	$*@$BU V V!y)>@STT<T[5W(X(XYY&(i0FH[&\&\#%'Yt/DdFY%Z%Z"r6   r  Nr   r   r   r   r   rG  rH  r+   c           	          d                     t                     |                     |||||||          }|d         }	|                     |	          }
|
S )Na  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`FlavaTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoProcessor, FlavaModel

        >>> model = FlavaModel.from_pretrained("{0}")
        >>> processor = AutoProcessor.from_pretrained("{0}")

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], max_length=77, padding="max_length", return_tensors="pt"
        ... )
        >>> text_features = model.get_text_features(**inputs)
        ```)r   r   r   r   r   rG  rH  r   )r  r  r  r  )r3   r   r   r   r   r   rG  rH  text_outputsr_  text_featuress              r4   get_text_featureszFlavaModel.get_text_features  sk    	" v)***))%/!5# ' 
 
 %Q,,];;r6   r  r   r   r   r   c	           
          d                     t                     |                     ||||||||          }	|	d         }
|                     |
          }|S )Na  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`FlavaImageModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, FlavaModel

        >>> model = FlavaModel.from_pretrained("{0}")
        >>> processor = AutoProcessor.from_pretrained("{0}")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```)r   r   r   r   r   rG  r   rH  r   )r  r  r  r  )r3   r   r   r   r   r   r   rG  rH  image_outputsr_  image_featuress               r4   get_image_featureszFlavaModel.get_image_features  sq    	* v)***((%+)/!5%=# ) 	
 	
 &a(..}==r6   r  r  rn  Timage_attention_maskskip_multimodal_encoderc           	         ||n| j         j        }|
st          d          d}d}d}d}|F|                     ||||	|
|          }|d         |d         }}|                     |d                   }d}d}d}d}|G|                     |||||	|
|          }|d         |d         }}|                     |d                   }d}d}|||s|Q|j        \  }}}| j        j	        r|dz  }t          j        |||j        	          }t          j        ||gd
          }nd}t          j        ||gd
          }|                     |||          }|d         }|s||||||fS t          ||||||          S )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, FlavaModel

        >>> model = FlavaModel.from_pretrained("facebook/flava-full")
        >>> processor = AutoProcessor.from_pretrained("facebook/flava-full")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(text=["a photo of a cat"], images=image, return_tensors="pt", padding=True)

        >>> outputs = model(**inputs)

        >>> image_embeddings = outputs.image_embeddings
        >>> text_embeddings = outputs.text_embeddings
        >>> multimodal_embeddings = outputs.multimodal_embeddings

        >>> outputs.image_embeddings.shape
        torch.Size([1, 197, 768])

        >>> text_embeddings.shape
        torch.Size([1, 7, 768])

        >>> multimodal_embeddings.shape
        torch.Size([1, 205, 768])
        ```
        NzRFLAVA model requires hidden states to work. Please set `output_hidden_states=True`)r   r   r   r   rG  rH  r   r   r   )r   r   r   r   r   rG  rH  r   r  r   )r   rH  )r%   r&   r'   r(   r)   r*   )rg   rH  r   r  r  r  r  r   r  r  r?   r  r   r   r$   )r3   r   r   r   r   r   r   r  r  r   rG  rH  r%   image_statesimage_mm_projectionr&   r'   text_statestext_mm_projectionr(   r)   r*   r   r   r   attention_mask_imageattention_multimodalmultimodal_inputs                               r4   r   zFlavaModel.forward-  s?   j &1%<kk$+BY# 	sqrrr"#++) /3"3%9' ,  L .:!_l1ol"&"="=l2>N"O"O! //#-)-"3%9' *  K ,7q>;q>[O!%!;!;KO!L!L $ */A/MVm/M))<)B&
GQ(6 !qLG',z*gNaNh'i'i'i$',y2F1W]^'_'_'_$$'+$$y*=?Q)RXYZZZ $ 5 5 1ES^ !6 ! ! %6a$8! 	 %!   -%+#"7/
 
 
 	
r6   )NNNNNNNr  )NNNNNNNNNTN)r;   r<   r=   r   rn  ro   r   r  r  r   r?   r   rO   r@   r  r  r   r  FLAVA_MODEL_INPUTS_DOCSTRINGr   r$   
LongTensorr   r
   r.  r   r   r   s   @r4   r  r    sb       
 L){ ) ) ) ) ) )V +*+F+M+MNk+l+lmm -11515/3,0/3&*) )EL)) !.) !.	)
 u|,) $D>) 'tn) d^) 
	) ) ) nm)V +*+G+N+NOn+o+opp 046:3715,0,0/3&*/ /u|,/ "%"23/ #+4.	/
 !./ EL)/ $D>/ 'tn/ d^/ 
	/ / / qp/b +*$++,Z[[  +;+VVV 1548151526377;26,0%)&*z
 z
E,-z
 u01z
 !.	z

 !.z
 "%,/z
 u/0z
 'u|4z
 "*$z
 $D>z
 #z
 d^z
 
uk!	"z
 z
 z
 WV z
 z
 z
 z
 z
r6   r  c                   L     e Zd Zdedef fdZdej        dej        fdZ xZS )FlavaImageCodebookResPathin_sizeout_sizec                 (   t                                                       |dz  }t                      }t          j                    |d<   t          j        ||dd          |d<   t          j                    |d<   t          j        ||dd          |d<   t          j                    |d	<   t          j        ||dd          |d
<   t          j                    |d<   t          j        ||dd          |d<   t          j        |          | _        d S )N   relu_1r   r   r   paddingconv_1relu_2conv_2relu_3conv_3relu_4r   conv_4)rn   ro   r   r   ReLUr   
Sequentialpath)r3   r  r  kwargshid_sizer  r|   s         r4   ro   z"FlavaImageCodebookResPath.__init__  s    q=}}X7H!QOOOXX8X1aPPPXX8X1aPPPXX8X1aPPPXM$''			r6   r   r+   c                 ,    |                      |          S rc   )r  r3   r   s     r4   r   z!FlavaImageCodebookResPath.forward  s    yy||r6   	r;   r<   r=   r   ro   r?   r   r   r   r   s   @r4   r  r    sq        ( (s ( ( ( ( ( (  %,        r6   r  c                   P     e Zd Zdededef fdZdej        dej        fdZ xZS )FlavaImageCodebookBlockr  r  
num_layersc                    t                                                       d|dz  z  | _        ||k    rt          j        ||dd          | _        nt          j                    | _        t          ||          | _        d S )Nr   r   r   r  )	rn   ro   	post_gainr   r   id_pathIdentityr  res_path)r3   r  r  r  r  r|   s        r4   ro   z FlavaImageCodebookBlock.__init__  sr    j!m,h9WhAqQQQDLL;==DL1'8DDr6   r   r+   c                 h    |                      |          | j        |                     |          z  z   S rc   )r  r  r  r  s     r4   r   zFlavaImageCodebookBlock.forward  s*    ||A$--2B2B!BBBr6   r  r   s   @r4   r  r    s        
E 
Es 
E 
E 
E 
E 
E 
E 
EC C%, C C C C C C C Cr6   r  c                   Z     e Zd Zddededededef
 fdZdej        d	ej        fd
Z xZ	S )FlavaImageCodebookLayerGroupT
num_blocksr  r  r  use_poolc                 d   t                                                       t                      }t          |          D ]=}|dk    rt	          |||          |d|dz    <   #t	          |||          |d|dz    <   >|rt          j        d          |d<   t          j        |          | _        d S )Nr   block_r   r   )r   pool)	rn   ro   r   rC  r  r   	MaxPool2dr  group)	r3   r  r  r  r  r	  blocksrS  r|   s	           r4   ro   z%FlavaImageCodebookLayerGroup.__init__  s    z"" 	a 	aAAvv)@(T^)_)_~!~~&&)@8U_)`)`~!~~&& 	9\a888F6N]6**


r6   r   r+   c                 ,    |                      |          S rc   )r  r  s     r4   r   z$FlavaImageCodebookLayerGroup.forward  s    zz!}}r6   r  )
r;   r<   r=   r   rO   ro   r?   r   r   r   r   s   @r4   r  r    s        + +3 +C +# +QT +`d + + + + + + %,        r6   r  a"  
    The FLAVA's image codebook model inspired from DALL-E's original encoder. Outputs raw hidden states and can be used
    to generate image tokens for an image based on DALL-E's vocab. Used to generate labels for MIM. Use
    `get_codebook_indices` to get image tokens for an image.
    r   c                        e Zd ZdZeZdZdZdedef fdZ	de
j        de
j        fdZde
j        de
j        fd	Zde
j        de
j        fd
Z xZS )FlavaImageCodebook r   Frg   r  c                 &   t                                          |           || _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        | j        | j        z  }t                      }t          j
                    |d<   t          j        d| j        z  | j        dd          |d<   t                      }t          j        | j        d| j        z  dd          |d	<   t          | j        |d| j        z  d| j        z            |d
<   t          | j        |d| j        z  d| j        z            |d<   t          | j        |d| j        z  d| j        z            |d<   t          | j        |d| j        z  d| j        z  d          |d<   t          j        |          |d<   t          j        |          | _        |                                  | j        j        r|                                 D ]}d|_        
d S d S )Nrelu   r   r   r  conv   r   inputgroup_1r   group_2r  group_3F)r	  group_4r  )rn   ro   rg   
num_groupsinput_channelsnum_blocks_per_grouprs   r   r   r   r  r   r  r  r  ry  freeze
parametersrequires_grad)r3   rg   r  r  output_blocksr  paramr|   s          r4   ro   zFlavaImageCodebook.__init__  s&   
 	    +$3$*$?!!- +_t'@@
# "		f "	!d.>*>]^hi j j jf)D$7T=M9M[\fghhhw8%z1t7G3GTM]I]
 
y 9%z1t7G3GTM]I]
 
y 9%z1t7G3GTM]I]
 
y 9%z1t7G3GTM]I]hm
 
 
y =77xmF++; 	,** , ,&+##	, 	,, ,r6   r+   c                     d                     t                     |                     |          }t          j        |d          S )Na  
        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing
                `return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.

        Examples:
        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoImageProcessor, FlavaImageCodebook

        >>> model = FlavaImageCodebook.from_pretrained("{0}")
        >>> image_processor = AutoImageProcessor.from_pretrained("{0}")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
        >>> inputs = dict(pixel_values=inputs.codebook_pixel_values)

        >>> outputs = model.get_codebook_indices(**inputs)
        ```
        r   )axis)r  _CHECKPOINT_FOR_CODEBOOK_DOCr  r?   argmaxr3   r   z_logitss      r4   get_codebook_indicesz'FlavaImageCodebook.get_codebook_indices"  s@    	. F/000;;|,,|H1----r6   c                 h    |                      |          } t          j        d          |          S )Nr   r   )r  r   Softmaxr*  s      r4   get_codebook_probsz%FlavaImageCodebook.get_codebook_probs>  s0    ;;|,, rza   ***r6   c                 6   d                     t                     t          |j                  dk    rt	          d|j         d          |j        d         | j        k    r%t	          d|j        d          d| j                   |                     |          S )Na  
        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing
                `return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoImageProcessor, FlavaImageCodebook

        >>> model = FlavaImageCodebook.from_pretrained("{0}")
        >>> image_processor = AutoImageProcessor.from_pretrained("{0}")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
        >>> inputs = dict(pixel_values=inputs.codebook_pixel_values)

        >>> outputs = model(**inputs)
        >>> print(outputs.shape)
        (1, 196)
        ```
        r  zinput shape z
 is not 4dr   z
input has z channels but model built for )r  r(  r  r   r   r  r  )r3   r   s     r4   r   zFlavaImageCodebook.forwardB  s    	4 F/000|!""a''JL,>JJJKKKa D$777t,*<Q*?tt_c_rttuuu{{<(((r6   )r;   r<   r=   ro  r   rn  r  rp  r   ro   r?   r   r,  r/  r@   r   r   r   s   @r4   r  r    s         +L$O&+#*,(*, *, *, *, *, *, *,X. .%, . . . .8+u| + + + + + )E$5  )%,  )  )  )  )  )  )  )  )r6   r  c                   $     e Zd Z fdZd Z xZS )FlavaPredictionHeadTransformc                 V   t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _
        n|j        | _
        t          j        |j        |j                  | _        d S ru  )rn   ro   r   r   rs   r  r   r'  r(  r   transform_act_fnr   r   r   s     r4   ro   z%FlavaPredictionHeadTransform.__init__f  s    Yv163EFF
f'-- 	6$*6+<$=D!!$*$5D!f&8f>STTTr6   c                     |                      |          }|                     |          }|                     |          }|S rc   )r  r4  r   r+  s     r4   r   z$FlavaPredictionHeadTransform.forwardo  s=    

=11--m<<}55r6   r;   r<   r=   ro   r   r   r   s   @r4   r2  r2  e  sL        U U U U U      r6   r2  c                   ,     e Zd Zd fd	Zd Zd Z xZS )FlavaMaskedPredictionHeadNc                 h   t                                                       || _        t          |          | _        t          j        |j        |j        d          | _	        t          j
        t          j        |j                            | _        ||| j	        _        | j        | j	        _        d S )NFr   )rn   ro   rg   r2  	transformr   r   rs   r   decoderrq   r?   rr   r   rg  )r3   rg   rg  r|   s      r4   ro   z"FlavaMaskedPredictionHead.__init__w  s    5f==y!3V5FUSSSLV->!?!?@@	"(DL !Ir6   c                 (    | j         | j        _         d S rc   )r   r;  r:   s    r4   _tie_weightsz&FlavaMaskedPredictionHead._tie_weights  s     Ir6   c                 Z    |                      |          }|                     |          }|S rc   )r:  r;  r  s     r4   r   z!FlavaMaskedPredictionHead.forward  s'    NN1LLOOr6   rc   )r;   r<   r=   ro   r=  r   r   r   s   @r4   r8  r8  v  s[        
& 
& 
& 
& 
& 
&& & &      r6   r8  c                   $     e Zd Z fdZd Z xZS )FlavaITMHeadc                     t                                                       || _        t          |          | _        t          j        |j        d          | _        d S )Nr   )	rn   ro   rg   rX  rx  r   r   rs   seq_relationshipr   s     r4   ro   zFlavaITMHead.__init__  sL    !&)) "	&*<a @ @r6   c                 Z    |                      |          }|                     |          }|S rc   )rx  rB  r  s     r4   r   zFlavaITMHead.forward  s)    KKNN!!!$$r6   r6  r   s   @r4   r@  r@    sL        A A A A A      r6   r@  c                   $     e Zd Z fdZd Z xZS )FlavaGlobalContrastiveHeadc                 n    t                                                       || _        |j        | _        d S rc   )rn   ro   rg   global_backprop_contrastiver   s     r4   ro   z#FlavaGlobalContrastiveHead.__init__  s1    +1+M(((r6   c                    t          j        |          }t           j                                        rt           j                                        s6t          j                            d          j                  }g}g}n@                    d          }t           j                                        }	| j	        rSt           j        j
        j                                      }t           j        j
        j                                      }nvfdt          |	          D             }fdt          |	          D             }t           j                            |           t           j                            |           |t           j                                        z  t          j        |j                  z   }t          j        |          }t          j        |          }t          j        |                    dd                    |z  }
t          j        |                    dd                    |z  }|
||fS )Nr   r  c                 8    g | ]}t          j                  S rB   r?   
zeros_like)r1   r   r'   s     r4   rA  z6FlavaGlobalContrastiveHead.forward.<locals>.<listcomp>  s$    'e'e'ea(8(I(I'e'e'er6   c                 8    g | ]}t          j                  S rB   rJ  )r1   r   r%   s     r4   rA  z6FlavaGlobalContrastiveHead.forward.<locals>.<listcomp>  s%    &e&e&eau'78H'I'I&e&e&er6   r   )r?   expdistributedis_availableis_initializedr   r   r   get_world_sizerG  r   r   
all_gatherrC  get_rankr   r   r   )r3   r%   r'   r  temperaturelabelsimage_embeddings_alltext_embeddings_alllocal_batch_size
world_sizelogits_per_imagelogits_per_texts    ``         r4   r   z"FlavaGlobalContrastiveHead.forward  s2   i,, --// 	u7H7W7W7Y7Y 	\"2"7"7":":CSCZ[[[F$4#5 #2"3/44Q77*99;;J/ 	S (-'8';'F'Q'QRb'c'c$&+&7&:&E&P&PQ`&a&a##'e'e'e'eSXYcSdSd'e'e'e$&e&e&e&eSXYcSdSd&e&e&e#!,,-ACSTTT!,,-@/RRR%(9(B(B(D(DDu| )9)@H H H F  %y)=>>#i(;<< <(8:M:W:WXY[\:]:]^^all,8L8V8VWXZ[8\8\]]`kk&88r6   r6  r   s   @r4   rE  rE    sL        N N N N N
9 9 9 9 9 9 9r6   rE  zk
    The FLAVA model for pretraining which outputs losses, embeddings, logits and transformer outputs.
    c            )           e Zd Zg dZddedeej                 f fdZde	j
        fdZ ee                    dd	                     eee
          	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d dee	j                 dee	j                 dee	j                 dee	j                 dee	j
                 dee	j
                 dee	j
                 dee	j                 dee	j
                 dedee	j
                 dee	j
                 dee	j
                 dee         dedee         dee         deee	j
                 ef         f$d                        Z xZS )!FlavaForPreTraining)zmmm_text_head.decoder.biaszmmm_image_head.decoder.biaszmlm_head.decoder.biaszmim_head.decoder.biasNrg   image_codebookc                    t                                          |           t          |          | _        || _        | j         |j        rt          |j                  | _        t          |j	                  | _
        t          |j                  | _        t          |          | _        t          |j	                  | _        t          |j                  | _        t#          |          | _        |j	        j        | _        |j        j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |                                  d S rc   )rn   ro   r  rb  r^  init_codebookr  image_codebook_configr8  r  mim_headr  mlm_headr@  itm_headmmm_image_headmmm_text_headrE  global_contrastive_headr   image_vocab_sizetext_vocab_size
mlm_weight
mim_weightglobal_contrastive_weightce_ignore_index
itm_weightmmm_image_weightmmm_text_weight skip_unmasked_multimodal_encoderry  )r3   rg   r^  r|   s      r4   ro   zFlavaForPreTraining.__init__  sH      ''
,&6+?&"4V5Q"R"RD 2&2EFF1&2DEE$V,,78KLL6v7IJJ'A&'I'I$ & 3 >%1< + +)/)I&%5 + & 7%5060W-r6   r   c                     |                                 dk    r)|                    |                    d          d          }|S )Nr   r   r   )r   r   r   r  s     r4   _resize_to_2dz!FlavaForPreTraining._resize_to_2d  s5    5577Q;;qvvayy"%%Ar6   zbatch_size, text_seq_lenr  r  Tr   input_ids_maskedr   codebook_pixel_valuesr   r   r   r   r  rq  
mlm_labels
mim_labels
itm_labelsr   rG  rH  return_lossr+   c                    ||n| j         j        }||n| j         j        }|
|
n| j        }
||t                              d           |}|                     ||||||	|
||d
  
        }|                     |||||	|||d	  	        }d}|j        }|j        }|j        }|j        }|j	        }dx}x}x}x}x}x}} dx}!x}"x}#}$dx}%x}&}'||E|C|rA| j
        t          d          |t          d          | j
                            |          }| j        dk    r(|%|"|}(||                     |          }|                     |          }| j        ||                    d          <   |(dd|                    d	           dddf         }(|                    | j                  })||)         }*|(|)ddf         }(|                     |(          }!|rVt(          j                            |!                    d
| j                  |*                    d
                    }|| j        z  }n|                     |(          }!| j        dk    r|||}+||                     |          }|+dd|                    d	           dddf         }+|                    | j                  })||)         },|+|)ddf         }+|                     |+          }"|rVt(          j                            |"                    d
| j                  |,                    d
                    }|| j        z  }n|                     |+          }"| j        dk    r||                     |          }%||                    d          }-t=          j        |-                                 |-|-!                    dg                    }|r*t(          j                            |%|          } | | j        z  } |||         }|||         }|||         }||         }|4| j"        dk    r(|}(|                    d	          d	z
  }.|(dddd|.z   ddf         }(||                     |          }|                     |          }| j        ||                    d          <   |                    | j                  })||)         }*|(|)ddf         }(| #                    |(          }$|rVt(          j                            |$                    d
| j                  |*                    d
                    }|| j"        z  }n| #                    |(          }$|| j$        dk    r|}+|+dd|                    d	           dddf         }+||                     |          }|                    | j                  })||)         },|+|)ddf         }+| %                    |+          }#|rVt(          j                            |#                    d
| j                  |,                    d
                    }|| j$        z  }n| %                    |+          }#|a|^| j&        dk    rR| j        '                    |dddddf                   }/t(          j        (                    |/d
          }/| j        )                    |dddddf                   }0t(          j        (                    |0d
          }0| j        j*        j+        ,                    tZ          t\                     | /                    |0|/| j        j*                  \  }&}'}1||&|         }&|'|         }'|1|         }1|rRt(          j                            |&|1          }2t(          j                            |'|1          }3|2|3z   dz  }|| j&        z  }ta          ||| |||          }4|r?|41                                s+te          d |43                                D                       }|s||j4        |j4        5                                nd||j6        |j6        5                                nd|j	        |j7        |j7        5                                nd||j4        |j4        5                                nd||j6        |j6        5                                nd||j7        |j7        5                                nd|!|"|%|&|&|$|#f}5|r|41                                s||4f|5z   }5tq          d |5D                       S ts          d%i d|d|4d|d|j4        d|d|j6        d|j	        d|j7        d|d|j4        d|d|j6        d|d|j7        d|!d|"d |%d!|&d"|'d#|$d$|#S )&ai  
        Examples:
        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import FlavaForPreTraining, AutoProcessor

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> model = FlavaForPreTraining.from_pretrained("facebook/flava-full")
        >>> processor = AutoProcessor.from_pretrained("facebook/flava-full")

        >>> text = ["a photo of a cat"]

        >>> inputs = processor(
        ...     images=[image],
        ...     text=text,
        ...     return_masks=True,
        ...     return_codebook_pixels=True,
        ...     padding=True,
        ...     max_length=77,
        ...     return_tensors="pt",
        ... )


        >>> output = model(**inputs)
        ```

        Return:

        Nz`input_ids_masked` isn't passed which means MLM loss won't be calculated correctlySetting it to `input_ids` so that model can work. Please pass it if this is unintentional. This is usually OKAY if you are doing inference on unmasked text...T)
r   r   r   r   r   r  r  r   rG  rH  )	r   r   r   r   r  r   r   rG  rH  z`return_loss` is set to True but the image codebook is not initialized and no `mim_labels`  have been passed. Reinstantiate the model with `init_codebook` set to True or pass in your custom `mim_labels`z`codebook_pixel_value` are required to generate `mim_labels` if loss is expected. Call `AutoProcessor` with `return_codebook_pixels` set to Truer   r   r   r   r   )rE   rF   rG   rH   rI   rJ   c              3   "   K   | ]
}||ndV  d S r]  rB   )r1   rR   s     r4   r5   z.FlavaForPreTraining.forward.<locals>.<genexpr>  s+      __T%5TT1______r6   c              3      K   | ]}||V  	d S rc   rB   )r1   r   s     r4   r5   z.FlavaForPreTraining.forward.<locals>.<genexpr>   s"      88qaiiiii88r6   rR   rS   r%   r&   r'   r(   r)   r*   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   rB   ):rg   r  ry  rq  loggerwarningrb  r%   r'   r)   r^  RuntimeErrorr   r,  rk  rs  rm  ner   rb  r   r   cross_entropyr   rh  rj  rc  ri  rn  rd  r?   whereanynewro  re  rp  rf  rl  r  	normalizer  r  rh  clamp_LOGIT_SCALE_CLAMP_MINLOGIT_SCALE_CLAMP_MAXrg  rD   rM   sumrL   r&   r0   r(   r*   r8   rQ   )6r3   r   rt  r   ru  r   r   r   r   r  rq  rv  rw  rx  r   rG  rH  ry  flava_outputflava_masked_outputpos_maskr%   r'   rT   rV   rX   
total_lossmim_lossmlm_lossmmm_text_lossmmm_image_lossgc_lossitm_lossrZ   r[   r`   r_   r\   rZ  r[  sequence_for_imagemasked_tokensmim_labels_filteredsequence_for_textmlm_labels_filtered	pos_pairs	end_indextext_embeddingimage_embedding	gc_labelsgc_loss_imagegc_loss_textflava_lossesr  s6                                                         r4   r   zFlavaForPreTraining.forward  s   p &1%<kk$+B]%0%<kk$+BY 0; -,6 	) #	(=NN?  
  )zz%))%!5 %E/!5 " 
 
  #jj&%))!5+/!5 ) 

 

 '8&6"5"F!4!D':'P$aee
eXee=e>eGV^GKK
KZK/4D:>>
>% #.2N2Z!k!&.&;  
 )0$Y   "0EEF[\\
 ?Q#:#FKgKo!8%!//
;;
"&"4"4_"E"E7;7K
?--d334%7JOOA<N<N;N;P;PRSRSRS8S%T" *d.B C C&0&?#%7qqq8H%I"!]]+=>>
 0!}::"D,ABBDWD\D\]_D`D`   H /H!]]+=>>
 ?Q#9#EJfJn 6%!//
;;
$5aaa*//!:L:L9L9N9NPQPQPQ6Q$R! *d.B C C&0&?#$5mQQQ6F$G!!]]+<==
 0!}::"D,@AACVC[C[\^C_C_   H /H!]]+<==
 ?Q#?#K'CDDJ%&MM!,,	 ;y}}	9==RVQWCXCXYY 0!}:::zRRH/H/;3OPX3Y0)!+H!5J)!+H!5J&5h&?O (38MPQ8Q8Q!=/44Q77!;I!3AAAq1y=7H!!!4K!L%!//
;;
"&"4"4_"E"E7;7K
?--d334 *d.B C C&0&?#%7qqq8H%I"#'#6#67I#J#J  <%']%@%@(--b$2GHHJ]JbJbceJfJf& &N #d&;;N#'#6#67I#J#J  (38Lq8P8P < 1!!!6L6Q6QRS6T6T5T5V5VXYXYXY2Y Z%!//
;;
 *d.B C C&0&?#$5mQQQ6F$G!"&"4"45F"G"G :$&M$?$?',,R1EFFH[H`H`acHdHd% %M "T%99M"&"4"45F"G"G 'O,GDLjmnLnLn!Z771aaa8PQQN]44^4LLN"j99:J111aQRQRQR7:STTO m55o25NNOJ"'../DF[\\\;?;W;W1G< <8oy
 ##3H#= "1(";%h/	 : " ; ;<Li X X!}::?IVV(<71<499"&$"
 
 
  	`|4466 	`__I\I\I^I^_____J 	9 8D8Q8])22444cg7C7O7[(11333ae2=I=[=g.77999mq'?R?_?k#099;;;qu&>Q>]>i#/88:::os,&8D $5>>@@@   +F.  <#8#8#:#:    88F888888( 
 
 

"l
 .-
 &22	

 ,O
 %00
 #/"D"D
 +<<
 %<$;
 !4 @ @
 $:#9
  3>>
 *F)E
 &9%J%J
 "z
  "z!
" "z#
$ *:)9%
& )8'
( .-)
* ,O+
 	
r6   rc   )NNNNNNNNNNNNNNTNN)r;   r<   r=   _tied_weights_keysr   r   r   r  ro   r?   r   rs  r   "FLAVA_PRETRAINING_INPUTS_DOCSTRINGr  r   rQ   r  r@   rO   r   r
   r   r   r   s   @r4   r]  r]    si          { HRY<O      <u|    
 +**112LNmnn  +DS^___ 157;48=A151526377;15-1-1-1,0%)&*&*%A
 A
E,-A
 #5#34A
 u01	A

  ((9:A
 !.A
 !.A
 "%,/A
 u/0A
 'u|4A
 +/A
 U\*A
 U\*A
 U\*A
 $D>A
  #!A
" d^#A
$ d^%A
& 
uU\"$==	>'A
 A
 A
 `_ A
 A
 A
 A
 A
r6   r]  )]r>   r   r   r   dataclassesr   typingr   r   r   r   r	   r
   r   r?   torch.utils.checkpointr   activationsr   modeling_outputsr   r   modeling_utilsr   r   r   utilsr   r   r   r   r   r   r   configuration_flavar   r   r   r   r    
get_loggerr;   r}  r  r(  r  r  r  r  r  r  r	  r$   rD   rQ   r  rf   ru   r   r   r  r  r$  r.  r2  r>  rX  FLAVA_START_DOCSTRINGFLAVA_INPUTS_DOCSTRING_COMMON!FLAVA_IMAGE_INPUTS_DOCSTRING_BASEr   FLAVA_TEXT_INPUTS_DOCSTRING_BASEr  r  !FLAVA_MODEL_INPUTS_DOCSTRING_BASEr  r  'FLAVA_PRETRAINING_START_DOCSTRING_EXTRAra  r  rr  r  r  r  r  r  r  r  r2  r8  r@  rE  r]  rB   r6   r4   <module>r     sC	          # # # # # # ! ! ! ! ! ! ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?            ! ! ! ! ! ! K K K K K K K K c c c c c c c c c c                               
	H	%	%+   ? $6 !#4  )@ &,}}    _.>@UUV  "
 "
 "
 "
 "
{ "
 "
 "
J ! ! ! ! !+ ! ! !H `t `t `t `t `t `t `t `tJ_ _ _ _ _29 _ _ _H! ! ! ! !bi ! ! !H6 6 6 6 6") 6 6 6r@ @ @ @ @ @ @ @F    bi   $' ' ' ' 'RY ' ' 'T    	   "    ")    + + + + + + + +\0
 0
 0
 0
 0
29 0
 0
 0
f    ")   	 ! 0% !  AC`` $   ?A^^ 
 $$ "% ! &&'#$ ((  '	' (
(*V $W+$ #^+ '* * * * *? * * *6 k  (: ;; \
 \
 \
 \
 \
* \
 \
	 \
~ j  (9 :: b
 b
 b
 b
 b
) b
 b
	 b
J p  (? @@ [
 [
 [
 [
 [
/ [
 [
	 [
| e   66 J
 J
 J
 J
 J
% J
 J
	 J
Z    	   *C C C C Cbi C C C"    29   ( 
   (B CC r) r) r) r) r)- r) r) r)j    29   "    	   ,
 
 
 
 
29 
 
 
%9 %9 %9 %9 %9 %9 %9 %9P     669``	 q
 q
 q
 q
 q
. q
 q
 q
 q
 q
r6   