
    gx                        d Z ddlZddlZddlmZ ddlmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZmZmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z%  e"j&        e'          Z(dZ)dZ*e G d de                      Z+ G d dej,                  Z- G d dej,                  Z. G d dej,                  Z/ G d dej,                  Z0 G d dej,                  Z1 G d dej,                  Z2 G d d ej,                  Z3 G d! d"ej,                  Z4 G d# d$ej,                  Z5 G d% d&ej,                  Z6 G d' d(e          Z7d)Z8d*Z9d+Z: e d,e8           G d- d.e7                      Z; G d/ d0ej,                  Z< e d1e8           G d2 d3e7                      Z= G d4 d5ej,                  Z> G d6 d7ej,                  Z? e d8e8           G d9 d:e7                      Z@ e d;e8           G d< d=e7                      ZA e d>e:           G d? d@e7                      ZB e dAe8           G dB dCe7                      ZCdS )DzPyTorch ViLT model.    N)	dataclass)ListOptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesmeshgridprune_linear_layer)add_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )
ViltConfigr   zdandelin/vilt-b32-mlmc                       e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
eeeej                                   ed<   dZeeeej                                   ed<   dS )(ViltForImagesAndTextClassificationOutputa  
    Class for outputs of [`ViltForImagesAndTextClassification`].

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (`List[tuple(torch.FloatTensor)]`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            List of tuples of `torch.FloatTensor` (one for each image-text pair, each tuple containing the output of
            the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`List[tuple(torch.FloatTensor)]`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            List of tuples of `torch.FloatTensor` (one for each image-text pair, each tuple containing the attention
            weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the
            attention softmax, used to compute the weighted average in the self-attention heads.
    Nlosslogitshidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r    r   r   r!        b/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/vilt/modeling_vilt.pyr   r   4   s          $ )-D(5$
%,,, $FE$$$>BM8Du'8!9:;BBB;?JeE$5678?????r*   r   c                   4     e Zd ZdZ fdZddZ	 ddZ xZS )	ViltEmbeddingsz
    Construct the text and patch embeddings.

    Text embeddings are equivalent to BERT embeddings.

    Patch embeddings are equivalent to ViT embeddings.
    c                    t                                                       t          |          | _        t	          j        t          j        dd|j                            | _	        t          |          | _        | j        j        }t	          j        t          j        d|dz   |j                            | _        t	          j        |j        |j                  | _        t	          j        |j                  | _        || _        d S Nr   )super__init__TextEmbeddingstext_embeddingsr   	Parameterr&   zeroshidden_size	cls_tokenViltPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddings	Embeddingmodality_type_vocab_sizetoken_type_embeddingsDropouthidden_dropout_probdropoutconfig)selfrB   r:   	__class__s      r+   r1   zViltEmbeddings.__init__W   s      .f55ek!Q8J&K&KLL 3F ; ;+7#%<A{QPVPb0c0c#d#d %'\&2QSYSe%f%f"z&"<==r*      c           	         !"# | j         j        j        j        \  }}}}|                      |          }|d d d d d d d f                                         }t
          j                            ||j        d         |j        d         f                                          }|d d df         	                    d          d d df         }	|d d df         	                    d          d d df         }
|j        \  }}#| j
        j        | j
        j        z  }| j        d d dd d d f                             dd                              d|||          !t!          j        !#fdt%          |	|
          D             d          }|                    d                              dd          }|                    d                              dd          }t!          j        t+          t!          j        |j        d                   t!          j        |j        d	                   d
          d	                              |j                  }|d d d d d d d d f         }|                    |j        d         |j        d         d	d	d	          }|                    dd          }|                    d          }dk     st5          t6                    s|	|
z  }|                                n'|	|
z  }t;          |                                          |                    d          "d|z
                      d           "d d df                                         }"fd|D             } fd|D             }d |D             }d |D             }fd|D             }g }tA          t%          |||                    D ]\  }\  }}}|dk    r[t!          j!        t!          j"        |                                                    }|#                    ||         |                    jt!          j!        t!          j"        |                                          |d          }|#                    t!          j        ||         ||         |         gd                     t!          j        |d          }||d d df         |d d df         f                             |d	|          }||d d df         |d d df         f                             |d	          }||d d df         |d d df         f                             |d	d          }||d d df         |d d df         f                             |d	|          }| j$                            |d	d	          }t!          j        ||fd          }t!          j        | j        d d dd d f         d d d d d f                             |d	d	          |fd          }||z   }| %                    |          }t!          j        t!          j"        |j        d         d                              |          |gd          }|||#fffS )N   r
   sizer   r   dimc           
          g | ]R\  }}t           j                            t           j                            ||fd d          d|z
  d|z
  f          SS )bilinearT)rI   modealign_cornersr   )r   
functionalpadinterpolate).0hwheightspatial_poswidths      r+   
<listcomp>z/ViltEmbeddings.visual_embed.<locals>.<listcomp>s   s        Aq !!M--#V'&*	 .   	1fqj1   r*   ij)indexingdeviceF)as_tuplec                 <    g | ]}d d df         |k             S Nr   r)   )rS   u	valid_idxs     r+   rY   z/ViltEmbeddings.visual_embed.<locals>.<listcomp>   s/    NNNQ9QQQT?a#78NNNr*   c                 <    g | ]}d d df         |k             S rb   r)   )rS   rc   non_valid_idxs     r+   rY   z/ViltEmbeddings.visual_embed.<locals>.<listcomp>   s0    ZZZ]=A+>!+CDZZZr*   c                 8    g | ]}|                     d           S r   rH   rS   vs     r+   rY   z/ViltEmbeddings.visual_embed.<locals>.<listcomp>   s"    777AaffQii777r*   c                 8    g | ]}|                     d           S rh   rH   ri   s     r+   rY   z/ViltEmbeddings.visual_embed.<locals>.<listcomp>   s"    ???!&&))???r*   c                     g | ]}|z
  S r)   r)   )rS   rj   max_image_lengths     r+   rY   z/ViltEmbeddings.visual_embed.<locals>.<listcomp>   s    ===Q$q(===r*   T)replacement)&r9   
projectionweightshapefloatr   rP   rR   longsumrB   
image_size
patch_sizer;   	transposeviewr&   catzipflattenstackr   arangetor_   expand
isinstanceintmaxminnonzerounique	enumeratemultinomialonesappendr7   rA   )$rC   pixel_values
pixel_maskrm   _phpwxx_maskx_hx_w
batch_sizenum_channels	patch_dim	pos_embedpatch_indexeffective_resolutionunique_rowsvalid_row_idxnon_valid_row_idx
valid_numsnon_valid_numspad_numsselectirj   nvpvalid_choice
pad_choice
cls_tokensrV   rf   rW   rd   rX   s$      `                           @@@@@r+   visual_embedzViltEmbeddings.visual_embedf   s   ,7>D1b"!!,//AAAtQQQM*0022**6QWQZ8P*QQVVXXQQQTl1%%aaad+QQQTl1%%aaad+23'/
L&%K*dk.DD	.qqq!""aaax8BB1aHHMMaQ]_hjsttI       SMM   
 
 
	  %%a((221a88	IIaLL""1a((kU\&,r"233U\&,rBR5S5S^bcccik
 
 

"FM"
"
" 	 "$aaaAAA"56!((a&,q/2rSUVV!))!Q//""a#3#;:N^`cCdCd#;
 $'9 37799#&9 "#7#;#;#=#=?OPPNNEN22	V,,e,<<1o,,..NNNN+NNNZZZZkZZZ77777
??->???====*===&s:~x'P'PQQ 	f 	fMAz2qAvv$0A1D1D1F1FHXYYmA.|<====".uz"~~/C/C/E/EqVZ[[[
eiq)9;LQ;OPZ;[(\bcdddeeee6q)))fQQQTlF111a4L()..z2|LLqqq!tfQQQTl2388RHH!&A,qqq!t"<=BB:rSTUUfQQQTlF111a4L89>>z2|\\	^**:r2>>
Iz1o1---I%aaaAAAg.qqq$z:AA*bRTUUW`agh
 
 
	 	MLLOOEJv|A::==fEEvNTUVVV&;888r*   r   c	           	         |                      |||          }	|'|                     ||| j        j                  \  }}
}n|                    d          }
|d}|	|                     t          j        |t          j        |	j	                            z   }	||                     t          j
        |
|t          j        |	j	                            z   }t          j        |	|gd          }t          j        ||
gd          }||fS )N)	input_idstoken_type_idsinputs_embeds)rm   r   dtyper_   rJ   )r3   r   rB   rm   r{   r>   r&   
zeros_likers   r_   	full_likery   )rC   r   attention_maskr   r   r   r   image_embedsimage_token_type_idxtext_embedsimage_masksr   
embeddingsmaskss                 r+   forwardzViltEmbeddings.forward   s0    **m + 
 

 595F5Fj4;;W 6G 6 62L+{{ %,,Q//K  '#$ !D$>$>^5:kFXYYY%
 %
 
 $d&@&@OK)=UZXcXjkkk'
 '
 

 Y\:BBB
	>;7Q???5  r*   )rE   )r   )r"   r#   r$   r%   r1   r   r   __classcell__rD   s   @r+   r-   r-   N   st             V9 V9 V9 V9B '! '! '! '! '! '! '! '!r*   r-   c                   *     e Zd ZdZ fdZddZ xZS )r2   zGConstruct the embeddings from word, position and token_type embeddings.c                    t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j        |j
        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        t#          |dd          | _        |                     dt)          j        |j                                      d          d           |                     d	t)          j        | j                                        t(          j        
          d           d S )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r   r[   F)
persistentr   r   )r0   r1   r   r<   
vocab_sizer6   pad_token_idword_embeddingsmax_position_embeddingsr;   type_vocab_sizer>   	LayerNormlayer_norm_epsr?   r@   rA   getattrr   register_bufferr&   r}   r   r5   r   rI   rs   rC   rB   rD   s     r+   r1   zTextEmbeddings.__init__   sK   !|F,=v?Q_e_rsss#%<0NPVPb#c#c %'\&2H&J\%]%]" f&8f>STTTz&"<=='.v7PR\']']$EL)GHHOOPWXXej 	 	
 	
 	
 	ek$*;*@*@*B*B%*UUUbg 	 	
 	
 	
 	
 	
r*   Nc                    ||                                 }n|                                 d d         }|d         }|| j        d d d |f         }|mt          | d          r2| j        d d d |f         }|                    |d         |          }|}n+t          j        |t
          j        | j        j                  }|| 	                    |          }| 
                    |          }	||	z   }
| j        dk    r|                     |          }|
|z  }
|                     |
          }
|                     |
          }
|
S )Nr[   r   r   r   r   r   )rI   r   hasattrr   r   r&   r5   rs   r_   r   r>   r   r;   r   rA   )rC   r   r   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr>   r   r;   s               r+   r   zTextEmbeddings.forward   sb    #..**KK',,..ss3K ^
,QQQ^<L
 !t-.. m*.*=aaa*n*M'3J3Q3QR]^_R`bl3m3m0!A!&[
SWSdSk!l!l!l  00;;M $ : :> J J"%::
':55"&":":<"H"H--J^^J//
\\*--
r*   )NNNNr"   r#   r$   r%   r1   r   r   r   s   @r+   r2   r2      sR        QQ
 
 
 
 
&               r*   r2   c                   (     e Zd ZdZ fdZd Z xZS )r8   z#
    Image to Patch Embedding.
    c                    t                                                       |j        |j        }}|j        |j        }}t          |t          j        j	                  r|n||f}t          |t          j        j	                  r|n||f}|d         |d         z  |d         |d         z  z  }|| _        || _        || _        || _
        t          j        ||||          | _        d S )Nr   r   )kernel_sizestride)r0   r1   ru   rv   r   r6   r   collectionsabcIterabler:   r   Conv2dro   )rC   rB   ru   rv   r   r6   r:   rD   s          r+   r1   zViltPatchEmbeddings.__init__&  s    !'!2F4EJ
$*$79Kk#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
!!}
15*Q-:VW=:XY$$(&)L+:^hiiir*   c                     |j         \  }}}}|| j        k    rt          d          | j        j        j        }|                     |                    |                    }|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   )rq   r   
ValueErrorro   rp   r   r~   )rC   r   r   r   rV   rX   target_dtyper   s           r+   r   zViltPatchEmbeddings.forward5  si    2>2D/
L&%4,,,w   -3OOLOO,O??@@r*   r   r   s   @r+   r8   r8   !  sV         j j j j j      r*   r8   c                   ,     e Zd Z fdZd ZddZ xZS )ViltSelfAttentionc                    t                                                       |j        |j        z  dk    r1t	          |d          s!t          d|j        f d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j
        |j        | j        |j                  | _        t          j
        |j        | j        |j                  | _        t          j
        |j        | j        |j                  | _        t          j        |j                  | _        d S )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .bias)r0   r1   r6   num_attention_headsr   r   r   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyvaluer?   attention_probs_dropout_probrA   r   s     r+   r1   zViltSelfAttention.__init__A  s1    ::a??PVXhHiHi?76#5"7 7 737 7 7  
 $*#= #&v'9F<V'V#W#W !58PPYv143EFO\\\
9V/1C&/ZZZYv143EFO\\\
z&"EFFr*   c                     |                                 d d         | j        | j        fz   } |j        | }|                    dddd          S )Nr[   r   rG   r   r
   )rI   r   r   rx   permute)rC   r   new_x_shapes      r+   transpose_for_scoresz&ViltSelfAttention.transpose_for_scoresS  sM    ffhhssmt'?AY&ZZAFK yyAq!$$$r*   NFc                    |                      |          }|                     |                     |                    }|                     |                     |                    }|                     |          }t	          j        ||                    dd                    }	|	t          j        | j	                  z  }	||	|z   }	 t          j        d          |	          }
|                     |
          }
||
|z  }
t	          j        |
|          }|                    dddd                                          }|                                d d         | j        fz   } |j        | }|r||
fn|f}|S )Nr[   rZ   rJ   r   rG   r   r
   )r   r   r   r   r&   matmulrw   mathsqrtr   r   SoftmaxrA   r   
contiguousrI   r   rx   )rC   r    r   	head_maskoutput_attentionsmixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                 r+   r   zViltSelfAttention.forwardX  s    JJ}55--dhh}.E.EFF	//

=0I0IJJ//0ABB !<Y5H5HR5P5PQQ+di8P.Q.QQ%/.@ -"*,,,-=>> ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S**,CD6G]=/22mM]r*   NNF)r"   r#   r$   r1   r   r   r   r   s   @r+   r   r   @  s`        G G G G G$% % %
! ! ! ! ! ! ! !r*   r   c                   ^     e Zd ZdZdeddf fdZdej        dej        dej        fdZ xZ	S )	ViltSelfOutputz
    The residual connection is defined in ViltLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rB   returnNc                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _        d S N)	r0   r1   r   r   r6   denser?   r@   rA   r   s     r+   r1   zViltSelfOutput.__init__  sJ    Yv163EFF
z&"<==r*   r    input_tensorc                 Z    |                      |          }|                     |          }|S r	  r
  rA   rC   r    r  s      r+   r   zViltSelfOutput.forward  s*    

=11]33r*   )
r"   r#   r$   r%   r   r1   r&   Tensorr   r   r   s   @r+   r  r  }  s         
>z >d > > > > > >
U\  RWR^        r*   r  c                   ,     e Zd Z fdZd ZddZ xZS )ViltAttentionc                     t                                                       t          |          | _        t	          |          | _        t                      | _        d S r	  )r0   r1   r   	attentionr  outputsetpruned_headsr   s     r+   r1   zViltAttention.__init__  sI    *622$V,,EEr*   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   rJ   )lenr   r  r   r   r  r   r   r   r   r  r
  r   union)rC   headsindexs      r+   prune_headszViltAttention.prune_heads  s   u::??F74>5t~7Y[_[l
 
u
  2$.2FNN/0BEJJ1$.2FNN.t{/@%QOOO .2^-ORUV[R\R\-\*'+~'IDNLn'n$ -33E::r*   NFc                     |                      ||||          }|                     |d         |          }|f|dd          z   }|S )Nr   r   )r  r  )rC   r    r   r   r   self_outputsattention_outputr  s           r+   r   zViltAttention.forward  sM    ~~m^YPabb;;|AFF#%QRR(88r*   r  )r"   r#   r$   r1   r  r   r   r   s   @r+   r  r    s[        " " " " "; ; ;$       r*   r  c                   L     e Zd Zdeddf fdZdej        dej        fdZ xZS )ViltIntermediaterB   r  Nc                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r	  )r0   r1   r   r   r6   intermediate_sizer
  r   
hidden_actstrr   intermediate_act_fnr   s     r+   r1   zViltIntermediate.__init__  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r*   r    c                 Z    |                      |          }|                     |          }|S r	  )r
  r&  rC   r    s     r+   r   zViltIntermediate.forward  s,    

=1100??r*   	r"   r#   r$   r   r1   r&   r  r   r   r   s   @r+   r!  r!    sq        9z 9d 9 9 9 9 9 9U\ el        r*   r!  c                   Z     e Zd Zdeddf fdZdej        dej        dej        fdZ xZS )
ViltOutputrB   r  Nc                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _	        d S r	  )
r0   r1   r   r   r#  r6   r
  r?   r@   rA   r   s     r+   r1   zViltOutput.__init__  sJ    Yv79KLL
z&"<==r*   r    r  c                 d    |                      |          }|                     |          }||z   }|S r	  r  r  s      r+   r   zViltOutput.forward  s4    

=11]33%4r*   r)  r   s   @r+   r+  r+    s|        >z >d > > > > > >
U\  RWR^        r*   r+  c                   *     e Zd ZdZ fdZddZ xZS )	ViltLayerz?This corresponds to the Block class in the timm implementation.c                 z   t                                                       |j        | _        d| _        t	          |          | _        t          |          | _        t          |          | _	        t          j        |j        |j                  | _        t          j        |j        |j                  | _        d S )Nr   r   )r0   r1   chunk_size_feed_forwardseq_len_dimr  r  r!  intermediater+  r  r   r   r6   r   layernorm_beforelayernorm_afterr   s     r+   r1   zViltLayer.__init__  s    '-'E$&v..,V44 (( "V-?VEZ [ [ [!|F,>FDYZZZr*   NFc                 H   |                      |                     |          |||          }|d         }|dd          }||                    |j                  z   }|                     |          }|                     |          }|                     ||          }|f|z   }|S )N)r   r   r   )r  r4  r~   r_   r5  r3  r  )	rC   r    r   r   r   self_attention_outputsr  r  layer_outputs	            r+   r   zViltLayer.forward  s    !%!!-00/	 "0 "
 "
 2!4(, )=+;+;<L<S+T+TT ++M::((66 {{<??/G+r*   r  r   r   s   @r+   r/  r/    sW        II[ [ [ [ [       r*   r/  c                   0     e Zd Z fdZ	 	 	 	 	 ddZ xZS )ViltEncoderc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r)   )r/  )rS   r   rB   s     r+   rY   z(ViltEncoder.__init__.<locals>.<listcomp>  s!    #_#_#_!If$5$5#_#_#_r*   F)	r0   r1   rB   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   s    `r+   r1   zViltEncoder.__init__  s`    ]#_#_#_#_uVE]?^?^#_#_#_``
&+###r*   NFTc                    |rdnd }|rdnd }t          | j                  D ]j\  }	}
|r||fz   }|||	         nd }| j        r&| j        r|                     |
j        ||||          }n |
||||          }|d         }|r||d         fz   }k|r||fz   }|st          d |||fD                       S t          |||          S )Nr)   r   r   c              3      K   | ]}||V  	d S r	  r)   ri   s     r+   	<genexpr>z&ViltEncoder.forward.<locals>.<genexpr>%  s(      mmq_`_l_l_l_l_lmmr*   )last_hidden_stater    r!   )r   r@  rA  training_gradient_checkpointing_func__call__tupler   )rC   r    r   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsr   layer_modulelayer_head_masklayer_outputss                r+   r   zViltEncoder.forward  sN    #7@BBD$5?bb4(44 	P 	POA|# I$58H$H!.7.CillO* 	pt} 	p $ A A )!"#%! ! !-]NO]n o o)!,M  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r*   )NNFFTr"   r#   r$   r1   r   r   r   s   @r+   r:  r:    s]        , , , , , "+
 +
 +
 +
 +
 +
 +
 +
r*   r:  c                   ,    e Zd ZdZeZdZdZddgZd Z	dS )ViltPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    viltTr-   r   c                    t          |t          j        t          j        f          rT|j        j                            d| j        j                   |j	         |j	        j        
                                 dS dS t          |t          j                  r_|j        j                            d| j        j                   |j        +|j        j        |j                 
                                 dS dS t          |t          j                  r?|j	        j        
                                 |j        j                            d           dS dS )zInitialize the weightsg        )meanstdNg      ?)r   r   r   r   rp   datanormal_rB   initializer_ranger   zero_r<   r   r   fill_)rC   modules     r+   _init_weightsz!ViltPreTrainedModel._init_weights8  s0   fry")455 	* M&&CT[5R&SSS{& &&((((( '&-- 	*M&&CT[5R&SSS!-"6#56<<>>>>> .--- 	*K""$$$M$$S)))))	* 	*r*   N)
r"   r#   r$   r%   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modulesr^  r)   r*   r+   rS  rS  -  sL         
 L&*#)+>?* * * * *r*   rS  aH  
    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ subclass. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`ViltConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)

        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)

        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)

        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`ViltImageProcessor.__call__`] for details.

        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:

            - 1 for pixels that are real (i.e. **not masked**),
            - 0 for pixels that are padding (i.e. **masked**).
            `What are attention masks? <../glossary.html#attention-mask>`__

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.

        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)

        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)

        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)

        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_images, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`ViltImageProcessor.__call__`] for details.

        pixel_mask (`torch.LongTensor` of shape `(batch_size, num_images, height, width)`, *optional*):
            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:

            - 1 for pixels that are real (i.e. **not masked**),
            - 0 for pixels that are padding (i.e. **masked**).
            `What are attention masks? <../glossary.html#attention-mask>`__

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.

        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_images, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z^The bare ViLT Model transformer outputting raw hidden-states without any specific head on top.c                        e Zd Zd fd	Zd Zd Zd Z ee           e	e
e          	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 d	eej                 d
eej                 deej                 deej                 deej                 deej                 deej                 dee         dee         dee         dee         dee
eej                 f         fd                        Z xZS )	ViltModelTc                 J   t                                          |           || _        t          |          | _        t          |          | _        t          j        |j	        |j
                  | _        |rt          |          nd | _        |                                  d S Nr   )r0   r1   rB   r-   r   r:  encoderr   r   r6   r   	layernorm
ViltPoolerpooler	post_init)rC   rB   add_pooling_layerrD   s      r+   r1   zViltModel.__init__  s       (00"6**f&8f>STTT,=Gj(((4 	r*   c                 $    | j         j        j        S r	  r   r3   r   rC   s    r+   get_input_embeddingszViltModel.get_input_embeddings  s    .>>r*   c                 (    || j         j        _        d S r	  rn  )rC   r   s     r+   set_input_embeddingszViltModel.set_input_embeddings  s    :?'777r*   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrg  r@  r  r  )rC   heads_to_pruner@  r  s       r+   _prune_headszViltModel._prune_heads  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr*   output_typer_  Nr   r   r   r   r   r   r   r   r   r   rJ  rK  r  c           
      \   |
|
n| j         j        }
||n| j         j        }||n| j         j        }||t	          d          |+|                     ||           |                                }n.||                                dd         }nt	          d          |\  }}||j        n|j        }|t          j	        ||f|          }||t	          d          ||t	          d          ||j
        d         n|j
        d         }||k    rt	          d	          |-t          j	        || j         j        | j         j        f|          }|                     || j         j                  }|                     ||||||||	
          \  }}|                     ||          }|                     ||||
||          }|d         }|                     |          }| j        |                     |          nd}|s||f|dd         z   S t'          |||j        |j                  S )a  
        Returns:

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltModel
        >>> from PIL import Image
        >>> import requests

        >>> # prepare image and text
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "hello world"

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")
        >>> model = ViltModel.from_pretrained("dandelin/vilt-b32-mlm")

        >>> inputs = processor(image, text, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer[   z5You have to specify either input_ids or inputs_embedsr^   zFYou cannot specify both pixel_values and image_embeds at the same timez7You have to specify either pixel_values or image_embedsr   zAThe text inputs and image inputs need to have the same batch size)r   )r   r   r   rJ  rK  r   )rE  pooler_outputr    r!   )rB   r   rJ  use_return_dictr   %warn_if_padding_and_no_attention_maskrI   r_   r&   r   rq   ru   get_head_maskr?  r   get_extended_attention_maskrg  rh  rj  r   r    r!   )rC   r   r   r   r   r   r   r   r   r   r   rJ  rK  r   text_batch_sizer   r_   image_batch_sizeembedding_outputextended_attention_maskencoder_outputssequence_outputpooled_outputs                          r+   r   zViltModel.forward  s   N 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU&1#%.%:!!@T!"Z/:)FPVWWWN#(@efff!l&:VWWW4@4L<-a00R^RdefRg..`aaa%5t{7Mt{Oe$fouvvvJ &&y$+2OPP	+/??!5 ,; 	,
 	,
(. 150P0PQ_al0m0m,,2/!5# ' 
 
 *!,..998<8OO444UY 	J#]3oabb6III)-')7&1	
 
 
 	
r*   )TNNNNNNNNNNNN)r"   r#   r$   r1   rp  rr  rv  r   VILT_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr   r&   
LongTensorr'   r   boolr   r   r   r   r   s   @r+   rd  rd    s       
     ? ? ?@ @ @C C C +*+@AA+ETcddd 156:594815155948.2,0/3&*p
 p
E,-p
 !!23p
 !!12	p

 u01p
 U-.p
 E-.p
   12p
 u01p
 'smp
 $D>p
 'tnp
 d^p
 
)51B+CC	Dp
 p
 p
 ed BAp
 p
 p
 p
 p
r*   rd  c                   $     e Zd Z fdZd Z xZS )ri  c                     t                                                       t          j        |j        |j                  | _        t          j                    | _        d S r	  )r0   r1   r   r   r6   r
  Tanh
activationr   s     r+   r1   zViltPooler.__init__Y  sC    Yv163EFF
'))r*   c                 r    |d d df         }|                      |          }|                     |          }|S rb   )r
  r  )rC   r    first_token_tensorr  s       r+   r   zViltPooler.forward^  s@     +111a40

#56666r*   rQ  r   s   @r+   ri  ri  X  sG        $ $ $ $ $
      r*   ri  zU
    ViLT Model with a language modeling head on top as done during pretraining.
    c                        e Zd ZddgZ fdZd Zd Z ee	                    d                     e
ee          	 	 	 	 	 	 	 	 	 	 	 	 dd	eej                 d
eej                 deej                 deej                 deej                 deej                 deej                 deej                 deej                 dee         dee         dee         deeeej                 f         fd                        Z xZS )ViltForMaskedLMzmlm_score.decoder.weightzmlm_score.decoder.biasc                     t                                          |           t          |          | _        t	          |          | _        |                                  d S r	  )r0   r1   rd  rT  ViltMLMHead	mlm_scorerk  r   s     r+   r1   zViltForMaskedLM.__init__p  sQ       f%%	$V,, 	r*   c                     | j         j        S r	  )r  decoderro  s    r+   get_output_embeddingsz%ViltForMaskedLM.get_output_embeddingsy  s    ~%%r*   c                 @    || j         _        |j        | j         _        d S r	  )r  r  r   )rC   new_embeddingss     r+   set_output_embeddingsz%ViltForMaskedLM.set_output_embeddings|  s    !/,1r*   zbatch_size, sequence_lengthrw  Nr   r   r   r   r   r   r   r   labelsr   rJ  rK  r  c                 V   ||n| j         j        }|                     |||||||||
||          }|dd         \  }}||j        d         n|j        d         }|ddd|f         |dd|df         }}|                     |          }d}|	et                      }|	                    |j                  }	 ||                    d| j         j	                  |	                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )a	  
        labels (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in *[-100, 0, ...,
            config.vocab_size]* (see *input_ids* docstring) Tokens with indices set to *-100* are ignored (masked), the
            loss is only computed for the tokens with labels in *[0, ..., config.vocab_size]*

        Returns:

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForMaskedLM
        >>> import requests
        >>> from PIL import Image
        >>> import re
        >>> import torch

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "a bunch of [MASK] laying on a [MASK]."

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")
        >>> model = ViltForMaskedLM.from_pretrained("dandelin/vilt-b32-mlm")

        >>> # prepare inputs
        >>> encoding = processor(image, text, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**encoding)

        >>> tl = len(re.findall("\[MASK\]", text))
        >>> inferred_token = [text]

        >>> # gradually fill in the MASK tokens, one by one
        >>> with torch.no_grad():
        ...     for i in range(tl):
        ...         encoded = processor.tokenizer(inferred_token)
        ...         input_ids = torch.tensor(encoded.input_ids)
        ...         encoded = encoded["input_ids"][0][1:-1]
        ...         outputs = model(input_ids=input_ids, pixel_values=encoding.pixel_values)
        ...         mlm_logits = outputs.logits[0]  # shape (seq_len, vocab_size)
        ...         # only take into account text features (minus CLS and SEP token)
        ...         mlm_logits = mlm_logits[1 : input_ids.shape[1] - 1, :]
        ...         mlm_values, mlm_ids = mlm_logits.softmax(dim=-1).max(dim=-1)
        ...         # only take into account text
        ...         mlm_values[torch.tensor(encoded) != 103] = 0
        ...         select = mlm_values.argmax().item()
        ...         encoded[select] = mlm_ids[select].item()
        ...         inferred_token = [processor.decode(encoded)]

        >>> selected_token = ""
        >>> encoded = processor.tokenizer(inferred_token)
        >>> output = processor.decode(encoded.input_ids[0], skip_special_tokens=True)
        >>> print(output)
        a bunch of cats laying on a couch.
        ```N
r   r   r   r   r   r   r   r   rJ  rK  rG   r   r[   r   r   r    r!   )rB   r{  rT  rq   r  r	   r~   r_   rx   r   r   r    r!   )rC   r   r   r   r   r   r   r   r   r  r   rJ  rK  r  r  r  text_seq_lentext_featuresr   
mlm_logitsmasked_lm_lossloss_fctr  s                          r+   r   zViltForMaskedLM.forward  s   R &1%<kk$+B]))))%!'%/!5#  
 
 *1!&-6-Byq))H[\]H^+AAA}},<=qqqR^R_R_O_?`q^^M22
'))HYYz011F%Xjoob$+:P&Q&QSYS^S^_aSbSbccN 	Z ]WQRR[0F3A3M^%..SYY!/)	
 
 
 	
r*   r  )r"   r#   r$   _tied_weights_keysr1   r  r  r   r  formatr   r   r  r   r&   r  r'   r  r   r   r   r   r   s   @r+   r  r  g  s        56NO    & & &2 2 2 +*+@+G+GHe+f+fgg>XXX 156:594815155948-1,0/3&*n
 n
E,-n
 !!23n
 !!12	n

 u01n
 U-.n
 E-.n
   12n
 u01n
 )*n
 $D>n
 'tnn
 d^n
 
~uU%677	8n
 n
 n
 YX hgn
 n
 n
 n
 n
r*   r  c                   $     e Zd Z fdZd Z xZS )ViltPredictionHeadTransformc                 V   t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _
        n|j        | _
        t          j        |j        |j                  | _        d S rf  )r0   r1   r   r   r6   r
  r   r$  r%  r   transform_act_fnr   r   r   s     r+   r1   z$ViltPredictionHeadTransform.__init__  s    Yv163EFF
f'-- 	6$*6+<$=D!!$*$5D!f&8f>STTTr*   c                     |                      |          }|                     |          }|                     |          }|S r	  )r
  r  r   r(  s     r+   r   z#ViltPredictionHeadTransform.forward  s=    

=11--m<<}55r*   rQ  r   s   @r+   r  r    sL        U U U U U      r*   r  c                   ,     e Zd Zd fd	Zd Zd Z xZS )r  Nc                 h   t                                                       || _        t          |          | _        t          j        |j        |j        d          | _	        t          j
        t          j        |j                            | _        ||| j	        _        | j        | j	        _        d S )NFr   )r0   r1   rB   r  	transformr   r   r6   r   r  r4   r&   r5   r   rp   )rC   rB   rp   rD   s      r+   r1   zViltMLMHead.__init__  s    4V<<y!3V5FUSSSLV->!?!?@@	"(DL !Ir*   c                 (    | j         | j        _         d S r	  )r   r  ro  s    r+   _tie_weightszViltMLMHead._tie_weights  s     Ir*   c                 Z    |                      |          }|                     |          }|S r	  )r  r  )rC   r   s     r+   r   zViltMLMHead.forward  s'    NN1LLOOr*   r	  )r"   r#   r$   r1   r  r   r   r   s   @r+   r  r    s[        
& 
& 
& 
& 
& 
&& & &      r*   r  z
    Vilt Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the [CLS]
    token) for visual question answering, e.g. for VQAv2.
    c                        e Zd Z fdZ ee           eee          	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 d	e	e
j                 d
e	e
j                 de	e
j                 de	e
j                 de	e         de	e         de	e         deeee
j                 f         fd                        Z xZS )ViltForQuestionAnsweringc           	         t                                          |           |j        | _        t          |          | _        t          j        t          j        |j        |j        dz            t          j	        |j        dz            t          j
                    t          j        |j        dz  |j                            | _        |                                  d S )NrG   )r0   r1   
num_labelsrd  rT  r   
Sequentialr   r6   r   GELU
classifierrk  r   s     r+   r1   z!ViltForQuestionAnswering.__init__"  s        +f%%	 -If(&*<q*@AAL+a/00GIIIf(1,f.?@@	
 
 	r*   rw  Nr   r   r   r   r   r   r   r   r  r   rJ  rK  r  c                    ||n| j         j        }|                     |||||||||
||          }|r|j        n|d         }|                     |          }d}|	H|	                    |j                  }	t          j        	                    ||	          |	j
        d         z  }|s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )a  
        labels (`torch.FloatTensor` of shape `(batch_size, num_labels)`, *optional*):
            Labels for computing the visual question answering loss. This tensor must be either a one-hot encoding of
            all answers that are applicable for a given example in the batch, or a soft encoding indicating which
            answers are applicable, where 1.0 is the highest score.

        Returns:

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForQuestionAnswering
        >>> import requests
        >>> from PIL import Image

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "How many cats are there?"

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
        >>> model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

        >>> # prepare inputs
        >>> encoding = processor(image, text, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**encoding)
        >>> logits = outputs.logits
        >>> idx = logits.argmax(-1).item()
        >>> print("Predicted answer:", model.config.id2label[idx])
        Predicted answer: 2
        ```Nr  r   rG   r  )rB   r{  rT  rz  r  r~   r_   r   rP    binary_cross_entropy_with_logitsrq   r   r    r!   )rC   r   r   r   r   r   r   r   r   r  r   rJ  rK  r  rz  r   r   r  s                     r+   r   z ViltForQuestionAnswering.forward3  s   b &1%<kk$+B]))))%!'%/!5#  
 
 2=L--'!*//YYv}--F=AA&&QQTZT`abTccD  	FY,F)-)9TGf$$vE'!/)	
 
 
 	
r*   r  r"   r#   r$   r1   r   r  r   r   r  r   r&   r  r'   r  r   r   r   r   r   s   @r+   r  r    s           " +*+@AA+CRabbb 156:594815155948-1,0/3&*S
 S
E,-S
 !!23S
 !!12	S

 u01S
 U-.S
 E-.S
   12S
 u01S
 )*S
 $D>S
 'tnS
 d^S
 
'u/@)AA	BS
 S
 S
 cb BAS
 S
 S
 S
 S
r*   r  z
    Vilt Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the [CLS]
    token) for image-to-text or text-to-image retrieval, e.g. MSCOCO and F30K.
    c                        e Zd Z fdZ ee           eee          	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 d	e	e
j                 d
e	e
j                 de	e
j                 de	e
j                 de	e         de	e         de	e         deeee
j                 f         fd                        Z xZS )ViltForImageAndTextRetrievalc                     t                                          |           t          |          | _        t	          j        |j        d          | _        |                                  d S r/   )	r0   r1   rd  rT  r   r   r6   rank_outputrk  r   s     r+   r1   z%ViltForImageAndTextRetrieval.__init__  s[       f%%	 9V%7;; 	r*   rw  Nr   r   r   r   r   r   r   r   r  r   rJ  rK  r  c                 B   ||n| j         j        }d}|	t          d          |                     |||||||||
||          }|r|j        n|d         }|                     |          }|s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )a'  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels are currently not supported.

        Returns:

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForImageAndTextRetrieval
        >>> import requests
        >>> from PIL import Image

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"]

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-coco")
        >>> model = ViltForImageAndTextRetrieval.from_pretrained("dandelin/vilt-b32-finetuned-coco")

        >>> # forward pass
        >>> scores = dict()
        >>> for text in texts:
        ...     # prepare inputs
        ...     encoding = processor(image, text, return_tensors="pt")
        ...     outputs = model(**encoding)
        ...     scores[text] = outputs.logits[0, :].item()
        ```NzTraining is not yet supported.r  r   rG   r  )	rB   r{  NotImplementedErrorrT  rz  r  r   r    r!   )rC   r   r   r   r   r   r   r   r   r  r   rJ  rK  r   r  rz  r   r  s                     r+   r   z$ViltForImageAndTextRetrieval.forward  s    Z &1%<kk$+B]%&FGGG))))%!'%/!5#  
 
 2=L--'!*!!-00 	FY,F)-)9TGf$$vE'!/)	
 
 
 	
r*   r  r  r   s   @r+   r  r    s       	 	 	 	 	 +*+@AA+CRabbb 156:594815155948-1,0/3&*L
 L
E,-L
 !!23L
 !!12	L

 u01L
 U-.L
 E-.L
   12L
 u01L
 )*L
 $D>L
 'tnL
 d^L
 
'u/@)AA	BL
 L
 L
 cb BAL
 L
 L
 L
 L
r*   r  zq
    Vilt Model transformer with a classifier head on top for natural language visual reasoning, e.g. NLVR2.
    c                        e Zd Z fdZ ee           eee          	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 d	e	e
j                 d
e	e
j                 de	e
j                 de	e
j                 de	e         de	e         de	e         deeee
j                 f         fd                        Z xZS )"ViltForImagesAndTextClassificationc           	         t                                          |           |j        | _        t          |          | _        |j        }t          j        t          j        |j	        |z  |j	        |z            t          j
        |j	        |z            t          j                    t          j        |j	        |z  |j                            | _        |                                  d S r	  )r0   r1   r  rd  rT  
num_imagesr   r  r   r6   r   r  r  rk  )rC   rB   r  rD   s      r+   r1   z+ViltForImagesAndTextClassification.__init__  s        +f%%	 &
-If(:5v7IJ7VWWL+j899GIIIf(:5v7HII	
 
 	r*   rw  Nr   r   r   r   r   r   r   r   r  r   rJ  rK  r  c                    |
|
n| j         j        }
||n| j         j        }||n| j         j        }| |j        dk    r|                    d          }| |j        dk    r|                    d          }||j        d         nd}|||j        d         nd}|| j         j        k    rt          d          g }|rg nd}|
rg nd}t          |          D ]}| 
                    |||||dd|ddddddf         nd||dd|ddddf         nd||||dd|ddddf         nd|dz   |
||          }|r|j        n|d         }|                    |           |r|                    |j                   |
r|                    |j                   t          j        |d          }|                     |          }d}|	`t%                      }|	                    |j                  }	 ||                    d| j                  |	                    d                    }|s|||f}||f|z   n|S t/          ||||	          S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Binary classification labels.

        Returns:

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForImagesAndTextClassification
        >>> import requests
        >>> from PIL import Image

        >>> image1 = Image.open(requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg", stream=True).raw)
        >>> image2 = Image.open(requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_1.jpg", stream=True).raw)
        >>> text = "The left image contains twice the number of dogs as the right image."

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-nlvr2")
        >>> model = ViltForImagesAndTextClassification.from_pretrained("dandelin/vilt-b32-finetuned-nlvr2")

        >>> # prepare inputs
        >>> encoding = processor([image1, image2], text, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(input_ids=encoding.input_ids, pixel_values=encoding.pixel_values.unsqueeze(0))
        >>> logits = outputs.logits
        >>> idx = logits.argmax(-1).item()
        >>> print("Predicted answer:", model.config.id2label[idx])
        Predicted answer: True
        ```N   r   r
   z\Make sure to match the number of images in the model with the number of images in the input.)r   r   r   r   r   r   r   r   r   rJ  rK  r[   rJ   r  )rB   r   rJ  r{  ndim	unsqueezerq   r  r   r>  rT  rz  r   r    r!   r&   ry   r  r	   r~   r_   rx   r  r   )rC   r   r   r   r   r   r   r   r   r  r   rJ  rK  r  pooler_outputsr    r!   r   r  rz  r  r   r   r  r  s                            r+   r   z*ViltForImagesAndTextClassification.forward  s   ^ 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]#(9Q(>(>'11!44L#(9Q(>(>'11!44L.:.F\'**D
2>2J+A..PTJ///n   2<,6RR$
z"" 	6 	6Aii--<H<T\!!!Q111aaa-88Z^5?5K:aaaAAAqqqj11QU#+9E9Q\!!!Q111*55W[%&U"3%9'    G 6APG11gajM!!-000# <$$W%:;;;  6!!'"4555	.b999//'))HYYv}--F8FKKDO<<fkk"ooNND 	FmZ8F)-)9TGf$$vE7'!	
 
 
 	
r*   r  )r"   r#   r$   r1   r   r  r   r   r  r   r&   r  r'   r  r   r   r   r   r   s   @r+   r  r    s           $ +*+@AA+Sbqrrr 156:594815155948-1,0/3&*o
 o
E,-o
 !!23o
 !!12	o

 u01o
 U-.o
 E-.o
   12o
 u01o
 )*o
 $D>o
 'tno
 d^o
 
7u?P9QQ	Ro
 o
 o
 sr BAo
 o
 o
 o
 o
r*   r  z
    ViLT Model with a token classification head on top (a linear layer on top of the final hidden-states of the text
    tokens) e.g. for Named-Entity-Recognition (NER) tasks.
    c                        e Zd Z fdZ ee           eee          	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 d	e	e
j                 d
e	e
j                 de	e
j                 de	e
j                 de	e         de	e         de	e         deeee
j                 f         fd                        Z xZS )ViltForTokenClassificationc                 :   t                                          |           |j        | _        t          |d          | _        t          j        |j                  | _        t          j	        |j
        |j                  | _        |                                  d S )NF)rl  )r0   r1   r  rd  rT  r   r?   r@   rA   r   r6   r  rk  r   s     r+   r1   z#ViltForTokenClassification.__init__  s~        +f>>>	z&"<==)F$68IJJ 	r*   rw  Nr   r   r   r   r   r   r   r   r  r   rJ  rK  r  c                 L   ||n| j         j        }|                     |||||||||
||          }|d         }||j        d         n|j        d         }|                     |          }|                     |ddd|f                   }d}|	`t                      }|	                    |j                  }	 ||	                    d| j
                  |		                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )z
        labels (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.

        Returns:
        Nr  r   r   r[   rG   r  )rB   r{  rT  rq   rA   r  r	   r~   r_   rx   r  r   r    r!   )rC   r   r   r   r   r   r   r   r   r  r   rJ  rK  r  r  text_input_sizer   r   r  r  s                       r+   r   z"ViltForTokenClassification.forward  sa   0 &1%<kk$+B]))))%!'%/!5#  
 
 "!*090E)/!,,=K^_`Ka,,774D_4D1D!EFF'))HYYv}--F8FKKDO<<fkk"ooNND 	FY,F)-)9TGf$$vE$!/)	
 
 
 	
r*   r  )r"   r#   r$   r1   r   r  r   r   r  r   r&   r  r'   r  r   r   r   r   r   s   @r+   r  r  |  s       
 
 
 
 
 +*+@AA+@___ 156:594815155948-1,0/3&*=
 =
E,-=
 !!23=
 !!12	=

 u01=
 U-.=
 E-.=
   12=
 u01=
 )*=
 $D>=
 'tn=
 d^=
 
$eE,=&>>	?=
 =
 =
 `_ BA=
 =
 =
 =
 =
r*   r  )Dr%   collections.abcr   r   dataclassesr   typingr   r   r   r   r&   torch.utils.checkpointr   torch.nnr	   activationsr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   configuration_viltr   
get_loggerr"   loggerr  _CHECKPOINT_FOR_DOCr   Moduler-   r2   r8   r   r  r  r!  r+  r/  r:  rS  VILT_START_DOCSTRINGr  4VILT_IMAGES_AND_TEXT_CLASSIFICATION_INPUTS_DOCSTRINGrd  ri  r  r  r  r  r  r  r  r)   r*   r+   <module>r     s          ! ! ! ! ! ! / / / / / / / / / / / /            % % % % % % ! ! ! ! ! !                . - - - - -         
 u t t t t t t t t t t t * * * * * * 
	H	%	%-  @ @ @ @ @{ @ @ @2W! W! W! W! W!RY W! W! W!t6 6 6 6 6RY 6 6 6r    ")   >9 9 9 9 9	 9 9 9z    RY   $    BI   F    ry   "       # # # # #	 # # #L2
 2
 2
 2
 2
") 2
 2
 2
j* * * * */ * * *8	 5 n58 4p d N
 N
 N
 N
 N
# N
 N
	 N
b         	 C
 C
 C
 C
 C
) C
 C
 C
L    ")   "    ")   ,   g
 g
 g
 g
 g
2 g
 g
 g
T   Z
 Z
 Z
 Z
 Z
#6 Z
 Z
 Z
z  9	 D
 D
 D
 D
 D
)< D
 D
 D
N   L
 L
 L
 L
 L
!4 L
 L
 L
 L
 L
r*   