
    gI                       d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z(  e            rddl)m*Z*  e!j+        e,          Z-dZ.e G d de                      Z/e G d de                      Z0 G d dej1                  Z2 G d dej1                  Z3 G d de3          Z4e3e4dZ5 G d  d!ej1                  Z6 G d" d#ej1                  Z7 G d$ d%ej1                  Z8 G d& d'ej1                  Z9 G d( d)ej1                  Z:d*Z; ed+e;           G d, d-e                      Z<d.Z= ed/e;           G d0 d1e<                      Z>d2e	j?        d3e@d4e	j?        fd5ZA G d6 d7ej1                  ZB G d8 d9ej1                  ZC G d: d;eC          ZDeCeDdZE G d< d=ej1                  ZFd>Z= ed?d@dAe;           G dB dCe<                      ZG G dD dEej1                  ZHdFZ= edGe;           G dH dIe<                      ZI edJe;           G dK dLe<e                      ZJdS )MzPyTorch Idefics2 model.    N)	dataclass)ListOptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN)CacheDynamicCache)GenerationMixin)_prepare_4d_attention_mask)BaseModelOutputModelOutput)PreTrainedModel)add_start_docstrings%add_start_docstrings_to_model_forwardis_flash_attn_2_available#is_flash_attn_greater_or_equal_2_10loggingreplace_return_docstrings   )	AutoModel   )Idefics2ConfigIdefics2PerceiverConfigIdefics2VisionConfig)_flash_attention_forwardr   c                       e Zd ZU dZdZej        ed<   dZe	e
e
ej                                   ed<   dZe	e
ej                          ed<   dZe	e
ej                          ed<   dZe	e
ej                          ed<   dS )Idefics2BaseModelOutputWithPasta	  
    Base class for Idefics2 model's outputs that may also contain a past key/values (to speed up sequential decoding).
    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
            encoder_sequence_length, embed_size_per_head)`.
            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
            input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
            sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_hidden_states)__name__
__module____qualname____doc__r"   torchFloatTensor__annotations__r#   r   r   r$   r%   r&        j/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/idefics2/modeling_idefics2.pyr!   r!   5   s          < ,0u(///AEOXeE%*;$<=>EEE8<M8E%"345<<<59Ju012999>B%(9":;BBBBBr/   r!   c                      e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
eeej                          ed<   dZeeej                          ed<   dZeeej                          ed<   dZeeej                          ed<   dS )	Idefics2CausalLMOutputWithPasta  
    Base class for Idefics2 causal language model (or autoregressive) outputs.
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
            sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlosslogitsr#   r$   r%   r&   )r'   r(   r)   r*   r3   r   r+   r,   r-   r4   r#   r   r$   r   r%   r&   r.   r/   r0   r2   r2   \   s          6 )-D(5$
%,,, $FE$$$9=OXd5#456===8<M8E%"345<<<59Ju012999>B%(9":;BBBBBr/   r2   c                   Z     e Zd ZdZdef fdZdej        dej        dej	        fdZ
 xZS )Idefics2VisionEmbeddingsaP  
    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings` to enable images of variable
    resolution.

    The modifications are adapted from [Patch n' Pack: NaViT, a Vision Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
    which allows treating images in their native aspect ratio and without the need to resize them to the same
    fixed size. In particular, we start from the original pre-trained SigLIP model
    (which uses images of fixed-size square images) and adapt it by training on images of variable resolutions.
    configc                    t                                                       |j        | _        |j        | _        |j        | _        t          j        |j        | j        | j        | j        d          | _	        | j        | j        z  | _
        | j
        dz  | _        | j        | _        t          j        | j        | j                  | _        d S )Nvalid)in_channelsout_channelskernel_sizestridepaddingr   )super__init__hidden_size	embed_dim
image_size
patch_sizer   Conv2dnum_channelspatch_embeddingnum_patches_per_sidenum_patchesnum_positions	Embeddingposition_embeddingselfr7   	__class__s     r0   r@   z!Idefics2VisionEmbeddings.__init__   s    + + +!y+? 
  
  
 %)Ot$F!4a7!-"$,t/A4>"R"Rr/   pixel_valuespatch_attention_maskreturnc                    |j         \  }}}}|                     |          }|                    d                              dd          }|| j        z  || j        z  }
}	t          j        d| j        z  dd| j        z            }t          j        ||	|
z  fd          }t          |          D ]\  }}|d d df         
                                }|d         
                                }t          j        ddd|z            }t          j        ddd|z            }t          j        ||d          }t          j        ||d          }|d d d f         | j        z  |z                                   }|||         |                    d	                                          <   |                    | j        j        j                  }||                     |          z   }|S )
Nr   r   g      ?r   )size
fill_valueg!?T)right)shaperG   flatten	transposerD   r+   arangerH   full	enumeratesum	bucketizeviewcputorL   weightdevice)rN   rP   rQ   
batch_size_max_im_hmax_im_wpatch_embeds
embeddingsmax_nb_patches_hmax_nb_patches_w
boundariesposition_ids	batch_idxp_attn_masknb_patches_hnb_patches_wfractional_coords_hfractional_coords_wbucket_coords_hbucket_coords_wpos_idss                         r0   forwardz Idefics2VisionEmbeddings.forward   s   ,8,>)
Ax++L99!))!,,66q!<<
-5-H(VZVeJe*\!d&?"?a$JcFcdd
z
4DGW4W'Xefggg&/0D&E&E 	J 	J"I{&qqq!t,0022L&q>--//L"',q(A<L"M"M"',q(A<L"M"M#o.A:UYZZZO#o.A:UYZZZO&qqq$w/$2KKo]ffhhGBIL#K$4$4R$8$8$<$<$>$>??#t'>'E'LMM$"9"9,"G"GG
r/   )r'   r(   r)   r*   r   r@   r+   r,   
BoolTensorTensorrx   __classcell__rO   s   @r0   r6   r6      s         S3 S S S S S S&E$5 UM] bgbn        r/   r6   c                        e Zd ZdZ fdZ	 	 d
dej        deej                 dee         de	ej        eej                 f         fd	Z
 xZS )Idefics2VisionAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        d| _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      F)r?   r@   r7   rA   rB   num_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutdropoutr   Lineark_projv_projq_projout_proj	is_causalrM   s     r0   r@   z Idefics2VisionAttention.__init__   s   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
/i??i??i??	$.$.AA r/   NFr$   attention_maskoutput_attentionsrR   c                 (   |                                 \  }}}|                     |          }|                     |          }|                     |          }	|                    ||| j        | j                                      dd          }|                    ||| j        | j                                      dd          }|	                    ||| j        | j                                      dd          }	|j        d         }
t          j
        ||                    dd                    | j        z  }|                                 || j        ||
fk    r0t          d|| j        ||
f d|                                            |L|                                 |d||
fk    r+t          d|d||
f d|                                            ||z   }t          j                            |d	t          j        
                              |j                  }t          j                            || j        | j                  }t          j
        ||	          }|                                 || j        || j        fk    r5t          d|| j        || j        f d|                                            |                    dd                                          }|                    ||| j                  }|                     |          }||fS )z#Input shape: Batch x Time x Channelr   r   r
   $Attention weights should be of size 	, but is N!Attention mask should be of size rW   dimdtype)ptraining `attn_output` should be of size )rT   r   r   r   r`   r   r   rZ   rX   r+   matmulr   r   r   
functionalsoftmaxfloat32rb   r   r   r   
contiguousreshaperB   r   )rN   r$   r   r   re   q_lenrf   query_states
key_statesvalue_statesk_v_seq_lenattn_weightsattn_outputs                r0   rx   zIdefics2VisionAttention.forward   s     -1133
E1{{=11[[//
{{=11#((UDNDMZZddefhijj__ZVV``abdeff
#((UDNDMZZddefhijj &r*|L*2F2Fq!2L2LMMPTPZZ:t~uk"RRR*
DNTY[f7g * * %%''* *  
 %""$$Q{(KKK }Q{8[}}ftfyfyf{f{}}   (.8L },,\r,WWZZ[g[mnn},,\T\TXTa,bbl<>>*dneT]!SSS)JPUW[Wd3e ) )$$&&) )  
 "++Aq11<<>>!))*eT^LLmmK00L((r/   )NF)r'   r(   r)   r*   r@   r+   rz   r   boolr   rx   r{   r|   s   @r0   r~   r~      s        GG    2 26,1	2) 2)|2) !.2) $D>	2)
 
u|Xel33	42) 2) 2) 2) 2) 2) 2) 2)r/   r~   c                        e Zd ZdZ fdZ	 	 	 	 	 ddej        deej                 deej                 dee	         d	e
d
e
deej        eej                 eeej                          f         fdZ xZS )Idefics2VisionFlashAttention2aX  
    Idefics2Vision flash attention module. This module inherits from `Idefics2VisionAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                 b     t                      j        |i | t                       | _        d S Nr?   r@   r   _flash_attn_uses_top_left_maskrN   argskwargsrO   s      r0   r@   z&Idefics2VisionFlashAttention2.__init__  9    $)&)))
 3V2W2W.W+++r/   NFr$   r   rn   past_key_valuer   	use_cacherR   c           
         d}|                                 \  }}	}
|                     |          }|                     |          }|                     |          }|                    ||	| j        | j                  }|                    ||	| j        | j                                      dd          }|                    ||	| j        | j                                      dd          }|j        d         }|||	                    || j
                  z  }|                    dd          }|                    dd          }| j        r| j        nd}|j        }|t          j        k    rt          j                    rt          j                    }n3t%          | j        d          r| j        j        }n| j        j        j        }t,                              d| d           |                    |          }|                    |          }|                    |          }t3          |||||	|| j        | j        	          }|                    ||	| j                                                  }|                     |          }|sd }||fS )
NFr   r   r           _pre_quantization_dtypeThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .)r   r   use_top_left_mask) rT   r   r   r   r`   r   r   rZ   rX   get_usable_length	layer_idxr   r   r   r+   r   is_autocast_enabledget_autocast_gpu_dtypehasattrr7   r   rc   loggerwarning_oncerb   r   r   r   r   rB   r   r   )rN   r$   r   rn   r   r   r   r   bszr   rf   r   r   r   
kv_seq_lendropout_rateinput_dtypetarget_dtyper   r   s                       r0   rx   z%Idefics2VisionFlashAttention2.forward  s    "%**,,UA{{=11[[//
{{=11
 $((eT^T]SS__S%OOYYZ[]^__
#((eT^T]SS]]^_abcc%b)
%.:::t~VVVJ  ))!Q//
#--a33'+}=t||# #(%-''(** 8$;==&?@@ 8#{B#{17$ $ $ $   (??<88L#|44J'??<88L. n"A	
 	
 	
 "))#udnEEPPRRmmK00  	 LL((r/   NNNFFr'   r(   r)   r*   r@   r+   rz   r   
LongTensorr   r   r   rx   r{   r|   s   @r0   r   r     s         X X X X X 6:37*."'O) O)|O) !!12O) u/0	O)
 !O)  O) O) 
u|Xel3XeEL>Q5RR	SO) O) O) O) O) O) O) O)r/   r   )eagerflash_attention_2c                   B     e Zd Z fdZdej        dej        fdZ xZS )Idefics2VisionMLPc                    t                                                       || _        t          |j                 | _        t          j        |j        |j	                  | _
        t          j        |j	        |j                  | _        d S r   )r?   r@   r7   r   
hidden_actactivation_fnr   r   rA   intermediate_sizefc1fc2rM   s     r0   r@   zIdefics2VisionMLP.__init__v  sf    #F$569V/1IJJ9V5v7IJJr/   r$   rR   c                     |                      |          }|                     |          }|                     |          }|S r   )r   r   r   )rN   r$   s     r0   rx   zIdefics2VisionMLP.forward}  s=    //**=99//r/   )r'   r(   r)   r@   r+   rz   rx   r{   r|   s   @r0   r   r   u  sc        K K K K KU\ el        r/   r   c                   6     e Zd Zdedededef fdZd Z xZS )Idefics2MLPrA   r   output_sizer   c                    t                                                       t          j        ||d          | _        t          j        ||d          | _        t          j        ||d          | _        t          |         | _        d S NFbias)	r?   r@   r   r   	gate_projup_proj	down_projr   act_fn)rN   rA   r   r   r   rO   s        r0   r@   zIdefics2MLP.__init__  sv     	;0ANNNy.?eLLL#4kNNNZ(r/   c                     |                      |                     |                     |                    |                     |          z            S r   )r   r   r   r   )rN   xs     r0   rx   zIdefics2MLP.forward  s;    ~~dkk$..*;*;<<t||ANOOOr/   )r'   r(   r)   intstrr@   rx   r{   r|   s   @r0   r   r     sy        )) ) 	)
 ) ) ) ) ) )P P P P P P Pr/   r   c                   .     e Zd ZdZdef fdZd Z xZS )%Idefics2MultiheadAttentionPoolingHeadzMultihead Attention Pooling.r7   c                    t                                                       t          j        t	          j        dd|j                            | _        t          j                            |j        |j	        d          | _
        t          j        |j        |j                  | _        t          |j        |j        |j        |j                  | _        d S )Nr   T)batch_firsteps)rA   r   r   r   )r?   r@   r   	Parameterr+   randnrA   probeMultiheadAttentionr   	attention	LayerNormlayer_norm_eps	layernormr   r   r   mlprM   s     r0   r@   z.Idefics2MultiheadAttentionPoolingHead.__init__  s    \%+aF4F"G"GHH
44V5GIcqu4vvf&8f>STTT*$6(*	
 
 
r/   c                    |j         d         }| j                            |dd          }|                     |||          d         }|}|                     |          }||                     |          z   }|d d df         S )Nr   r   )rX   r   repeatr   r   r   )rN   hidden_statere   r   residuals        r0   rx   z-Idefics2MultiheadAttentionPoolingHead.forward  s    !'*

!!*a33~~e\<HHK~~l33$((<"8"88AAAqD!!r/   )r'   r(   r)   r*   r   r@   rx   r{   r|   s   @r0   r   r     sZ        &&
3 
 
 
 
 
 

" 
" 
" 
" 
" 
" 
"r/   r   c            
       v     e Zd Zdef fdZ	 d	dej        dej        dee         de	ej
                 fdZ xZS )
Idefics2EncoderLayerr7   c                 Z   t                                                       |j        | _        t	          |j                 |          | _        t          j        | j        |j	                  | _
        t          |          | _        t          j        | j        |j	                  | _        d S )Nr   )r?   r@   rA   rB    IDEFICS_VISION_ATTENTION_CLASSES_attn_implementation	self_attnr   r   r   layer_norm1r   r   layer_norm2rM   s     r0   r@   zIdefics2EncoderLayer.__init__  s    +9&:UVW]^^<F<QRRR$V,,<F<QRRRr/   Fr$   r   r   rR   c                     |}|                      |          }|                     |||          \  }}||z   }|}|                     |          }|                     |          }||z   }|f}|r||fz  }|S )a=  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r$   r   r   )r   r   r   r   )rN   r$   r   r   r   r   outputss          r0   rx   zIdefics2EncoderLayer.forward  s      !((77&*nn')/ '5 '
 '
#|
 !=0 ((77// =0 " 	'&Gr/   )F)r'   r(   r)   r   r@   r+   rz   r   r   r   r,   rx   r{   r|   s   @r0   r   r     s        S3 S S S S S S -2	$ $|$ $ $D>	$
 
u 	!$ $ $ $ $ $ $ $r/   r   c                        e Zd ZdZdef fdZ	 	 	 	 ddeej                 dee	         dee	         dee	         d	e
eef         f
d
Z xZS )Idefics2Encoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Idefics2EncoderLayer`].

    Args:
        config: Idefics2Config
    r7   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r.   )r   ).0rf   r7   s     r0   
<listcomp>z,Idefics2Encoder.__init__.<locals>.<listcomp>  s"    $k$k$ka%9&%A%A$k$k$kr/   F)	r?   r@   r7   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingrM   s    `r0   r@   zIdefics2Encoder.__init__  sa    m$k$k$k$k5QWQiKjKj$k$k$kll&+###r/   Nr   r   output_hidden_statesreturn_dictrR   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|rdnd}|rdnd}|}| j        D ]Z}	|r||fz   }| j        r%| j        r|                     |	j        |||          }
n |	|||          }
|
d         }|r||
d         fz   }[|r||fz   }|st          d |||fD                       S t          |||          S )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr.   )r   r   r   c              3      K   | ]}||V  	d S r   r.   r  vs     r0   	<genexpr>z*Idefics2Encoder.forward.<locals>.<genexpr>9  s(      eeqWXWdWdWdWdWdeer/   r"   r$   r%   )r7   r   r  use_return_dictr	  r
  r   _gradient_checkpointing_func__call__tupler   )rN   inputs_embedsr   r   r  r  encoder_statesall_attentionsr$   encoder_layerlayer_outputss              r0   rx   zIdefics2Encoder.forward  sx   < 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]3=0:d%![ 	F 	FM# C!/=2B!B* t}  $ A A!*!"%	! ! !.!"&7! ! ! *!,M  F!/=3C2E!E 	?+}.>>N 	fee]NN$Seeeeee+>Vd
 
 
 	
r/   NNNN)r'   r(   r)   r*   r   r@   r   r+   rz   r   r   r   r   rx   r{   r|   s   @r0   r  r    s         ,~ , , , , , , 26,0/3&*E
 E
 !.E
 $D>	E

 'tnE
 d^E
 
uo%	&E
 E
 E
 E
 E
 E
 E
 E
r/   r  ai  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`Idefics2Config`] or [`Idefics2VisionConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
zVThe bare Idefics2 Model outputting raw hidden-states without any specific head on top.c                   8    e Zd ZeZdZdZg dZdZdZ	dZ
dZd ZdS )Idefics2PreTrainedModelmodelT)r~   r   Idefics2PerceiverLayerIdefics2DecoderLayerr#   c                    t          | j        d          r| j        j        j        n| j        j        j        }t          |d          r!|j        j                            d|           t          |t          j	        t          j
        f          rJ|j        j                            d|           |j         |j        j                                         d S d S t          |t          j                  rS|j        j                            d|           |j        -|j        j        |j                                                  d S d S d S )Ninitializer_rangeclass_embeddingr   )meanstd)r   r7   text_configr#  r$  datanormal_
isinstancer   r   rE   rc   r   zero_rK   padding_idx)rN   moduler&  s      r0   _init_weightsz%Idefics2PreTrainedModel._init_weights^  sA    t{$788;DK#55(: 	 6,-- 	C"'//Sc/BBBfry")455 	?M&&CS&999{& &&((((( '&-- 	?M&&CS&999!-"6#56<<>>>>>	? 	?--r/   N)r'   r(   r)   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_cache_classr.  r.   r/   r0   r  r  P  sY        
 "L&*#ttt"3!N ? ? ? ? ?r/   r  a<  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
            The tensors corresponding to the input images. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`LlavaProcessor`] uses
            [`CLIPImageProcessor`] for processing images).
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zAIdefics2 vision encoder model that returnss raw image embeddings.c                        e Zd ZdZeZdef fdZd Zd Z	 	 	 	 dde	e
j                 de	e         d	e	e         d
e	e         deeef         f
dZ xZS )Idefics2VisionTransformerFr7   c                    t                                          |           |j        }|| _        t	          |          | _        t          |          | _        t          j	        ||j
                  | _        |j        dk    | _        d S )Nr   r   )r?   r@   rA   r7   r6   rj   r  encoderr   r   r   post_layernormr   _use_flash_attention_2)rN   r7   rB   rO   s      r0   r@   z"Idefics2VisionTransformer.__init__  sy       &	26::&v.. l9&:OPPP&,&AEX&X###r/   c                     | j         S r   rj   rN   s    r0   get_input_embeddingsz.Idefics2VisionTransformer.get_input_embeddings  s
    r/   c                     || _         d S r   r>  rN   values     r0   set_input_embeddingsz.Idefics2VisionTransformer.set_input_embeddings  s    r/   NrQ   r   r  r  rR   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|                    d          }|u| j         j        }t          j        ||                    d          |z  |                    d          |z  f          }|                    t          j	        |j
                  }|                     ||          }|                    |d          }t          j        |           sd }n| j        st          ||j                  }|                     |||||          }	|	d         }
|                     |
          }
|s|
f|	dd          z   S t'          |
|	j        |	j        	          S )
Nr   r   r
   r   rd   rP   rQ   rW   )r  r   r   r  r  r   r  )r7   r   r  r  rT   rD   r+   onesrb   r   rd   rj   r`   anyr<  r   r   r:  r;  r   r$   r%   )rN   rP   rQ   r   r  r  re   rD   r$   encoder_outputsr"   s              r0   rx   z!Idefics2VisionTransformer.forward  s    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]!&&q))
'/J#(: %%a((J6 %%a((J6$ $  $8#:#:T`Tg#:#h#h \Xlmm388RHH y..// 	i#'  , 	i#=>RTaTg#h#h ,,'//!5# ' 
 
 ,A. //0ABB 	>%'/!""*===/)7&1
 
 
 	
r/   r  )r'   r(   r)   r5  r   r/  r@   r@  rD  r   r+   ry   r   r   r   r   rx   r{   r|   s   @r0   r8  r8    s        
 N'LY3 Y Y Y Y Y Y        <@,0/3&*7
 7
 'u'787
 $D>	7

 'tn7
 d^7
 
uo%	&7
 7
 7
 7
 7
 7
 7
 7
r/   r8  r$   n_reprR   c                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rX   expandr   )r$   rK  batchnum_key_value_headsslenr   s         r0   	repeat_kvrQ    s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr/   c                   ,     e Zd Zd fd	Zd Zd Z xZS )Idefics2RMSNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )z>
        Idefics2RMSNorm is equivalent to T5LayerNorm
        N)r?   r@   r   r   r+   rH  rc   variance_epsilon)rN   rA   r   rO   s      r0   r@   zIdefics2RMSNorm.__init__  sD     	l5:k#:#:;; #r/   c                    |j         }|                    t          j                  }|                    d                              dd          }|t          j        || j        z             z  }| j        |                    |          z  S )Nr   rW   T)keepdim)	r   rb   r+   r   powr%  rsqrtrV  rc   )rN   r$   r   variances       r0   rx   zIdefics2RMSNorm.forward  s|    #)%((77 $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::r/   c                 H    t          | j        j                   d| j         S )Nz, eps=)r  rc   rX   rV  r?  s    r0   
extra_reprzIdefics2RMSNorm.extra_repr  s&    )**II$2GIIIr/   )rT  )r'   r(   r)   r@   rx   r]  r{   r|   s   @r0   rS  rS    sb        $ $ $ $ $ $; ; ;J J J J J J Jr/   rS  c                        e Zd Zddee         ddf fdZ	 	 	 	 	 ddej        dej        deej                 d	eej                 d
ee	ej                          de
de
de	ej        eej                 ee	ej                          f         fdZ xZS )Idefics2PerceiverAttentionNr   rR   c                 t   t                                                       d| _        |j        | _        |j        | _        |j        | _        |j        | _        | j        | j        z  | _	        |j
        | _
        t          j        | j        | j        | j        z  d          | _        t          j        | j        | j        | j        z  d          | _        t          j        | j        | j        | j        z  d          | _        t          j        | j        | j        z  | j        d          | _        d| _        dS )ziPerceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`NFr   )r?   r@   r   rA   resampler_n_headsr   resampler_head_dimr   rO  num_key_value_groupsr   r   r   r   r   r   o_projr   rN   r7   r   rO   s      r0   r@   z#Idefics2PerceiverAttention.__init__  s   !-11#)#= $(Nd6N$N!!'!9i 0$.4=2PW\]]]i 0$2JT]2Zafgggi 0$2JT]2Zafgggi >@PW\]]]r/   Flatentscontextr   rn   r   r   r   c                    |                                 \  }}	}
|	|                                 d         z   }t          j        ||gd          }|                     |          }|                     |          }|                     |          }|                    ||	| j        | j                  	                    dd          }|                    ||| j
        | j                  	                    dd          }|                    ||| j
        | j                  	                    dd          }t          | d|          }||                    ||| j                  \  }}t          || j                  }t          || j                  }t          j        ||	                    dd                    t#          j        | j                  z  }|                                 || j        |	|fk    r0t'          d|| j        |	|f d	|                                            |L|                                 |d|	|fk    r+t'          d
|d|	|f d	|                                            ||z   }t(          j                            |dt          j                                      |j                  }t          j        ||          }|                                 || j        |	| j        fk    r5t'          d|| j        |	| j        f d	|                                            |	                    dd                                          }|                    ||	| j        | j        z            }|                     |          }|sd}|||fS )a  
        Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!

        Args:
            latents (`torch.Tensor`): Tensor of shape [bsz, n_latents, embed_dim] representing fixed length latents to compress to.
            context (`torch.Tensor`): Tensor of shape [bsz, seq, embed_dim] representing long-form context to resample.
            attention_mask (`torch.Tensor`, *optional*): Tensor of shape [bsz, 1, seq, n_latents] representing attention mask.
            position_ids (`torch.LongTensor`, *optional*): Tensor of shape [bsz, seq] representing position indices of each input token.
            past_key_value (`Tuple[torch.Tensor]`, *optional*): Tuple of tensors containing cached key and value states.
            output_attentions (`bool`, *optional*, defaults to `False`): Whether to return attention weights.
            use_cache (`bool`, *optional*, defaults to `False`): Whether to use past_key_value for caching.
        r   r   r   r   r   Nr
   r   r   r   rW   r   r   )rT   r+   concatr   r   r   r`   r   r   rZ   rO  getattrupdater   rQ  rc  r   mathsqrtr   r   r   r   r   rb   r   r   r   rd  )rN   rf  rg  r   rn   r   r   r   r   r   rf   r   r$   r   r   r   r   r   s                     r0   rx   z"Idefics2PerceiverAttention.forward  s   ,  UAW\\^^A..
gw%7R@@@{{7++[[//
{{=11#((eT^T]SS]]^_abcc__S*d6NPTP]^^hhijlmnn
#((j$:RTXTabbllmnpqrr '7HH%'5'<'<ZW[We'f'f$J z4+DEE
 t/HII|L*2F2Fq!2L2LMMPTPYZ^ZgPhPhh3z"JJJ*T^UT^7_ * * %%''* *  
 %""$$a
(CCC ua
8Suu^l^q^q^s^suu   (.8L },,\r,WWZZ[g[mnnl<>>#t~udm!LLL)CPTP]3^ ) )$$&&) )  
 "++Aq11<<>>!))#udnt}6TUUkk+..  	 LL.88r/   r   r   )r'   r(   r)   r   r   r@   r+   rz   r   r   r   rx   r{   r|   s   @r0   r_  r_    s        (3- 4      . 26378<"'N9 N9N9 N9 !.	N9
 u/0N9 !u|!45N9  N9 N9 
u|Xel3XeEL>Q5RR	SN9 N9 N9 N9 N9 N9 N9 N9r/   r_  c                        e Zd ZdZ fdZ	 	 	 	 	 ddej        dej        deej                 deej                 d	ee	         d
e
de
deej        eej                 eeej                          f         fdZ xZS ) Idefics2PerceiverFlashAttention2aU  
    Idefics2 flash attention module. This module inherits from `Idefics2PerceiverAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                 b     t                      j        |i | t                       | _        d S r   r   r   s      r0   r@   z)Idefics2PerceiverFlashAttention2.__init__g  r   r/   NFrf  rg  r   rn   r   r   r   rR   c                    |                                 \  }	}
}|
|                                 d         z   }|                     |          }|                     t          j        ||gd                    }|                     t          j        ||gd                    }|                    |	|
| j        | j                  }|                    |	|| j	        | j                  
                    dd          }|                    |	|| j	        | j                  
                    dd          }|j        d         }|||d         j        d         z  }|Kt          | j        d          r|| j        j        k    r|| j        j        z
  }|d         }|d         }|d d d d |d d d f                                         }|d d d d |d d d f                                         }|j        d         | j        j        dz
  k    rt!          d|j                   ||f}|D|d d |d f         }t          j        |t          j        |d d dd f                   gd          }t          j        |d         |gd          }t          j        |d         |gd          }|r||fnd }t%          || j                  }t%          || j                  }| j        sd	n| j        }|j        }|t          j        k    rt          j                    rt          j                    }n3t          | j        d
          r| j        j        }n| j        j        j        }t8                              d| d           |                    |          }|                    |          }|                    |          }|
                    dd          }|
                    dd          }t?          |||||
|d | j         | j!        	  	        }|"                    |	|
| j        | j        z                                            }| #                    |          }|sd }|||fS )Nr   r   ri  r   r   sliding_windowzepast key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got rW   r   r   r   r   )r   rs  r   r   )$rT   r   r   r+   catr   r`   r   r   rO  rZ   rX   r   r7   rs  r   r   	ones_likerQ  rc  r   r   r   r   r   r   r   rc   r   r   rb   r   r   r   r   rd  )rN   rf  rg  r   rn   r   r   r   r   r   r   rf   r   r   r   r   slicing_tokenspast_key
past_valuer   r   r   r   r   s                           r0   rx   z(Idefics2PerceiverFlashAttention2.forwardp  s     UAW\\^^A..
 {{7++[[GW+=2!F!F!FGG
{{59gw-?R#H#H#HII#((eT^T]SS__S*d6NPTP]^^hhijlmnn
#((j$:RTXTabbllmnpqrr%b)
%.+1"55J%t{$455 r*t{Ga:a:a!+dk.H!H)!,+A.
#AAAqqq.//111$<=HHJJ'111nooqqq(@ALLNN
>"%)Ca)GGG$=,4N= =  
 #+J!7!-%3AAA~4F%GN%*YP^_`_`_`bdbebe_ePf@g@g/hnp%q%q%qNN1$5z#BJJJJ 9nQ&7%FANNNL7@J*l33d z4+DEE
 t/HII"&-KssT5K
 #(%-''(** 8$;==&?@@ 8#{B#{17$ $ $ $   (??<88L#|44J'??<88L  ))!Q//
#--a33. n"A

 

 

 "))#udnt}6TUU``bbkk+..  	 LL.88r/   r   r   r|   s   @r0   rp  rp  _  s	        X X X X X 6:37*."'j9 j9j9 j9 !!12	j9
 u/0j9 !j9  j9 j9 
u|Xel3XeEL>Q5RR	Sj9 j9 j9 j9 j9 j9 j9 j9r/   rp  c                       e Zd Zdef fdZ	 	 	 	 	 ddej        dej        deej                 deej                 d	ee	ej                          d
ee
         dee
         de	ej        ee	ej        ej        f                  f         fdZ xZS )r   r   c                    t                                                       |j        | _        |j        | _        |j        | _        |j        | _        t          | j        | j                  | _	        t          | j        | j                  | _
        t          |j                 ||          | _        t          | j        | j                  | _        t          |j        |j        dz  |j        |j                  | _        d S )Nr   )r      rA   r   r   r   )r?   r@   rA   resampler_n_latents	n_latentsresampler_depthdepthrms_norm_epsrS  input_latents_norminput_context_norm$IDEFICS2_PERCEIVER_ATTENTION_CLASSESr   r   post_attention_layernormr   r   r   re  s      r0   r@   zIdefics2PerceiverLayer.__init__  s    !-3+
"/"1$2BHY"Z"Z"Z"1$2BHY"Z"Z"Z=f>YZ[amvwww(78HdN_(`(`(`%*$014*(	
 
 
r/   NFrf  rg  r   rn   r   r   r   rR   c                 (   |}	|                      |          }|                     |          }|                     |||          \  }}
}|	|z   }|}	|                     |          }|                     |          }|	|z   }|f}|r||
fz  }|r||fz  }|S )a  
        Args:
            latents (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            context (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
        )rf  rg  r   )r  r  r   r  r   )rN   rf  rg  r   rn   r   r   r   r   r   self_attn_weightspresent_key_valuer   s                r0   rx   zIdefics2PerceiverLayer.forward  s    2 ))'22))'228<) 9G 9
 9
5"$5
 W$//88((7##W$* 	,)++G 	,)++Gr/   r   )r'   r(   r)   r   r@   r+   rz   r   r   r   r   r,   rx   r{   r|   s   @r0   r   r     s       
# 
 
 
 
 
 
, 26378<,1$)2 22 2 !.	2
 u/02 !u|!452 $D>2 D>2 
u (51BEDU1U+V"WW	X2 2 2 2 2 2 2 2r/   r   a3  
    Args:
        context (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`):
            The hidden states of the image after vision encoder and modality projection.
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
z`Idefics2 perceiver resampler model that performs `depth` blocks of cross-attention with a fixed zn`n_latents` inputs to decrease embedding sequence length. The Resampler acts as a form of learned pooling and zjis derived from [Perceiver: General Perception with Iterative Attention](https://arxiv.org/abs/2103.03206)c                   Z     e Zd ZdZeZd fdZdej        dej        dej        fdZ	 xZ
S )	Idefics2PerceiverResamplerFrR   Nc                    t                                                     j        | _        j        | _        j        | _        j        | _        j        | _        t          j
        t          j        | j        | j                            | _        t          j        fdt          | j                  D                       | _        t#          | j        | j                  | _        j        dk    | _        d S )Nc                 0    g | ]}t          |          S r.   )r   )r  idxr7   s     r0   r  z7Idefics2PerceiverResampler.__init__.<locals>.<listcomp>O  s$    $f$f$fS%;FC%H%H$f$f$fr/   r   r   )r?   r@   rA   r   r}  r~  r  r  r  r   r   r+   rH  rf  r  r  r	  rS  normr   r<  rM   s    `r0   r@   z#Idefics2PerceiverResampler.__init__C  s       !- +3+
"/ |EJt~t?O$P$PQQ m$f$f$f$fTYZ^ZdTeTe$f$f$fgg#D$4$:KLLL	&,&AEX&X###r/   rg  r   c           
      .   | j                             d                              |j        d         g| j                                         R           }t          j        |                    d          |                    d          f|j        |j                  }t          j	        ||gd          }| j
        st          ||j        | j                  n|}|}| j        D ]} ||||d d dd          }|d         }|                     |          }|S )	Nr   r   rF  rW   ri  )tgt_lenF)r   rn   r   r   r   )rf  	unsqueezerM  rX   rT   r+   rH  r   rd   rt  r<  r   r~  r	  r  )rN   rg  r   rf  latent_attention_maskcompressed_contextperceiver_layerr  s           r0   rx   z"Idefics2PerceiverResampler.forwardT  sA    ,((++22GM!4D3[t|GXGXGZGZ3[3[\\ %
  ##W\\!__5^=QZhZo!
 !
 !
 N4I#JPRSSS . &~w}dn]]]] 	 %#{ 	2 	2O+O"-!#"'  M "/q!1!YY'9::!!r/   )rR   N)r'   r(   r)   r5  r   r/  r@   r+   rz   rx   r{   r|   s   @r0   r  r  9  s         N*LY Y Y Y Y Y""""" "" 
	"" "" "" "" "" "" "" ""r/   r  c                   $     e Zd Z fdZd Z xZS )Idefics2Connectorc                    t                                                       t          |j        j        |j        j        |j        j        |j        j                  | _        t          
                    |j                  | _        d S )Nr|  )r?   r@   r   vision_configrA   r'  r   r   modality_projectionr  _from_configperceiver_configperceiver_resamplerrM   s     r0   r@   zIdefics2Connector.__init__z  su    #.,8$0B*6)4	$
 $
 $
  $>#J#J6Kb#c#c   r/   c                 ^    |                      |          }|                     ||          }|S )N)rg  r   )r  r  )rN   r&   r   s      r0   rx   zIdefics2Connector.forward  s8    "667JKK"66?Rcq6rr""r/   )r'   r(   r)   r@   rx   r{   r|   s   @r0   r  r  y  sL        d d d d d# # # # # # #r/   r  aE  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
            The tensors corresponding to the input images. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`LlavaProcessor`] uses
            [`CLIPImageProcessor`] for processing images).
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The hidden states of the image encoder after modality projection and perceiver resampling.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zQIdefics2 model consisting of a SIGLIP vision encoder and Mistral language decoderc                       e Zd Zdef fdZd Zd Zd Zd Zdde	e
         d	ej        fd
Zdej        de	ej                 de	ej                 fdZ ede          	 	 	 	 	 	 	 	 	 	 	 	 ddej        de	ej                 de	ej                 de	eej                          de	ej                 de	ej                 de	ej                 de	ej                 de	e         de	e         de	e         de	e         d	eeef         fd            Z xZS )Idefics2Modelr7   c                    t                                          |           | j        j        j        | _        | j        j        j        | _        t                              |j	                  | _
        t          |          | _        t          j        |j                  | _        |j        j        | _        | j        j        | _        |j        j        dk    | _        |                                  d S )Nr   )r?   r@   r7   r'  pad_token_idr,  
vocab_sizer8  r  r  vision_modelr  	connectorr   from_config
text_modelr  r}  image_seq_lenimage_token_idr   r<  	post_initrM   s     r0   r@   zIdefics2Model.__init__  s       ;2?+1<5BB6CWXX*622#/0BCC#4H"k8&,&8&MQd&d#r/   c                     fdd }|                                                      |          | _         | j                                      |          | _        dS )aE  
        Enables the gradients for the input embeddings.

        This is useful for lora when using gradient checkpointing.
        c.f. https://github.com/huggingface/peft/issues/1402#issuecomment-1913675032

        Override to set output.requires_grad = True for both the decoder's and vision model's embeddings.
        c                     t          t          |                                                     dk    r| S  t          |                                           d                   S Nr   )lenlistchildren)r-  get_lowest_modules    r0   r  zCIdefics2Model.enable_input_require_grads.<locals>.get_lowest_module  sW    4))**++q00 )(foo.?.?)@)@)CDDDr/   c                 0    |                     d           d S NTrequires_grad_r-  inputoutputs      r0   make_inputs_require_gradszKIdefics2Model.enable_input_require_grads.<locals>.make_inputs_require_grads      !!$'''''r/   N)r@  register_forward_hook_text_require_grads_hookr  _vision_require_grads_hook)rN   r  r  s     @r0   enable_input_require_gradsz(Idefics2Model.enable_input_require_grads  s    	E 	E 	E 	E 	E	( 	( 	( )-(A(A(C(C(Y(YZs(t(t%*;*;D<M*N*N*d*d%+
 +
'''r/   c                 j    | j                                          | j                                         d S r   r  remover  r?  s    r0   disable_input_require_gradsz)Idefics2Model.disable_input_require_grads  2    %,,...'..00000r/   c                 4    | j                                         S r   )r  r@  r?  s    r0   r@  z"Idefics2Model.get_input_embeddings  s    33555r/   c                 :    | j                             |           d S r   )r  rD  rB  s     r0   rD  z"Idefics2Model.set_input_embeddings  s    ,,U33333r/   Nnew_num_tokensrR   c                 j    | j                             ||          }|j        | j        j        _        |S )N)r  pad_to_multiple_of)r  resize_token_embeddingsnum_embeddingsr7   r'  r  rN   r  r  model_embedss       r0   r  z%Idefics2Model.resize_token_embeddings	  s<    >>)>P ? 
 
 .:-H*r/   	input_idsr  r&   c                     |j         \  }}}|| j        k    }|                                }|                    d|          }	|	||<   |S )aq  
        This method aims at merging the token embeddings with the image hidden states into one single sequence of vectors that are fed to the transformer LM.
        The merging happens as follows:
        - The text token sequence is: `tok_1 tok_2 tok_3 <fake_token_around_image> <image> <image> ... <image> <fake_token_around_image> tok_4`.
        - We get the image hidden states for the image through the vision encoder (and potentially the perceiver), and that hidden state is then projected into the text embedding space.
        We thus have a sequence of image hidden states of size (1, image_seq_len, hidden_dim), where 1 is for batch_size of 1 image and hidden_dim is the hidden_dim of the LM transformer.
        - The merging happens so that we obtain the following sequence: `vector_tok_1 vector_tok_2 vector_tok_3 vector_fake_tok_around_image {sequence of image_seq_len image hidden states} vector_fake_toke_around_image vector_tok_4`. That sequence is fed to the LM.
        - To fit the format of that sequence, `input_ids`, `input_embeds`, `attention_mask` are all 3 adapted to insert the image hidden states.
        rW   )rX   r  cloner`   )
rN   r  r  r&   
num_imagesrf   vision_hidden_sizespecial_image_token_masknew_inputs_embedsreshaped_image_hidden_statess
             r0   inputs_mergerzIdefics2Model.inputs_merger  s^     -@,E)
A)#,0C#C )//11':'?'?DV'W'W$6R23  r/   a  
        Inputs fed to the model can have an arbitrary number of images. To account for this, pixel_values fed to
        the model have image padding -> (batch_size, max_num_images, 3, max_heights, max_widths) where
        max_num_images is the maximum number of images among the batch_size samples in the batch.

        Padding images are not needed beyond padding the pixel_values at the entrance of the model.
        For efficiency, we only pass through the vision_model's forward the real images by
        discarding the padding images i.e. pixel_values of size (image_batch_size, 3, height, width) where
        image_batch_size would be 7 when num_images_per_sample=[1, 3, 1, 2] and max_num_images would be 3.
        r   rn   r#   rP   pixel_attention_maskr   r   r  r  c           
      b   |
|
n| j         j        }
||n| j         j        }|	|	n| j         j        }	||n| j         j        }| j        r*| j        j        r|	rt          	                    d           d}	||j
        \  }}n||j
        \  }}}nt          d          d}d}|	rjt          |t                    sAd}|t                      }n.t          j        |          }t          	                    d           |                                }|||dk    rt          d          |" | j                                        |          }||t          d          ||j
        \  }}}}}|                    | j        	          } |j        ||z  g|j
        d
d          R  }|j
        dd                                          }|dk                        d          |k    }||                                         }|ct1          j        |                    d          |                    d
          |                    d          ft0          j        |j                  }n8 |j        ||z  g|j
        d
d          R  }||                                         }| j         j        j        }|                    d||          }|                    d
||          }|                    d          ||z  k                                    }|                      ||          j!        }| "                    ||                    |                    d          d                    }n#|!|                    | j        |j                  }|dk    r||| #                    |||          }|                     |||||	|
||          }|r |	r|j$        %                                |_$        |stM          d g ||D                       S tO          |j!        |j$        |j(        |j)        |          S )NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz5You have to specify either input_ids or inputs_embedsr   TzWe detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)zWWhen first calling the model, if input_embeds are passed, input_ids should not be None.zMYou cannot specify both pixel_values and image_hidden_states at the same time)r   r   r   r   )rW   r   ri  r
   )rT   r   rd   )	dimensionrT   step)rW   r   rG  rW   )r   rF  )r  r  r&   )r  r   rn   r#   r   r   r  r  c              3      K   | ]}||V  	d S r   r.   r  s     r0   r  z(Idefics2Model.forward.<locals>.<genexpr>  s"      UUqq}}}}}UUr/   )r"   r#   r$   r%   r&   )*r7   r   r  r   r  r   r  r
  r   r   rX   r   r*  r   r   from_legacy_cacheget_seq_lengthr@  rb   r   r`   numelr^   r   r+   rH  rT   r   rd   r  rD   unfoldr  r"   r  r  r#   to_legacy_cacher  r!   r$   r%   )rN   r  r   rn   r#   r  rP   r  r&   r   r   r  r  re   
seq_lengthrf   past_seen_tokensreturn_legacy_cacher  rF   heightwidthnb_values_per_imagereal_images_indsrD   patches_subgridrQ   r   s                               r0   rx   zIdefics2Model.forward&  s*   8 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B]= 	T_C 		 	l   I  %._"J

&(5(;%J
AATUUU# 	@ou55 
&*#"*&2nnOO&2&D_&U&UO''b  
  /==??$):?OST?T?Tvwww BDO@@BB9MMM #(;(Glmmm%BNBT?J
L&%'???<<L,<,Z*-D^|GYZ[Z\Z\G]^^^L #/"4QRR"8">">"@"@ , 388\8JJNaa'(89DDFFL $+',z&++A..0A0A!0D0DlFWFWXYFZFZ[*'.( ( ($$ (A';'@+(.B.H.L( ( ($ (<<L'M'X'X'Z'Z$2=J299AJ]g9hhO-44qzXb4ccO$3$7$7H$7$E$EV`I`$`#f#f#h#h  #'"3"3)%9 #4 # #     #'..#4H4M4MlN_N_`aNbNbdf4g4g #1 # # !,"5"8"8tzR[Rb"8"c"cq  ]%>CVCb !..#+$7 /  M //')%+/!5# " 	
 	
  	P9 	P&-&=&M&M&O&OG# 	VUU$Cg$C/B$CUUUUUU.%7#3!/) 3
 
 
 	
r/   NN)NNNNNNNNNNNN)r'   r(   r)   r   r@   r  r  r@  rD  r   r   r   rK   r  r+   r   rz   r  r   IDEFICS2_INPUTS_DOCSTRINGr   r,   ry   r   r   r   r!   rx   r{   r|   s   @r0   r  r    sE       
~       
 
 
41 1 16 6 64 4 4 hsm hjht    !#!  -! &el3	! ! ! !, +*		 	"  '+1537=A5948;?;?$(,0/3&*I
 I
#I
 !.I
 u/0	I

 "$u'8"9:I
   12I
 u01I
 'u'78I
 &e&78I
 D>I
 $D>I
 'tnI
 d^I
 
u55	6I
 I
 I
 I
 I
 I
 I
 I
r/   r  zThe Idefics2 Model with a language modeling head. It is made up a SigLIP vision encoder, with a language modeling head on top. c            #       ^    e Zd ZdgZ fdZd Zd Zd Zd Zd Z	d Z
d"d
ee         dej        fdZd Z ee           eee          	 	 	 	 	 	 	 	 	 	 	 	 	 	 d#dej        deej                 deej                 deeej                          deej                 deej                 deej                 deej                 deej                 dee         dee         dee         dee         dedeeef         fd                        Z	 	 	 	 	 	 	 	 d$dZ  fd Z!e"d!             Z# xZ$S )% Idefics2ForConditionalGenerationzlm_head.weightc                 >   t                                          |           t          |          | _        | j        j        | _        t          j        |j        j	        |j        j
        d          | _        |j        j
        | _
        |                                  d S r   )r?   r@   r  r  r7   r  r   r   r'  rA   r  lm_headr  rM   s     r0   r@   z)Idefics2ForConditionalGeneration.__init__  s       "6**
"k8y!3!?ASA^ejkkk ,7 	r/   c                     d }|                                                      |          | _        | j        j                                                             |          | _        dS )z
        Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping
        the model weights fixed.
        c                 0    |                     d           d S r  r  r  s      r0   r  z^Idefics2ForConditionalGeneration.enable_input_require_grads.<locals>.make_inputs_require_grads  r  r/   N)r@  r  r  r  r  r  )rN   r  s     r0   r  z;Idefics2ForConditionalGeneration.enable_input_require_grads  sh    	( 	( 	( )-(A(A(C(C(Y(YZs(t(t%*.**A*V*V*X*X*n*n%+
 +
'''r/   c                 j    | j                                          | j                                         d S r   r  r?  s    r0   r  z<Idefics2ForConditionalGeneration.disable_input_require_grads  r  r/   c                 >    | j         j                                        S r   )r  r  r@  r?  s    r0   r@  z5Idefics2ForConditionalGeneration.get_input_embeddings  s    z$99;;;r/   c                 D    | j         j                            |           d S r   )r  r  rD  rB  s     r0   rD  z5Idefics2ForConditionalGeneration.set_input_embeddings  s!    
22599999r/   c                     | j         S r   r  r?  s    r0   get_output_embeddingsz6Idefics2ForConditionalGeneration.get_output_embeddings  s
    |r/   c                     || _         d S r   r  )rN   new_embeddingss     r0   set_output_embeddingsz6Idefics2ForConditionalGeneration.set_output_embeddings  s    %r/   Nr  rR   c                     |                      ||          }|||S |j        j        d         | j        j        _        | j        j        j        | _        |                                  |S r  )_resize_token_embeddingsrc   rX   r7   r'  r  tie_weightsr  s       r0   r  z8Idefics2ForConditionalGeneration.resize_token_embeddings  sm    44^EWXX!&8&@ .:-@-Fq-I*+1< 	r/   c                     |                                  }|                                 }t          | j        dd          r|j        |_        dS dS )z
        Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of DecoupledLinear and DecoupledEmbedding.
        tie_word_embeddingsTN)r  r@  rk  r7   rc   )rN   output_embeddingsinput_embeddingss      r0   r  z,Idefics2ForConditionalGeneration.tie_weights  s[     !668844664; 5t<< 	?'7'>$$$	? 	?r/   )output_typer/  r   r  r   rn   r#   r  rP   r  r&   labelsr   r   r  r  num_logits_to_keepc                 &   ||n| j         j        }||n| j         j        }||n| j         j        }|                     |||||||||
|||          }|d         }|                     |dd| dddf                   }d}|	L|                                }|	                    |j                  }	||dd|j	        d         dz
   df                             |j                  }|dddddf         |dk             
                                }|	dddf         |dk             
                                }n?|dddddf         
                                }|	dddf         
                                }t                      } ||                    d|                    d                    |                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j        |j        |j        |j                  S )aE  
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics2ForConditionalGeneration`).
                Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only
                computed for the tokens with labels in `[0, ..., config.vocab_size]`.

            num_logits_to_keep (`int`, *optional*):
                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.

        Returns:

        Example:

        ```python
        >>> import requests
        >>> import torch
        >>> from PIL import Image
        >>> from io import BytesIO

        >>> from transformers import AutoProcessor, AutoModelForVision2Seq
        >>> from transformers.image_utils import load_image

        >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
        >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
        >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
        >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")

        >>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-base")
        >>> model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/idefics2-8b-base", device_map="auto")

        >>> BAD_WORDS_IDS = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
        >>> EOS_WORDS_IDS = [processor.tokenizer.eos_token_id]

        >>> # Create inputs
        >>> prompts = [
        ...   "<image>In this image, we can see the city of New York, and more specifically the Statue of Liberty.<image>In this image,",
        ...   "In which city is that bridge located?<image>",
        ... ]
        >>> images = [[image1, image2], [image3]]
        >>> inputs = processor(images=images, text=prompts, padding=True, return_tensors="pt").to("cuda")

        >>> # Generate
        >>> generated_ids = model.generate(**inputs, bad_words_ids=BAD_WORDS_IDS, max_new_tokens=20)
        >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

        >>> print(generated_texts)
        ['In this image, we can see the city of New York, and more specifically the Statue of Liberty. In this image, we can see the city of New York, and more specifically the Statue of Liberty.\n\n', 'In which city is that bridge located?\n\nThe bridge is located in the city of Pittsburgh, Pennsylvania.\n\n\nThe bridge is']
        ```N)r  r   rn   r#   r  rP   r  r&   r   r   r  r  r   r   .rW   )r3   r4   r#   r$   r%   r&   )r7   r   r  r  r  r  floatrb   rd   rX   r   r	   r`   rT   r2   r#   r$   r%   r&   )rN   r  r   rn   r#   r  rP   r  r&   r  r   r   r  r  r  r   r$   r4   r3   shift_attention_maskshift_logitsshift_labelsloss_fctr  s                           r0   rx   z(Idefics2ForConditionalGeneration.forward	  s|   P 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] **)%+'%!5 3/!5#  
 
  
mAAA0B/B/C/CQQQ,FGHH\\^^FYYv}--F) (6aaa6<?Q;N9O9Q9Q6Q'R'U'UV\Vc'd'd$%c3B3k23G13LMXXZZ%c122g/Cq/HITTVV%c3B3k2==??%c122g99;;'))H8L--b,2C2CB2G2GHH,J[J[\^J_J_``D 	DY,F'+'7D7V##VC-#3!/) ' ;
 
 
 	
r/   c
           
      z   |E||d d |j         d          d f         }n(|j         d         |j         d         k    r|d d |f         }|
                    dd           }|b|`|                                                    d          dz
  }|                    |dk    d           |r|d d |j         d          d f         }||d         dk    r||d}n#|                    t          j                  d d}|	|	|d<   |d }d }n|}|}|                    |||
                    d	          ||||d
           |S )Nr   r   rn   rW   )r  r  )memory_format)r  r  r  r   )rn   r#   r   r   rP   r  r&   )	rX   getlongcumsummasked_fill_r  r+   contiguous_formatrl  )rN   r  r#   r   r  cache_positionrP   r  r&   r  r   rn   model_inputss                r0   prepare_inputs_for_generationz>Idefics2ForConditionalGeneration.prepare_inputs_for_generation  s   " &(%aaa.*>q*A)A)C)C&CD		#~';A'>>>%aaa&78	zz.$77%,*>)..0077;;a?L%%n&91=== F+AAA	0B/B/D/D,DE $):a)?)?-:SSLL *3uG^)_)_rvwwL)1CL-.*L#'  'L#7  ,#2#ZZ44"0 ,(<': 
	
 
	
 
	
 r/   c                 V     t                      j        d|||d|}|j        |d<   |S )N)r   model_kwargsis_encoder_decoderr&   r.   )r?   #_update_model_kwargs_for_generationr&   )rN   r   r  r  r   rO   s        r0   r  zDIdefics2ForConditionalGeneration._update_model_kwargs_for_generation  sN    BuwwB 
%1
 
 	
 
 /6.I*+r/   c                 T    d}| D ]!}|t          fd|D                       fz  }"|S )Nr.   c              3   t   K   | ]2}|                     d                     |j                            V  3dS )r   N)index_selectrb   rd   )r  
past_statebeam_idxs     r0   r  zBIdefics2ForConditionalGeneration._reorder_cache.<locals>.<genexpr>  sC      nnU_j--aZ=N1O1OPPnnnnnnr/   )r  )r#   r  reordered_past
layer_pasts    `  r0   _reorder_cachez/Idefics2ForConditionalGeneration._reorder_cache  sS     ) 	 	Jnnnncmnnnnn NN r/   r  )NNNNNNNNNNNNNr   )NNNNNNNN)%r'   r(   r)   _tied_weights_keysr@   r  r  r@  rD  r  r  r   r   r   rK   r  r  r   r  r   r2   _CONFIG_FOR_DOCr+   r   rz   r   r,   ry   r   r   r   rx   r  r  staticmethodr  r{   r|   s   @r0   r  r    s       
 ++	 	 	 	 	
 
 
1 1 1< < <: : :  & & & hsm hjht     ? ? ? +*+DEE+IXghhh '+1537=A5948;?;?-1$(,0/3&*"#~
 ~
#~
 !.~
 u/0	~

 "$u'8"9:~
   12~
 u01~
 'u'78~
 &e&78~
 )*~
 D>~
 $D>~
 'tn~
 d^~
  ~
  
u44	5!~
 ~
 ~
 ih FE~
F ! ; ; ; ;z	 	 	 	 	   \    r/   r  )Kr*   rm  dataclassesr   typingr   r   r   r   r+   torch.utils.checkpointr   torch.nnr	   activationsr   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   r   r   r   autor   configuration_idefics2r   r   r   modeling_flash_attention_utilsr   
get_loggerr'   r   r   r!   r2   Moduler6   r~   r   r   r   r   r   r   r  IDEFICS2_START_DOCSTRINGr  r  r8  rz   r   rQ  rS  r_  rp  r  r   r  r  r  r  r.   r/   r0   <module>r3     s      ! ! ! ! ! ! / / / / / / / / / / / /            % % % % % % ! ! ! ! ! ! . . . . . . . . ) ) ) ) ) ) B B B B B B < < < < < < < < - - - - - -                      a a a a a a a a a a  KJJJJJJ 
	H	%	%" #C #C #C #C #Ck #C #C #CL !C !C !C !C !C[ !C !C !CH7 7 7 7 7ry 7 7 7vL) L) L) L) L)bi L) L) L)^_) _) _) _) _)$; _) _) _)F %6$ $      	   P P P P P") P P P&" " " " "BI " " "<. . . . .29 . . .dU
 U
 U
 U
 U
bi U
 U
 U
p " \ ? ? ? ? ?o ? ?	 ?< & K K
 K
 K
 K
 K
 7 K
 K
	 K
^	UU\ 	U# 	U%, 	U 	U 	U 	UJ J J J Jbi J J J(b9 b9 b9 b9 b9 b9 b9 b9L{9 {9 {9 {9 {9'A {9 {9 {9~ (9( ( $E E E E ERY E E EP  ftp	 7" 7" 7" 7" 7"!8 7" 7" 7"t# # # # #	 # # #"C L [ h
 h
 h
 h
 h
+ h
 h
	 h
V  J X X X X X'> X X	 X X Xr/   