
    gW                        d Z ddlZddlmZ ddlmZmZmZ ddlZddl	Zddlm
Z
 ddlmZ ddlmZmZ dd	lmZmZ d
dlmZ  ej        e          Ze G d de                      Z G d de
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  ZdS )zTPyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object    N)	dataclass)OptionalTupleUnion)nn   )ACT2FN)BaseModelOutputBaseModelOutputWithPooling)ModelOutputlogging   )IdeficsVisionConfigc                       e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dS )IdeficsVisionModelOutputa  
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.

    Args:
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r        ^/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/idefics/vision.pyr   r   "   s          * 15L(5,-444+/u(///=AM8E%"3S"89:AAA:>Ju0#567>>>>>r   r   c                   z     e Zd Zdef fdZdej        dededej        fdZdd	ej	        d
e
dej        fdZ xZS )IdeficsVisionEmbeddingsconfigc                 z   t                                                       || _        |j        | _        |j        | _        |j        | _        t          j        t          j
        | j                            | _        t          j        |j        | j        | j        | j        d          | _        | j        | j        z  dz  | _        | j        dz   | _        t          j        | j        | j                  | _        |                     dt          j        | j                                      d          d           d S )NF)in_channelsout_channelskernel_sizestridebias   r   position_ids)r   )
persistent)super__init__r"   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferarangeexpandselfr"   	__class__s     r   r.   z IdeficsVisionEmbeddings.__init__A   s   + + +!|EK,G,GHH!y+? 
  
  
 !Ot>1D!-1"$,t/A4>"R"R^U\$:L-M-M-T-TU\-]-]jopppppr   
embeddingsheightwidthreturnc                    |j         d         dz
  }|                     | j                  }|j         d         dz
  }||k    r||k    r|S |dddf         }|ddddf         }|j         d         }	|| j        j        z  }
|| j        j        z  }|
dz   |dz   }}
t          j        |          }|                    dt          |          t          |          |	          }|	                    dddd          }|j
        t          j        k    }|r9t                              d           |                    t          j                  }t"          j                            ||
|z  ||z  fd	d
          }|r|                    t          j                  }t          |
          |j         d         k    st          |          |j         d         k    rJt)          dt          |
          t          |          f d|j         d         |j         d         f d          |	                    dddd                              dd|	          }t          j        |                    d          |fd          S )a#  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
        resolution images.

        Source:
        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
        r   Nr   r+   g?r   r)   zUpcasting patch_pos_embed to fp32 for interpolation since `upsample_bicubic2d_out_frame` in nn.functional.interpolate is not implemented for 'torch.bfloat16' dtype. This will result in a slight overhead.bicubicF)scale_factormodealign_cornerszNumber of patches for images (z/) don't match the shape of position embedding ()dim)shaper<   r*   r"   r2   mathsqrtreshapeintpermutedtyper   bfloat16loggerwarning_oncetofloatr   
functionalinterpolate
ValueErrorviewcat	unsqueeze)rA   rC   rD   rE   r9   	pos_embedr:   class_pos_embedpatch_pos_embedr0   num_h_patchesnum_w_patchessqrt_num_positionsfp32_upcastings                 r   interpolate_pos_encodingz0IdeficsVisionEmbeddings.interpolate_pos_encodingX   s    !&q)A-++D,=>>	!*Q.-''FeOO#AAAqD/#AAAqrrE*$R(	$+"88!77 (5s':MC<O}!Y}55)11!S9K5L5LcRdNeNegpqq)11!Q1==(.%.@ 	>h   .00==O-33'*<<mN`>`a	 4 
 
  	A-00@@O}!6r!:::c->P>PTcTijlTm>m>mh]1C1CSEWEW0X h h0?0Eb0I?K`acKd/eh h h   *11!Q1==BB1b)TTy/33A66HaPPPPr   Fpixel_valuesri   c                 <   |j         \  }}}}|s<|| j        k    s|| j        k    r&t          d| d| d| j         d| j         d	          | j        j        j        }|                     |                    |                    }|                    d                              dd          }| j	        
                    |dd          }	t          j        |	|gd	          }
|r|
|                     |
||          z   }
n|
|                     | j                  z   }
|
S )
NzInput image size (*z) doesn't match model (z8). You should try to set `interpolate_pos_encoding=True`)rV   r)   r   r+   rN   )rP   r1   r^   r8   weightrV   rZ   flatten	transposer5   r?   r   r`   ri   r<   r*   )rA   rj   ri   
batch_sizer7   rD   rE   target_dtypepatch_embedsclass_embedsrC   s              r   forwardzIdeficsVisionEmbeddings.forward   sc   2>2D/
L&%' 	((ET_,D,D u u u% u uu u+/?u u u  
 +28++LOO,O,O,OPP#++A..88A>>+22:q"EEYl;CCC
 $ 	Q#d&C&CJPVX]&^&^^JJ#d&=&=d>O&P&PPJr   F)r   r   r   r   r.   r   TensorrT   ri   r   boolrt   __classcell__rB   s   @r   r!   r!   @   s        q2 q q q q q q./Q5< /Q /QUX /Q]b]i /Q /Q /Q /Qb E$5 QU bgbn        r   r!   c                        e Zd ZdZ fdZdej        dedefdZ	 	 	 dd	ej        d
e	ej                 de	ej                 de	e
         deej        e	ej                 f         f
dZ xZS )IdeficsVisionAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                 t   t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      )r-   r.   r"   r/   r0   num_attention_heads	num_headshead_dimr^   scaleattention_dropoutdropoutr   Lineark_projv_projq_projout_projr@   s     r   r.   zIdeficsVisionAttention.__init__   s   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
/i??i??i??	$.$.AAr   tensorseq_lenbszc                     |                     ||| j        | j                                      dd                                          S )Nr   r)   )r_   r~   r   ro   
contiguous)rA   r   r   r   s       r   _shapezIdeficsVisionAttention._shape   s<    {{3GGQQRSUVWWbbdddr   NFr   attention_maskcausal_attention_maskoutput_attentionsrF   c                    |                                 \  }}}|                     |          | j        z  }|                     |                     |          d|          }	|                     |                     |          d|          }
|| j        z  d| j        f} |                     |||          j        | } |	j        | }	 |
j        | }
|	                     d          }t          j
        ||	                    dd                    }|                                 || j        z  ||fk    r2t          d|| j        z  ||f d|                                            ||                                 |d||fk    r+t          d|d||f d|                                            |                    || j        ||          |z   }|                    || j        z  ||          }||                                 |d||fk    r+t          d|d||f d|                                            |                    || j        ||          |z   }|                    || j        z  ||          }t          j                            |d          }|r=|                    || j        ||          }|                    || j        z  ||          }nd}t          j                            || j        | j        	          }t          j
        ||
          }|                                 || j        z  || j        fk    r5t          d
|| j        || j        f d|                                            |                    || j        || j                  }|                    dd          }|                    |||          }|                     |          }||fS )z#Input shape: Batch x Time x Channelr+   r   r)   z$Attention weights should be of size z	, but is Nz!Attention mask should be of size rN   )ptrainingz `attn_output` should be of size )sizer   r   r   r   r   r~   r   r_   r   bmmro   r^   r   r\   softmaxr   r   rS   r   )rA   r   r   r   r   r   tgt_lenr0   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                    r   rt   zIdeficsVisionAttention.forward   s0    #0"4"4"6"6Wi {{=11DJ>[[]!;!;REE
{{4;;}#=#=r3GGDN*B>
Ct{{<#>>CZP$Z_j1
(|(*5//!$$yz/C/CAq/I/IJJ3#7'"JJJ*dn8LgW^7_ * * %%''* *   !,$))++Q/III 7a'8R 7 7-22447 7   (,,S$.'7SSVkkL',,S4>-A7GTTL%""$$a'(BBB ta'8Rtt]k]p]p]r]rtt   (,,S$.'7SSVddL',,S4>-A7GTTL},,\r,BB 	)
 %1$5$5c4>7T[$\$\!055cDN6JGU\]]LL$(!]**<4<RVR_*``
i
L99#"6!OOO)CRVR_3` ) )$$&&) )  
 "&&sDNGT]SS!++Aq11!))#w	BBmmK00111r   )NNF)r   r   r   r   r.   r   rv   rT   r   r   rw   r   rt   rx   ry   s   @r   r{   r{      s        GGB B B B B&eU\ eC ec e e e e 268<,1L2 L2|L2 !.L2  (5	L2
 $D>L2 
u|Xel33	4L2 L2 L2 L2 L2 L2 L2 L2r   r{   c                   B     e Zd Z fdZdej        dej        fdZ xZS )IdeficsVisionMLPc                    t                                                       || _        t          |j                 | _        t          j        |j        |j	                  | _
        t          j        |j	        |j                  | _        d S N)r-   r.   r"   r	   
hidden_actactivation_fnr   r   r/   intermediate_sizefc1fc2r@   s     r   r.   zIdeficsVisionMLP.__init__  sf    #F$569V/1IJJ9V5v7IJJr   r   rF   c                     |                      |          }|                     |          }|                     |          }|S r   )r   r   r   )rA   r   s     r   rt   zIdeficsVisionMLP.forward  s=    //**=99//r   )r   r   r   r.   r   rv   rt   rx   ry   s   @r   r   r     sc        K K K K KU\ el        r   r   c                        e Zd Zdef fdZ	 d
dej        dej        dej        dee         de	ej
                 f
d	Z xZS )IdeficsVisionEncoderLayerr"   c                 D   t                                                       |j        | _        t	          |          | _        t          j        | j        |j                  | _	        t          |          | _        t          j        | j        |j                  | _        d S N)eps)r-   r.   r/   r0   r{   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r@   s     r   r.   z"IdeficsVisionEncoderLayer.__init__  s    +/77<F<QRRR#F++<F<QRRRr   Fr   r   r   r   rF   c                     |}|                      |          }|                     ||||          \  }}||z   }|}|                     |          }|                     |          }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r   r   r   )r   r   r   r   )rA   r   r   r   r   residualr   outputss           r   rt   z!IdeficsVisionEncoderLayer.forward&  s    " !((77&*nn')"7/	 '5 '
 '
#| !=0 ((77// =0 " 	'&Gr   ru   )r   r   r   r   r.   r   rv   r   rw   r   r   rt   rx   ry   s   @r   r   r     s        S2 S S S S S S -2& &|& &  %|	&
 $D>& 
u 	!& & & & & & & &r   r   c                        e Zd ZdZdef fdZ	 	 	 	 	 ddeej                 deej                 dee	         dee	         d	ee	         d
e
eef         fdZ xZS )IdeficsVisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`IdeficsVisionEncoderLayer`].

    Args:
        config: IdeficsVisionConfig
    r"   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r   )r   ).0_r"   s     r   
<listcomp>z1IdeficsVisionEncoder.__init__.<locals>.<listcomp>\  s"    $p$p$p1%>v%F%F$p$p$pr   F)	r-   r.   r"   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingr@   s    `r   r.   zIdeficsVisionEncoder.__init__Y  sb    m$p$p$p$pPUV\VnPoPo$p$p$pqq&+###r   Nr   r   r   output_hidden_statesreturn_dictrF   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|rdnd}|rdnd}|}	t	          | j                  D ]_\  }
}|r||	fz   }| j        r&| j        r|                     |j	        |	|||          }n ||	|||          }|d         }	|r||d         fz   }`|r||	fz   }|st          d |	||fD                       S t          |	||          S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr   )r   r   r   c              3      K   | ]}||V  	d S r   r   )r   vs     r   	<genexpr>z/IdeficsVisionEncoder.forward.<locals>.<genexpr>  s(      eeqWXWdWdWdWdWdeer   )r   r   r   )r"   r   r   use_return_dict	enumerater   r   r   _gradient_checkpointing_func__call__tupler
   )rA   inputs_embedsr   r   r   r   r   encoder_statesall_attentionsr   idxencoder_layerlayer_outputss                r   rt   zIdeficsVisionEncoder.forward_  s   L 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]3=0:d%"+DK"8"8 	F 	FC# C!/=2B!B* t}  $ A A!*!")%! ! !.!")&7	! ! ! *!,M  F!/=3C2E!E 	?+}.>>N 	fee]NN$Seeeeee+>Vd
 
 
 	
r   )NNNNN)r   r   r   r   r   r.   r   r   rv   rw   r   r   r
   rt   rx   ry   s   @r   r   r   P  s         ,2 , , , , , , 268<,0/3&*O
 O
 !.O
  (5	O

 $D>O
 'tnO
 d^O
 
uo%	&O
 O
 O
 O
 O
 O
 O
 O
r   r   c                        e Zd Zdef fdZ	 	 	 	 	 ddeej                 dee         dee         dee         d	ee         d
e	e
ef         fdZ xZS )IdeficsVisionTransformerr"   c                 4   t                                                       || _        |j        }t	          |          | _        t          j        ||j                  | _	        t          |          | _        t          j        ||j                  | _        d S r   )r-   r.   r"   r/   r!   rC   r   r   r   pre_layrnormr   encoderpost_layernorm)rA   r"   r0   rB   s      r   r.   z!IdeficsVisionTransformer.__init__  s    &	1&99L8MNNN+F33 l9&:OPPPr   NFrj   r   r   ri   r   rF   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |                     ||          }|                     |          }|                     ||||          }|d         }|dddddf         }	|                     |	          }	|s||	f|dd         z   S t          ||	|j
        |j                  S )z
        Returns:

        Nz You have to specify pixel_values)ri   )r   r   r   r   r   r   )r   pooler_outputr   r   )r"   r   r   r   r^   rC   r   r   r   r   r   r   )
rA   rj   r   r   ri   r   r   encoder_outputsr   pooled_outputs
             r   rt   z IdeficsVisionTransformer.forward  s/    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]?@@@Oghh))-88,,'/!5#	 ' 
 
 ,A.)!!!Q'2++M:: 	L%}58KKK)/')7&1	
 
 
 	
r   )NNNFN)r   r   r   r   r.   r   r   r   rw   r   r   r   rt   rx   ry   s   @r   r   r     s        Q2 Q Q Q Q Q Q 59,0/338&*+
 +
u01+
 $D>+
 'tn	+

 #+4.+
 d^+
 
u00	1+
 +
 +
 +
 +
 +
 +
 +
r   r   ) r   rQ   dataclassesr   typingr   r   r   r   torch.utils.checkpointr   activationsr	   modeling_outputsr
   r   utilsr   r   configuration_ideficsr   
get_loggerr   rX   r   Moduler!   r{   r   r   r   r   r   r   r   <module>r      s:   [ Z  ! ! ! ! ! ! ) ) ) ) ) ) ) ) ) )            ! ! ! ! ! ! K K K K K K K K ) ) ) ) ) ) ) ) 6 6 6 6 6 6 
	H	%	% ? ? ? ? ?{ ? ? ?:` ` ` ` `bi ` ` `He2 e2 e2 e2 e2RY e2 e2 e2R    ry    / / / / /	 / / /f^
 ^
 ^
 ^
 ^
29 ^
 ^
 ^
D7
 7
 7
 7
 7
ry 7
 7
 7
 7
 7
r   