
    g$                         d Z ddlmZmZ ddlZddlmZ ddlmZ  G d dej	                  Z
 G d d	ej	                  Z G d
 dej	                  ZdS )a  

Generic interface to various configurations of the Perceiver Resampler, that simply takes in a series of (potentially
time-indexed) contextual embeddings, and "resamples" (compresses) them down to a pre-specified number of latents! Note
that the Perceiver in general resamples based solely off the *long-range* context; there's a nice opportunity here to
prime the Perceiver Resampler with say a single layer's worth of language embeddings (the target domain), and use that
to softly "retrieve & compress" what we need --> this would be a novel contribution we should explore.

References:
    - DeepMind's Flamingo: https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model
    - Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch

    )OptionalTupleN   )IdeficsConfigc                   `     e Zd Zdededededededdf fd	Zd
ej        dej        fdZ xZ	S )IdeficsPerceiverResamplerconfig	embed_dimdepthn_headshead_dim	n_latentsreturnNc                 $    t                                                       ||||f\   _         _         _         _        j        j         _        t          j
        t          j         j         j                  d           _        t          j        d          s
 j        dz  nj        j        dz   _        t          j         fdt%          |          D                        _        t          j         j                   _        dS )ao  
        Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
        MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
        returns a Tensor of shape [bsz, n_latents, embed_dim]. :param embed_dim: Dimensionality of embeddings being fed
        to the Perceiver Resampler (also dimensionality of latent embeddings *returned* by the Perceiver Resampler.
        Could be e.g., VIT embed_dim, ResNet pool dim, and so on.

        Args:
            config (`IdeficsConfig`): config object
            embed_dim (`int`): The size of each embedding vector
            depth (`int`): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
            n_heads (`int`): Number of heads in each Transformer block (for multi-headed self-attention).
            head_dim (`int`): Dimensionality of each head projection in the Transformer block.
            n_latents (`int`):
                Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).

        T)requires_gradr
      c           
          g | ]O}t          j        t          j        j        j        j                  t          j                  g          PS  )	nn
ModuleListIdeficsPerceiverAttentionr
   r   r   qk_layer_norms
IdeficsMLPintermediate_dim).0_r	   selfs     a/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/idefics/perceiver.py
<listcomp>z6IdeficsPerceiverResampler.__init__.<locals>.<listcomp>S   si         1$.$,PTP]_c_rss"4#8&AA       N)super__init__r
   r   r   r   perceiver_configqk_layer_norms_perceiverr   r   	Parametertorchrandnlatentshasattrvision_configr   r   rangeblocks	LayerNorm
layer_norm)r   r	   r
   r   r   r   r   	__class__s   ``     r   r"   z"IdeficsPerceiverResampler.__init__1   s   ( 	FOQXZbdmFmCdmT^$5N |EK$O$O_cddd 6/==4DNQ%/!3 	 m     u  

 

 ,t~66r    contextc                     | j                             |j        d         dd          }| j        D ]"\  }} |||          |z   } ||          |z   }#|                     |          S )zWResample arbitrary length context & *compress* down to self.n_latents latent embeddingsr   r   )r(   repeatshaper,   r.   )r   r0   r(   attnffs        r   forwardz!IdeficsPerceiverResampler.forward_   sw     ,%%gmA&61==  	, 	,HD"d7G,,w6GbkkG+GGw'''r    )
__name__
__module____qualname__r   intr"   r&   Tensorr6   __classcell__r/   s   @r   r   r   0   s        ,7#,703,7<?,7JM,7Y\,7il,7	,7 ,7 ,7 ,7 ,7 ,7\
(u| 
( 
( 
( 
( 
( 
( 
( 
( 
(r    r   c            
       f     e Zd Zdededededdf
 fdZdej        d	ej        dej        fd
Z xZ	S )r   r
   r   r   r   r   Nc                    t                                                       |||c| _        | _        | _        || _        t          j        | j                  | _        t          j        | j                  | _	        | j        r<t          j        | j                  | _
        t          j        | j                  | _        | j        dz  | _        t          j        | j        | j        | j        z  d          | _        t          j        | j        | j        | j        z  d          | _        t          j        | j        | j        | j        z  d          | _        t          j        | j        | j        z  |d          | _        dS )ziPerceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`g      FbiasN)r!   r"   r
   r   r   r   r   r-   context_layer_normlatents_layer_normq_layer_normk_layer_normqk_scaleLinearq_projk_projv_projoutput_proj)r   r
   r   r   r   r/   s        r   r"   z"IdeficsPerceiverAttention.__init__m   s7   6?(3dm,"$,t~">">"$,t~">"> 	< "T] ; ;D "T] ; ;Dt+ it}0LSXYYYit}0LSXYYYit}0LSXYYY9T\DM%A9SXYYYr    r0   r(   c                 H                          |          }                     |          }|j        dd         \  }}                     |          }                     t          j        ||gd                    }                     t          j        ||gd                    } fd|||fD             \  }}} j        r* 	                    |          } 
                    |          }t          j        d| j        z  |          }||                    dd	                                          z
  }	|	                    d          }
t          j        d
|
|          }                     |                    dd                              d                    S )aF  
        Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!

        Args:
            context (`torch.Tensor`):
                Tensor of shape `[bsz, seq, embed_dim]` representing long-form context to resample.
            latents (`torch.Tensor`):
                Tensor of shape `[bsz, n_latents, embed_dim]` representing fixed length latents to compress to.

        Returns:
            `torch.Tensor`: Tensor of shape `[bsz, n_latents, embed_dim]` representing attention over latents w/ cross
            from context.
        N   )dimc                     g | ]C}|                     |j        d          j        j                                      d d          DS )r      )reshaper3   r   r   	transpose)r   x
batch_sizer   s     r   r   z5IdeficsPerceiverAttention.forward.<locals>.<listcomp>   sH    uuufg199ZT\4=QQ[[\]_`aauuur    z... i d, ... j d -> ... i jT)rO   keepdimz... i j, ... j d -> ... i dr   rQ   )rB   rC   r3   rH   rI   r&   catrJ   r   rD   rE   einsumrF   amaxdetachsoftmaxrK   rS   flatten)r   r0   r(   
seq_lengthr
   qkvscoresstabilized_scoresr4   	resampledrU   s   `           @r   r6   z!IdeficsPerceiverAttention.forward   s    ))'22))'22,3M"1",=)
J	 KK  KK	7G"4"===>>KK	7G"4"===>>
 vuuuulmoprsktuuu1a 	%!!!$$A!!!$$A;Q=NPQRR"fkkb$k&G&G&N&N&P&PQ ((R(00 L!>aHH		 3 3Aq 9 9 A A" E EFFFr    )
r7   r8   r9   r:   boolr"   r&   r;   r6   r<   r=   s   @r   r   r   l   s        Z# Z Zs ZTX Z]a Z Z Z Z Z Z*(Gu| (Gel (Gu| (G (G (G (G (G (G (G (Gr    r   c                   `     e Zd Zdef fdZdeeej                          dej        fdZ	 xZ
S )r   r	   c                 Z   t                                                       |j        j        | _        t	          j        | j                  | _        t	          j        | j        |d          | _        t	          j	                    | _
        t	          j        || j        d          | _        dS )z:Simple MLP block with intermediate_size and embedding sizeFr@   N)r!   r"   r*   r
   r   r-   lnrG   fcReLUactc_proj)r   intermediate_sizer	   r/   s      r   r"   zIdeficsMLP.__init__   s    -7,t~..)DN,=EJJJ799i 14>NNNr    hidden_statesr   c                     |                      |          }|                     |          }|                     |          }|                     |          }|S )N)rh   ri   rk   rl   )r   rn   s     r   r6   zIdeficsMLP.forward   sL    ....//M22r    )r7   r8   r9   r   r"   r   r   r&   FloatTensorr6   r<   r=   s   @r   r   r      s}        O- O O O O O OXeE4E.F%G EL]        r    r   )__doc__typingr   r   r&   torch.nnr   configuration_ideficsr   Moduler   r   r   r   r    r   <module>rv      s   4  # " " " " " " "        0 0 0 0 0 09( 9( 9( 9( 9(	 9( 9( 9(x>G >G >G >G >G	 >G >G >GB         r    