
    g'                         d Z ddlmZmZ ddlZddlmZ ddlm	Z	  G d d	ej
        j        j                  Z G d
 dej
        j        j                  Z G d dej
        j        j                  ZdS )a  

Generic interface to various configurations of the Perceiver Resampler, that simply takes in a series of (potentially
time-indexed) contextual embeddings, and "resamples" (compresses) them down to a pre-specified number of latents! Note
that the Perceiver in general resamples based solely off the *long-range* context; there's a nice opportunity here to
prime the Perceiver Resampler with say a single layer's worth of language embeddings (the target domain), and use that
to softly "retrieve & compress" what we need --> this would be a novel contribution we should explore.

References:
    - DeepMind's Flamingo: https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model
    - Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch

    )OptionalTupleN   )
shape_list   )IdeficsConfigc                   j     e Zd Zdededededededdf fd	Z fd
Zdej        dej        fdZ	 xZ
S )TFIdeficsPerceiverResamplerconfig	embed_dimdepthn_headshead_dim	n_latentsreturnNc                 F    t                      j        di | ||||f\  | _        | _        | _        | _        |j        j        | _        t          |j
        d          s
| j        dz  n|j
        j        dz  | _        g | _        t          |          D ]a}| j                            t          | j        | j        | j        | j        d| d          t!          | j        |d| d          g           bt"          j        j                            dd	          | _        d
S )ao  
        Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
        MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
        returns a Tensor of shape [bsz, n_latents, embed_dim]. :param embed_dim: Dimensionality of embeddings being fed
        to the Perceiver Resampler (also dimensionality of latent embeddings *returned* by the Perceiver Resampler.
        Could be e.g., VIT embed_dim, ResNet pool dim, and so on.

        Args:
            config (`IdeficsConfig`): config object
            embed_dim (`int`): The size of each embedding vector
            depth (`int`): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
            n_heads (`int`): Number of heads in each Transformer block (for multi-headed self-attention).
            head_dim (`int`): Dimensionality of each head projection in the Transformer block.
            n_latents (`int`):
                Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).

        r      zblocks.z.0namez.1h㈵>
layer_normepsilonr   N )super__init__r   r   r   r   perceiver_configqk_layer_norms_perceiverqk_layer_normshasattrvision_configintermediate_dimblocksrangeappendTFIdeficsPerceiverAttentionTFIdeficsMLPtfkeraslayersLayerNormalizationr   )
selfr   r   r   r   r   r   kwargsi	__class__s
            d/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/idefics/perceiver_tf.pyr   z$TFIdeficsPerceiverResampler.__init__1   sD   ( 	""6"""FOQXZbdmFmCdmT^$5N 6/==4DNQ%/!3 	 u 	 	AK/dmTEX_nij_n_n_n   !!6_q___UUU	    (/<<TP\<]]    c                     |                      | j        | j        fddd          | _        t	                                          |           d S )Nrandom_normalTlatents)shapeinitializer	trainabler   )
add_weightr   r   r4   r   build)r,   input_shaper/   s     r0   r9   z!TFIdeficsPerceiverResampler.build\   sO    >4>2[_fo ' 
 
 	k"""""r1   contextc                    t          j        | j        d          }t          j        |t          j        |          d         ddg          }| j        D ]"\  }} |||          |z   } ||          |z   }#|                     |          S )zWResample arbitrary length context & *compress* down to self.n_latents latent embeddingsr   axisr   )r(   expand_dimsr4   tiler5   r#   r   )r,   r;   r4   attnffs        r0   callz TFIdeficsPerceiverResampler.callc   s     .A666''BHW$5$5a$8!Q#?@@ 	, 	,HD"d7G,,w6GbkkG+GGw'''r1   )__name__
__module____qualname__r   intr   r9   r(   TensorrC   __classcell__r/   s   @r0   r
   r
   0   s        )^#)^03)^<?)^JM)^Y\)^il)^	)^ )^ )^ )^ )^ )^V# # # # #	(BI 	(") 	( 	( 	( 	( 	( 	( 	( 	(r1   r
   c            
       f     e Zd Zdededededdf
 fdZdej        d	ej        dej        fd
Z xZ	S )r&   r   r   r   r   r   Nc                     t                      j        di | |||c| _        | _        | _        || _        t          j        j        	                    dd          | _
        t          j        j        	                    dd          | _        | j        rVt          j        j        	                    dd          | _        t          j        j        	                    dd          | _        | j        dz  | _        t          j        j                            | j        | j        z  dd	
          | _        t          j        j                            | j        | j        z  dd
          | _        t          j        j                            | j        | j        z  dd
          | _        t          j        j                            |dd
          | _        dS )ziPerceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`r   context_layer_normr   latents_layer_normq_layer_normk_layer_normg      Fq_projuse_biasr   k_projv_projoutput_projNr   )r   r   r   r   r   r   r(   r)   r*   r+   rM   rN   rO   rP   qk_scaleDenserQ   rT   rU   rV   )r,   r   r   r   r   r-   r/   s         r0   r   z$TFIdeficsPerceiverAttention.__init__p   s   ""6"""6?(3dm,"$(/"D"DTXl"D"m"m"$(/"D"DTXl"D"m"m 	f " B B4Vd B e eD " B B4Vd B e eDt+ ho++DL4=,HSX_g+hhho++DL4=,HSX_g+hhho++DL4=,HSX_g+hh8?00UQ^0__r1   r;   r4   c           	      ^                          |          }                     |          }t          |          \  }}                     |          }                     t          j        ||gd                    }                     t          j        ||gd                    } fd|||fD             \  }}} j        r* 	                    |          } 
                    |          }t          j        d| j        z  |          }|t          j        |dd          z
  }	t
          j                            |	d          }
t          j        d|
|          }                     t          j        t          j        |g d	
          d j         j        z  f                    S )a=  
        Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!

        Args:
            context (`tf.Tensor`):
                Tensor of shape `[bsz, seq, embed_dim]` representing long-form context to resample.
            latents (`tf.Tensor`):
                Tensor of shape `[bsz, n_latents, embed_dim]` representing fixed length latents to compress to.

        Returns:
            `tf.Tensor`: Tensor of shape `[bsz, n_latents, embed_dim]` representing attention over latents w/ cross
            from context.
        r=   c                     g | ]F}t          j        t          j        ||j        d          j        j        f          g d          GS )r   r      r   r   perm)r(   	transposereshaper5   r   r   ).0x
batch_sizer,   s     r0   
<listcomp>z4TFIdeficsPerceiverAttention.call.<locals>.<listcomp>   s^     
 
 
 LA
AGAJdm'\]]dpdpdpqqq
 
 
r1   z... i d, ... j d -> ... i jT)r>   keepdimsz... i j, ... j d -> ... i dr\   r^   )rM   rN   r   rQ   rT   r(   concatrU   r   rO   rP   einsumrW   
reduce_maxnnsoftmaxrV   ra   r`   r   r   )r,   r;   r4   
seq_lengthr   qkvscoresstabilized_scoresrA   	resampledrd   s   `           @r0   rC   z TFIdeficsPerceiverAttention.call   s    ))'22))'22,6w,?,?)
J	 KK  KK	7G"42>>>??KK	7G"42>>>??
 
 
 
 
AY
 
 
1a
  	%!!!$$A!!!$$A8!dm:KQOO"R]6T%R%R%RRu}}.R}88 I;T1EE	Jr|ILLLAAAJPRTXT`cgcpTpCqrr
 
 	
r1   )
rD   rE   rF   rG   boolr   r(   rH   rC   rI   rJ   s   @r0   r&   r&   o   s        `# ` `s `TX `gk ` ` ` ` ` `*+
BI +
	 +
bi +
 +
 +
 +
 +
 +
 +
 +
r1   r&   c                   `     e Zd Zdef fdZdeeej                          dej        fdZ	 xZ
S )r'   r   c                     t                      j        di | |j        j        | _        t          j        j                            dd          | _        t          j        j        	                    |dd          | _
        t          j        j                            d          | _        t          j        j        	                    | j        dd	          | _        d
S )z:Simple MLP block with intermediate_size and embedding sizer   lnr   FfcrR   actr   c_projNr   )r   r   r!   r   r(   r)   r*   r+   rw   rX   rx   ReLUry   rz   )r,   intermediate_sizer   r-   r/   s       r0   r   zTFIdeficsMLP.__init__   s    ""6"""-7(/44T4MM(/''(9EPT'UU8?''U'33ho++DNUQY+ZZr1   hidden_statesr   c                     |                      |          }|                     |          }|                     |          }|                     |          }|S )N)rw   rx   ry   rz   )r,   r}   s     r0   rC   zTFIdeficsMLP.call   sL    ....//M22r1   )rD   rE   rF   r   r   r   r   r(   rH   rC   rI   rJ   s   @r0   r'   r'      s{        [- [ [ [ [ [ [(5+;"<         r1   r'   )__doc__typingr   r   
tensorflowr(   modeling_tf_utilsr   configuration_ideficsr   r)   r*   Layerr
   r&   r'   r   r1   r0   <module>r      s   4  # " " " " " " "     + + + + + + 0 0 0 0 0 0<( <( <( <( <("(/"7 <( <( <(~A
 A
 A
 A
 A
"(/"7 A
 A
 A
H    28?(     r1   