
    Ng1                         d dl mZmZmZ d dlZd dlmZ d dlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ  G d
 dej                  Z G d dej                  Z G d dej                  ZdS )    )ListOptionalUnionN)nn)
functional   )use_fused_attn)create_conv2d)	to_2tuple)create_pool2dc                        e Zd ZdZ	 	 	 	 	 	 ddedee         ded	ed
ededef fdZd Zddee	j
                 fdZ xZS )MultiQueryAttentionV2a  Multi Query Attention.

    Fast Transformer Decoding: One Write-Head is All You Need
    https://arxiv.org/pdf/1911.02150.pdf

    This is an acceletor optimized version - removing multiple unneccessary
    tensor transpose by re-arranging indices according to the following rules: 1)
    contracted indices are at the end, 2) other indices have the same order in the
    input and output tensores.

    Compared to V1, this gives 3x speed up.
    N   @           dimdim_out	num_headskey_dim	value_dim	attn_drop	proj_dropc                    t                                                       |p|}|| _        || _        || _        |dz  | _        t          j        t          j	        | j        | j        |g                    | _
        t          j        t          j	        || j        g                    | _        t          j        t          j	        || j        g                    | _        t          j        |          | _        t          j        t          j	        || j        | j        g                    | _        t          j        |          | _        dS )zInitializer.      N)super__init__r   r   r   scaler   	Parametertorchrandn
query_projkey_proj
value_projDropoutr   out_projr   )	selfr   r   r   r   r   r   r   	__class__s	           S/var/www/html/ai-engine/env/lib/python3.11/site-packages/timm/layers/attention2d.pyr   zMultiQueryAttentionV2.__init__   s     	.S""_
,u{DNDLRU3V'W'WXXU[#t|1D%E%EFF,u{C3H'I'IJJI..U['4>4>1Z%[%[\\I..    c                 ~    |j         }|                    |d         |d         d                              dd          S )zBReshapes a tensor to three dimensions, keeping the first and last.r   r      )shapereshape	transposer&   tss      r(   _reshape_inputz$MultiQueryAttentionV2._reshape_input4   s9    G yy1qtR((221a888r)   mc                 F   |j         }|p|}|                     |          }|                     |          }t          j        d|| j                  }t          j        d|| j                  }t          j        d||          }|                    d          }|                     |          }t          j        d|| j                  }	t          j        d||	          }
t          j        d|
| j	                  }| 
                    |          }|                    |          S )	Run layer computation.zbnd,hkd->bnhkzbmd,dk->bmkzbnhk,bmk->bnhmr+   r   zbmd,dv->bmvzbnhm,bmv->bnhvzbnhv,dhv->bnd)r-   r3   r   einsumr!   r"   softmaxr   r#   r%   r   r.   )r&   xr4   r2   
reshaped_x
reshaped_mqkattnvoresults               r(   forwardzMultiQueryAttentionV2.forward<   s    GF((++
((++
L*doFFL
DMBB|,a33|||##~~d##L
DODDL)433oq$-@@''~~a   r)   )Nr   r   r   r   r   N)__name__
__module____qualname____doc__intr   floatr   r3   r   TensorrC   __classcell__r'   s   @r(   r   r      s           &*!!/ // c]/ 	/
 / / / / / / / / /29 9 9! !HU\2 ! ! ! ! ! ! ! !r)   r   c                   z    e Zd ZU dZej        j        e         ed<   ddddddddddde	j
        d	fd
edee         dedee         dee         dededededeeeee         f         dedede	j        def fdZd Zdej        fdZdej        dedefdZdej        dededefdZd"d eej                 fd!Z xZS )#MultiQueryAttention2da  Multi Query Attention with spatial downsampling.

     3 parameters are introduced for the spatial downsampling:
     1. kv_stride: downsampling factor on Key and Values only.
     2. query_strides: horizontal & vertical strides on Query only.

    This is an optimized version.
    1. Projections in Attention is explict written out as 1x1 Conv2D.
    2. Additional reshapes are introduced to bring a up to 3x speed up.
    
fused_attnNr   r       r   Fr   r   r   r   r   query_strides	kv_stridedw_kernel_sizedilationpaddingr   r   
norm_layeruse_biasc                 \   t                                                       |p|}|| _        |p||z  | _        |p||z  | _        t          |          | _        || _        t          d | j        D                       | _	        | j        dz  | _
        t                      | _        || _        t          j                    | _        | j	        r|
dk    r1| j                            dt%          d| j        d                     n.| j                            dt          j        |                     | j                            d ||                     | j                            d	t)          || j        | j        z  d
|                     t          j                    | _        |d
k    rS| j                            dt)          |||||	|
d                     | j                            d ||                     | j                            d	t)          || j        d
|
|                     t          j                    | _        |d
k    rS| j                            dt)          |||||	|
d                     | j                            d ||                     | j                            d	t)          || j        d
|                     t          j        |          | _        t          j                    | _        | j	        r5| j                            dt          j        | j        dd                     | j                            d	t)          | j        | j        z  |d
|                     | j                            dt          j        |                     d| _        dS )a{  Initializer.

        Args:
          num_heads: Number of attention heads.
          key_dim: Size of the attention key dimension.
          value_dim: Size of the attention value dimension.
          query_strides: Vertical stride size for query only.
          kv_stride: Key and value stride size.
          dw_kernel_size: Spatial dimension of the depthwise kernel.
        c                     g | ]}|d k    	S )r    ).0r2   s     r(   
<listcomp>z2MultiQueryAttention2d.__init__.<locals>.<listcomp>   s    %H%H%Ha!e%H%H%Hr)   r   same	down_poolavg)kernel_sizerW   )rb   normprojr   )rb   bias	down_convT)rb   striderV   rW   	depthwise)rb   rW   re   upsamplebilinearF)scale_factormodealign_cornersdropN)r   r   r   r   r   r   rS   rT   anyhas_query_stridesr   r	   rP   rn   r   
Sequentialquery
add_moduler   	AvgPool2dr
   keyvaluer$   r   outputUpsampler8   )r&   r   r   r   r   r   rS   rT   rU   rV   rW   r   r   rX   rY   r'   s                  r(   r   zMultiQueryAttention2d.__init___   s   6 	.S"2#"2"6cY&6&}55"!$%H%HT5G%H%H%H!I!I\T)
(**	]__
! 	;&  
%%k=$($6 &4 4 4     
%%k2<M3Z3Z3Z[[[J!!&**S//:::
fmNT\)	'
 '
 '
 	 	 	 =??q==H]* !. . .    H

3888FML%
 %
 %
 	 	 	 ]__
q==J!!+}* !0 0 0    J!!&**S//:::
fmN	'
 '
 '
 	 	 	 I..moo! 	CK"":r{HZak  |A  0B  0B  0B  C  C  Cv}NT^+	(
 (
 (
 	 	 	 	v
9(=(=>>>r)   c                 D   t           j                            | j        j        j                   t           j                            | j        j        j                   t           j                            | j        j        j                   | j        dk    r\t           j                            | j        j	        j                   t           j                            | j        j	        j                   t           j                            | j
        j        j                   d S )Nr   )r   initxavier_uniform_rr   rd   weightru   rv   rT   rf   rw   )r&   s    r(   init_weightsz"MultiQueryAttention2d.init_weights   s    

 6777
 4555

 6777>AG##DH$6$=>>>G##DJ$8$?@@@
 0 788888r)   r1   c                     |j         }|                    |d         |d         d                              dd          }| j        r|S |                    d                                          S )zFReshapes a tensor to three dimensions, keeping the batch and channels.r   r   r+   r,   )r-   r.   r/   r8   	unsqueeze
contiguousr0   s      r(   r3   z$MultiQueryAttention2d._reshape_input   sa    GIIadAaD"%%//155; 	/H;;q>>,,...r)   c                     |j         }|                    |d         ||d          }| j        r*|                    dddd                                          S |                    dd                                          S )z?Reshapes projected query: [b, n, n, h x k] -> [b, n x n, h, k].r   r+   rQ   r   r,   )r-   r.   r8   permuter   r/   )r&   r1   r   r   r2   s        r(   _reshape_projected_queryz.MultiQueryAttention2d._reshape_projected_query   sr    GIIadIw33; 	499Q1a((33555;;r2&&11333r)   h_pxw_pxc                     |j         }|d         |z  }| j        s|                    dd          }|                    |d         |||                              dddd                                          S )z2Reshape output:[b, n x n x h, k] -> [b, n, n, hk].r+   r   r,   r   rQ   )r-   r8   r/   r.   r   r   )r&   r1   r   r   r   r2   feat_dims          r(   _reshape_outputz%MultiQueryAttention2d._reshape_output   sp    GR59${ 	"Aq!!Ayy1tT844<<Q1aHHSSUUUr)   	attn_maskc                    |j         x\  }}}}}|                     |          }|                     || j        | j                  }|                     |          }	|                     |	          }	|                     |          }
|                     |
          }
| j        rgt          j        d||	          | j
        z  }|||z   }|                    d          }|                     |          }t          j        d||
          }n| j        r-t          j        ||	|
|| j        r| j        j        nd          }nZ|| j
        z  }||	                    dd          z  }|||z   }|                    d          }|                     |          }||
z  }|                     || j        || j        d	         z  || j        d
         z            }|                     |          }|S )r6   zblhk,bpk->blhpNr+   r7   zblhp,bpk->blhkr   r   	dropout_pr   r   r   )r-   rr   r   r   r   ru   r3   rv   r8   r   r   r9   r   rP   Fscaled_dot_product_attentiontrainingpr/   r   rS   rw   )r&   r:   r   BCHWr2   r=   r>   r@   r?   rA   s                r(   rC   zMultiQueryAttention2d.forward   s    
1aQJJqMM))!T^T\JJHHQKK""JJqMM""
 ; 	< 0!Q77$*DD$i'<<B<''D>>$''D-tQ77AA 2q!'26-Gdn..R   
N1;;r2...()+D|||++~~d++1H   DNA9KA9N4NPQUYUghiUjPjkkKKNNr)   rD   )rE   rF   rG   rH   r   jitFinalbool__annotations__r   BatchNorm2drI   r   r   strr   rJ   Moduler   r}   rK   r3   r   r   rC   rL   rM   s   @r(   rO   rO   R   s        	 	 	%%%%
 &*%)'+!""#24!!$&N"o oo c]o 	o
 c]o  }o o o  o o 3T#Y./o o o 	o o o o o o ob9 9 9/ / / / /4%, 43 4QT 4 4 4 4V V# VS VPS V V V V/ /HU\$: / / / / / / / /r)   rO   c                        e Zd ZU ej        j        e         ed<   	 	 	 	 	 	 	 	 ddede	e         d	ed
ededede
de
f fdZdde	ej                 fdZ xZS )Attention2drP   N    TFr   r   r   r   re   expand_first
head_firstr   r   c	                    t                                                       |p|}|r|n|}	|| _        |	|z  | _        || _        |dz  | _        t                      | _        t          j	        ||	dz  d|          | _
        t          j        |          | _        t          j	        |	|d|          | _        t          j        |          | _        d S )Nr   rQ   r   )re   )r   r   r   dim_headr   r   r	   rP   r   Conv2dqkvr$   r   rd   r   )r&   r   r   r   re   r   r   r   r   dim_attnr'   s             r(   r   zAttention2d.__init__*  s     	.S*377" I-$$&
(**9S(Q,===I..Ih>>>	I..r)   r   c                    |j         \  }}}}| j        rR|                     |                              || j        | j        dz  d                              dd          \  }}}	nM|                     |                              |d| j        | j        d                              d          \  }}}	| j	        rt          j        j                            |                    dd                                          |                    dd                                          |	                    dd                                          || j        r| j        j        nd                              dd                              |d||          }n|| j        z  }|                    dd          |z  }
||
|z   }
|
                    d          }
|                     |
          }
|	|
                    dd          z                      |d||          }|                     |          }|                     |          }|S )	NrQ   r+   r,   r7   r   r   r   r   )r-   r   r   viewr   r   chunkr.   unbindrP   r   r   r   r   r/   r   r   r   r   r   r9   rd   r   )r&   r:   r   r   r   r   r   r=   r>   r@   r?   s              r(   rC   zAttention2d.forwardC  s   W
1a? 	]hhqkk&&q$.$-!:KRPPVVWX^_V``GAq!!hhqkk))!QrRRYYZ[\\GAq!? 	?#@@B##..00B##..00B##..00#.2mC$.** A   iB2q! 4 4 A DJA;;r2&&*D$i'<<B<''D>>$''DT^^B+++11!RA>>AIIaLLNN1r)   )Nr   TFFr   r   rD   )rE   rF   rG   r   r   r   r   r   rI   r   rJ   r   rK   rC   rL   rM   s   @r(   r   r   &  s         	%%%%3 &*!&$!!/ // c]/ 	/
 / / / / / / / / / /2 HU\$:        r)   r   )typingr   r   r   r   r   torch.nnr   r   configr	   r
   helpersr   pool2d_samer   r   r   rO   r   r\   r)   r(   <module>r      s8   ( ( ( ( ( ( ( ( ( (        $ $ $ $ $ $ " " " " " " ( ( ( ( ( (       & & & & & &B! B! B! B! B!BI B! B! B!JQ Q Q Q QBI Q Q Qh9 9 9 9 9") 9 9 9 9 9r)   