
    Ng)                         d Z ddlmZ ddlZddlmZ ddlmc mZ ddlm	Z	 ddl
mZ ddlmZ d	ee         fd
Z G d dej                  Z G d dej                  ZdS )a]   Halo Self Attention

Paper: `Scaling Local Self-Attention for Parameter Efficient Visual Backbones`
    - https://arxiv.org/abs/2103.12731

@misc{2103.12731,
Author = {Ashish Vaswani and Prajit Ramachandran and Aravind Srinivas and Niki Parmar and Blake Hechtman and
    Jonathon Shlens},
Title = {Scaling Local Self-Attention for Parameter Efficient Visual Backbones},
Year = {2021},
}

Status:
This impl is a WIP, there is no official ref impl and some details in paper weren't clear to me.
The attention mechanism works but it's slow as implemented.

Hacked together by / Copyright 2021 Ross Wightman
    )ListN)nn   )make_divisible)trunc_normal_)_assertpermute_maskc                    | j         \  }}}}|j         d         }|dz   dz  }| |                    dd          z  }	|	                    d||          }	t          j        |	ddg                              d          }
t          j        |
d||z
  g          }
|
                    d|dz   |          }
|
ddd||dz
  df         }	|	                    ||d||                              dd|dd          }	|	                    |          S )a~   Compute relative logits along one dimension

    As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
    Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925

    Args:
        q: (batch, height, width, dim)
        rel_k: (2 * window - 1, dim)
        permute_mask: permute output dim according to this
    r   r      N)shape	transposereshapeFpadflattenexpandpermute)qrel_kr	   BHWdimrel_sizewin_sizexx_pads              Q/var/www/html/ai-engine/env/lib/python3.11/site-packages/timm/layers/halo_attn.pyrel_logits_1dr!      s    7LAq!S{1~H1"H	
U__R$$	$A			"a""A E!aV$$Q''EE%!X\*++E MM"a!eX..Eaaa!X\]]"#A 	
		!Q1h''..r2xRHHA99\"""    c                   (     e Zd ZdZ fdZd Z xZS )PosEmbedRelz Relative Position Embedding
    As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
    Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925

    c                 8   t                                                       || _        || _        t	          j        t          j        |dz  dz
  |          |z            | _        t	          j        t          j        |dz  dz
  |          |z            | _	        dS )z
        Args:
            block_size (int): block size
            win_size (int): neighbourhood window size
            dim_head (int): attention head dim
            scale (float): scale factor (for init)
        r   r   N)
super__init__
block_sizedim_headr   	Parametertorchrandn
height_rel	width_rel)selfr(   r   r)   scale	__class__s        r    r'   zPosEmbedRel.__init__C   s     	$ ,u{8a<!3CX'N'NQV'VWWek(Q,2BH&M&MPU&UVVr"   c                 .   |j         \  }}}}|                    d| j        | j        | j                  }t	          || j        d          }|                    dd          }t	          || j        d          }||z   }|                    |||d          }|S )Nr   )r   r      r      )r	   r   r   )r   r3   r   r4   r   )r   r   r(   r)   r!   r.   r   r-   )	r/   r   r   BBHW_rel_logits_wrel_logits_h
rel_logitss	            r    forwardzPosEmbedRel.forwardQ   s    w2r1 IIb$/4?DMJJ$Q_UUU KK1$QoVVV!L0
''2r266
r"   )__name__
__module____qualname____doc__r'   r;   __classcell__r1   s   @r    r$   r$   =   sV         
W W W W W      r"   r$   c                   4     e Zd ZdZ	 	 d fd	Zd	 Zd
 Z xZS )HaloAttna   Halo Attention

    Paper: `Scaling Local Self-Attention for Parameter Efficient Visual Backbones`
        - https://arxiv.org/abs/2103.12731

    The internal dimensions of the attention module are controlled by the interaction of several arguments.
      * the output dimension of the module is specified by dim_out, which falls back to input dim if not set
      * the value (v) dimension is set to dim_out // num_heads, the v projection determines the output dim
      * the query and key (qk) dimensions are determined by
        * num_heads * dim_head if dim_head is not None
        * num_heads * (dim_out * attn_ratio // num_heads) if dim_head is None
      * as seen above, attn_ratio determines the ratio of q and k relative to the output if dim_head not used

    Args:
        dim (int): input dimension to the module
        dim_out (int): output dimension of the module, same as dim if not set
        feat_size (Tuple[int, int]): size of input feature_map (not used, for arg compat with bottle/lambda)
        stride: output stride of the module, query downscaled if > 1 (default: 1).
        num_heads: parallel attention heads (default: 8).
        dim_head: dimension of query and key heads, calculated from dim_out * attn_ratio // num_heads if not set
        block_size (int): size of blocks. (default: 8)
        halo_size (int): size of halo overlap. (default: 3)
        qk_ratio (float): ratio of q and k dimensions to output dimension when dim_head not set. (default: 1.0)
        qkv_bias (bool) : add bias to q, k, and v projections
        avg_down (bool): use average pool downsample instead of strided query blocks
        scale_pos_embed (bool): scale the position embedding as well as Q @ K
    Nr      r3         ?Fc                 z   t                                                       |p|}||z  dk    sJ |dv sJ || _        |pt          ||	z  d          |z  | _        || j        z  | _        || j        z  | _        || j        z  | _        | j        dz  | _        || _	        |x| _
        | _        || _        ||dz  z   | _        d| _        d}|dk    r*|p||z  dk    }|rdn|| _        | j
        | j        z  | _        t          j        || j        d| j        |
	          | _        t          j        || j        | j        z   d|

          | _        t'          | j        | j        | j        | j                  | _        |rt          j        dd          nt          j                    | _        |                                  d S )Nr   )r   r   rD   )divisor      r   r   F)stridebias)rJ   )r(   r   r)   r0   )r&   r'   	num_headsr   dim_head_qk
dim_head_v
dim_out_qk	dim_out_vr0   scale_pos_embedr(   block_size_ds	halo_sizer   block_strider   Conv2dr   kvr$   	pos_embed	AvgPool2dIdentitypoolreset_parameters)r/   r   dim_out	feat_sizerI   rK   r)   r(   rR   qk_ratioqkv_biasavg_downrP   use_avg_poolr1   s                 r    r'   zHaloAttn.__init__}   s    	.S"a''''"#a~g6HRS'T'T'TXa'a!T^3#d&66"T_4%-
./99$,""Y]2A::#?zF':a'?L%1 =vD!%D4E!ED
 34;LS[\\\)C4>!A18TTT$)DMDL\dhdnp p p +7IBLA&&&BKMM	r"   c                 &   | j         j        j        d         dz  }t          | j         j        |           t          | j        j        |           t          | j        j        | j                   t          | j        j        | j                   d S )Nr   rH   )std)	r   weightr   r   rU   rV   r-   r0   r.   )r/   rb   s     r    rZ   zHaloAttn.reset_parameters   s    fm!!$,dfm----dgn#....dn/TZ@@@@dn.DJ??????r"   c                    |j         \  }}}}t          || j        z  dk    d           t          || j        z  dk    d           || j        z  }|| j        z  }||z  }|                     |          }	|	                    d| j        || j        || j                                      dddddd          }	|	                    || j        z  | j        d|          	                    dd          }	| 
                    |          }
t          j        |
| j        | j        | j        | j        g          }
|
                    d| j        | j                                      d| j        | j                                      || j        z  | j        | j        z   |d                              dddd          }
t#          j        |
| j        | j        gd	          \  }}| j        r8|	|	                    dd
          z  |                     |	          z   | j        z  }n7|	|	                    dd
          z  | j        z  |                     |	          z   }|                    d	          }||z  	                    dd          }|                    d| j        | j        ||          }|                    ddddd                                                              || j        || j        z  || j        z            }|                     |          }|S )Nr    r   r   r3      r   r4   )r   r   )r   r   r(   r   r   rL   rQ   r   rK   r   rU   r   r   rR   unfoldr   rM   r+   splitrP   rV   r0   softmax
contiguousviewrO   rS   rY   )r/   r   r   Cr   r   num_h_blocksnum_w_blocks
num_blocksr   rU   kvattnouts                  r    r;   zHaloAttn.forward   s   W
1aDO#q("---DO#q("---DO+DO+!L0
FF1IIII $,lD<NP PPWPWXY[\^_abdeghPiPi 	
 IIa$.($*:B
KKUUVWYZ[[ WWQZZ U2WXXYYq$-99@@DMSWSbcckk 04? BJPRT TT[T[\]_`bcefTgTg 	{2 0$/BKKK1  	NB+++dnnQ.?.??4:MDDB+++tz9DNN1<M<MMD|||##ax""1a((kk"d0$2DlT`aakk!Q1a((3355::t~qD$55qD<M7MO O iinn
r"   )NNr   rD   NrD   r3   rE   FFF)r<   r=   r>   r?   r'   rZ   r;   r@   rA   s   @r    rC   rC   a   ss         8 tuJO#  #  #  #  #  # J@ @ @) ) ) ) ) ) )r"   rC   )r?   typingr   r+   r   torch.nn.functional
functionalr   helpersr   weight_initr   trace_utilsr   intr!   Moduler$   rC    r"   r    <module>r}      s   $                       # # # # # # & & & & & &            #$s) # # # #>! ! ! ! !") ! ! !Hq q q q qry q q qh r"   