
    Ng                         d Z ddlmZ ddlZddlmZ ddlmc mZ ddl	m
Z
mZ ddlmZ ddlmZ dee         fd	Z G d
 dej                  Z G d dej                  ZdS )aP   Bottleneck Self Attention (Bottleneck Transformers)

Paper: `Bottleneck Transformers for Visual Recognition` - https://arxiv.org/abs/2101.11605

@misc{2101.11605,
Author = {Aravind Srinivas and Tsung-Yi Lin and Niki Parmar and Jonathon Shlens and Pieter Abbeel and Ashish Vaswani},
Title = {Bottleneck Transformers for Visual Recognition},
Year = {2021},
}

Based on ref gist at: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2

This impl is a WIP but given that it is based on the ref gist likely not too far off.

Hacked together by / Copyright 2021 Ross Wightman
    )ListN   )	to_2tuplemake_divisible)trunc_normal_)_assertpermute_maskc                     | j         \  }}}}| |                    dd          z  }|                    d|d|z  dz
            }t          j        |ddg                              d          }t          j        |d|dz
  g          }|                    d|dz   d|z  dz
            }|ddd||dz
  df         }|                    ||d||                              dd|dd          }|                    |          S )a   Compute relative logits along one dimension

    As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
    Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925

    Args:
        q: (batch, heads, height, width, dim)
        rel_k: (2 * width - 1, dim)
        permute_mask: permute output dim according to this
       r   r   N)shape	transposereshapeFpadflattenexpandpermute)	qrel_kr	   BHWdimxx_pads	            W/var/www/html/ai-engine/env/lib/python3.11/site-packages/timm/layers/bottleneck_attn.pyrel_logits_1dr      s    7LAq!S	
U__R$$	$A			"aQ""A E!aV$$Q''EE%!QU$$E MM"a!eQUQY//Eaaa!QUVVmA 	
		!Q1a  ''B2r::A99\"""    c                   (     e Zd ZdZ fdZd Z xZS )PosEmbedRelz Relative Position Embedding
    As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
    Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925
    c                 v   t                                                       t          |          \  | _        | _        || _        t          j        t          j	        | j        dz  dz
  |          |z            | _
        t          j        t          j	        | j        dz  dz
  |          |z            | _        d S )Nr   r   )super__init__r   heightwidthdim_headnn	Parametertorchrandn
height_rel	width_rel)self	feat_sizer(   scale	__class__s       r   r%   zPosEmbedRel.__init__=   s    "+I"6"6TZ ,u{4;?Q3F'Q'QTY'YZZek$*q.12Dh&O&ORW&WXXr    c                     |j         \  }}}|                    || j        | j        d          }t	          || j        d          }|                    dd          }t	          || j        d          }||z   }|                    |||          }|S )Nr   )r   r      r      )r	   r   r   )r   r4   r   r5   r   )r   r   r&   r'   r   r.   r   r-   )r/   r   r   HW_rel_logits_wrel_logits_h
rel_logitss           r   forwardzPosEmbedRel.forwardD   s    72q IIadj"55$Q_UUU KK1$QoVVV!L0
''2r22
r    )__name__
__module____qualname____doc__r%   r;   __classcell__r2   s   @r   r"   r"   8   sV         Y Y Y Y Y      r    r"   c                   4     e Zd ZdZ	 	 d
 fd	Zd Zd	 Z xZS )BottleneckAttna   Bottleneck Attention
    Paper: `Bottleneck Transformers for Visual Recognition` - https://arxiv.org/abs/2101.11605

    The internal dimensions of the attention module are controlled by the interaction of several arguments.
      * the output dimension of the module is specified by dim_out, which falls back to input dim if not set
      * the value (v) dimension is set to dim_out // num_heads, the v projection determines the output dim
      * the query and key (qk) dimensions are determined by
        * num_heads * dim_head if dim_head is not None
        * num_heads * (dim_out * attn_ratio // num_heads) if dim_head is None
      * as seen above, attn_ratio determines the ratio of q and k relative to the output if dim_head not used

    Args:
        dim (int): input dimension to the module
        dim_out (int): output dimension of the module, same as dim if not set
        stride (int): output stride of the module, avg pool used if stride == 2 (default: 1).
        num_heads (int): parallel attention heads (default: 4)
        dim_head (int): dimension of query and key heads, calculated from dim_out * attn_ratio // num_heads if not set
        qk_ratio (float): ratio of q and k dimensions to output dimension when dim_head not set. (default: 1.0)
        qkv_bias (bool): add bias to q, k, and v projections
        scale_pos_embed (bool): scale the position embedding as well as Q @ K
    Nr   r5         ?Fc
                 x   t                                                       |
J d            |p|}||z  dk    sJ || _        |pt          ||z  d          |z  | _        || j        z  | _        || j        z  | _        || j        z  | _        | j        dz  | _        |	| _	        t          j        || j        dz  | j        z   d|          | _        t          || j        | j        	          | _        |dk    rt          j        dd          nt          j                    | _        |                                  d S )
NzBA concrete feature size matching expected input (H, W) is requiredr      )divisor      r   r   )bias)r(   r1   )r$   r%   	num_headsr   dim_head_qk
dim_head_v
dim_out_qk	dim_out_vr1   scale_pos_embedr)   Conv2dqkvr"   	pos_embed	AvgPool2dIdentitypoolreset_parameters)r/   r   dim_outr0   striderJ   r(   qk_ratioqkv_biasrO   r2   s             r   r%   zBottleneckAttn.__init__j   sD    	$$&j$$$.S"a''''"#a~g6HRS'T'T'TXa'a!T^3#d&66"T_4%-
.9S$/A"5"FPXYYY %Y9IQUQ[\\\*0A++BLA&&&2;==	r    c                     t          | j        j        | j        j        j        d         dz             t          | j        j        | j                   t          | j        j        | j                   d S )Nr   rH   )std)r   rQ   weightr   rR   r-   r1   r.   )r/   s    r   rV   zBottleneckAttn.reset_parameters   sd    dho48?+@+Ct+KLLLLdn/TZ@@@@dn.DJ??????r    c                    |j         \  }}}}t          || j        j        k    d           t          || j        j        k    d           |                     |          }t          j        || j        | j        | j	        gd          \  }}}|
                    || j        z  | j        d                              dd          }|
                    || j        z  | j        d          }|
                    || j        z  | j        d                              dd          }| j        r$||z  |                     |          z   | j        z  }	n#||z  | j        z  |                     |          z   }	|	                    d          }	|	|z                      dd          
                    || j	        ||          }
|                     |
          }
|
S )N r   )r   r   r   )r   r   rR   r&   r'   rQ   r+   splitrM   rN   r   rJ   rK   r   rL   rO   r1   softmaxrU   )r/   r   r   Cr   r   r   kvattnouts              r   r;   zBottleneckAttn.forward   s   W
1aT^**B///T^))2...HHQKK +a$/4?DN!SYZ[[[1aIIa$.($*:B??II"bQQIIa$.($*:B??IIa$.($/2>>HHRPP 	<EDNN1---;DDETZ'$..*;*;;D|||##ax""2r**221dnaKKiinn
r    )NNr   r5   NrD   FF)r<   r=   r>   r?   r%   rV   r;   r@   rA   s   @r   rC   rC   T   sr         , VZ:?           0@ @ @
      r    rC   )r?   typingr   r+   torch.nnr)   torch.nn.functional
functionalr   helpersr   r   weight_initr   trace_utilsr   intr   Moduler"   rC    r    r   <module>rq      s                            . . . . . . . . & & & & & &            #$s) # # # #8    ")   8I I I I IRY I I I I Ir    