
    Ng2T              
          d Z ddlZddlZddlZddlmZ ddlZddlm	c m
Z ddlm	Z	 ddlmZmZ ddlmZmZmZmZmZmZ ddlmZmZmZmZmZ dd	lmZ dd
lmZ ddlm Z m!Z! ddl"m#Z#m$Z$m%Z% dgZ& ej'        e(          Z) G d de	j*                  Z+ G d de	j*                  Z, G d de	j*                  Z-de.fdZ/ede.fd            Z0 G d de	j*                  Z1 G d de	j*                  Z2d3de	j*        de3de4fdZ5d  Z6d! Z7d4d#Z8d5d$Z9 e$ e9             e9             e9             e9d%&           e9d%&           e9d%&          d'          Z:e#d4d(e2fd)            Z;e#d4d(e2fd*            Z<e#d4d(e2fd+            Z=e#d4d(e2fd,            Z>e#d4d(e2fd-            Z?e#d4d(e2fd.            Z@ e%e(d/d0d1d2           dS )6a   Nested Transformer (NesT) in PyTorch

A PyTorch implement of Aggregating Nested Transformers as described in:

'Aggregating Nested Transformers'
    - https://arxiv.org/abs/2105.12723

The official Jax code is released and available at https://github.com/google-research/nested-transformer. The weights
have been converted with convert/convert_nest_flax.py

Acknowledgments:
* The paper authors for sharing their research, code, and model weights
* Ross Wightman's existing code off which I based this

Copyright 2021 Alexander Soare
    N)partial)nnIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)
PatchEmbedMlpDropPathcreate_classifiertrunc_normal__assert)create_conv2dcreate_pool2d	to_ntupleuse_fused_attn	LayerNorm   )build_model_with_cfg)register_notrace_function)checkpoint_seqnamed_apply)register_modelgenerate_default_cfgsregister_model_deprecationsNestc                   V     e Zd ZU dZej        j        e         ed<   d fd	Z	d Z
 xZS )		Attentionz
    This is much like `.vision_transformer.Attention` but uses *localised* self attention by accepting an input with
     an extra "image block" dim
    
fused_attn   F        c                 p   t                                                       || _        ||z  }|dz  | _        t	                      | _        t          j        |d|z  |          | _        t          j	        |          | _
        t          j        ||          | _        t          j	        |          | _        d S )Ng         )bias)super__init__	num_headsscaler   r   r   LinearqkvDropout	attn_dropproj	proj_drop)selfdimr&   qkv_biasr+   r-   head_dim	__class__s          L/var/www/html/ai-engine/env/lib/python3.11/site-packages/timm/models/nest.pyr%   zAttention.__init__/   s    ")#%
(**9S!C%h777I..Ic3''	I..    c           	         |j         \  }}}}|                     |                              |||d| j        || j        z                                dddddd          }|                    d          \  }}}	| j        r,t          j        |||	| j	        r| j
        j        nd          }nS|| j        z  }||                    d	d
          z  }
|
                    d
          }
| 
                    |
          }
|
|	z  }|                    ddddd                              ||||          }|                     |          }|                     |          }|S )zm
        x is shape: B (batch_size), T (image blocks), N (seq length per image block), C (embed dim)
        r"   r      r         r    )	dropout_p)r/   )shaper)   reshaper&   permuteunbindr   Fscaled_dot_product_attentiontrainingr+   pr'   	transposesoftmaxr,   r-   )r.   xBTNCr)   qkvattns              r3   forwardzAttention.forward;   sS    W
1ahhqkk!!!Q1dna4>>QRRZZ[\^_abdeghjkll**Q--1a? 	.q!QVZVcBk$.BRBRiklllAADJAq{{2r***D<<B<''D>>$''DqA IIaAq!$$,,Q1a88IIaLLNN1r4   )r   Fr    r    )__name__
__module____qualname____doc__torchjitFinalbool__annotations__r%   rO   __classcell__r2   s   @r3   r   r   (   so           	%%%%
/ 
/ 
/ 
/ 
/ 
/      r4   r   c                   L     e Zd ZdZdddddej        ej        f fd	Zd Z xZ	S )TransformerLayerz
    This is much like `.vision_transformer.Block` but:
        - Called TransformerLayer here to allow for "block" as defined in the paper ("non-overlapping image blocks")
        - Uses modified Attention layer that handles the "block" dimension
          @Fr    c
                 h   t                                                        |	|          | _        t          |||||          | _        |dk    rt          |          nt          j                    | _         |	|          | _	        t          ||z            }
t          ||
||          | _        d S )N)r&   r0   r+   r-   r    )in_featureshidden_features	act_layerdrop)r$   r%   norm1r   rN   r
   r   Identity	drop_pathnorm2intr	   mlp)r.   r/   r&   	mlp_ratior0   r-   r+   re   ra   
norm_layermlp_hidden_dimr2   s              r3   r%   zTransformerLayer.__init__Z   s     	Z__

 
 
	 1:B),,,BKMMZ__
S9_--*	
 
 
r4   c                    |                      |          }||                     |                     |                    z   }||                     |                     |                     |                              z   }|S N)rc   re   rN   rh   rf   )r.   rF   ys      r3   rO   zTransformerLayer.forwardy   sb    JJqMMtyy||,,,txx

166777r4   )
rP   rQ   rR   rS   r   GELUr   r%   rO   rY   rZ   s   @r3   r\   r\   T   sq          g|
 
 
 
 
 
>      r4   r\   c                   &     e Zd Zd fd	Zd Z xZS )ConvPool c                     t                                                       t          ||d|d          | _         ||          | _        t          ddd|          | _        d S )Nr"   T)kernel_sizepaddingr#   maxr7   )rt   strideru   )r$   r%   r   convnormr   pool)r.   in_channelsout_channelsrj   pad_typer2   s        r3   r%   zConvPool.__init__   sb    !+|T\cghhh	J|,,	!%Qq(SSS			r4   c                 d   t          |j        d         dz  dk    d           t          |j        d         dz  dk    d           |                     |          }|                     |                    dddd                                        dddd          }|                     |          }|S )z:
        x is expected to have shape (B, C, H, W)
        r:   r7   r   z1BlockAggregation requires even input spatial dimsr;   r"   r   )r   r<   rx   ry   r>   rz   r.   rF   s     r3   rO   zConvPool.forward   s     	a1$&YZZZa1$&YZZZIIaLLIIaii1a++,,44Q1a@@IIaLLr4   rr   )rP   rQ   rR   r%   rO   rY   rZ   s   @r3   rq   rq      sR        T T T T T T
 
 
 
 
 
 
r4   rq   
block_sizec                     | j         \  }}}}t          ||z  dk    d           t          ||z  dk    d           ||z  }||z  }|                     ||||||          } |                     dd                              |||z  d|          } | S )zimage to blocks
    Args:
        x (Tensor): with shape (B, H, W, C)
        block_size (int): edge length of a single square block in units of H, W
    r   z,`block_size` must divide input height evenlyz+`block_size` must divide input width evenlyr7   r"   r;   )r<   r   r=   rD   )rF   r   rG   HWrJ   grid_height
grid_widths           r3   blockifyr      s     'JAq!QA
Na!OPPPA
Na!NOOOz/KjJ			![*j*aHHA	Aq!!![:%=r1EEAHr4   c                     | j         \  }}}}t          t          j        |                    }||z  x}}|                     ||||||          } |                     dd                              ||||          } | S )zblocks to image
    Args:
        x (Tensor): with shape (B, T, N, C) where T is number of blocks and N is sequence size per block
        block_size (int): edge length of a single square block in units of desired H, W
    r7   r"   )r<   rg   mathsqrtr=   rD   )	rF   r   rG   rH   _rJ   	grid_sizeheightwidths	            r3   
deblockifyr      s~     JAq!QDIaLL!!I++FU			!Y	:z1EEA	Aq!!!VUA66AHr4   c            	       <     e Zd ZdZdddddg dddf	 fd	Zd Z xZS )		NestLevelz7 Single hierarchical level of a Nested Transformer
    Nr]   Tr    rr   c                   	
 t                                                       || _        d| _        t	          j        t          j        d||                    | _        |t          ||          | _
        nt	          j                    | _
        t                    rt                    |k    s
J d            t	          j        
	f	dt          |          D              | _        d S )NFr   )rj   r}   zDMust provide as many drop path rates as there are transformer layersc                 L   	 g | ] }t          
	|          	  	        !S ))	r/   r&   ri   r0   r-   r+   re   rj   ra   )r\   ).0ira   r+   re   	embed_dimri   rj   r&   r-   r0   s     r3   
<listcomp>z&NestLevel.__init__.<locals>.<listcomp>   sY     3# 3# 3#  ##!###A,%#
 
 
3# 3# 3#r4   )r$   r%   r   grad_checkpointingr   	ParameterrT   zeros	pos_embedrq   rz   rd   len
Sequentialrangetransformer_encoder)r.   
num_blocksr   
seq_lengthr&   depthr   prev_embed_dimri   r0   r-   r+   re   rj   ra   r}   r2   s       ` ` ``````` r3   r%   zNestLevel.__init__   s   $ 	$"'ek!ZY&W&WXX% z\deeeDIIDI y>> 	sy>>U***,r***#%= 3# 3# 3# 3# 3# 3# 3# 3# 3# 3# 3# 3# 5\\3# 3# 3# $$   r4   c                    |                      |          }|                    dddd          }t          || j                  }|| j        z   }| j        r4t          j                                        st          | j
        |          }n| 
                    |          }t          || j                  }|                    dddd          S )z+
        expects x as (B, C, H, W)
        r   r7   r"   r   )rz   r>   r   r   r   r   rT   rU   is_scriptingr   r   r   r   s     r3   rO   zNestLevel.forward   s     IIaLLIIaAq!!Q((" 	,59+A+A+C+C 	,t7;;AA((++Aq$/**yyAq!$$$r4   )rP   rQ   rR   rS   r%   rO   rY   rZ   s   @r3   r   r      st           !,$ ,$ ,$ ,$ ,$ ,$\% % % % % % %r4   r   c                   ^    e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Zej        j        d d            Zej        j        d             Z	ej        j        d!d            Z
ej        j        d"d            Zej        j        dej        fd            Zd#dedefdZd Zd!defdZd Z xZS )$r   z Nested Transformer (NesT)

    A PyTorch impl of : `Aggregating Nested Transformers`
        - https://arxiv.org/abs/2105.12723
       r"   r6         i   r6   r      r7   r7        r]   Tr          ?Nrr   avgc                    t                                                       dD ]V}t                      |         }t          |t          j        j                  r!t          |          |k    sJ d| d            W t          |          |          } t          |          |          } t          |          |          }|| _	        |d         x| _
        | _        g | _        |pt          }|pt          j        }|| _        || _        t          |t          j        j                  r$|d         |d         k    s
J d            |d         }||z  dk    s
J d            || _        d	t'          j        |          z                      d                                          | _        ||z  t1          j        | j        d                   z  dk    s
J d
            t5          ||z  t1          j        | j        d                   z            | _        t9          ||||d         d          | _        | j        j        | _        | j        | j        d         z  | _        g }d t'          j         d|tC          |                    "                    |          D             }d}d	}tG          t          | j                            D ]}||         }|$                    tK          | j        |         | j        | j        ||         ||         |||	|
||||         |||                     | xj        tM          ||d|           gz  c_        |}|dz  }t          j'        | | _(         ||d                   | _)        tU          | j
        | j	        |          \  }}|| _+        t          j,        |          | _-        || _.        | /                    |           dS )a  
        Args:
            img_size (int, tuple): input image size
            in_chans (int): number of input channels
            patch_size (int): patch size
            num_levels (int): number of block hierarchies (T_d in the paper)
            embed_dims (int, tuple): embedding dimensions of each level
            num_heads (int, tuple): number of attention heads for each level
            depths (int, tuple): number of transformer layers for each level
            num_classes (int): number of classes for classification head
            mlp_ratio (int): ratio of mlp hidden dim to embedding dim for MLP of transformer layers
            qkv_bias (bool): enable bias for qkv if True
            drop_rate (float): dropout rate for MLP of transformer layers, MSA final projection layer, and classifier
            attn_drop_rate (float): attention dropout rate
            drop_path_rate (float): stochastic depth rate
            norm_layer: (nn.Module): normalization layer for transformer layers
            act_layer: (nn.Module): activation layer in MLP of transformer layers
            pad_type: str: Type of padding to use '' for PyTorch symmetric, 'same' for TF SAME
            weight_init: (str): weight init scheme
            global_pool: (str): type of pooling operation to apply to final feature map

        Notes:
            - Default values follow NesT-B from the original Jax code.
            - `embed_dims`, `num_heads`, `depths` should be ints or tuples with length `num_levels`.
            - For those following the paper, Table A1 may have errors!
                - https://github.com/google-research/nested-transformer/issues/2
        
embed_dimsr&   depthszRequire `len(z) == num_levels`r;   r   r   z Model only handles square inputsz*`patch_size` must divide `img_size` evenlyr6   zUFirst level blocks don't fit evenly. Check `img_size`, `patch_size`, and `num_levels`F)img_size
patch_sizein_chansr   flattenc                 6    g | ]}|                                 S  )tolist)r   rF   s     r3   r   z!Nest.__init__.<locals>.<listcomp>[  s     eee1AHHJJeeer4   N)ri   r0   r-   r+   re   rj   ra   r}   zlevels.)num_chs	reductionmoduler7   	pool_type)0r$   r%   locals
isinstancecollectionsabcSequencer   r   num_classesnum_featureshead_hidden_sizefeature_infor   r   ro   	drop_rate
num_levelsr   rT   arangeflipr   r   r   r   rg   r   r   patch_embednum_patchesr   linspacesumsplitr   appendr   dictr   levelsry   r   global_poolr*   	head_dropheadinit_weights)r.   r   r   r   r   r   r&   r   r   ri   r0   r   proj_drop_rateattn_drop_ratedrop_path_raterj   ra   r}   weight_initr   
param_nameparam_valuer   dp_ratesprev_dimcurr_strider   r/   r   r2   s                                r3   r%   zNest.__init__   s   b 	? 	d 	dJ ((:.K+{'?@@ d;'':5557cz7c7c7c555*Yz**:66
)Ij)))44	&:&&v..&4>rNBD1,9
(	"$h 899 	#A;(1+---/Q---{H*$)))+W)))$ Z 8 88>>qAAHHJJJ&$)DOA4F*G*GG1LLLd MLL x:5$)DOTUDV:W:WWXX &! m
 
 
  +7*doa.@@ eeq.#f++(V(V(\(\]c(d(deees4?++,, 	 	AQ-CMM)"!q	#!(("1+%#!     " $skR_\]R_R_"`"`"`!aaH1KKmV, Jz"~..	 .d.?AQ]hiiiT&I..	+&&&&&r4   c                     |dv sJ d|v rt          j        | j                   nd}| j        D ]}t	          |j        ddd           t          t          t          |          |            d S )	N)nlhbrr   r   r    {Gz?r:   r7   stdab)	head_bias)	r   logr   r   r   r   r   r   _init_nest_weights)r.   moder   levels       r3   r   zNest.init_weights  s    |####39T>>TXd.////r	[ 	? 	?E%/sbA>>>>>G.)DDDdKKKKKr4   c                 X    d t          t          | j                            D             S )Nc                     h | ]}d | d	S )zlevel.z
.pos_embedr   )r   r   s     r3   	<setcomp>z'Nest.no_weight_decay.<locals>.<setcomp>  s$    HHH1&&&&HHHr4   )r   r   r   r.   s    r3   no_weight_decayzNest.no_weight_decay  s(    HHc$+6F6F0G0GHHHHr4   Fc                 :    t          d|rdndd fddg          }|S )Nz^patch_embedz^levels\.(\d+)z*^levels\.(\d+)\.transformer_encoder\.(\d+))z"^levels\.(\d+)\.(?:pool|pos_embed))r   )z^norm)i )stemblocks)r   )r.   coarsematchers      r3   group_matcherzNest.group_matcher  s@     &,_""2_aef=$
 
 
 r4   c                 (    | j         D ]	}||_        
d S rm   )r   r   )r.   enablels      r3   set_grad_checkpointingzNest.set_grad_checkpointing  s(     	* 	*A#)A  	* 	*r4   returnc                     | j         S rm   )r   r   s    r3   get_classifierzNest.get_classifier  s
    yr4   r   r   c                 f    || _         t          | j        | j         |          \  | _        | _        d S )Nr   )r   r   r   r   r   )r.   r   r   s      r3   reset_classifierzNest.reset_classifier  s<    &&7t/;'H 'H 'H#$)))r4   c                     |                      |          }|                     |          }|                     |                    dddd                                        dddd          }|S )Nr   r7   r"   r   )r   r   ry   r>   r   s     r3   forward_featureszNest.forward_features  s`    QKKNNIIaii1a++,,44Q1a@@r4   
pre_logitsc                     |                      |          }|                     |          }|r|n|                     |          S rm   )r   r   r   )r.   rF   r  s      r3   forward_headzNest.forward_head  s?    QNN10qqDIIaLL0r4   c                 Z    |                      |          }|                     |          }|S rm   )r  r  r   s     r3   rO   zNest.forward  s-    !!!$$a  r4   )r   r"   r6   r"   r   r   r   r   r]   Tr    r    r    r   NNrr   rr   r   r   F)T)r   )rP   rQ   rR   rS   r%   rT   rU   ignorer   r   r   r   r   Moduler  rg   strr  r  rW   r  rO   rY   rZ   s   @r3   r   r      s         & )C' C' C' C' C' C'J YL L L L YI I I Y	 	 	 	 Y* * * * Y	    H HC Hc H H H H
  1 1$ 1 1 1 1
      r4   rr   r    r   namer   c                 2   t          | t          j                  r|                    d          r?t	          | j        ddd           t          j                            | j        |           dS t	          | j        ddd           | j        &t          j        	                    | j                   dS dS t          | t          j
                  rEt	          | j        ddd           | j        (t          j        	                    | j                   dS dS dS )zn NesT weight initialization
    Can replicate Jax implementation. Otherwise follows vision_transformer.py
    r   r   r:   r7   r   N)r   r   r(   
startswithr   weightinit	constant_r#   zeros_Conv2d)r   r  r   s      r3   r   r     s    &")$$ (??6"" 	,&-SB!<<<<Gfk955555&-SB!<<<<{&v{+++++ '&	FBI	&	& (fma8888;"GNN6;'''''( (""r4   c                     t                               d| j        |j                   | j        d         }|j        dd         \  }}t          t	          j        ||z                      }t          | t          t	          j        |                                                  dddd          } t          j	        | ||gdd          } t          |                     dddd          t          t	          j        |                              } | S )	z
    Rescale the grid of position embeddings when loading from state_dict
    Expected shape of position embeddings is (1, T, N, C), and considers only square images
    z$Resized position embedding: %s to %sr7   r   r"   r   bicubicF)sizer   align_corners)_loggerinfor<   rg   r   r   r   r>   r@   interpolater   )posemb
posemb_newseq_length_oldnum_blocks_newseq_length_newsize_news         r3   resize_pos_embedr#    s    
 LL7zGWXXX\!_N%/%5ac%:"NN49^N:;;<<HDIn$=$= > >??GG1aQRSSF]68(<9\abbbFfnnQ1a00#di6O6O2P2PQQFMr4   c                     d |                                  D             }|D ]M}| |         j        t          ||          j        k    r't          | |         t          ||                    | |<   N| S )z4 resize positional embeddings of pretrained weights c                 <    g | ]}|                     d           |S )
pos_embed_)r  )r   rL   s     r3   r   z(checkpoint_filter_fn.<locals>.<listcomp>  s)    QQQAall<6P6PQaQQQr4   )keysr<   getattrr#  )
state_dictmodelpos_embed_keysrL   s       r3   checkpoint_filter_fnr,    sw    QQ!2!2QQQN O Oa='%"3"3"999,Z]GE1<M<MNNJqMr4   Fc                 \    t          t          | |ft          dd          t          d|}|S )N)r   r   r7   T)out_indicesflatten_sequential)feature_cfgpretrained_filter_fn)r   r   r   r,  )variant
pretrainedkwargsr*  s       r3   _create_nestr5    sJ      Y4HHH1   E Lr4   c                 :    | ddddgdddt           t          ddd	|S )
Nr   )r"   r   r      g      ?r  Tzpatch_embed.projr   )urlr   
input_size	pool_sizecrop_pctinterpolationfixed_input_sizemeanr   
first_conv
classifierr   )r8  r4  s     r3   _cfgrA    s;    =Bx9$%.B(   r4   ztimm/)	hf_hub_id)znest_base.untrainedznest_small.untrainedznest_tiny.untrainedznest_base_jx.goog_in1kznest_small_jx.goog_in1kznest_tiny_jx.goog_in1kr   c                 B    t          ddddd|}t          dd| i|}|S )	 Nest-B @ 224x224
    r   r   r   r   	nest_baser3  r   )rE  r   r5  r3  r4  model_kwargsr*  s       r3   rE  rE    sN      W"jW WOUW WLLLL|LLELr4   c                 B    t          ddddd|}t          dd| i|}|S )	 Nest-S @ 224x224
    `      i  r"         r   r   
nest_smallr3  r   )rQ  rF  rG  s       r3   rQ  rQ    s?     e>ZPZee^deeLMM*MMMELr4   c                 B    t          ddddd|}t          dd| i|}|S )	 Nest-T @ 224x224
    rK  rN  r7   r7   r   r   	nest_tinyr3  r   )rU  rF  rG  s       r3   rU  rU    s?     d>ZPYdd]cddLLLL|LLELr4   c                 n    |                     dd           t          d	dddd|}t          d
d| i|}|S )rD  r}   samer   r   r   r   nest_base_jxr3  r   )rX  
setdefaultr   r5  rG  s       r3   rX  rX  $  sd     j&))) W"jW WOUW WLOOJO,OOELr4   c                 n    |                     dd           t          d	dddd|}t          d
d| i|}|S )rJ  r}   rW  rK  rN  r   r   nest_small_jxr3  r   )r\  rY  rG  s       r3   r\  r\  /  sU     j&)))e>ZPZee^deeLPPZP<PPELr4   c                 n    |                     dd           t          d	dddd|}t          d
d| i|}|S )rS  r}   rW  rK  rN  rT  r   nest_tiny_jxr3  r   )r^  rY  rG  s       r3   r^  r^  9  sU     j&)))d>ZPYdd]cddLOOJO,OOELr4   rX  r\  r^  )jx_nest_basejx_nest_smalljx_nest_tiny)rr   r    r
  r   )ArS   collections.abcr   loggingr   	functoolsr   rT   torch.nn.functionalr   
functionalr@   	timm.datar   r   timm.layersr   r	   r
   r   r   r   r   r   r   r   r   _builderr   _features_fxr   _manipulater   r   	_registryr   r   r   __all__	getLoggerrP   r  r  r   r\   rq   rg   r   r   r   r   r  floatr   r#  r,  r5  rA  default_cfgsrE  rQ  rU  rX  r\  r^  r   r4   r3   <module>rq     s   "                             A A A A A A A A \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Z Z Z Z Z Z Z Z Z Z Z Z Z Z * * * * * * 3 3 3 3 3 3 4 4 4 4 4 4 4 4 Y Y Y Y Y Y Y Y Y Y(
'
H
%
%) ) ) ) )	 ) ) )X) ) ) ) )ry ) ) )X    ry   (C      c    ?% ?% ?% ?% ?%	 ?% ?% ?%DA A A A A29 A A AH( (ry ( (U ( ( ( ($  "  
 
 
 
    %$466 DFF466"dW555#tg666"dW555& &    T      d      T                        H"$"' '     r4   