
    Ng                        d Z ddlZddlZddlmZmZmZmZmZ ddl	Z	ddl
mZ ddlmZmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZ ddlmZ ddlmZ dd	l m!Z! dd
l"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z* dgZ+ ej,        e-          Z.ee/ee/e/f         f         Z0de	j1        dee/e/f         de	j1        fdZ2e!dee/e/f         de/de/fd            Z3de/de/fdZ4 G d dej5                  Z6 G d dej5                  Z7 G d dej5                  Z8 G d dej5                  Z9 G d  dej5                  Z:d! Z;d^d#Z<d_d%Z= e&i d& e=d'd()          d* e=d'd+)          d, e=d'd-d.d/d01          d2 e=d'd3)          d4 e=d'd5d.d/d01          d6 e=d'd7)          d8 e=d'd9)          d: e=d'd;)          d< e=d'd=d.d/d01          d> e=d'd?)          d@ e=d'dAdBC          dD e=d'dEdBC          dF e=d'dGdBC          dH e=d'dId.d/d0dBJ          dK e=d'dLdBC          dM e=d'dNd.d/d0dBJ          dO e=d'dP)           e=d'dQ)           e=d'dR)          dS          Z>e'd^de:fdT            Z?e'd^de:fdU            Z@e'd^de:fdV            ZAe'd^de:fdW            ZBe'd^de:fdX            ZCe'd^de:fdY            ZDe'd^de:fdZ            ZEe'd^de:fd[            ZFe'd^de:fd\            ZG e(e-dFdHdKdMd]           dS )`a   Swin Transformer
A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`
    - https://arxiv.org/pdf/2103.14030

Code/weights from https://github.com/microsoft/Swin-Transformer, original copyright/license info below

S3 (AutoFormerV2, https://arxiv.org/abs/2111.14725) Swin weights from
    - https://github.com/microsoft/Cream/tree/main/AutoFormerV2

Modifications and additions for timm hacked together by / Copyright 2021, Ross Wightman
    N)CallableListOptionalTupleUnionIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)
PatchEmbedMlpDropPathClassifierHead	to_2tuple	to_ntupletrunc_normal__assertuse_fused_attnresize_rel_pos_bias_tableresample_patch_embedndgrid   )build_model_with_cfg)feature_take_indices)register_notrace_function)checkpoint_seqnamed_apply)generate_default_cfgsregister_modelregister_model_deprecations)get_init_weights_vitSwinTransformerxwindow_sizereturnc                 *   | j         \  }}}}|                     |||d         z  |d         ||d         z  |d         |          } |                     dddddd                                                              d|d         |d         |          }|S )aU  
    Partition into non-overlapping windows with padding if needed.
    Args:
        x (tensor): input tokens with [B, H, W, C].
        window_size (int): window size.

    Returns:
        windows: windows after partition with [B * num_windows, window_size, window_size, C].
        (Hp, Wp): padded height and width before partition
    r   r               shapeviewpermute
contiguous)r"   r#   BHWCwindowss          X/var/www/html/ai-engine/env/lib/python3.11/site-packages/timm/models/swin_transformer.pywindow_partitionr6   *   s     JAq!Q	q!{1~%{1~qKN7JKXYN\]^^Aii1aAq))4466;;BAP[\]P^`abbGN    r1   r2   c                    | j         d         }|                     d||d         z  ||d         z  |d         |d         |          }|                    dddddd                                                              d|||          }|S )z
    Args:
        windows: (num_windows*B, window_size, window_size, C)
        window_size (int): Window size
        H (int): Height of image
        W (int): Width of image

    Returns:
        x: (B, H, W, C)
    r*   r   r   r&   r'   r(   r)   r+   )r4   r#   r1   r2   r3   r"   s         r5   window_reverser9   >   s     	bARk!n,a;q>.A;q>S^_`SacdeeA			!Q1a##..0055b!QBBAHr7   win_hwin_wc                    t          j        t          t          j        |           t          j        |                              }t          j        |d          }|d d d d d f         |d d d d d f         z
  }|                    ddd                                          }|d d d d dfxx         | dz
  z  cc<   |d d d d dfxx         |dz
  z  cc<   |d d d d dfxx         d|z  dz
  z  cc<   |                    d          S )Nr   r'   r   r*   )torchstackr   arangeflattenr.   r/   sum)r:   r;   coordscoords_flattenrelative_coordss        r5   get_relative_position_indexrE   P   s5   [U 3 3U\%5H5HIIJJF]61--N$QQQ4Z0>!!!T111*3MMO%--aA66AACCOAAAqqq!G	)AAAqqq!G	)AAAqqq!GE	A-r"""r7   c                        e Zd ZU dZej        j        e         ed<   	 	 	 	 	 dde	de	d	e
e	         d
edededef fdZd
ee	e	f         ddfdZdej        fdZdde
ej                 fdZ xZS )WindowAttentionz Window based multi-head self attention (W-MSA) module with relative position bias.
    It supports shifted and non-shifted windows.
    
fused_attnN   T        dim	num_headshead_dimr#   qkv_bias	attn_drop	proj_dropc                    t                                                       || _        t          |          | _        | j        \  }}	||	z  | _        || _        |p||z  }||z  }
|dz  | _        t          d          | _	        t          j        t          j        d|z  dz
  d|	z  dz
  z  |                    | _        |                     dt!          ||	          d           t          j        ||
d	z  |
          | _        t          j        |          | _        t          j        |
|          | _        t          j        |          | _        t/          | j        d           t          j        d          | _        dS )a  
        Args:
            dim: Number of input channels.
            num_heads: Number of attention heads.
            head_dim: Number of channels per head (dim // num_heads if not set)
            window_size: The height and width of the window.
            qkv_bias:  If True, add a learnable bias to query, key, value.
            attn_drop: Dropout ratio of attention weight.
            proj_drop: Dropout ratio of output.
        g      T)experimentalr'   r   relative_position_indexF
persistentr&   biasg{Gz?)stdr*   )rK   N)super__init__rK   r   r#   window_arearL   scaler   rH   nn	Parameterr=   zerosrelative_position_bias_tableregister_bufferrE   LinearqkvDropoutrO   projrP   r   Softmaxsoftmax)selfrK   rL   rM   r#   rN   rO   rP   r:   r;   attn_dim	__class__s              r5   rZ   zWindowAttention.__init__b   so   ( 	$[11'u 5="/si/i'%
(d;;; -/La%iRSmXY\aXadeXeEfhq9r9r,s,s) 	68STY[`8a8ansttt9S(Q,X>>>I..Ih,,	I..d7SAAAAzb)))r7   r$   c                    t          |          }|| j        k    rdS || _        | j        \  }}||z  | _        t          j                    5  d|z  dz
  d|z  dz
  z  | j        f}t          j        t          | j	        | j        |                    | _	        | 
                    dt          ||          d           ddd           dS # 1 swxY w Y   dS )zzUpdate window size & interpolate position embeddings
        Args:
            window_size (int): New window size
        Nr'   r   new_window_sizenew_bias_shaperS   FrT   )r   r#   r[   r=   no_gradrL   r]   r^   r   r`   ra   rE   )rh   r#   r:   r;   rn   s        r5   set_window_sizezWindowAttention.set_window_size   sJ   
  ,,$***F&'u 5=]__ 	y 	y%i!mE	A>NN02)5$($4#1  1 1D-   !:<WX]_d<e<erw xxx	y 	y 	y 	y 	y 	y 	y 	y 	y 	y 	y 	y 	y 	y 	y 	y 	y 	ys   A2C

CCc                    | j         | j                            d                                       | j        | j        d          }|                    ddd                                          }|                    d          S )Nr*   r'   r   r   )r`   rS   r-   r[   r.   r/   	unsqueeze)rh   relative_position_biass     r5   _get_rel_pos_biasz!WindowAttention._get_rel_pos_bias   sv    !%!B(--b11"33748H$JZ\^3_3_ 	!7!?!?1a!H!H!S!S!U!U%//222r7   maskc                    |j         \  }}}|                     |                              ||d| j        d                              ddddd          }|                    d          \  }}}	| j        r|                                 }
|e|j         d         }|                    d|d||          	                    ||z  d| j        dd          }|
|                    d| j        ||          z   }
t          j        j                            |||	|
| j        r| j        j        nd	          }n|| j        z  }||                    d
d          z  }||                                 z   }|q|j         d         }|                    d|| j        ||          |                    d                              d          z   }|                    d| j        ||          }|                     |          }|                     |          }||	z  }|                    dd                              ||d          }|                     |          }|                     |          }|S )z
        Args:
            x: input features with shape of (num_windows*B, N, C)
            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
        r&   r*   r'   r   r   r(   NrJ   )	attn_mask	dropout_p)r,   rc   reshaperL   r.   unbindrH   rt   r-   expandr=   r]   
functionalscaled_dot_product_attentiontrainingrO   pr\   	transposerr   rg   re   rP   )rh   r"   ru   B_Nr3   rc   qkvrw   num_winattns                r5   forwardzWindowAttention.forward   s>    7Aqhhqkk!!"aDNB??GG1aQRTUVV**Q--1a? 	..00I*Q-yyGQ155<<R7]BPTP^`bdfgg%RA(N(NN	#@@1a#.2mC$.** A  AA DJAq{{2r***D$00222D*Q-yyWdnaCCdnnUVFWFWFaFabcFdFddyyT^Q::<<%%D>>$''DqAKK1%%b!R00IIaLLNN1r7   )NrI   TrJ   rJ   N)__name__
__module____qualname____doc__r=   jitFinalbool__annotations__intr   _int_or_tuple_2_tfloatrZ   r   rp   Tensorrt   r   __classcell__rj   s   @r5   rG   rG   \   s3          	%%%% '+-.!!!+* +*+* +* sm	+*
 ++* +* +* +* +* +* +* +* +*Zy5c? yt y y y y*35< 3 3 3 3$ $x5 $ $ $ $ $ $ $ $r7   rG   c                       e Zd ZdZddddddddd	d	d	ej        ej        fd
ededede	e         dedede
de
dede
dededededef fdZd$de	ej                 de	ej                 fdZ	 d$deeeeef         f         de	eeeeef         f                  deeeef         eeef         f         fdZ	 d$d eeef         deeef         de	e
         fd!Zd" Zd# Z xZS )%SwinTransformerBlockz Swin Transformer Block.
    r(   NrI   r   F      @TrJ   rK   input_resolutionrL   rM   r#   
shift_sizealways_partitiondynamic_mask	mlp_ratiorN   rP   rO   	drop_path	act_layer
norm_layerc           	      &   t                                                       || _        || _        t	          |          | _        || _        || _        |                     ||          \  | _	        | _
        | j	        d         | j	        d         z  | _        |	| _         ||          | _        t          |||| j	        |
||          | _        |dk    rt!          |          nt#          j                    | _         ||          | _        t+          |t-          ||	z            ||          | _        |dk    rt!          |          nt#          j                    | _        |                     d| j        rdn|                                 d	           dS )
a  
        Args:
            dim: Number of input channels.
            input_resolution: Input resolution.
            window_size: Window size.
            num_heads: Number of attention heads.
            head_dim: Enforce the number of channels per head
            shift_size: Shift size for SW-MSA.
            always_partition: Always partition into full windows and shift
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: If True, add a learnable bias to query, key, value.
            proj_drop: Dropout rate.
            attn_drop: Attention dropout rate.
            drop_path: Stochastic depth rate.
            act_layer: Activation layer.
            norm_layer: Normalization layer.
        r   r   )rL   rM   r#   rN   rO   rP   rJ   )in_featureshidden_featuresr   droprw   NFrT   )rY   rZ   rK   r   r   target_shift_sizer   r   _calc_window_shiftr#   r   r[   r   norm1rG   r   r   r]   Identity
drop_path1norm2r   r   mlp
drop_path2ra   get_attn_mask)rh   rK   r   rL   rM   r#   r   r   r   r   rN   rP   rO   r   r   r   rj   s                   r5   rZ   zSwinTransformerBlock.__init__   s   F 	 0!*:!6!6 0(,0,C,CKQ[,\,\)$/+A.1A!1DD"Z__
#(
 
 
	 2;R(9---R[]]Z__
i00	
 
 
 2;R(9---R[]]%?DD4+=+=+?+? 	 	
 	
 	
 	
 	
r7   r"   r$   c           	      D   t          | j                  r|)|j        d         |j        d         }}|j        }|j        }n| j        \  }}d }d }t          j        || j        d         z            | j        d         z  }t          j        || j        d         z            | j        d         z  }t          j
        d||df||          }d}d| j        d          f| j        d          | j        d          f| j        d          d ffD ]n}d| j        d          f| j        d          | j        d          f| j        d          d ffD ]0}	||d d |d         |d         |	d         |	d         d d f<   |dz  }1ot          || j                  }
|
                    d| j                  }
|
                    d          |
                    d          z
  }|                    |dk    t!          d                                        |dk    t!          d                    }nd }|S )Nr   r'   r   )dtypedevicer*   g      YrJ   )anyr   r,   r   r   r   mathceilr#   r=   r_   r6   r-   r[   rr   masked_fillr   )rh   r"   r1   r2   r   r   img_maskcnthwmask_windowsrw   s               r5   r   z"SwinTransformerBlock.get_attn_mask  sU   t 	}wqz171:1,1	!d.q1122T5Ea5HHA	!d.q1122T5Ea5HHA{Aq!Q<uVLLLHC)!,,-&q))DOA,>+>?oa(($/   T-a001*1--0B/BC/!,,d3  A
 <?HQQQ!QqT	1Q4!9aaa781HCC ,Hd6FGGL',,R1ABBL$..q11L4J4J14M4MMI!--i1neFmmLLXXYbfgYginorisisttIIIr7   target_window_sizer   c                 l   t          |          }|-| j        }t          |          r|d         dz  |d         dz  f}nt          |          }| j        r||fS d t	          | j        |          D             }d t	          | j        ||          D             }t          |          t          |          fS )Nr   r'   r   c                 (    g | ]\  }}||k    r|n|S  r   ).0rr   s      r5   
<listcomp>z;SwinTransformerBlock._calc_window_shift.<locals>.<listcomp>Q  s(    eeedaAFFqqeeer7   c                 *    g | ]\  }}}||k    rd n|S r   r   )r   r   r   ss       r5   r   z;SwinTransformerBlock._calc_window_shift.<locals>.<listcomp>R  s*    sssWQ1166aaqsssr7   )r   r   r   r   zipr   tuple)rh   r   r   r#   r   s        r5   r   z'SwinTransformerBlock._calc_window_shift@  s    
 ''9::$ $ 6$%% ]%7%:a%?ASTUAVZ[A[$\! )*; < <  	9%'888eec$:OQc6d6deeessD<QS^`q8r8rsss
[!!5#4#444r7   	feat_sizec                 N   || _         ||| _        |                     |          \  | _        | _        | j        d         | j        d         z  | _        | j                            | j                   |                     d| j	        rdn| 
                                d           dS )z
        Args:
            feat_size: New input resolution
            window_size: New window size
            always_partition: Change always_partition attribute if not None
        Nr   r   rw   FrT   )r   r   r   r#   r   r[   r   rp   ra   r   r   )rh   r   r#   r   s       r5   set_input_sizez#SwinTransformerBlock.set_input_sizeU  s     !*'$4D!,0,C,CK,P,P)$/+A.1A!1DD	!!$"2333%?DD4+=+=+?+? 	 	
 	
 	
 	
 	
r7   c           	         |j         \  }}}}t          | j                  }|r2t          j        || j        d          | j        d          fd          }n|}| j        d         || j        d         z  z
  | j        d         z  }| j        d         || j        d         z  z
  | j        d         z  }	t          j        j                            |ddd|	d|f          }|j         \  }
}}}
t          || j                  }|
                    d| j        |          }t          | dd          r|                     |          }n| j        }|                     ||          }|
                    d| j        d         | j        d         |          }t!          || j        ||          }|d d d |d |d d f                                         }|rt          j        || j        d          }n|}|S )	Nr   r   )r   r'   )shiftsdimsr*   r   F)ru   )r,   r   r   r=   rollr#   r]   r}   padr6   r-   r[   getattrr   rw   r   r9   r/   )rh   r"   r0   r1   r2   r3   	has_shift	shifted_xpad_hpad_w_HpWp	x_windowsrw   attn_windowss                   r5   _attnzSwinTransformerBlock._attnm  s   W
1a ((	 	
1tq/A.ADOTUDVCV-W^deeeIII !!$q4+;A+>'>>$BRSTBUU!!$q4+;A+>'>>$BRSTBUUH'++I1a57QRR	 2r1 %Y0@AA	NN2t'7;;	 4// 	'**955IIIyyy;; $((T-=a-@$BRSTBUWXYY"<1A2rJJ	aaa!RaRl+6688	  	
9T_6JJJAAAr7   c                 t   |j         \  }}}}||                     |                     |                     |                              z   }|                    |d|          }||                     |                     |                     |                              z   }|                    ||||          }|S )Nr*   )r,   r   r   r   rz   r   r   r   )rh   r"   r0   r1   r2   r3   s         r5   r   zSwinTransformerBlock.forward  s    W
1a

4::a== 9 9:::IIaQA 7 7888IIaAq!!r7   r   )r   r   r   r   r]   GELU	LayerNormr   r   r   r   r   r   rZ   r=   r   r   r   r   r   r   r   r   r   r   s   @r5   r   r      sJ         &*-.%*!&!!!!!"$'#%<!F
 F
F
 0F
 	F

 smF
 +F
 F
 #F
 F
 F
 F
 F
 F
 F
  F
  !!F
 F
 F
 F
 F
 F
P! !x5 !%,AW ! ! ! !L HL5 5 %c5c?&: ;5  (c5c?.B(CD5 
uS#Xc3h/	0	5 5 5 52 04	
 
S#X
 sCx
 'tn	
 
 
 
0% % %N      r7   r   c                   R     e Zd ZdZdej        fdedee         def fdZ	d Z
 xZS )PatchMergingz Patch Merging Layer.
    NrK   out_dimr   c                     t                                                       || _        |pd|z  | _         |d|z            | _        t          j        d|z  | j        d          | _        dS )z
        Args:
            dim: Number of input channels.
            out_dim: Number of output channels (or 2 * dim if None)
            norm_layer: Normalization layer.
        r'   r(   FrV   N)rY   rZ   rK   r   normr]   rb   	reduction)rh   rK   r   r   rj   s       r5   rZ   zPatchMerging.__init__  si     	)!c'Jq3w''	1s7DLuEEEr7   c                 |   |j         \  }}}}ddd|dz  d|dz  f}t          j                            ||          }|j         \  }}}}|                    ||dz  d|dz  d|                              dddddd                              d          }|                     |          }|                     |          }|S )Nr   r'   r   r&   r(   r)   )	r,   r]   r}   r   rz   r.   r@   r   r   )rh   r"   r0   r1   r2   r3   
pad_valuesr   s           r5   r   zPatchMerging.forward  s    W
1aAq1uaQ/
Ma,,W
1aIIaaAFAq1199!Q1aKKSSTUVVIIaLLNN1r7   )r   r   r   r   r]   r   r   r   r   rZ   r   r   r   s   @r5   r   r     s          &*#%<	F FF c]F !	F F F F F F$
 
 
 
 
 
 
r7   r   c            !            e Zd ZdZdddddddddddej        fd	ed
edeeef         dededede	e         de
dededededededeee         ef         def  fdZ	 ddeeef         dede	e         fdZd Z xZS )SwinTransformerStagez3 A basic Swin Transformer layer for one stage.
    Tr(   NrI   Fr   rJ   rK   r   r   depth
downsamplerL   rM   r#   r   r   r   rN   rP   rO   r   r   c                    	
 t                                                       | _        | _        |rt	          d |D                       n| _        | _        d _        t                    t	          d D                       |rt          |           _
        n |k    sJ t          j                     _
        t          j        	
 fdt          |          D               _        dS )a  
        Args:
            dim: Number of input channels.
            out_dim: Number of output channels.
            input_resolution: Input resolution.
            depth: Number of blocks.
            downsample: Downsample layer at the end of the layer.
            num_heads: Number of attention heads.
            head_dim: Channels per head (dim // num_heads if not set)
            window_size: Local window size.
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: If True, add a learnable bias to query, key, value.
            proj_drop: Projection dropout rate.
            attn_drop: Attention dropout rate.
            drop_path: Stochastic depth rate.
            norm_layer: Normalization layer.
        c              3       K   | ]	}|d z  V  
dS r'   Nr   r   is     r5   	<genexpr>z0SwinTransformerStage.__init__.<locals>.<genexpr>  s&      &H&H!qAv&H&H&H&H&H&Hr7   Fc                     g | ]}|d z  S r'   r   )r   r   s     r5   r   z1SwinTransformerStage.__init__.<locals>.<listcomp>  s    888qAF888r7   )rK   r   r   c                     g | ]L}t          
j        	|d z  dk    rdnt          t                    r|         n          MS )r'   r   )rK   r   rL   rM   r#   r   r   r   r   rN   rP   rO   r   r   )r   output_resolution
isinstancelist)r   r   r   rO   r   r   rM   r   r   rL   r   rP   rN   rh   r   r#   s     r5   r   z1SwinTransformerStage.__init__.<locals>.<listcomp>  s     &# &# &#" ! !!%!7#!'!"Q!11*!1)#!##*4Y*E*ET)A,,9%  &# &# &#r7   N)rY   rZ   rK   r   r   r   r   grad_checkpointingr   r   r   r]   r   
Sequentialrangeblocks)rh   rK   r   r   r   r   rL   rM   r#   r   r   r   rN   rP   rO   r   r   r   rj   s   ` `   ```````````@r5   rZ   zSwinTransformerStage.__init__  sf   H 	 0LV!l&H&H7G&H&H&H!H!H!H\l
"',,88K88899
  	,*%  DOO '>>>> kmmDO m &# &# &# &# &# &# &# &# &# &# &# &# &# &# &# &# &#" 5\\#&# &# &# $r7   r   c                     || _         t          | j        t          j                  r|| _        nt          d |D                       | _        | j        D ]}|                    | j        ||            dS )a   Updates the resolution, window size and so the pair-wise relative positions.

        Args:
            feat_size: New input (feature) resolution
            window_size: New window size
            always_partition: Always partition / shift the window
        c              3       K   | ]	}|d z  V  
dS r   r   r   s     r5   r   z6SwinTransformerStage.set_input_size.<locals>.<genexpr>!  s&      *E*Ea16*E*E*E*E*E*Er7   r   r#   r   N)	r   r   r   r]   r   r   r   r   r   )rh   r   r#   r   blocks        r5   r   z#SwinTransformerStage.set_input_size  s     !*dor{33 	F%.D""%**E*E9*E*E*E%E%ED"[ 	 	E  0'!1 !    	 	r7   c                     |                      |          }| j        r4t          j                                        st          | j        |          }n|                     |          }|S r   )r   r   r=   r   is_scriptingr   r   rh   r"   s     r5   r   zSwinTransformerStage.forward)  sZ    OOA" 	59+A+A+C+C 	t{A..AAAAr7   r   )r   r   r   r   r]   r   r   r   r   r   r   r   r   r   r   rZ   r   r   r   r   s   @r5   r   r     s          $&*-.%*!&!!!!35#%<#J$ J$J$ J$ $CHo	J$
 J$ J$ J$ smJ$ +J$ #J$ J$ J$ J$ J$ J$  T%[%/0!J$" !#J$ J$ J$ J$ J$ J$` 04	 S#X  'tn	   2      r7   r   c            +           e Zd ZdZdddddddd	d
dddddddddeej        dfdedededede	dede
edf         de
edf         dee         dedededed ed!ed"ed#ed$ed%ed&ee	ef         d'e	f* fd(Zej        j        dCd)            Zej        j        d*             Z	 	 	 	 	 dDdee
eef                  dee
eef                  dee
eef                  d,edee         d-d
fd.Zej        j        dEd/            Zej        j        dFd0            Zej        j        d-ej        fd1            ZdGdedee	         fd2Z	 	 	 	 	 dHd4ej        d5eeeee         f                  d6ed7ed8e	d9ed-eeej                 e
ej        eej                 f         f         fd:Z	 	 	 dId5eeee         f         d<ed=efd>Zd? Z dEd@efdAZ!dB Z" xZ#S )Jr!   z Swin Transformer

    A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
          https://arxiv.org/pdf/2103.14030
       r(   r&     avg`   r'   r'      r'   r&   r        NrI   FTr   rJ   g? img_size
patch_sizein_chansnum_classesglobal_pool	embed_dimdepths.rL   rM   r#   r   strict_img_sizer   rN   	drop_rateproj_drop_rateattn_drop_ratedrop_path_rateembed_layerr   weight_initc           
      <   t                                                       |dv sJ || _        || _        d| _        t          |          | _        | _        t          d| j        dz
  z  z            x| _	        | _
        g | _        t          t          t          f          s fdt          | j                  D              ||||d         ||d          | _        | j        j        } t%          | j                  |	          }	t          |
t          t          f          s t%          | j                  |
          }
nt          |
          dk    r|
f| j        z  }
t          |
          | j        k    sJ  t%          | j                  |          }d t'          j        d|t+          |                                        |          D             }g }d         }d}t          | j                  D ]}|         }|t/          di d	|d
|d|d         |z  |d         |z  fd||         d|dk    d||         d|	|         d|
|         d|d| d||         d|d|d|d||         d|gz  }|}|dk    r|dz  }| xj        t1          |||z  d|           gz  c_        t3          j        | | _         || j	                  | _        t;          | j	        |||| j                  | _        |dk    r|                     |           dS dS )a~  
        Args:
            img_size: Input image size.
            patch_size: Patch size.
            in_chans: Number of input image channels.
            num_classes: Number of classes for classification head.
            embed_dim: Patch embedding dimension.
            depths: Depth of each Swin Transformer layer.
            num_heads: Number of attention heads in different layers.
            head_dim: Dimension of self-attention heads.
            window_size: Window size.
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: If True, add a learnable bias to query, key, value.
            drop_rate: Dropout rate.
            attn_drop_rate (float): Attention dropout rate.
            drop_path_rate (float): Stochastic depth rate.
            embed_layer: Patch embedding layer.
            norm_layer (nn.Module): Normalization layer.
        )r  r  NHWCr'   r   c                 :    g | ]}t          d |z  z            S r   )r   )r   r   r  s     r5   r   z,SwinTransformer.__init__.<locals>.<listcomp>r  s*    QQQQYa/00QQQr7   r   )r  r  r  r  r   r  
output_fmtc                 6    g | ]}|                                 S r   )tolist)r   r"   s     r5   r   z,SwinTransformer.__init__.<locals>.<listcomp>  s     ```aqxxzz```r7   rK   r   r   r   r   rL   rM   r#   r   r   r   rN   rP   rO   r   r   layers.)num_chsr   module)	pool_typer  	input_fmtskipNr   ) rY   rZ   r  r  r  len
num_layersr  r   num_featureshead_hidden_sizefeature_infor   r   r   r   patch_embed	grid_sizer   r=   linspacerA   splitr   dictr]   r   layersr   r   headinit_weights)rh   r  r  r  r  r  r  r  rL   rM   r#   r   r  r   rN   r  r  r  r  r  r   r  kwargs
patch_griddprr1  in_dimr\   r   r   rj   s         `                       r5   rZ   zSwinTransformer.__init__:  s   X 	k))))&& f++"47	A$/\]J]D^8^4_4__D1)eT]33 	RQQQQ%:P:PQQQI ';!l!+
 
 
 %/
 .9T_--h77+e}55 	;4)DO44[AAKK""&.4?:K;4?2222.Ido..y99	``5>!^S[[#Q#Q#W#WX^#_#_```1t'' 	m 	mAlG+   F qMU*qMU*" " Qii q55 $A,, "! (NN "2!1 "100 $A,, "  ).!" ).#$ a&&%& &:'  F* F1uu
$w*uBT]jgh]j]j"k"k"k!llmV,Jt011	"!o
 
 
	 &  k***** ! r7   c                     |dv sJ d|v rt          j        | j                   nd}t          t	          ||          |            d S )N)jaxjax_nlhbmocor  nlhbrJ   )	head_bias)r   logr  r   r    )rh   moder=  s      r5   r3  zSwinTransformer.init_weights  sZ    6666639T>>TXd.////r	(CCCTJJJJJr7   c                     t                      }|                                 D ]\  }}d|v r|                    |           |S )Nr`   )setnamed_parametersadd)rh   nwdnr   s       r5   no_weight_decayzSwinTransformer.no_weight_decay  sJ    ee))++ 	 	DAq-22



r7      window_ratior$   c                 R   ||(| j                             ||           | j         j        }|t          fd|D                       }t	          | j                  D ]G\  }}dt          |dz
  d          z  }	|                    |d         |	z  |d         |	z  f||           HdS )a   Updates the image resolution and window size.

        Args:
            img_size: New input resolution, if None current resolution is used
            patch_size (Optional[Tuple[int, int]): New patch size, if None use current patch size
            window_size: New window size, if None based on new_img_size // window_div
            window_ratio: divisor for calculating window size from grid size
            always_partition: always partition into windows and shift (even if window size < feat size)
        N)r  r  c                     g | ]}|z  S r   r   )r   pgrH  s     r5   r   z2SwinTransformer.set_input_size.<locals>.<listcomp>  s     I I I|!3 I I Ir7   r'   r   r   r   )r,  r   r-  r   	enumerater1  max)
rh   r  r  r#   rH  r   r5  indexstagestage_scales
       `     r5   r   zSwinTransformer.set_input_size  s    " :#9++X*+UUU)3J I I I Ij I I IJJK%dk22 	 	LE5s519a000K  %a=K7A+9UV'!1 !    	 	r7   c                 0    t          d|rdng d          S )Nz^patch_embedz^layers\.(\d+)))z^layers\.(\d+).downsampler   )z^layers\.(\d+)\.\w+\.(\d+)N)z^norm)i )stemr   )r0  )rh   coarses     r5   group_matcherzSwinTransformer.group_matcher  s9     (. $$ 5 5 5
 
 
 	
r7   c                 (    | j         D ]	}||_        
d S r   )r1  r   )rh   enablels      r5   set_grad_checkpointingz&SwinTransformer.set_grad_checkpointing  s(     	* 	*A#)A  	* 	*r7   c                     | j         j        S r   )r2  fc)rh   s    r5   get_classifierzSwinTransformer.get_classifier  s    y|r7   c                 L    || _         | j                            ||           d S )N)r$  )r  r2  reset)rh   r  r  s      r5   reset_classifierz SwinTransformer.reset_classifier  s(    &	{;;;;;r7   NCHWr"   indicesr   
stop_earlyr  intermediates_onlyc                 x   |dv s
J d            g }t          t          | j                  |          \  }}	|                     |          }t          | j                  }
t          j                                        s|s| j        }n| j        d|	dz            }t          |          D ]v\  }} ||          }||v rb|r||
dz
  k    r|                     |          }n|}|	                    dddd          
                                }|                    |           w|r|S |                     |          }||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to compatible intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )r_  zOutput shape must be NCHW.Nr   r   r&   r'   )r   r'  r1  r,  r=   r   r   rL  r   r.   r/   append)rh   r"   r`  r   ra  r  rb  intermediatestake_indices	max_index
num_stagesstagesr   rO  x_inters                  r5   forward_intermediatesz%SwinTransformer.forward_intermediates  s^   * Y&&&(D&&&"6s4;7G7G"Q"Qi Q%%
9!!## 	1: 	1[FF[)a-0F!&)) 	. 	.HAuaAL    Aa//"iillGGG!//!Q155@@BB$$W--- 	!  IIaLL-r7   r   
prune_norm
prune_headc                     t          t          | j                  |          \  }}| j        d|dz            | _        |rt          j                    | _        |r|                     dd           |S )z@ Prune layers not required for specified intermediates.
        Nr   r   r  )r   r'  r1  r]   r   r   r^  )rh   r`  rl  rm  rf  rg  s         r5   prune_intermediate_layersz)SwinTransformer.prune_intermediate_layers,  sq     #7s4;7G7G"Q"Qik.9q=.1 	&DI 	)!!!R(((r7   c                     |                      |          }|                     |          }|                     |          }|S r   )r,  r1  r   r  s     r5   forward_featuresz SwinTransformer.forward_features<  s8    QKKNNIIaLLr7   
pre_logitsc                 ^    |r|                      |d          n|                      |          S )NT)rr  )r2  )rh   r"   rr  s      r5   forward_headzSwinTransformer.forward_headB  s,    0:Ltyyty,,,		!Lr7   c                 Z    |                      |          }|                     |          }|S r   )rq  rt  r  s     r5   r   zSwinTransformer.forwardE  s-    !!!$$a  r7   r  )NNNrG  NF)Tr   )NFFr_  F)r   FT)$r   r   r   r   r   r]   r   r   r   strr   r   r   r   r   r   rZ   r=   r   ignorer3  rF  r   rT  rX  Moduler[  r^  r   r   rk  ro  rq  rt  r   r   r   s   @r5   r!   r!   3  sR         +.#$&2)7&*-.%*$(!!!$&$&$'$./1|!-x+ x+'x+ x+ 	x+
 x+ x+ x+ #s(Ox+ S#Xx+ smx+ +x+ #x+ "x+ x+ x+  !x+" "#x+$ "%x+& "'x+( ")x+* c8m,+x+, -x+ x+ x+ x+ x+ x+t YK K K K
 Y   374859 !/3 uS#X/ !sCx1 "%S/2	
  'tn 
   @ Y
 
 
 
 Y* * * * Y	    < <C <hsm < < < < 8<$$',0  0 |0  eCcN340  	0 
 0  0  !%0  
tEL!5tEL7I)I#JJ	K0  0  0  0 h ./$#	 3S	>*  	      M M$ M M M M      r7   c                   
 d}d| v rd}ddl }i }|                     d|           } |                     d|           } |                                 D ]2\  
}t          
fdd	D                       r"d

v rS|j        j        j        j        \  }}}}|j        d         |k    s|j        d         |k    rt          |||fddd          }
	                    d          ro|
                    
dd                   }	|j        |	j        j        k    s|	j        d         |	j        d         k    r!t          ||	j        |	j        j                  }|r.|                    dd 
          

                    dd          
||
<   4|S )zJ convert patch embedding weight from manual patchify + linear proj to convTzhead.fc.weightFr   Nmodel
state_dictc                     g | ]}|v S r   r   )r   rE  r   s     r5   r   z(checkpoint_filter_fn.<locals>.<listcomp>U  s    III1QIIIr7   )rS   rw   zpatch_embed.proj.weightry   r*   bicubic)interpolation	antialiasverboser`   ir   rl   zlayers.(\d+).downsamplec                 T    dt          |                     d                    dz    dS )Nr!  r   z.downsample)r   group)r"   s    r5   <lambda>z&checkpoint_filter_fn.<locals>.<lambda>m  s)    =gs177ST::YZGZ=g=g=g r7   zhead.zhead.fc.)regetitemsr   r,  re   weightr,   r   endswithget_submoduler`   r#   r   subreplace)r}  r|  old_weightsr  out_dictr   r   r1   r2   mr   s             @r5   checkpoint_filter_fnr  K  s   K:%%IIIH44Jj99J  ""  1IIII HIIIJJ 	$))*/6<JAq!Qwr{a172;!#3#3(F"+"    ::455 	##AdsdG,,Aw!8>>>!-PQBRVWVcdeVfBfBf-$%M#$#A#G    	/13g3gijkkA		':..AOr7   Fc           	          t          d t          |                    dd                    D                       }|                    d|          }t	          t
          | |ft          t          d|          d|}|S )Nc              3       K   | ]	\  }}|V  
d S r   r   )r   r   r   s      r5   r   z+_create_swin_transformer.<locals>.<genexpr>u  s&      \\da\\\\\\r7   r  )r   r   r&   r   out_indicesT)flatten_sequentialr  )pretrained_filter_fnfeature_cfg)r   rL  r  popr   r!   r  r0  )variant
pretrainedr4  default_out_indicesr  r|  s         r5   _create_swin_transformerr  t  s    \\i

8\8Z8Z.[.[\\\\\**],?@@K *1DkJJJ  	 E Lr7   r  c                 8    | ddddddt           t          ddd	d
|S )Nr  )r&   r  r  )rI   rI   g?r  Tzpatch_embed.projzhead.fcmit)urlr  
input_size	pool_sizecrop_pctr  fixed_input_sizemeanrX   
first_conv
classifierlicenser   )r  r4  s     r5   _cfgr    s:    =v%.B(	  # r7   z.swin_small_patch4_window7_224.ms_in22k_ft_in1kztimm/zvhttps://github.com/SwinTransformer/storage/releases/download/v1.0.8/swin_small_patch4_window7_224_22kto1k_finetune.pth)	hf_hub_idr  z-swin_base_patch4_window7_224.ms_in22k_ft_in1kzlhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224_22kto1k.pthz.swin_base_patch4_window12_384.ms_in22k_ft_in1kzmhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22kto1k.pth)r&     r  )r
  r
  g      ?)r  r  r  r  r  z.swin_large_patch4_window7_224.ms_in22k_ft_in1kzmhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window7_224_22kto1k.pthz/swin_large_patch4_window12_384.ms_in22k_ft_in1kznhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22kto1k.pthz$swin_tiny_patch4_window7_224.ms_in1kzdhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pthz%swin_small_patch4_window7_224.ms_in1kzehttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pthz$swin_base_patch4_window7_224.ms_in1kzdhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224.pthz%swin_base_patch4_window12_384.ms_in1kzehttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384.pthz-swin_tiny_patch4_window7_224.ms_in22k_ft_in1kzuhttps://github.com/SwinTransformer/storage/releases/download/v1.0.8/swin_tiny_patch4_window7_224_22kto1k_finetune.pthz%swin_tiny_patch4_window7_224.ms_in22kzhhttps://github.com/SwinTransformer/storage/releases/download/v1.0.8/swin_tiny_patch4_window7_224_22k.pthiQU  )r  r  r  z&swin_small_patch4_window7_224.ms_in22kzihttps://github.com/SwinTransformer/storage/releases/download/v1.0.8/swin_small_patch4_window7_224_22k.pthz%swin_base_patch4_window7_224.ms_in22kzhhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224_22k.pthz&swin_base_patch4_window12_384.ms_in22kzihttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth)r  r  r  r  r  r  z&swin_large_patch4_window7_224.ms_in22kzihttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window7_224_22k.pthz'swin_large_patch4_window12_384.ms_in22kzjhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pthzswin_s3_tiny_224.ms_in1kzbhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/s3_t-1d53f6a8.pthzbhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/s3_s-3bb4c69d.pthzbhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/s3_b-a1e95db4.pth)zswin_s3_small_224.ms_in1kzswin_s3_base_224.ms_in1kc           	      ^    t          ddddd          }t          	 d	d| it          |fi |S )
z+ Swin-T @ 224x224, trained ImageNet-1k
    r(   rI   r  r  r	  r  r#   r  r  rL   swin_tiny_patch4_window7_224r  )r  r0  r  r  r4  
model_argss      r5   r  r    s^     R`noooJ#&] ]3=]AEjA[A[TZA[A[] ] ]r7   c           	      ^    t          ddddd          }t          	 d	d| it          |fi |S )
z Swin-S @ 224x224
    r(   rI   r  r'   r'      r'   r	  r  swin_small_patch4_window7_224r  )r  r  r  s      r5   r  r    s^     RaopppJ#'^ ^4>^BFzB\B\U[B\B\^ ^ ^r7   c           	      ^    t          ddddd          }t          	 d	d| it          |fi |S )
z Swin-B @ 224x224
    r(   rI      r  r(   rG         r  swin_base_patch4_window7_224r  )r  r  r  s      r5   r  r    s^     SbpqqqJ#&] ]3=]AEjA[A[TZA[A[] ] ]r7   c           	      ^    t          ddddd          }t          	 d	d| it          |fi |S )
z Swin-B @ 384x384
    r(   r
  r  r  r  r  swin_base_patch4_window12_384r  )r  r  r  s      r5   r  r    s^     c-cqrrrJ#'^ ^4>^BFzB\B\U[B\B\^ ^ ^r7   c           	      ^    t          ddddd          }t          	 d	d| it          |fi |S )
z Swin-L @ 224x224
    r(   rI      r  r  r
  r  0   r  swin_large_patch4_window7_224r  )r  r  r  s      r5   r  r    s^     SbqrrrJ#'^ ^4>^BFzB\B\U[B\B\^ ^ ^r7   c           	      ^    t          ddddd          }t          	 d	d| it          |fi |S )
z Swin-L @ 384x384
    r(   r
  r  r  r  r  swin_large_patch4_window12_384r  )r  r  r  s      r5   r  r    s^     c-crsssJ#(_ _5?_CG
C]C]V\C]C]_ _ _r7   c           	      \    t          ddddd          }t          d	d| it          |fi |S )
z; Swin-S3-T @ 224x224, https://arxiv.org/abs/2111.14725
    r(   rI   rI      rI   r  r  r	  r  swin_s3_tiny_224r  )r  r  r  s      r5   r  r    sS     -2l^ln n nJ#ll:lQUV`QkQkdjQkQklllr7   c           	      \    t          ddddd          }t          d	d| it          |fi |S )
z; Swin-S3-S @ 224x224, https://arxiv.org/abs/2111.14725
    r(   )r  r  r  rI   r  r  r	  r  swin_s3_small_224r  )r  r  r  s      r5   r  r    sS     /Raoq q qJ#mmJmRVWaRlRlekRlRlmmmr7   c           	      \    t          ddddd          }t          d	d| it          |fi |S )
z; Swin-S3-B @ 224x224, https://arxiv.org/abs/2111.14725
    r(   r  r  )r'   r'      r'   r	  r  swin_s3_base_224r  )r  r  r  s      r5   r  r    sS     -2m_mo o oJ#ll:lQUV`QkQkdjQkQklllr7   )"swin_base_patch4_window7_224_in22k#swin_base_patch4_window12_384_in22k#swin_large_patch4_window7_224_in22k$swin_large_patch4_window12_384_in22krw  rv  )Hr   loggingr   typingr   r   r   r   r   r=   torch.nnr]   	timm.datar	   r
   timm.layersr   r   r   r   r   r   r   r   r   r   r   r   _builderr   	_featuresr   _features_fxr   _manipulater   r   	_registryr   r   r   vision_transformerr    __all__	getLoggerr   _loggerr   r   r   r6   r9   rE   rz  rG   r   r   r   r!   r  r  r  default_cfgsr  r  r  r  r  r  r  r  r  r   r7   r5   <module>r     s  
 
"   9 9 9 9 9 9 9 9 9 9 9 9 9 9        A A A A A A A AU U U U U U U U U U U U U U U U U U U U U U U U U U U U * * * * * * + + + + + + 3 3 3 3 3 3 4 4 4 4 4 4 4 4 Y Y Y Y Y Y Y Y Y Y 4 4 4 4 4 4

'
H
%
%#uS#X./ <38_ \   ( sCx S S    "	#s 	#3 	# 	# 	# 	#r r r r rbi r r rjI I I I I29 I I IX         29      Fp p p p p29 p p pfU U U U Ubi U U Up& & &R
 
 
 
    %$ H&4dd E7H 7H 7HH& 4TTz6} 6} 6}	H& 5dd{ Hs7D 7D 7DH& 5dd{7~ 7~ 7~H& 6tt| Hs8D 8D 8DH&& +DDr-u -u -u'H&, ,TTs.v .v .v-H&2 +DDr-u -u -u3H&8 ,TTs Hs.D .D .D9H&D 4TT D6F 6F 6FEH&L ,TTv. . .MH&T -ddw/ / /UH&\ ,TTv. . .]H&d -ddw HsPU/W /W /WeH&l -ddw/ / /mH&t .ttx HsPU0W 0W 0WuH&~ p!r !r !rH&D "&p"r "r "r !%p!r !r !rKH& H& H& H HV ] ] ] ] ] ] ^ ^ ^ ^ ^ ^ ] ] ] ] ] ] ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ _ _/ _ _ _ _ m mO m m m m n n_ n n n n m mO m m m m  H*Q+S+S,U	' '     r7   