
    Ng                     R   d Z ddlZddlmZmZmZmZmZ ddlZddl	m
Z
 ddlm
c mZ ddlmc mZ ddlmZmZ ddlmZmZmZmZmZmZmZmZmZmZmZ ddl m!Z! ddl"m#Z# dd	l$m%Z% dd
l&m'Z'm(Z(m)Z) dgZ*ee+ee+e+f         f         Z,dej-        dee+e+f         dej-        fdZ.e%dej-        dee+e+f         dee+e+f         dej-        fd            Z/ G d de
j0                  Z1 G d de
j0                  Z2 G d de
j0                  Z3 G d de
j0                  Z4 G d de
j0                  Z5d Z6dKdZ7dLd Z8 e' e8d!d"#           e8d!d$d%d&d'(           e8d!d)#           e8d!d*d%d&d'(           e8d!d+#           e8d!d,#           e8d!d-#           e8d!d.#           e8d!d/#           e8d!d0#           e8d!d1d2d3d45           e8d!d6d2d3d45          d7          Z9e(dKde5fd8            Z:e(dKde5fd9            Z;e(dKde5fd:            Z<e(dKde5fd;            Z=e(dKde5fd<            Z>e(dKde5fd=            Z?e(dKde5fd>            Z@e(dKde5fd?            ZAe(dKde5fd@            ZBe(dKde5fdA            ZCe(dKde5fdB            ZDe(dKde5fdC            ZE e)eFdDdEdFdGdHdIdJ           dS )MaK   Swin Transformer V2
A PyTorch impl of : `Swin Transformer V2: Scaling Up Capacity and Resolution`
    - https://arxiv.org/abs/2111.09883

Code/weights from https://github.com/microsoft/Swin-Transformer, original copyright/license info below

Modifications and additions for timm hacked together by / Copyright 2022, Ross Wightman
    N)CallableListOptionalTupleUnionIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)
PatchEmbedMlpDropPath	to_2tupletrunc_normal__assertClassifierHeadresample_patch_embedndgridget_act_layer	LayerType   )build_model_with_cfg)feature_take_indices)register_notrace_function)generate_default_cfgsregister_modelregister_model_deprecationsSwinTransformerV2xwindow_sizereturnc                 *   | j         \  }}}}|                     |||d         z  |d         ||d         z  |d         |          } |                     dddddd                                                              d|d         |d         |          }|S )z
    Args:
        x: (B, H, W, C)
        window_size (int): window size

    Returns:
        windows: (num_windows*B, window_size, window_size, C)
    r   r               shapeviewpermute
contiguous)r   r   BHWCwindowss          [/var/www/html/ai-engine/env/lib/python3.11/site-packages/timm/models/swin_transformer_v2.pywindow_partitionr2   $   s     JAq!Q	q!{1~%{1~qKN7JKXYN\]^^Aii1aAq))4466;;BAP[\]P^`abbGN    r0   img_sizec                    |\  }}| j         d         }|                     d||d         z  ||d         z  |d         |d         |          }|                    dddddd                                                              d|||          }|S )z
    Args:
        windows: (num_windows * B, window_size[0], window_size[1], C)
        window_size (Tuple[int, int]): Window size
        img_size (Tuple[int, int]): Image size

    Returns:
        x: (B, H, W, C)
    r&   r   r   r"   r#   r$   r%   r'   )r0   r   r4   r-   r.   r/   r   s          r1   window_reverser6   3   s     DAqbARk!n,a;q>.A;q>S^_`SacdeeA			!Q1a##..0055b!QBBAHr3   c                        e Zd ZdZ	 	 	 	 	 ddedeeef         ded	ed
edededeeef         ddf fdZd Z	deeef         ddfdZ
ddej        deej                 dej        fdZ xZS )WindowAttentiona   Window based multi-head self attention (W-MSA) module with relative position bias.
    It supports both of shifted and non-shifted window.

    Args:
        dim (int): Number of input channels.
        window_size (tuple[int]): The height and width of the window.
        num_heads (int): Number of attention heads.
        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
        pretrained_window_size (tuple[int]): The height and width of the window in pre-training.
    TF        r   r   dimr   	num_headsqkv_biasqkv_bias_separate	attn_drop	proj_droppretrained_window_sizer    Nc	           
      @   t                                                       || _        || _        t	          |          | _        || _        || _        t          j	        t          j        dt          j        |ddf          z                      | _        t          j        t          j        ddd          t          j        d          t          j        d|d                    | _        t          j        ||d	z  d          | _        |rt          j	        t          j        |                    | _        |                     d
t          j        |          d           t          j	        t          j        |                    | _        nd | _        d | _        d | _        t          j        |          | _        t          j        ||          | _        t          j        |          | _        t          j        d          | _        |                                  d S )N
   r   r#   i   Tbias)inplaceFr"   k_bias
persistentr&   r;   )super__init__r;   r   r   rA   r<   r>   nn	Parametertorchlogoneslogit_scale
SequentialLinearReLUcpb_mlpqkvzerosq_biasregister_bufferv_biasrG   Dropoutr?   projr@   Softmaxsoftmax"_make_pair_wise_relative_positions)
selfr;   r   r<   r=   r>   r?   r@   rA   	__class__s
            r1   rL   zWindowAttention.__init__S   s    	&&/0F&G&G#"!2<	"uz9aQRBS7T7T2T(U(UVV }Ia4(((GD!!!Ic95111
 
 9S#'666 	,u{3'7'788DK  5;s+;+; NNN,u{3'7'788DKKDKDKDKI..Ic3''	I..zb)))//11111r3   c                    t          j        | j        d         dz
   | j        d                                       t           j                  }t          j        | j        d         dz
   | j        d                                       t           j                  }t          j        t          ||                    }|                    ddd                                          	                    d          }| j
        d         dk    rQ|d d d d d d dfxx         | j
        d         dz
  z  cc<   |d d d d d d dfxx         | j
        d         dz
  z  cc<   nP|d d d d d d dfxx         | j        d         dz
  z  cc<   |d d d d d d dfxx         | j        d         dz
  z  cc<   |dz  }t          j        |          t          j        t          j        |          dz             z  t          j        d          z  }|                     d|d           t          j        | j        d                   }t          j        | j        d                   }t          j        t          ||                    }t          j        |d          }|d d d d d f         |d d d d d f         z
  }|                    ddd                                          }|d d d d dfxx         | j        d         dz
  z  cc<   |d d d d dfxx         | j        d         dz
  z  cc<   |d d d d dfxx         d| j        d         z  dz
  z  cc<   |                    d	          }	|                     d
|	d           d S )Nr   r   r#            ?relative_coords_tableFrH   r&   relative_position_index)rO   aranger   tofloat32stackr   r*   r+   	unsqueezerA   signlog2absmathrZ   flattensum)
ra   relative_coords_hrelative_coords_wrf   coords_hcoords_wcoordscoords_flattenrelative_coordsrg   s
             r1   r`   z2WindowAttention._make_pair_wise_relative_positions~   s   !L4+;A+>+B)CTEUVWEXYY\\]b]jkk!L4+;A+>+B)CTEUVWEXYY\\]b]jkk %F3DFW,X,X Y Y 5 = =aA F F Q Q S S ] ]^_ ` `&q)A--!!!!QQQ1*---$2Ma2PST2TU---!!!!QQQ1*---$2Ma2PST2TU----!!!!QQQ1*---$2B12E2IJ---!!!!QQQ1*---$2B12E2IJ---" %
+@ A AEJI+,,s2E4 E4 !46:ill!C46KX]^^^ < 0 344< 0 344VHh7788vq11(AAAt4~aaaqqqj7QQ)11!Q::EEGG111a   D$4Q$7!$;;   111a   D$4Q$7!$;;   111a   A(8(;$;a$??   "1"5"5b"9"968O\abbbbbr3   c                 t    t          |          }|| j        k    r|| _        |                                  dS dS )zzUpdate window size & interpolate position embeddings
        Args:
            window_size (int): New window size
        N)r   r   r`   )ra   r   s     r1   set_window_sizezWindowAttention.set_window_size   sG    
  ,,$****D3355555 +*r3   r   maskc                    |j         \  }}}| j        |                     |          }nit          j        | j        | j        | j        f          }| j        r|                     |          }||z  }n!t          j	        || j        j
        |          }|                    ||d| j        d                              ddddd          }|                    d          \  }}	}
t          j        |d	          t          j        |	d	                              d
d          z  }t          j        | j        t'          j        d                                                    }||z  }|                     | j                                      d| j                  }|| j                            d                                       | j        d         | j        d         z  | j        d         | j        d         z  d          }|                    ddd                                          }dt          j        |          z  }||                    d          z   }||j         d         }|                    d|| j        ||          |                    d                              d          z   }|                    d| j        ||          }|                     |          }n|                     |          }|                     |          }||
z                      dd                              |||          }|                      |          }| !                    |          }|S )z
        Args:
            x: input features with shape of (num_windows*B, N, C)
            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
        N)weightrE   r"   r&   r#   r   r   r$   rJ   g      Y@)max   )"r(   rY   rW   rO   catrG   r[   r>   Flinearr~   reshaper<   r*   unbind	normalize	transposeclamprR   rp   rP   exprV   rf   r)   rg   r   r+   sigmoidrl   r_   r?   r]   r@   )ra   r   r|   B_Nr/   rW   r=   qkvattnrR   relative_position_bias_tablerelative_position_biasnum_wins                   r1   forwardzWindowAttention.forward   s    7Aq;((1++CCy$+t{DK!HIIH% IhhqkkxhqxHHHkk"aDNB77??1aANN**Q--1a A2&&&QB)?)?)?)I)I"b)Q)QQk$"28K8KLLLPPRRk!'+||D4N'O'O'T'TUWY]Yg'h'h$!=d>Z>_>_`b>c>c!d!i!iQ$"21"55t7G7JTM]^_M`7`bd"f "f!7!?!?1a!H!H!S!S!U!U!#em4J&K&K!K,66q999jmG99R$.!Q??$..QRBSBSB]B]^_B`B``D99RA66D<<%%DD<<%%D~~d##AX  A&&..r1a88IIaLLNN1r3   )TFr9   r9   r:   N)__name__
__module____qualname____doc__intr   boolfloatrL   r`   r{   rO   Tensorr   r   __classcell__rb   s   @r1   r8   r8   E   s;        $ "&+!!6<)2 )2)2 sCx)2 	)2
 )2  $)2 )2 )2 %*#s(O)2 
)2 )2 )2 )2 )2 )2Vc c c<65c? 6t 6 6 6 6- - -Xel-C -u| - - - - - - - -r3   r8   c                       e Zd ZdZddddddddddej        dfd	ed
edededededede	dede	de	de	de
dej        def fdZd$deej                 deej                 fdZ	 d$dedee         deeeef         eeef         f         fdZ	 d$d eeef         deeef         dee         fd!Zdej        dej        fd"Zdej        dej        fd#Z xZS )%SwinTransformerV2Blockz Swin Transformer Block.
       r   F      @Tr9   gelur;   input_resolutionr<   r   
shift_sizealways_partitiondynamic_mask	mlp_ratior=   r@   r?   	drop_path	act_layer
norm_layerrA   c                    t                                                       || _        t          |          | _        || _        t          |          | _        || _        || _        | 	                    ||          \  | _
        | _        | j
        d         | j
        d         z  | _        || _        t          |          }t          |t          | j
                  ||	||
t          |                    | _         ||          | _        |dk    rt%          |          nt'          j                    | _        t-          |t/          ||z            ||
          | _         ||          | _        |dk    rt%          |          nt'          j                    | _        |                     d| j        rdn|                                 d	           dS )
a  
        Args:
            dim: Number of input channels.
            input_resolution: Input resolution.
            num_heads: Number of attention heads.
            window_size: Window size.
            shift_size: Shift size for SW-MSA.
            always_partition: Always partition into full windows and shift
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: If True, add a learnable bias to query, key, value.
            proj_drop: Dropout rate.
            attn_drop: Attention dropout rate.
            drop_path: Stochastic depth rate.
            act_layer: Activation layer.
            norm_layer: Normalization layer.
            pretrained_window_size: Window size in pretraining.
        r   r   )r   r<   r=   r?   r@   rA   r9   )in_featureshidden_featuresr   drop	attn_maskNFrH   )rK   rL   r;   r   r   r<   target_shift_sizer   r   _calc_window_shiftr   r   window_arear   r   r8   r   norm1r   rM   Identity
drop_path1r   r   mlpnorm2
drop_path2rZ   get_attn_mask)ra   r;   r   r<   r   r   r   r   r   r=   r@   r?   r   r   r   rA   rb   s                   r1   rL   zSwinTransformerV2Block.__init__   s   F 	 )*: ; ;"!*:!6!6 0(,0,C,CKQ[,\,\)$/+A.1A!1DD"!),,	#!$"233#,-C#D#D
 
 
	  Z__
1:R(9---R[]]i00	
 
 
  Z__
1:R(9---R[]]%?DD4+=+=+?+? 	 	
 	
 	
 	
 	
r3   Nr   r    c           	         t          | j                  r| t          j        dg| j        dR           }n;t          j        d|j        d         |j        d         df|j        |j                  }d}d| j        d          f| j        d          | j        d          f| j        d          d ffD ]n}d| j        d          f| j        d          | j        d          f| j        d          d ffD ]0}||d d |d         |d         |d         |d         d d f<   |dz  }1ot          || j                  }|
                    d| j                  }|                    d          |                    d          z
  }|                    |dk    t          d                                        |dk    t          d                    }nd }|S )Nr   r#   )dtypedevicer   r&   g      Yr9   )anyr   rO   rX   r   r(   r   r   r   r2   r)   r   rl   masked_fillr   )ra   r   img_maskcnthwmask_windowsr   s           r1   r   z$SwinTransformerV2Block.get_attn_mask$  s   t 	y ;'ED,A'E1'E'EFF ;171:qwqz1'EQW]^]efffC)!,,-&q))DOA,>+>?oa(($/   T-a001*1--0B/BC/!,,d3  A
 <?HQQQ!QqT	1Q4!9aaa781HCC ,Hd6FGGL',,R1ABBL$..q11L4J4J14M4MMI!--i1neFmmLLXXYbfgYginorisisttIIIr3   target_window_sizer   c                    t          |          }|-| j        }t          |          r|d         dz  |d         dz  f}nt          |          }| j        r||fS t          |          }t          |          }d t	          | j        |          D             }d t	          | j        ||          D             }t          |          t          |          fS )Nr   r#   r   c                 (    g | ]\  }}||k    r|n|S  r   ).0rr   s      r1   
<listcomp>z=SwinTransformerV2Block._calc_window_shift.<locals>.<listcomp>T  s(    eeedaAFFqqeeer3   c                 *    g | ]\  }}}||k    rd n|S r   r   )r   r   r   ss       r1   r   z=SwinTransformerV2Block._calc_window_shift.<locals>.<listcomp>U  s*    sssWQ1166aaqsssr3   )r   r   r   r   zipr   tuple)ra   r   r   r   r   s        r1   r   z)SwinTransformerV2Block._calc_window_shift@  s    
 ''9::$ $ 6$%% ]%7%:a%?ASTUAVZ[A[$\! )*; < <  	9%'888&'9::%&788eec$:OQc6d6deeessD<QS^`q8r8rsss
[!!5#4#444r3   	feat_sizec                 h   || _         ||| _        |                     t          |                    \  | _        | _        | j        d         | j        d         z  | _        | j                            | j                   | 	                    d| j
        rdn|                                 d           dS )z Updates the input resolution, window size.

        Args:
            feat_size (Tuple[int, int]): New input resolution
            window_size (int): New window size
            always_partition: Change always_partition attribute if not None
        Nr   r   r   FrH   )r   r   r   r   r   r   r   r   r{   rZ   r   r   )ra   r   r   r   s       r1   set_input_sizez%SwinTransformerV2Block.set_input_sizeX  s     !*'$4D!,0,C,CIkDZDZ,[,[)$/+A.1A!1DD	!!$"2333%?DD4+=+=+?+? 	 	
 	
 	
 	
 	
r3   c           	         |j         \  }}}}t          | j                  }|r2t          j        || j        d          | j        d          fd          }n|}| j        d         || j        d         z  z
  | j        d         z  }| j        d         || j        d         z  z
  | j        d         z  }	t          j        j                            |ddd|	d|f          }|j         \  }
}}}
t          || j                  }|
                    d| j        |          }t          | dd          r|                     |          }n| j        }|                     ||          }|
                    d| j        d         | j        d         |          }t!          || j        ||f          }|d d d |d |d d f                                         }|rt          j        || j        d          }n|}|S )	Nr   r   )r   r#   )shiftsdimsr&   r   F)r|   )r(   r   r   rO   rollr   rM   
functionalpadr2   r)   r   getattrr   r   r   r6   r+   )ra   r   r,   r-   r.   r/   	has_shift	shifted_xpad_hpad_w_HpWp	x_windowsr   attn_windowss                   r1   _attnzSwinTransformerV2Block._attnr  s   W
1a ((	 	
1tq/A.ADOTUDVCV-W^deeeIII!!$q4+;A+>'>>$BRSTBUU!!$q4+;A+>'>>$BRSTBUUH'++I1a57QRR	 2r1 %Y0@AA	NN2t'7;;	 4// 	'**955IIIyyy;; $((T-=a-@$BRSTBUWXYY"<1AB8LL	aaa!RaRl+6688	  	
9T_6JJJAAAr3   c                 t   |j         \  }}}}||                     |                     |                     |                              z   }|                    |d|          }||                     |                     |                     |                              z   }|                    ||||          }|S )Nr&   )r(   r   r   r   r   r   r   r   )ra   r   r,   r-   r.   r/   s         r1   r   zSwinTransformerV2Block.forward  s    W
1a

4::a== 9 9:::IIaQ

488A;; 7 7888IIaAq!!r3   r   )r   r   r   r   rM   	LayerNormr   _int_or_tuple_2_tr   r   r   ModulerL   r   rO   r   r   r   r   r   r   r   r   r   s   @r1   r   r      sL         ./,-%*!&!!!!!#)$&L89!H
 H
H
 0H
 	H

 +H
 *H
 #H
 H
 H
 H
 H
 H
 H
 !H
 	H
  %6!H
 H
 H
 H
 H
 H
T x5 %,AW    > >B5 5 15  ((9:5 
uS#Xc3h/	0	5 5 5 58 04	
 
S#X
 sCx
 'tn	
 
 
 
4$u| $ $ $ $ $L %,        r3   r   c                   z     e Zd ZdZdej        fdedee         dej        f fdZ	de
j        de
j        fd	Z xZS )
PatchMergingz Patch Merging Layer.
    Nr;   out_dimr   c                     t                                                       || _        |pd|z  | _        t	          j        d|z  | j        d          | _         || j                  | _        dS )z
        Args:
            dim (int): Number of input channels.
            out_dim (int): Number of output channels (or 2 * dim if None)
            norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
        r#   r$   FrD   N)rK   rL   r;   r   rM   rT   	reductionnorm)ra   r;   r   r   rb   s       r1   rL   zPatchMerging.__init__  sg     	)!c'1s7DLuEEEJt|,,			r3   r   r    c                 |   |j         \  }}}}ddd|dz  d|dz  f}t          j                            ||          }|j         \  }}}}|                    ||dz  d|dz  d|                              dddddd                              d          }|                     |          }|                     |          }|S )Nr   r#   r   r"   r$   r%   )	r(   rM   r   r   r   r*   rq   r   r   )ra   r   r,   r-   r.   r/   
pad_valuesr   s           r1   r   zPatchMerging.forward  s    W
1aAq1uaQ/
Ma,,W
1aIIaaAFAq1199!Q1aKKSSTUVVNN1IIaLLr3   )r   r   r   r   rM   r   r   r   r   rL   rO   r   r   r   r   s   @r1   r   r     s          &*$&L	- -- c]- 		- - - - - -$
 
%, 
 
 
 
 
 
 
 
r3   r   c            '           e Zd ZdZdddddddddej        ddfded	ed
ededededededede	dede	de	de	de
eef         dej        dededdf& fdZ	 d"deeef         dedee         fdZdej        dej        fd Zd#d!Z xZS )$SwinTransformerV2Stagez" A Swin Transformer V2 Stage.
    Fr   Tr9   r   r   r;   r   r   depthr<   r   r   r   
downsampler   r=   r@   r?   r   r   r   rA   output_nchwr    Nc                 4   
 t                                                       | _        | _        |	rt	          d |D                       n| _        | _        | _        d _        t                    t	          d D                       |	rt          |           _        n |k    sJ t          j                     _        t          j        
 fdt          |          D                        _        dS )a  
        Args:
            dim: Number of input channels.
            out_dim: Number of output channels.
            input_resolution: Input resolution.
            depth: Number of blocks.
            num_heads: Number of attention heads.
            window_size: Local window size.
            always_partition: Always partition into full windows and shift
            dynamic_mask: Create attention mask in forward based on current input size
            downsample: Use downsample layer at start of the block.
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: If True, add a learnable bias to query, key, value.
            proj_drop: Projection dropout rate
            attn_drop: Attention dropout rate.
            drop_path: Stochastic depth rate.
            act_layer: Activation layer type.
            norm_layer: Normalization layer.
            pretrained_window_size: Local window size in pretraining.
            output_nchw: Output tensors on NCHW format instead of NHWC.
        c              3       K   | ]	}|d z  V  
dS r#   Nr   r   is     r1   	<genexpr>z2SwinTransformerV2Stage.__init__.<locals>.<genexpr>  s&      &H&H!qAv&H&H&H&H&H&Hr3   Fc                     g | ]}|d z  S r#   r   )r   r   s     r1   r   z3SwinTransformerV2Stage.__init__.<locals>.<listcomp>  s    888qAF888r3   )r;   r   r   c                     g | ]M}t          
j        	|d z  dk    rdnt          t                    r|         n          NS )r#   r   )r;   r   r<   r   r   r   r   r   r=   r@   r?   r   r   r   rA   )r   output_resolution
isinstancelist)r   r   r   r   r?   r   r   r   r   r<   r   rA   r@   r=   ra   r   r   s     r1   r   z3SwinTransformerV2Stage.__init__.<locals>.<listcomp>  s     %# %# %#$ # #!%!7#'!"Q!11*!1)#!##*4Y*E*ET)A,,9#%'=  %# %# %#r3   N)rK   rL   r;   r   r   r  r   r   grad_checkpointingr   r   r   rM   r   
ModuleListrangeblocks)ra   r;   r   r   r   r<   r   r   r   r   r   r=   r@   r?   r   r   r   rA   r   r   rb   s   ` `  ```` ```````` @r1   rL   zSwinTransformerV2Stage.__init__  so   T 	 0LV!l&H&H7G&H&H&H!H!H!H\l
&"',,88K88899
  	,*sGPZ[[[DOO'>>>> kmmDO m %# %# %# %# %# %# %# %# %# %# %# %# %# %# %# %# %# %#$ 5\\%%# %# %# $ $r3   r   c                 $   || _         t          | j        t          j                  r|| _        n:t          | j        t                    sJ t          d |D                       | _        | j        D ]}|	                    | j        ||            dS )a   Updates the resolution, window size and so the pair-wise relative positions.

        Args:
            feat_size: New input (feature) resolution
            window_size: New window size
            always_partition: Always partition / shift the window
        c              3       K   | ]	}|d z  V  
dS r   r   r   s     r1   r   z8SwinTransformerV2Stage.set_input_size.<locals>.<genexpr>*  s&      *E*Ea16*E*E*E*E*E*Er3   r   r   r   N)
r   r  r   rM   r   r  r   r   r  r   )ra   r   r   r   blocks        r1   r   z%SwinTransformerV2Stage.set_input_size  s     !*dor{33 	F%.D""do|<<<<<%**E*E9*E*E*E%E%ED"[ 	 	E  0'!1 !    	 	r3   r   c                     |                      |          }| j        D ]H}| j        r4t          j                                        st          j        ||          }= ||          }I|S r   )r   r  r  rO   jitis_scripting
checkpoint)ra   r   blks      r1   r   zSwinTransformerV2Stage.forward2  sl    OOA; 	 	C& uy/E/E/G/G )#q11CFFr3   c                 j   | j         D ]}t          j                            |j        j        d           t          j                            |j        j        d           t          j                            |j        j        d           t          j                            |j        j        d           d S )Nr   )r  rM   init	constant_r   rE   r~   r   )ra   r  s     r1   _init_respostnormz(SwinTransformerV2Stage._init_respostnorm<  s    ; 	3 	3CGcina000Gci.222Gcina000Gci.2222		3 	3r3   r   )r    N)r   r   r   r   rM   r   r   r   r   r   r   strr   r   rL   r   r   r   rO   r   r   r  r   r   s   @r1   r   r     s         &+!&$!!!!!.4$&L89 %'N$ N$N$ N$ 0	N$
 N$ N$ +N$ #N$ N$ N$ N$ N$ N$ N$ N$  S(]+!N$" 	#N$$ %6%N$& 'N$( 
)N$ N$ N$ N$ N$ N$h 04	 S#X  'tn	   4 %,    3 3 3 3 3 3 3 3r3   r   c            +       f    e Zd ZdZdddddddd	d
dddddddddej        dfdedededededede	edf         de	edf         dede
de
dede
d ed!ed"ed#ed$eeef         d%ed&e	edf         f( fd'Zd( Z	 	 	 	 	 dCdee	eef                  dee	eef                  dee	eef                  d+ee         dee
         f
d,Zej        j        d-             Zej        j        dDd.            Zej        j        dEd/            Zej        j        d0ej        fd1            ZdFdedee         fd2Z	 	 	 	 	 dGd4ej        d5eeeee         f                  d6e
d7e
d8ed9e
d0eeej                 e	ej        eej                 f         f         fd:Z	 	 	 dHd5eeee         f         d<e
d=e
fd>Zd? ZdDd@e
fdAZ dB Z! xZ"S )Ir   z Swin Transformer V2

    A PyTorch impl of : `Swin Transformer V2: Scaling Up Capacity and Resolution`
        - https://arxiv.org/abs/2111.09883
       r$   r"     avg`   r#   r#      r#   r"   r        r   FTr   r9   g?r   )r   r   r   r   r4   
patch_sizein_chansnum_classesglobal_pool	embed_dimdepths.r<   r   r   strict_img_sizer   r=   	drop_rateproj_drop_rateattn_drop_ratedrop_path_rater   r   pretrained_window_sizesc           
         t                                                       || _        |dv sJ || _        d| _        t          |          | _        | _        t          d| j        dz
  z  z            x| _	        | _
        g | _        t          t          t          f          s fdt          | j                  D             t!          |||d         ||d          | _        | j        j        }d t'          j        d|t+          |                                        |          D             }g }d         }d}t          | j                  D ]}|         }|t/          di d	|d
|d|d         |z  |d         |z  fd||         d|dk    d||         d|	d|
d| d|d|d|d|d||         d|d|d||         gz  }|}|dk    r|dz  }| xj        t1          |d|z  d|           gz  c_        t3          j        | | _         || j	                  | _        t;          | j	        |||| j                  | _        |                     | j                    | j        D ]}|!                                 dS ) a]  
        Args:
            img_size: Input image size.
            patch_size: Patch size.
            in_chans: Number of input image channels.
            num_classes: Number of classes for classification head.
            embed_dim: Patch embedding dimension.
            depths: Depth of each Swin Transformer stage (layer).
            num_heads: Number of attention heads in different layers.
            window_size: Window size.
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: If True, add a learnable bias to query, key, value.
            drop_rate: Head dropout rate.
            proj_drop_rate: Projection dropout rate.
            attn_drop_rate: Attention dropout rate.
            drop_path_rate: Stochastic depth rate.
            norm_layer: Normalization layer.
            act_layer: Activation layer type.
            patch_norm: If True, add normalization after patch embedding.
            pretrained_window_sizes: Pretrained window sizes of each layer.
            output_fmt: Output tensor format if not None, otherwise output 'NHWC' by default.
        ) r  NHWCr#   r   c                 :    g | ]}t          d |z  z            S r   )r   )r   r   r$  s     r1   r   z.SwinTransformerV2.__init__.<locals>.<listcomp>  s*    QQQQYa/00QQQr3   r   )r4   r   r!  r$  r   r&  
output_fmtc                 6    g | ]}|                                 S r   )tolist)r   r   s     r1   r   z.SwinTransformerV2.__init__.<locals>.<listcomp>  s     ```aqxxzz```r3   r;   r   r   r   r   r<   r   r   r   r   r=   r@   r?   r   r   r   rA   r$   layers.)num_chsr   module)	pool_typer'  	input_fmtNr   )"rK   rL   r"  r#  r0  len
num_layersr$  r   num_featureshead_hidden_sizefeature_infor  r   r  r  r   patch_embed	grid_sizerO   linspacerr   splitr   dictrM   rS   layersr   r   headapply_init_weightsr  )ra   r4   r   r!  r"  r#  r$  r%  r<   r   r   r&  r   r=   r'  r(  r)  r*  r   r   r+  kwargsr>  dprrB  in_dimscaler   r   blyrb   s         `                       r1   rL   zSwinTransformerV2.__init__K  sp   \ 	&k))))& f++"47	A$/\]J]D^8^4_4__D1)eT]33 	RQQQQ%:P:PQQQI &!l!+
 
 
 $.	``5>!^S[[#Q#Q#W#WX^#_#_```1t'' 	d 	dAlG-   F #,A,%"719N!O!O Qii	
 q55 $A,, (K "2!1 "100 $) " ). ). a&& $)  &:!" (?q'A'A#  F& F1uu
$w!e)Ta^_TaTa"b"b"b!ccmV,Jt011	"!o
 
 
	 	

4%&&&; 	$ 	$C!!####	$ 	$r3   c                     t          |t          j                  r^t          |j        d           t          |t          j                  r0|j        +t          j                            |j        d           d S d S d S d S )Ng{Gz?)stdr   )r  rM   rT   r   r~   rE   r  r  )ra   ms     r1   rE  zSwinTransformerV2._init_weights  s    a## 	-!(,,,,!RY'' -AF,>!!!&!,,,,,	- 	-- -,>,>r3   Nrd   window_ratioc                 V   ||(| j                             ||           | j         j        }|t          fd|D                       }t	          | j                  D ]G\  }}dt          |dz
  d          z  }	|                    |d         |	z  |d         |	z  f||           HdS )aT  Updates the image resolution, window size, and so the pair-wise relative positions.

        Args:
            img_size (Optional[Tuple[int, int]]): New input resolution, if None current resolution is used
            patch_size (Optional[Tuple[int, int]): New patch size, if None use current patch size
            window_size (Optional[int]): New window size, if None based on new_img_size // window_div
            window_ratio (int): divisor for calculating window size from patch grid size
            always_partition: always partition / shift windows even if feat size is < window
        N)r4   r   c                     g | ]}|z  S r   r   )r   r   rN  s     r1   r   z4SwinTransformerV2.set_input_size.<locals>.<listcomp>  s     F F Fql!2 F F Fr3   r#   r   r   r
  )r=  r   r>  r   	enumeraterB  r   )
ra   r4   r   r   rN  r   r>  indexstagestage_scales
       `     r1   r   z SwinTransformerV2.set_input_size  s    " :#9++X*+UUU(2I<#; F F F FI F F FGGK%dk22 	 	LE5s519a000K  $Q<;6	!8ST'!1 !    	 	r3   c                     t                      }|                                 D ]5\  }t          fddD                       r|                               6|S )Nc                     g | ]}|v S r   r   )r   kwns     r1   r   z5SwinTransformerV2.no_weight_decay.<locals>.<listcomp>  s    AAAB!GAAAr3   )rV   rR   )setnamed_modulesr   add)ra   nodrM  rX  s      @r1   no_weight_decayz!SwinTransformerV2.no_weight_decay  sf    ee&&(( 	 	DAqAAAA&@AAABB 



r3   c                 0    t          d|rdng d          S )Nz^absolute_pos_embed|patch_embedz^layers\.(\d+)))z^layers\.(\d+).downsampler   )z^layers\.(\d+)\.\w+\.(\d+)N)z^norm)i )stemr  )rA  )ra   coarses     r1   group_matcherzSwinTransformerV2.group_matcher  s9    3(. $$ 5 5 5
 
 
 	
r3   c                 (    | j         D ]	}||_        
d S r   )rB  r  )ra   enablels      r1   set_grad_checkpointingz(SwinTransformerV2.set_grad_checkpointing  s(     	* 	*A#)A  	* 	*r3   r    c                     | j         j        S r   )rC  fc)ra   s    r1   get_classifierz SwinTransformerV2.get_classifier  s    y|r3   c                 J    || _         | j                            ||           d S r   )r"  rC  reset)ra   r"  r#  s      r1   reset_classifierz"SwinTransformerV2.reset_classifier  s&    &	[11111r3   NCHWr   indicesr   
stop_earlyr0  intermediates_onlyc                 x   |dv s
J d            g }t          t          | j                  |          \  }}	|                     |          }t          | j                  }
t          j                                        s|s| j        }n| j        d|	dz            }t          |          D ]v\  }} ||          }||v rb|r||
dz
  k    r|                     |          }n|}|	                    dddd          
                                }|                    |           w|r|S |                     |          }||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to compatible intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )rl  zOutput shape must be NCHW.Nr   r   r"   r#   )r   r8  rB  r=  rO   r  r  rQ  r   r*   r+   append)ra   r   rm  r   rn  r0  ro  intermediatestake_indices	max_index
num_stagesstagesr   rS  x_inters                  r1   forward_intermediatesz'SwinTransformerV2.forward_intermediates  s^   * Y&&&(D&&&"6s4;7G7G"Q"Qi Q%%
9!!## 	1: 	1[FF[)a-0F!&)) 	. 	.HAuaAL    Aa//"iillGGG!//!Q155@@BB$$W--- 	!  IIaLL-r3   r   
prune_norm
prune_headc                     t          t          | j                  |          \  }}| j        d|dz            | _        |rt          j                    | _        |r|                     dd           |S )z@ Prune layers not required for specified intermediates.
        Nr   r   r-  )r   r8  rB  rM   r   r   rk  )ra   rm  ry  rz  rs  rt  s         r1   prune_intermediate_layersz+SwinTransformerV2.prune_intermediate_layers7  sq     #7s4;7G7G"Q"Qik.9q=.1 	&DI 	)!!!R(((r3   c                     |                      |          }|                     |          }|                     |          }|S r   )r=  rB  r   ra   r   s     r1   forward_featuresz"SwinTransformerV2.forward_featuresG  s8    QKKNNIIaLLr3   
pre_logitsc                 ^    |r|                      |d          n|                      |          S )NT)r  )rC  )ra   r   r  s      r1   forward_headzSwinTransformerV2.forward_headM  s,    0:Ltyyty,,,		!Lr3   c                 Z    |                      |          }|                     |          }|S r   )r  r  r~  s     r1   r   zSwinTransformerV2.forwardP  s-    !!!$$a  r3   )NNNrd   NF)Tr   )NFFrl  F)r   FT)#r   r   r   r   rM   r   r   r   r  r   r   r   r   r   rL   rE  r   r   rO   r  ignorer]  ra  re  r   rh  rk  r   r   rx  r|  r  r  r   r   r   s   @r1   r   r   D  s'         +.#$&2)7-.%*$(!!!$&$&$'.4#%<7C+r$ r$'r$ r$ 	r$
 r$ r$ r$ #s(Or$ S#Xr$ +r$ #r$ "r$ r$ r$ r$  "!r$" "#r$$ "%r$& S(]+'r$( !)r$* &+38_+r$ r$ r$ r$ r$ r$h- - - 374859*+/3 uS#X/ !sCx1 "%S/2	
 #3- 'tn   @ Y   Y
 
 
 
 Y* * * * Y	    2 2C 2hsm 2 2 2 2 8<$$',0  0 |0  eCcN340  	0 
 0  0  !%0  
tEL!5tEL7I)I#JJ	K0  0  0  0 h ./$#	 3S	>*  	      M M$ M M M M      r3   c                   	 |                      d|           } |                      d|           } d| v }i }dd l}|                                 D ]\  	}t          	fddD                       r!d	v rS|j        j        j        j        \  }}}}|j        d         |k    s|j        d	         |k    rt          |||fd
dd          }|s.|	                    dd 	          		
                    dd          	||	<   |S )Nmodel
state_dictzhead.fc.weightr   c                     g | ]}|v S r   r   )r   rX  r   s     r1   r   z(checkpoint_filter_fn.<locals>.<listcomp>]  s    bbb1Qbbbr3   )rg   rf   r   zpatch_embed.proj.weightr   r&   bicubicT)interpolation	antialiasverbosezlayers.(\d+).downsamplec                 T    dt          |                     d                    dz    dS )Nr3  r   z.downsample)r   group)r   s    r1   <lambda>z&checkpoint_filter_fn.<locals>.<lambda>m  s)    =gs177ST::YZGZ=g=g=g r3   zhead.zhead.fc.)getreitemsr   r=  r]   r~   r(   r   subreplace)
r  r  native_checkpointout_dictr  r   r   r-   r.   r   s
            @r1   checkpoint_filter_fnr  V  s>   44Jj99J(J6HIII  ""  1bbbb abbbcc 	$))*/6<JAq!Qwr{a172;!#3#3(F"+"    ! 	/13g3gijkkA		':..AOr3   Fc           	          t          d t          |                    dd                    D                       }|                    d|          }t	          t
          | |ft          t          d|          d|}|S )Nc              3       K   | ]	\  }}|V  
d S r   r   )r   r   r   s      r1   r   z._create_swin_transformer_v2.<locals>.<genexpr>u  s&      \\da\\\\\\r3   r%  )r   r   r   r   out_indicesT)flatten_sequentialr  )pretrained_filter_fnfeature_cfg)r   rQ  r  popr   r   r  rA  )variant
pretrainedrF  default_out_indicesr  r  s         r1   _create_swin_transformer_v2r  t  s    \\i

8\8Z8Z.[.[\\\\\**],?@@K 7J1DkJJJ  	 E
 Lr3   r-  c                 8    | ddddddt           t          ddd	d
|S )Nr  )r"      r  )rd   rd   g?r  Tzpatch_embed.projzhead.fcmit)urlr"  
input_size	pool_sizecrop_pctr  fixed_input_sizemeanrL  
first_conv
classifierlicenser   )r  rF  s     r1   _cfgr    s:    =v%.B(	  # r3   ztimm/z{https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window12to16_192to256_22kto1k_ft.pth)	hf_hub_idr  z{https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window12to24_192to384_22kto1k_ft.pth)r"     r  )r  r  re   )r  r  r  r  r  z|https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_large_patch4_window12to16_192to256_22kto1k_ft.pthz|https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_large_patch4_window12to24_192to384_22kto1k_ft.pthzfhttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_tiny_patch4_window8_256.pthzghttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_tiny_patch4_window16_256.pthzghttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_small_patch4_window8_256.pthzhhttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_small_patch4_window16_256.pthzfhttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window8_256.pthzghttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window16_256.pthzkhttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window12_192_22k.pthiQU  )r"      r  )r  r  )r  r  r"  r  r  zlhttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_large_patch4_window12_192_22k.pth)2swinv2_base_window12to16_192to256.ms_in22k_ft_in1k2swinv2_base_window12to24_192to384.ms_in22k_ft_in1k3swinv2_large_window12to16_192to256.ms_in22k_ft_in1k3swinv2_large_window12to24_192to384.ms_in22k_ft_in1kzswinv2_tiny_window8_256.ms_in1kz swinv2_tiny_window16_256.ms_in1kz swinv2_small_window8_256.ms_in1kz!swinv2_small_window16_256.ms_in1kzswinv2_base_window8_256.ms_in1kz swinv2_base_window16_256.ms_in1k!swinv2_base_window12_192.ms_in22k"swinv2_large_window12_192.ms_in22kc           	      \    t          dddd          }t          	 dd| it          |fi |S )	
    r   r  r  r  r   r$  r%  r<   swinv2_tiny_window16_256r  )r  rA  r  r  rF  
model_argss      r1   r  r    s\     "<SabbbJ&"Y Y/9Y=A*=W=WPV=W=WY Y Yr3   c           	      \    t          dddd          }t          	 dd| it          |fi |S )	r  rd   r  r  r  r  swinv2_tiny_window8_256r  )r  r  r  s      r1   r  r    s[     !r,R`aaaJ&!X X.8X<@<V<Vv<V<VX X Xr3   c           	      \    t          dddd          }t          	 dd| it          |fi |S )	r  r   r  r#   r#      r#   r  r  swinv2_small_window16_256r  )r  r  r  s      r1   r  r    s\     "=TbcccJ&#Z Z0:Z>B:>X>XQW>X>XZ Z Zr3   c           	      \    t          dddd          }t          	 dd| it          |fi |S )	r  rd   r  r  r  r  swinv2_small_window8_256r  )r  r  r  s      r1   r  r    s\     !r-SabbbJ&"Y Y/9Y=A*=W=WPV=W=WY Y Yr3   c           	      \    t          dddd          }t          	 dd| it          |fi |S )	r  r      r  r$   rd   r       r  swinv2_base_window16_256r  )r  r  r  s      r1   r  r    \     "MUcdddJ&"Y Y/9Y=A*=W=WPV=W=WY Y Yr3   c           	      \    t          dddd          }t          	 dd| it          |fi |S )	r  rd   r  r  r  r  swinv2_base_window8_256r  )r  r  r  s      r1   r  r    s[     !s=TbcccJ&!X X.8X<@<V<Vv<V<VX X Xr3   c           	      \    t          dddd          }t          	 dd| it          |fi |S )	r  r  r  r  r  r  swinv2_base_window12_192r  )r  r  r  s      r1   r  r    r  r3   c           	      ^    t          ddddd          }t          	 d	d| it          |fi |S )
r  r   r  r  r  r  r  r  r  r   r$  r%  r<   r+  !swinv2_base_window12to16_192to256r  )r  r  r  s      r1   r  r    g     #m~ /1 1 1J '+b b8BbFJ:F`F`Y_F`F`b b br3   c           	      ^    t          ddddd          }t          	 d	d| it          |fi |S )
r  r  r  r  r  r  r  !swinv2_base_window12to24_192to384r  )r  r  r  s      r1   r  r    r  r3   c           	      \    t          dddd          }t          	 dd| it          |fi |S )	r  r  r  r  r  r  r  0   r  swinv2_large_window12_192r  )r  r  r  s      r1   r  r    s\     "MUdeeeJ&#Z Z0:Z>B:>X>XQW>X>XZ Z Zr3   c           	      ^    t          ddddd          }t          	 d	d| it          |fi |S )
r  r   r  r  r  r  r  "swinv2_large_window12to16_192to256r  )r  r  r  s      r1   r  r  #  g     #m /1 1 1J ',c c9CcGKJGaGaZ`GaGac c cr3   c           	      ^    t          ddddd          }t          	 d	d| it          |fi |S )
r  r  r  r  r  r  r  "swinv2_large_window12to24_192to384r  )r  r  r  s      r1   r  r  .  r  r3   r  r  r  r  r  r  )swinv2_base_window12_192_22k)swinv2_base_window12to16_192to256_22kft1k)swinv2_base_window12to24_192to384_22kft1kswinv2_large_window12_192_22k*swinv2_large_window12to16_192to256_22kft1k*swinv2_large_window12to24_192to384_22kft1kr  )r-  )Gr   rp   typingr   r   r   r   r   rO   torch.nnrM   torch.nn.functionalr   r   torch.utils.checkpointutilsr  	timm.datar	   r
   timm.layersr   r   r   r   r   r   r   r   r   r   r   _builderr   	_featuresr   _features_fxr   	_registryr   r   r   __all__r   r   r   r2   r6   r   r8   r   r   r   r   r  r  r  default_cfgsr  r  r  r  r  r  r  r  r  r  r  r  r   r   r3   r1   <module>r     s     9 9 9 9 9 9 9 9 9 9 9 9 9 9                 + + + + + + + + + A A A A A A A A; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; * * * * * * + + + + + + 3 3 3 3 3 3 Y Y Y Y Y Y Y Y Y Y
#uS#X./  5c? u|     EL uS#X RWX[]`X`Ra fkfr    "N N N N Nbi N N NbH H H H HRY H H HV         29      F}3 }3 }3 }3 }3RY }3 }3 }3@O O O O O	 O O Od  <	 	 	 	    %$:>$ J; ; ; ;?$ J Hs; ; ;
 <@4 K< < < <@4 K Hs< < < (,tt( ( ( )-u) ) ) )-u) ) ) *.v* * * (,tt( ( ( )-u) ) )
 *.ymv* * *
 +/$zmv+ + +e7& 7& 7 7t Y Y<M Y Y Y Y X X;L X X X X Z Z=N Z Z Z Z Y Y<M Y Y Y Y Y Y<M Y Y Y Y X X;L X X X X Y Y<M Y Y Y Y b bEV b b b b b bEV b b b b Z Z=N Z Z Z Z c cFW c c c c c cFW c c c c  H$G1e1e%I2g2g' '     r3   