
    %h`                        d dl Z d dlmZ d dlmc mZ d dlmc mZ d dl	Z
d dlmZmZmZ d dlmZ  e       Z G d dej"                        Zd Zd Z G d d	ej"                        Z G d
 dej"                        Z G d dej"                        Z G d dej"                        Z G d dej"                        Z G d dej"                        Zd Zd Zd Zd Zy)    N)DropPath	to_2tupletrunc_normal_)Configc                   F     e Zd ZdZddej
                  df fd	Zd Z xZS )Mlpz Multilayer perceptron.N        c                     t         |           |xs |}|xs |}t        j                  ||      | _         |       | _        t        j                  ||      | _        t        j                  |      | _        y N)	super__init__nnLinearfc1actfc2Dropoutdrop)selfin_featureshidden_featuresout_features	act_layerr   	__class__s         :/var/www/html/mariraj/BiRefNet/models/backbones/swin_v1.pyr   zMlp.__init__   s_    #2{)8[99[/:;99_l;JJt$	    c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   r   )r   xs     r   forwardzMlp.forward    sH    HHQKHHQKIIaLHHQKIIaLr   )	__name__
__module____qualname____doc__r   GELUr   r   __classcell__r   s   @r   r   r      s!    !48tWYW^W^eg %r   r   c                     | j                   \  }}}}| j                  |||z  |||z  ||      } | j                  dddddd      j                         j                  d|||      }|S )z
    Args:
        x: (B, H, W, C)
        window_size (int): window size

    Returns:
        windows: (num_windows*B, window_size, window_size, C)
    r                  )shapeviewpermute
contiguous)r   window_sizeBHWCwindowss          r   window_partitionr8   )   sp     JAq!Q	q!{"Kk1A;PQRAii1aAq)446;;B[Z[\GNr   c                     t        | j                  d         }| j                  d||z  ||z  |||      }|j                  dddddd      j	                         j                  d|||      }|S )z
    Args:
        windows: (num_windows*B, window_size, window_size, C)
        window_size (int): Window size
        H (int): Height of image
        W (int): Width of image

    Returns:
        x: (B, H, W, C)
    r-   r   r(   r)   r*   r+   r,   )intr.   r/   r0   r1   )r7   r2   r4   r5   r6   r   s         r   window_reverser;   8   sq     	GMM"ARk)1+;[+WXYA			!Q1a#..055b!QBAHr   c                   ,     e Zd ZdZd fd	ZddZ xZS )WindowAttentiona   Window based multi-head self attention (W-MSA) module with relative position bias.
    It supports both of shifted and non-shifted window.

    Args:
        dim (int): Number of input channels.
        window_size (tuple[int]): The height and width of the window.
        num_heads (int): Number of attention heads.
        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
    c                    t         |           || _        || _        || _        ||z  }|xs |dz  | _        t        j                  t        j                  d|d   z  dz
  d|d   z  dz
  z  |            | _
        t        j                  | j                  d         }	t        j                  | j                  d         }
t        j                  t        j                  |	|
gd            }t        j                  |d      }|d d d d d f   |d d d d d f   z
  }|j                  ddd      j!                         }|d d d d dfxx   | j                  d   dz
  z  cc<   |d d d d dfxx   | j                  d   dz
  z  cc<   |d d d d dfxx   d| j                  d   z  dz
  z  cc<   |j#                  d      }| j%                  d|       t        j&                  ||d	z  |
      | _        || _        t        j,                  |      | _        t        j&                  ||      | _        t        j,                  |      | _        t5        | j                  d       t        j6                  d      | _        y )Ng      r*   r   r(   ij)indexingr-   relative_position_indexr)   bias{Gz?std)dim)r   r   rG   r2   	num_headsscaler   	Parametertorchzerosrelative_position_bias_tablearangestackmeshgridflattenr0   r1   sumregister_bufferr   qkvattn_drop_probr   	attn_dropproj	proj_dropr   Softmaxsoftmax)r   rG   r2   rH   qkv_biasqk_scalerV   rX   head_dimcoords_hcoords_wcoordscoords_flattenrelative_coordsrA   r   s                  r   r   zWindowAttention.__init__W   s-   &")#1T!1
 -/LLKK[^+a/AA4F4JKYW-Y) << 0 0 34<< 0 0 34U^^Xx,@4PQvq1(At4~aqj7QQ)11!Q:EEG1a D$4$4Q$7!$;; 1a D$4$4Q$7!$;; 1a A(8(8(;$;a$?? "1"5"5b"968OP99S#'9'I.IIc3'	I.d77SAzzb)r   c                 ,   |j                   \  }}}| j                  |      j                  ||d| j                  || j                  z        j	                  ddddd      }|d   |d   |d   }	}}|| j
                  z  }t        j                  r\t        j                  j                  j                  |||	d| j                  d      j                  dd      j                  |||      }n||j                  d	d
      z  }
| j                  | j                  j!                  d
         j!                  | j"                  d   | j"                  d   z  | j"                  d   | j"                  d   z  d
      }|j	                  ddd      j%                         }|
|j'                  d      z   }
||j                   d   }|
j!                  ||z  || j                  ||      |j'                  d      j'                  d      z   }
|
j!                  d
| j                  ||      }
| j)                  |
      }
n| j)                  |
      }
| j+                  |
      }
|
|	z  j                  dd      j                  |||      }| j-                  |      }| j/                  |      }|S )z Forward function.

        Args:
            x: input features with shape of (num_windows*B, N, C)
            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
        r)   r*   r   r(   r+   NF)	attn_mask	dropout_p	is_causalr-   )r.   rT   reshaperH   r0   rI   configSDPA_enabledrK   r   
functionalscaled_dot_product_attentionrU   	transposerM   rA   r/   r2   r1   	unsqueezerZ   rV   rW   rX   )r   r   maskB_Nr6   rT   qkvattnrelative_position_biasnWs                r   r   zWindowAttention.forwardz   sf    77Aqhhqk!!"aDNNA<OPXXYZ\]_`bcefga&#a&#a&a1

N##@@1a$*=*= A  i1oggb!Q/ 
 B++D%)%F%FtGcGcGhGhikGl%m%r%r  #d&6&6q&994;K;KA;NQUQaQabcQd;dfh&" &<%C%CAq!%L%W%W%Y"0::1==DZZ]yyr2t~~q!Dt~~VWGXGbGbcdGeeyyT^^Q:||D)||D)>>$'D$$Q*222q!<AIIaLNN1r   )TNr	   r	   r   r    r!   r"   r#   r   r   r%   r&   s   @r   r=   r=   I   s    !*F(r   r=   c            
       f     e Zd ZdZddddddddej
                  ej                  f
 fd	Zd	 Z xZ	S )
SwinTransformerBlocka]   Swin Transformer Block.

    Args:
        dim (int): Number of input channels.
        num_heads (int): Number of attention heads.
        window_size (int): Window size.
        shift_size (int): Shift size for SW-MSA.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float, optional): Stochastic depth rate. Default: 0.0
        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
       r         @TNr	   c           	         t         |           || _        || _        || _        || _        || _        d| j
                  cxk  r| j                  k  sJ d        J d        ||      | _        t        |t        | j                        ||||	|      | _
        |
dkD  rt        |
      nt        j                         | _         ||      | _        t!        ||z        }t#        ||||      | _        d | _        d | _        y )Nr   z shift_size must in 0-window_size)r2   rH   r[   r\   rV   rX   r	   )r   r   r   r   )r   r   rG   rH   r2   
shift_size	mlp_rationorm1r=   r   ru   r   r   Identity	drop_pathnorm2r:   r   mlpr4   r5   )r   rG   rH   r2   r~   r   r[   r\   r   rV   r   r   
norm_layermlp_hidden_dimr   s                 r   r   zSwinTransformerBlock.__init__   s     	"&$"DOO6d&6&66Z8ZZ6Z8ZZ6_
#Yt'7'78IIQUW	 1:B),BKKM_
S9_-3R[bfgr   c           	         |j                   \  }}}| j                  | j                  }}|||z  k(  sJ d       |}| j                  |      }|j	                  ||||      }dx}	}
| j
                  || j
                  z  z
  | j
                  z  }| j
                  || j
                  z  z
  | j
                  z  }t        j                  |dd|	||
|f      }|j                   \  }}}}| j                  dkD  r3t        j                  || j                   | j                   fd      }|}n|}d}t        || j
                        }|j	                  d| j
                  | j
                  z  |      }| j                  ||      }|j	                  d| j
                  | j
                  |      }t        || j
                  ||      }| j                  dkD  r/t        j                  || j                  | j                  fd      }n|}|dkD  s|dkD  r|ddd|d|ddf   j                         }|j	                  |||z  |      }|| j                  |      z   }|| j                  | j!                  | j#                  |                  z   }|S )z Forward function.

        Args:
            x: Input feature, tensor size (B, H*W, C).
            H, W: Spatial resolution of the input feature.
            mask_matrix: Attention mask for cyclic shift.
        input feature has wrong sizer   )r(   r*   )shiftsdimsNr-   )ro   )r.   r4   r5   r   r/   r2   Fpadr~   rK   rollr8   ru   r;   r1   r   r   r   )r   r   mask_matrixr3   Lr6   r4   r5   shortcutpad_lpad_tpad_rpad_b_HpWp	shifted_xrd   	x_windowsattn_windowss                       r   r   zSwinTransformerBlock.forward   se    ''1avvtvv1AEz999zJJqMFF1aA !!A(8(8$88D<L<LL!!A(8(8$88D<L<LLEE!aE5%78ww2r1 ??Q

1t.>@P-QX^_I#III %Y0@0@A	NN2t'7'7$:J:J'JAN	 yyy; $((T-=-=t?O?OQRS"<1A1A2rJ	 ??Q

9doot-OV\]AA19	!RaR!Q,**,AFF1a!eQ t~~a((txx

1677r   )
r    r!   r"   r#   r   r$   	LayerNormr   r   r%   r&   s   @r   rz   rz      s5    " 45t"PR^`77r||09r   rz   c                   @     e Zd ZdZej
                  f fd	Zd Z xZS )PatchMergingz Patch Merging Layer

    Args:
        dim (int): Number of input channels.
        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
    c                     t         |           || _        t        j                  d|z  d|z  d      | _         |d|z        | _        y )Nr+   r*   FrB   )r   r   rG   r   r   	reductionnorm)r   rG   r   r   s      r   r   zPatchMerging.__init__  sA    1s7AG%@q3w'	r   c           
         |j                   \  }}}|||z  k(  sJ d       |j                  ||||      }|dz  dk(  xs |dz  dk(  }|r"t        j                  |ddd|dz  d|dz  f      }|ddddddddddf   }|ddddddddddf   }	|ddddddddddf   }
|ddddddddddf   }t	        j
                  ||	|
|gd      }|j                  |dd|z        }| j                  |      }| j                  |      }|S ) Forward function.

        Args:
            x: Input feature, tensor size (B, H*W, C).
            H, W: Spatial resolution of the input feature.
        r   r*   r(   r   Nr-   r+   )r.   r/   r   r   rK   catr   r   )r   r   r4   r5   r3   r   r6   	pad_inputx0x1x2x3s               r   r   zPatchMerging.forward  sL    ''1aAEz999zFF1aA UaZ0QUaZ	a!Q1q5!QU34Aq!$Q$1a q!$Q$1a q!$Q$1a q!$Q$1a IIr2r2&+FF1b!a% IIaLNN1r   	r    r!   r"   r#   r   r   r   r   r%   r&   s   @r   r   r     s     (*|| (r   r   c            
       R     e Zd ZdZdddddddej
                  ddf
 fd	Zd	 Z xZS )

BasicLayera   A basic Swin Transformer layer for one stage.

    Args:
        dim (int): Number of feature channels
        depth (int): Depths of this stage.
        num_heads (int): Number of attention head.
        window_size (int): Local window size. Default: 7.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
    r{   r|   TNr	   Fc                 j   t         |           || _        |dz  | _        || _        || _        t        j                  t        |      D cg c]:  }t        ||||dz  dk(  rdn|dz  |||||	t        |
t              r|
|   n|
|      < c}      | _        | |||      | _        y d | _        y c c}w )Nr*   r   )rG   rH   r2   r~   r   r[   r\   r   rV   r   r   )rG   r   )r   r   r2   r~   depthuse_checkpointr   
ModuleListrangerz   
isinstancelistblocks
downsample)r   rG   r   rH   r2   r   r[   r\   r   rV   r   r   r   r   ir   s                  r   r   zBasicLayer.__init__I  s     	&%*
, mm 5\%#  !#'!"Q!1+2B#!!#*4Y*E)A,9%'%# $  !(SZHDO"DO'%#s   ?B0c           	         t        j                  t        j                  |      | j                  z        j	                  t         j
                        | j                  z  }t        j                  t        j                  |      | j                  z        j	                  t         j
                        | j                  z  }t        j                  d||df|j                        }t        d| j                         t        | j                   | j                         t        | j                   d      f}t        d| j                         t        | j                   | j                         t        | j                   d      f}d}	|D ]  }
|D ]  }|	|dd|
|ddf<   |	dz  }	  t        || j                        }|j                  d| j                  | j                  z        }|j                  d      |j                  d      z
  }|j                  |dk7  t        d            j                  |dk(  t        d            j	                  |j                        }| j                   D ]>  }||c|_        |_        | j&                  rt)        j(                  |||      }6 |||      }@ | j*                  +| j+                  |||      }|dz   dz  |dz   dz  }}||||||fS ||||||fS )	r   r(   )devicer   Nr-   r*   g      Yr	   )rK   ceiltensorr2   toint64rL   r   slicer~   r8   r/   rn   masked_fillfloatdtyper   r4   r5   r   
checkpointr   )r   r   r4   r5   r   r   img_maskh_slicesw_slicescnthwmask_windowsrd   blkx_downWhWws                     r   r   zBasicLayer.forwards  s    ZZQ$*:*::;>>u{{KdN^N^^ZZQ$*:*::;>>u{{KdN^N^^;;2r1~ahh?!d.../4+++doo-=>4??*D13 !d.../4+++doo-=>4??*D13  	A '*Aq!$q	
 ($2B2BC#((T-=-=@P@P-PQ **1-0F0Fq0II	)))q.%-HTTU^bcUcejkneopsstut{t{|	;; 	&CaLCE35""))#q)<9%	& ??&__Q1-F!e\AEa<BaFB**aAq!##r   r   r&   s   @r   r   r   6  s7    , LL  %(#T)$r   r   c                   *     e Zd ZdZd fd	Zd Z xZS )
PatchEmbedaH   Image to Patch Embedding

    Args:
        patch_size (int): Patch token size. Default: 4.
        in_channels (int): Number of input image channels. Default: 3.
        embed_dim (int): Number of linear projection output channels. Default: 96.
        norm_layer (nn.Module, optional): Normalization layer. Default: None
    c                     t         |           t        |      }|| _        || _        || _        t        j                  ||||      | _        | ||      | _	        y d | _	        y )N)kernel_sizestride)
r   r   r   
patch_sizein_channels	embed_dimr   Conv2drW   r   )r   r   r   r   r   r   s        r   r   zPatchEmbed.__init__  s^    z*
$&"IIk9*U_`	!"9-DIDIr   c           
         |j                         \  }}}}|| j                  d   z  dk7  r8t        j                  |d| j                  d   || j                  d   z  z
  f      }|| j                  d   z  dk7  r:t        j                  |ddd| j                  d   || j                  d   z  z
  f      }| j	                  |      }| j
                  |j                  d      |j                  d      }}|j                  d      j                  dd      }| j                  |      }|j                  dd      j                  d| j                  ||      }|S )Forward function.r(   r   r*   r)   r-   )
sizer   r   r   rW   r   rQ   rm   r/   r   )r   r   r   r4   r5   r   r   s          r   r   zPatchEmbed.forward  s*    VVX
1atq!!Q&a!T__Q/!dooa6H2HHIJAtq!!Q&a!Q4??1#5DOOA<N8N#NOPAIIaL99 VVAYq	B		!&&q!,A		!AAq!&&r4>>2rBAr   )r+   r)   `   Nrx   r&   s   @r   r   r     s    r   r   c                   ~     e Zd ZdZddddg dg ddd	d
ddddej
                  dd
dddf fd	Zd Zd Zd fd	Z	 xZ
S )SwinTransformera   Swin Transformer backbone.
        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
          https://arxiv.org/pdf/2103.14030

    Args:
        pretrain_img_size (int): Input image size for training the pretrained model,
            used in absolute postion embedding. Default 224.
        patch_size (int | tuple(int)): Patch size. Default: 4.
        in_channels (int): Number of input image channels. Default: 3.
        embed_dim (int): Number of linear projection output channels. Default: 96.
        depths (tuple[int]): Depths of each Swin Transformer stage.
        num_heads (tuple[int]): Number of attention head of each stage.
        window_size (int): Window size. Default: 7.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
        drop_rate (float): Dropout rate.
        attn_drop_rate (float): Attention dropout rate. Default: 0.
        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
        out_indices (Sequence[int]): Output from which stages.
        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
            -1 means not freezing any parameters.
        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
       r+   r)   r   r*   r*      r*   r)   r         r{   r|   TNr	   g?F)r   r(   r*   r)   r-   c                    t         |           || _        t        |      | _        || _        || _        || _        || _        || _	        t        |||| j                  r|nd       | _        | j                  ryt        |      }t        |      }|d   |d   z  |d   |d   z  g}t        j                  t        j                   d||d   |d               | _        t%        | j"                  d       t        j&                  |      | _        t        j*                  d|t-        |            D cg c]  }|j/                          }}t        j0                         | _        t5        | j                        D ]  }t7        t9        |d|z  z        ||   ||   |||	|
|||t-        |d |       t-        |d |dz           ||| j                  dz
  k  rt:        nd |      }| j2                  j=                  |        t5        | j                        D cg c]  }t9        |d|z  z         }}|| _        |D ]$  } |||         }d	| }| jA                  ||       & | jC                          y c c}w c c}w )
N)r   r   r   r   r   r(   rD   rE   )pr*   )rG   r   rH   r2   r   r[   r\   r   rV   r   r   r   r   r   )"r   r   pretrain_img_sizelen
num_layersr   ape
patch_normout_indicesfrozen_stagesr   patch_embedr   r   rJ   rK   rL   absolute_pos_embedr   r   pos_droplinspacerR   itemr   layersr   r   r:   r   appendnum_features
add_module_freeze_stages)r   r   r   r   r   depthsrH   r2   r   r[   r\   	drop_rateattn_drop_ratedrop_path_rater   r   r   r   r   r   patches_resolutionr   dpri_layerlayerr   r   
layer_namer   s                               r   r   zSwinTransformer.__init__  sp   ( 	!2f+"$&* &!{i%)__z$@
 88 )*; <":.J"3A"6*Q-"GIZ[\I]aklmanIn!o&(ll5;;q)M_`aMbdvwxdy3z&{D#$11s;

Y/ "'>3v;!OPAqvvxPP mmoT__- 	&G	AL01Wo#G,'#!!(c&'"23C|!8L4MN%,3doo6I,I<PT-/E KKu%	&" :?t9OPAIQ./PP( # 	/G|G45Ey)JOOJ.	/
 	= Q* Qs   .I*I/c                     | j                   dk\  r@| j                  j                          | j                  j                         D ]	  }d|_         | j                   dk\  r| j
                  rd| j                  _        | j                   dk\  rt| j                  j                          t        d| j                   dz
        D ]=  }| j                  |   }|j                          |j                         D ]	  }d|_         ? y y )Nr   Fr(   r*   )
r   r   eval
parametersrequires_gradr   r   r   r   r   )r   paramr   ms       r   r   zSwinTransformer._freeze_stages6  s    "!!#))446 ,&+#, "txx49D##1"MM 1d00145 0KKN\\^ 0E*/E'00 #r   c                    | j                  |      }|j                  d      |j                  d      }}| j                  r)t        j                  | j
                  ||fd      }||z   }g }|j                  d      j                  dd      }| j                  |      }t        | j                        D ]  }| j                  |   } ||||      \  }}	}
}}}|| j                  v s2t        | d|       } ||      }|j                  d|	|
| j                  |         j!                  dddd      j#                         }|j%                  |        t'        |      S )	r   r*   r)   bicubic)r   moder(   r   r-   r   )r   r   r   r   interpolater   rQ   rm   r   r   r   r   r   getattrr/   r   r0   r1   r   tuple)r   r   r   r   r   outsr   r   x_outr4   r5   r   outs                r   r   zSwinTransformer.forwardH  sE   QAFF1IB88!"t/F/FbRTX\e!f''AIIaL""1a(MM!t' 		!AKKNE%*1b"%5"E1aBD$$$$TT!:6
"5)jjQ4+<+<Q+?@HHAqRST__aC 		! T{r   c                 L    t         t        |   |       | j                          y)z?Convert the model into training mode while keep layers freezed.N)r   r   trainr   )r   r  r   s     r   r  zSwinTransformer.trainb  s    ot*40r   )T)r    r!   r"   r#   r   r   r   r   r   r  r%   r&   s   @r   r   r     sb    : $'$) " #LL )! %'M^0$4 r   r   c                  ,    t        dg dg dd      } | S )Nr   r   r   r{   r   r   rH   r2   r   models    r   	swin_v1_tr  g  s    befgELr   c                  ,    t        dg dg dd      } | S )Nr   r*   r*      r*   r   r{   r  r  r  s    r   	swin_v1_sr  k  s    b.fghELr   c                  ,    t        dg dg dd      } | S )N   r  )r+             r   r  r  r  s    r   	swin_v1_br  o  s    c->gijELr   c                  ,    t        dg dg dd      } | S )N   r  )r   r   r   0   r   r  r  r  s    r   	swin_v1_lr!  s  s    c-?hjkELr   )rK   torch.nnr   torch.nn.functionalrk   r   torch.utils.checkpointutilsr   numpynptimm.layersr   r   r   ri   r   Moduler   r8   r;   r=   rz   r   r   r   r   r  r  r  r!   r   r   <module>r+     s        + +  : :  
")) *"Ybii Yxc299 cL(299 (Vf$ f$R( (V[bii [zr   