
    Ngn                     H   d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
Z
ddlmZ ddlmc mZ ddlZ
ddlmZmZmZmZ ddlmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z# ddl$m%Z% dd	l&m'Z' dd
l(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0 dgZ1 ej2        e3          Z4de5de5de
j6        de
j6        fdZ7 e+e7           de
j6        de
j6        de
j6        dee5e5f         dee5e5f         de
j6        fdZ8 G d dej9                  Z: G d dej9                  Z; G d dej9                  Z<de
j6        de5dee
j6        ee5e5f         f         fd Z=	 d<d!e
j6        de5d"ee5e5f         d#eee5e5f                  de
j6        f
d$Z> G d% dej9                  Z?d& Z@d=d(ZA e/ eAd)d*d+eedd,d-.           eAd/d*d+eedd,d-.           eAd0d*d+eedd,d-.           eAeed1d2d34          d5          ZBd>d7ZCe0d>de?fd8            ZDe0d>de?fd9            ZEe0d>de?fd:            ZFe0d>de?fd;            ZGdS )?a+   Vision Transformer (ViT) in PyTorch

A PyTorch implement of Vision Transformers as described in:

'Exploring Plain Vision Transformer Backbones for Object Detection'
    - https://arxiv.org/abs/2203.16527

'Segment Anything Model (SAM)'
    - https://github.com/facebookresearch/segment-anything/

    N)partial)CallableListOptionalTupleUnion)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STDIMAGENET_INCEPTION_MEANIMAGENET_INCEPTION_STD)
PatchEmbedMlpDropPathPatchDropoutLayerNorm2dClassifierHeadNormMlpClassifierHeadFormatresample_abs_pos_embed_nhwcRotaryEmbeddingCatapply_rot_embed_cat	to_2tupleuse_fused_attn)Final   )build_model_with_cfg)feature_take_indices)register_notrace_function)checkpoint_seq)generate_default_cfgsregister_modelVisionTransformerSAMq_sizek_sizerel_posreturnc                    t          dt          | |          z  dz
            }|j        d         |k    rwt          j        |                    d|j        d         d                              ddd          |d          }|                    d|                              dd          }n|}t          j        |           dddf         t          || z  d          z  }t          j        |          dddf         t          | |z  d          z  }||z
  |dz
  t          | |z  d          z  z   }||	                                         S )	a\  
    Get relative positional embeddings according to the relative positions of
        query and key sizes.
    Args:
        q_size (int): size of query q.
        k_size (int): size of key k.
        rel_pos (Tensor): relative position embeddings (L, C).

    Returns:
        Extracted positional embeddings according to relative positions.
       r   r   linear)sizemodeN      ?)
intmaxshapeFinterpolatereshapepermutetorcharangelong)r#   r$   r%   max_rel_distrel_pos_resizedq_coordsk_coordsrelative_coordss           ^/var/www/html/ai-engine/env/lib/python3.11/site-packages/timm/models/vision_transformer_sam.pyget_rel_posr>   &   sK    q3vv...233L}Q<''-OOAw}Q/44<<Q1EE
 
 

 *11"lCCKKAqQQ! |F##AAAtG,s6F?C/H/HHH|F##D!!!G,s6F?C/H/HHH(*vzS&RU=V=V.VVO?//1122    q	rel_pos_h	rel_pos_wc                    |\  }}|\  }}t          |||          }	t          |||          }
| j        \  }}}|                     ||||          }t          j        d||	          }t          j        d||
          }|dddddddddf         |dddddddddf         z   }|                    d||z  ||z            S )a  
    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py
    Args:
        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).

    Returns:
        bias (Tensor): attention bias to add to attention map
    zbhwc,hkc->bhwkzbhwc,wkc->bhwkNr)   )r>   r0   r3   r5   einsum)r@   rA   rB   r#   r$   q_hq_wk_hk_wRhRwB_dimr_qrel_hrel_w	attn_biass                     r=   get_decomposed_rel_pos_biasrR   I   s    ( HCHC	S#y	)	)B	S#y	)	)BIAq#
))AsC
%
%CL)333EL)333EaaaAAAqqq$&'%111aaaqqq0@*AAIRsC#I666r?   c            	            e Zd ZU ee         ed<   dddddej        dddf	dedee	e
e
f                  d	eej                 f fd
Zd Z xZS )	Attention
fused_attn   TF        Nuse_rel_pos
input_sizeropec                    t                                                       ||z  dk    s
J d            || _        ||z  | _        | j        dz  | _        t                      | _        t          j        ||dz  |          | _	        |r || j                  nt          j
                    | _        |r || j                  nt          j
                    | _        t          j        |          | _        t          j        ||          | _        t          j        |          | _        || _        | j        r|
J |	
J d            t          j        t'          j        d|	d         z  dz
  | j                            | _        t          j        t'          j        d|	d         z  dz
  | j                            | _        |
| _        d S )	Nr   z$dim should be divisible by num_headsg         )biaszBInput size must be provided if using relative positional encoding.r(   r   )super__init__	num_headshead_dimscaler   rU   nnLinearqkvIdentityq_normk_normDropout	attn_dropproj	proj_droprX   	Parameterr5   zerosrA   rB   rZ   )selfrM   r`   qkv_biasqk_normrj   rl   
norm_layerrX   rY   rZ   	__class__s              r=   r_   zAttention.__init__n   s    	Y!###%K###"y(]d*
(**9S#'9993:Mjj///3:Mjj///I..Ic3''	I..& 		7<<<&&S '&&  \%+JqM!A%t}+6 +6 7 7DN\%+JqM!A%t}+6 +6 7 7DN			r?   c                    |j         \  }}}}||z  }|                    ||d          }|                     |                              ||d| j        d                              ddddd          }|                    d|| j        z  |d                              d          \  }}	}
|                     |          |                     |	          }	}| j	        r"t          || j        | j        ||f||f          }nhd }| j        _| j                                        }t          ||                              |
          }t          |	|                              |
          }	| j        r=t$          j        j                            ||	|
|| j        r| j        j        nd          }nZ|| j        z  }||	                    d	d          z  }|||z   }|                    d
          }|                     |          }||
z  }|                    || j        |d                              dd                              ||d          }|                     |          }|                     |          }|                    |||d          }|S )Nr)   r\   r(   r   r      rW   )	attn_mask	dropout_p)rM   )r0   r3   re   viewr`   r4   unbindrg   rh   rX   rR   rA   rB   rZ   	get_embedr   type_asrU   r5   rc   
functionalscaled_dot_product_attentiontrainingrj   prb   	transposesoftmaxrk   rl   )ro   xrK   HWrL   Nre   r@   kvrQ   rZ   attns                 r=   forwardzAttention.forward   sm   W
1aEIIaBhhqkkq!Q;;CCAq!QPQRR++aT^!3Q;;BB1EE1a{{1~~t{{1~~1 	<3At~t~XY[\W]`acd_effIIIy$y**,,'40088;;'40088;;? 	#@@1a#.2mC$.** A  AA DJAq{{2r***D$i'<<B<''D>>$''DqAFF1dna,,66q!<<DDQ2NNIIaLLNN1FF1aBr?   )__name__
__module____qualname__r   bool__annotations__rc   	LayerNormr   r   r.   Moduler_   r   __classcell__rs   s   @r=   rT   rT   k   s         d
 | %48(,% % % !sCx1% 29%% % % % % %N& & & & & & &r?   rT   c                   &     e Zd Zd fd	Zd Z xZS )
LayerScaleh㈵>Fc                     t                                                       || _        t          j        |t          j        |          z            | _        d S N)r^   r_   inplacerc   rm   r5   onesgamma)ro   rM   init_valuesr   rs   s       r=   r_   zLayerScale.__init__   sB    \+
3"?@@


r?   c                 X    | j         r|                    | j                  n	|| j        z  S r   )r   mul_r   ro   r   s     r=   r   zLayerScale.forward   s(    %)\Eqvvdj!!!q4:~Er?   )r   F)r   r   r   r_   r   r   r   s   @r=   r   r      sY        A A A A A A
F F F F F F Fr?   r   c                   V     e Zd Zdddddddej        ej        eddddf fd	Zd Z xZ	S )	Block      @TFrW   Nr   c                    t                                                       || _         ||          | _        t	          |||||||||dk    r|n||f|
  
        | _        |rt          ||          nt          j                    | _	        |	dk    rt          |	          nt          j                    | _         ||          | _         ||t          ||z            |
|          | _        |rt          ||          nt          j                    | _        |	dk    rt          |	          nt          j                    | _        d S )Nr   )	r`   rp   rq   rj   rl   rr   rX   rY   rZ   )r   rW   )in_featureshidden_features	act_layerdrop)r^   r_   window_sizenorm1rT   r   r   rc   rf   ls1r   
drop_path1norm2r.   mlpls2
drop_path2)ro   rM   r`   	mlp_ratiorp   rq   rl   rj   r   	drop_pathr   rr   	mlp_layerrX   r   rY   rZ   rs   s                    r=   r_   zBlock.__init__   sX   & 	&Z__
!#%0A%5%5zzK;U
 
 
	 @K]:c{;;;;PRP[P]P]1:R(9---R[]]Z__
9i00	
 
 
 @K]:c{;;;;PRP[P]P]1:R(9---R[]]r?   c           
      d   |j         \  }}}}|}|                     |          }d }| j        dk    rt          || j                  \  }}|                     |                     |                     |                              }| j        dk    rt          || j        ||f|          }||z   }|                    |||z  d          }|| 	                    | 
                    |                     |                     |                                        z   }|                    |||d          }|S )Nr   r)   )r0   r   r   window_partitionr   r   r   window_unpartitionr3   r   r   r   r   )ro   r   rK   r   r   rL   shortcutpad_hws           r=   r   zBlock.forward   s   W
1aJJqMM,0a(D,<==IAvOODHHTYYq\\2233 a"1d&6AGGAqLIIaQ##$**Q--)@)@ A ABBBIIaAr""r?   )
r   r   r   rc   GELUr   r   r_   r   r   r   s   @r=   r   r      s         g|#-S -S -S -S -S -S^      r?   r   r   r   c           	      `   | j         \  }}}}|||z  z
  |z  }|||z  z
  |z  }t          j        | ddd|d|f          } ||z   ||z   }	}|                     |||z  ||	|z  ||          } |                     dddddd                                                              d|||          }
|
||	ffS )aU  
    Partition into non-overlapping windows with padding if needed.
    Args:
        x (tensor): input tokens with [B, H, W, C].
        window_size (int): window size.

    Returns:
        windows: windows after partition with [B * num_windows, window_size, window_size, C].
        (Hp, Wp): padded height and width before partition
    r   r   r\   r(   ru      r)   )r0   r1   padry   r4   
contiguous)r   r   rK   r   r   Cpad_hpad_wHpWpwindowss              r=   r   r     s     JAq!Q1{?*k9E1{?*k9E	a!Q5!U+,,AYE	B	q"#["2C[RSTTAii1aAq))4466;;B[Z[\\GRHr?   r   hwr   c                 d   ||n|\  }}|\  }}| j         d         ||z  |z  |z  z  }|                     |||z  ||z  ||d          }	|	                    dddddd                                                              |||d          }	|	ddd|d|ddf                                         }	|	S )	a  
    Window unpartition into original sequences and removing padding.
    Args:
        windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
        window_size (int): window size.
        pad_hw (Tuple): padded height and width (Hp, Wp).
        hw (Tuple): original height and width (H, W) before padding.

    Returns:
        x: unpartitioned sequences with [B, H, W, C].
    Nr   r)   r   r\   r(   ru   r   )r0   ry   r4   r   )
r   r   r   r   r   r   r   r   rK   r   s
             r=   r   r   )  s     )VVrFBDAqaR"W3{BCAQk)2+<k;XZ[[A			!Q1a##..0055aRDDA	!!!RaR!QQQ,""$$AHr?   c            H           e Zd ZdZdddddddddd	d
d	ddddddd eeej        d	          ej	        ej
        eedd	d	ddddd
d
f!dededededededededededee         dededed ed!ed"ed#ed$ed%ed&ee         d'ee         d(ed)ed*ed+ed,ed-ed.eed/f         d0ed1ed2ee         d3eeeeef         eeef         f                  fB fd4Zej        j        d5             Zej        j        dJd6            Zej        j        dKd7            Zej        j        d8ej        fd9            ZdLded1ee         fd:Z	 	 	 	 	 dMd<ej        d=eeee e         f                  d>ed?ed@edAed8ee ej                 eej        e ej                 f         f         fdBZ!	 	 	 dNd=eeee e         f                  dCedDefdEZ"dF Z#dJdGefdHZ$dI Z% xZ&S )Or"   z Vision Transformer for Segment-Anything Model(SAM)

    A PyTorch impl of : `Exploring Plain Vision Transformer Backbones for Object Detection` or `Segment Anything Model (SAM)`
        - https://arxiv.org/abs/2010.11929
          r\         r   TFNrW    )
output_fmtstrict_img_size       avgimg_size
patch_sizein_chansnum_classes	embed_dimdepthr`   r   rp   rq   r   pre_norm	drop_ratepos_drop_ratepatch_drop_rateproj_drop_rateattn_drop_ratedrop_path_rateweight_initembed_layerrr   r   block_fnr   use_abs_posrX   use_roper   global_attn_indexes.
neck_chansglobal_poolhead_hidden_sizeref_feat_shapec"                    	
$%& t                                                       pt          t          j        d          pt          j        | _        | _        x _        x _	         _
        d _         |||||            _         j        j        %t           j        d          r j                                        n|&|r;t          j        t#          j        d%d         %d                              _        nd _        t          j        |	           _        |dk    rt-          |d
           _        nt          j                     _        |r           nt          j                     _        |rr
J d            |!@t5          |!          dk    sJ t7          |!d                   }"t7          |!d                   }#ndx}"}#t9          z  d%|"           _        t9          z  dt7                    |#           _        nd _        d _        d t#          j        d||          D             $t          j         $%
	 fdtC          |          D               _"        &fdtC          |          D              _#        |rkt          j         t          j$        |dd          tK          |          t          j$        ||ddd          tK          |                     _&        | _        n1| rt          j                     _&        ntK                     _&        }| rtO          ||| ||           _(        dS tS          ||||           _(        dS )a  
        Args:
            img_size: Input image size.
            patch_size: Patch size.
            in_chans: Number of image input channels.
            num_classes: Mumber of classes for classification head.
            global_pool: Type of global pooling for final sequence (default: 'token').
            embed_dim: Transformer embedding dimension.
            depth: Depth of transformer.
            num_heads: Number of attention heads.
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: Enable bias for qkv projections if True.
            init_values: Layer-scale init values (layer-scale enabled if not None).
            drop_rate: Head dropout rate.
            pos_drop_rate: Position embedding dropout rate.
            attn_drop_rate: Attention dropout rate.
            drop_path_rate: Stochastic depth rate.
            weight_init: Weight initialization scheme.
            embed_layer: Patch embedding layer.
            norm_layer: Normalization layer.
            act_layer: MLP activation layer.
            block_fn: Transformer block layer.
            use_abs_pos: If True, use absolute positional embeddings.
            use_rel_pos: If True, add relative positional embeddings to the attention map.
            use_rope: If True, add rotary position embeddings to q/k in attention block.
            window_size: Window size for window attention blocks. If 0, not use window attention.
            global_attn_indexes: Indexes for blocks using global attention. Used when window_size > 0.
            global_pool: Global pooling type.
            head_hidden_size: If set, use NormMlpHead
            ref_feat_shape: Tuple of reference feature shapes for ROPE, (global, local)
        gư>)epsF)r   r   r   r   r]   
feat_ratior   r   N)r   )num_prefix_tokenszCROPE and relative pos embeddings should not be enabled at same timer(   )	in_pixels
feat_shaper   c                 6    g | ]}|                                 S r   )item).0r   s     r=   
<listcomp>z1VisionTransformerSAM.__init__.<locals>.<listcomp>  s     JJJAqvvxxJJJr?   c           	          g | ]T} di d ddddd	ddd|         d	d
d
dd|vrnddd|vrj         nj        US )rM   r`   r   rp   rq   r   rl   rj   r   rr   r   r   rX   r   r   rY   rZ   r   )rope_windowrope_global)r   ir   r   r   dprr   r   	grid_sizer   r   r   rr   r`   r   rq   rp   ro   rX   r   s     r=   r   z1VisionTransformerSAM.__init__.<locals>.<listcomp>  s    &# &# &#& % H   I#) $) "	
   (K ). ). a&& &: $) $) (K ,-4G+G+GKKQ %9  *+2E)E)ET%%4K[!&# &# &#r?   c                 :    g | ]}t          d |           S )zblocks.)modulenum_chs	reductiondict)r   r   r   rs     r=   r   z1VisionTransformerSAM.__init__.<locals>.<listcomp>  sA     ^ ^ ^KLD!yAFFF^ ^ ^r?   )kernel_sizer]   r\   )r   paddingr]   )hidden_size	pool_typer   )r  r   )*r^   r_   r   rc   r   r   r   r   num_featuresr   r   grad_checkpointingpatch_embedr   hasattrr   rm   r5   rn   	pos_embedri   pos_dropr   
patch_droprf   norm_prelenr   r   r   r   linspace
Sequentialrangeblocksfeature_infoConv2dr   neckr   headr   )(ro   r   r   r   r   r   r   r`   r   rp   rq   r   r   r   r   r   r   r   r   r   r   rr   r   r   r   r   rX   r   r   r   r   r   r   r   ref_feat_shape_globalref_feat_shape_windowr   r   r   rs   s(   `    ` `````    ``   ```` ` ``      @@@r=   r_   zVisionTransformerSAM.__init__G  st   F 	B72<T#B#B#B
(	&&ENNND1DN"'&;!
 
 
 $.	-4T5E|-T-TdD'')))Zd 	"\%+a1yQR|U^*_*_``DNN!DN
]333Q*"#  DOO
 !kmmDO19L

9---r{}} 	$"ii$iiii)>**a////(1.2C(D(D%(1.2C(D(D%%@DD%(=1Y&$4	     D  2Y&$[114	     D  $D#D KJ>5!I!IJJJm &# &# &# &# &# &# &# &# &# &# &# &# &# &# &# &# &# &# &# &# &#& 5\\'&# &# &# $(^ ^ ^ ^ ^PUV[P\P\^ ^ ^  	#	 !	   J''	 !   J'' DI" !+D 3KMM		 (	22	"J  	-,%#  DIII '%#	  DIIIr?   c                 
    ddhS )Nr  
dist_tokenr   ro   s    r=   no_weight_decayz$VisionTransformerSAM.no_weight_decay	  s    \**r?   c                 (    t          dddg          S )Nz^pos_embed|patch_embed)z^blocks\.(\d+)N)z^norm)i )stemr  r   )ro   coarses     r=   group_matcherz"VisionTransformerSAM.group_matcher  s%    *-/CD
 
 
 	
r?   c                     || _         d S r   )r  )ro   enables     r=   set_grad_checkpointingz+VisionTransformerSAM.set_grad_checkpointing  s    "(r?   r&   c                     | j         S r   r  r  s    r=   get_classifierz#VisionTransformerSAM.get_classifier  s
    yr?   c                 <    | j                             ||           d S r   )r  reset)ro   r   r   s      r=   reset_classifierz%VisionTransformerSAM.reset_classifier  s    	[11111r?   NCHWr   indicesnorm
stop_earlyr   intermediates_onlyc                 j   |dk    s
J d            g }t          t          | j                  |          \  }}	|                     |          }| j        %|t          | j        |j        dd                   z   }|                     |          }|                     |          }| 	                    |          }t          j                                        s|s| j        }
n| j        d|	dz            }
t          |
          D ]\  }} ||          }||v rl|r?|                    |                     |                    dddd                               U|                    |                    dddd                     |r|S |                     |                    dddd                    }||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to all intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        r'  z&Output shape for ViT-SAM must be NCHW.Nr   r\   r   r(   )r   r  r  r  r  r   r0   r  r	  r
  r5   jitis_scripting	enumerateappendr  r4   )ro   r   r(  r)  r*  r   r+  intermediatestake_indices	max_indexr  r   blks                r=   forward_intermediatesz*VisionTransformerSAM.forward_intermediates  s   * V###%M###"6s4;7G7G"Q"Qi Q>%/!MMMAMM!OOAMM!9!!## 	1: 	1[FF[)a-0F'' 		@ 		@FAsAAL   @ "((199Q1a3H3H)I)IJJJJ!((1aA)>)>??? 	!  IIaii1a++,,-r?   
prune_norm
prune_headc                     t          t          | j                  |          \  }}| j        d|dz            | _        |rt          j                    | _        |r|                     dd           |S )z@ Prune layers not required for specified intermediates.
        Nr   r   r   )r   r  r  rc   rf   r  r&  )ro   r(  r6  r7  r2  r3  s         r=   prune_intermediate_layersz.VisionTransformerSAM.prune_intermediate_layersW  sq     #7s4;7G7G"Q"Qik.9q=.1 	&DI 	)!!!R(((r?   c                    |                      |          }| j        %|t          | j        |j        dd                   z   }|                     |          }|                     |          }|                     |          }| j        r4t          j	        
                                st          | j        |          }n|                     |          }|                     |                    dddd                    }|S )Nr   r\   r   r(   )r  r  r   r0   r  r	  r
  r  r5   r-  r.  r   r  r  r4   r   s     r=   forward_featuresz%VisionTransformerSAM.forward_featuresh  s    Q>%/!MMMAMM!OOAMM!" 	59+A+A+C+C 	t{A..AAAAIIaii1a++,,r?   
pre_logitsc                 ^    |r|                      |d          n|                      |          S )NT)r<  r"  )ro   r   r<  s      r=   forward_headz!VisionTransformerSAM.forward_headw  s,    0:Ltyyty,,,		!Lr?   c                 Z    |                      |          }|                     |          }|S r   )r;  r>  r   s     r=   r   zVisionTransformerSAM.forwardz  s-    !!!$$a  r?   F)Tr   )NFFr'  F)NFT)'r   r   r   __doc__r   r   r   NHWCrc   r   r   r   r   r.   floatr   r   strr   r   r_   r5   r-  ignorer  r  r   r   r#  r&  Tensorr   r   r5  r9  r;  r>  r   r   r   s   @r=   r"   r"   @  s~         ! " !!!+/"!#%%'$&$&$&!$+GJ6;`e$f$f$f-/\,.G!&"% $ %"!35!$.2PTE@ @@ @ 	@
 @ @ @ @ @ @ @ "%@ @ @ !@  #!@" "#@$ "%@& "'@( )@* "+@, !*-@.  )/@0 1@2  3@4 5@6 7@8 9@: ;@< "'sCx=@> ?@@ A@B 'smC@D %U5c?E#s(O+K%LME@ @ @ @ @ @D Y+ + + Y
 
 
 
 Y) ) ) ) Y	    2 2C 2hsm 2 2 2 2 8<$$',6  6 |6  eCcN346  	6 
 6  6  !%6  
tEL!5tEL7I)I#JJ	K6  6  6  6 t 8<$#	 eCcN34  	   "  M M$ M M M M      r?   c                     d| v }i }|                                  D ]C\  }}|                    d          r!|dd         }|                    dd          }n|r>|||<   D|S )z Remap SAM checkpoints -> timm z%image_encoder.patch_embed.proj.weightzimage_encoder.r   Nzmlp.linzmlp.fc)items
startswithreplace)
state_dictmodelsam_checkpointout_dictr   r   s         r=   checkpoint_filter_fnrO    s    
 =
JNH  ""  1<<()) 	"##A		)X..AA Or?   r   c                 6    | ddd dddt           t          ddd|S )	N  r\   r   r   ?bicubicTzpatch_embed.projzhead.fc)urlr   rY   	pool_sizecrop_pctinterpolationfixed_input_sizemeanstd
first_conv
classifier)r   r   )rU  kwargss     r=   _cfgr_    s7    ?'0F(	   r?   zDhttps://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pthztimm/z
apache-2.0rR  r-   )rU  	hf_hub_idlicenserZ  r[  r   rY   rW  zDhttps://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pthzDhttps://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pthrQ  )r\      rb  rS  )rZ  r[  r   rY   rW  )zsamvit_base_patch16.sa1bzsamvit_large_patch16.sa1bzsamvit_huge_patch16.sa1bsamvit_base_patch16_224Fc                     |                     dd          }t          t          | |ft          t	          |d          d|S )Nout_indicesr\   getter)re  feature_cls)pretrained_filter_fnfeature_cfg)popr   r"   rO  r   )variant
pretrainedr^  re  s       r=   _create_vision_transformerrm    sY    **]A..K 2[hGGG    r?   c           
      l    t          ddddg dddd          }t          	 dd
| it          |fi |}|S )z# ViT-B/16 for Segment-Anything
    r   r   r   r(   r   rV      r   Tr   r   r   r   r`   r   r   rX   r   samvit_base_patch16rl  )rr  r   rm  rl  r^  
model_argsrL  s       r=   rr  rr    st     B"R_R_R_D4  J 'T T*4T8<Z8R8R68R8RT TELr?   c           
      l    t          ddddg dddd          }t          	 d
d	| it          |fi |}|S )z# ViT-L/16 for Segment-Anything
    r   r      )r   rp        r   Trq  samvit_large_patch16rl  )rz  rs  rt  s       r=   rz  rz    st     R2SbSbSbD4  J 'U U+5U9=j9S9SF9S9SU UELr?   c           
      l    t          ddddg dddd          }t          	 dd
| it          |fi |}|S )z# ViT-H/16 for Segment-Anything
    r   i       )      ry     r   Tr   rq  samvit_huge_patch16rl  )r  rs  rt  s       r=   r  r    st     R2SbSbSbD4  J 'T T*4T8<Z8R8R68R8RT TELr?   c                 p    t          ddddg dddddd	

  
        }t          	 dd| it          |fi |}|S )z# ViT-B/16 based on samvit arch
    r   r   r   ro  r   TFrb  N)
r   r   r   r`   r   r   rX   r   r   r   rc  rl  )rc  rs  rt  s       r=   rc  rc    sy     B"R_R_R_DecVZ  J '!X X.8X<@<V<Vv<V<VX XELr?   r   )r   r@  )HrA  logging	functoolsr   typingr   r   r   r   r   r5   torch.nnrc   torch.nn.functionalr}   r1   torch.utils.checkpoint	timm.datar	   r
   r   r   timm.layersr   r   r   r   r   r   r   r   r   r   r   r   r   	torch.jitr   _builderr   	_featuresr   _features_fxr   _manipulater   	_registryr    r!   __all__	getLoggerr   _loggerr.   rF  r>   rR   r   rT   r   r   r   r   r"   rO  r_  default_cfgsrm  rr  rz  r  rc  r   r?   r=   <module>r     s  
 
        9 9 9 9 9 9 9 9 9 9 9 9 9 9                     r r r r r r r r r r r rl l l l l l l l l l l l l l l l l l l l l l l l l l l l l l       * * * * * * + + + + + + 3 3 3 3 3 3 ' ' ' ' ' ' < < < < < < < < "
" '
H
%
%3 3S 35< 3EL 3 3 3 3@  + & & &7|7|7 |7 #s(O	7
 #s(O7 \7 7 7 7DP P P P P	 P P PfF F F F F F F FG G G G GBI G G GT 3 5uUXZ]U]A^;_    0 gk \(+16sCxJRSXY\^aYaSbJc
\   .} } } } }29 } } }@
  $    %$ !%R"(<!"S!2 !2 !2 "&R"(<!"S"2 "2 "2 !%R"(<!"S!2 !2 !2  $t"(<$ 3 0  0  0-& &  8	 	 	 	 	 	7K 	 	 	 	 	 	8L 	 	 	 	 	 	7K 	 	 	 	 	 	;O 	 	 	 	 	 	r?   