
    Ngll                     |   d Z ddlmZ ddlmZmZ ddlZddlmZ ddl	mc m
Z ddlmZ ddlmZmZ ddlmZmZmZmZmZmZmZ ddlmZmZ d	d
lmZ d	dlmZ d	dlmZ d	dl m!Z!m"Z" dgZ# G d dej$                  Z% G d dej$                  Z& G d dej$                  Z' G d dej$                  Z( G d dej$                  Z) G d dej$                  Z*dedee+e+f         fdZ,ededee+e+f         de+d e+fd!            Z- G d" d#ej$                  Z. G d$ d%ej$                  Z/ G d& d'ej$                  Z0 G d( dej$                  Z1d@d*Z2d+ Z3dAd-Z4dBd/Z5 e! e5d01           e5d01           e5d01           e5             e5             e5             e5d2dd34           e5d5dd34          d6          Z6e"dAd7e1fd8            Z7e"dAd7e1fd9            Z8e"dAd7e1fd:            Z9e"dAd7e1fd;            Z:e"dAd7e1fd<            Z;e"dAd7e1fd=            Z<e"dAd7e1fd>            Z=e"dAd7e1fd?            Z>dS )Caf   DaViT: Dual Attention Vision Transformers

As described in https://arxiv.org/abs/2204.03645

Input size invariant transformer architecture that combines channel and spacial
attention in each block. The attention mechanisms used are linear in complexity.

DaViT model defs and weights adapted from https://github.com/dingmyu/davit, original copyright below

    )partial)OptionalTupleN)TensorIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)DropPath	to_2tupletrunc_normal_MlpLayerNorm2dget_norm_layeruse_fused_attn)NormMlpClassifierHeadClassifierHead   )build_model_with_cfg)register_notrace_function)checkpoint_seq)generate_default_cfgsregister_modelDaVitc                   :     e Zd Zd	dededef fdZdefdZ xZS )

ConvPosEnc   Fdimkactc                     t          t          |                                            t          j        |||d|dz  |          | _        |rt          j                    nt          j                    | _        d S )Nr      )kernel_sizestridepaddinggroups)	superr   __init__nnConv2dprojGELUIdentityr   )selfr   r   r   	__class__s       M/var/www/html/ai-engine/env/lib/python3.11/site-packages/timm/models/davit.pyr'   zConvPosEnc.__init__"   so    j$((***IF
 
 
	 !$627999    xc                 `    |                      |          }||                     |          z   }|S N)r*   r   )r-   r1   feats      r/   forwardzConvPosEnc.forward/   s*    yy||r0   )r   F)	__name__
__module____qualname__intboolr'   r   r5   __classcell__r.   s   @r/   r   r   !   sr        7 7C 7C 7$ 7 7 7 7 7 7        r0   r   c                   8     e Zd ZdZdddef fd	ZdefdZ xZS )Stemz Size-agnostic implementation of 2D image to patch embedding,
        allowing input size to be adjusted during model forward operation
    r   `      c                    t                                                       t          |          }|| _        || _        || _        |d         dk    sJ t          j        ||d|d          | _         ||          | _	        d S )Nr   r@      r   r"   r#   r$   )
r&   r'   r   r#   in_chsout_chsr(   r)   convnorm)r-   rD   rE   r#   
norm_layerr.   s        r/   r'   zStem.__init__:   s     	6""ayA~~~~I
 
 
	 Jw''			r0   r1   c                 T   |j         \  }}}}| j        d         || j        d         z  z
  | j        d         z  }| j        d         || j        d         z  z
  | j        d         z  }t          j        |d|d|f          }|                     |          }|                     |          }|S )Nr   r   )shaper#   FpadrF   rG   )r-   r1   BCHWpad_rpad_bs           r/   r5   zStem.forwardP   s    W
1aQ!dk!n"44AFQ!dk!n"44AFE!a5)**IIaLLIIaLLr0   )	r6   r7   r8   __doc__r   r'   r   r5   r;   r<   s   @r/   r>   r>   5   sp          "( ( ( ( ( (,        r0   r>   c                   0     e Zd Zdef fd	ZdefdZ xZS )
Downsampler   c                     t                                                       || _        || _         ||          | _        |dz  dk    | _        t          j        |||d| j        rdn|dz            | _        d S )Nr!   r   rC   )	r&   r'   rD   rE   rG   even_kr(   r)   rF   )r-   rD   rE   r"   rH   r.   s        r/   r'   zDownsample.__init__[   s     	Jv&&	!Ao*I#:AA+*:
 
 
			r0   r1   c                     |j         \  }}}}|                     |          }| j        r>| j        j        \  }}|||z  z
  |z  }|||z  z
  |z  }	t          j        |d|d|	f          }|                     |          }|S )Nr   )rJ   rG   rW   rF   r"   rK   rL   )
r-   r1   rM   rN   rO   rP   k_hk_wrQ   rR   s
             r/   r5   zDownsample.forwardp   s    W
1aIIaLL; 	0y,HC1s7]c)E1s7]c)Ea!UQ.//AIIaLLr0   )r6   r7   r8   r   r'   r   r5   r;   r<   s   @r/   rU   rU   Z   s`        
 "
 
 
 
 
 
*	 	 	 	 	 	 	 	 	r0   rU   c                   &     e Zd Zd fd	Zd Z xZS )ChannelAttentionV2   Tc                     t                                                       || _        ||z  | _        || _        t          j        ||dz  |          | _        t          j        ||          | _        d S )Nr   bias)	r&   r'   r%   head_dimdynamic_scaler(   Linearqkvr*   )r-   r   	num_headsqkv_biasrb   r.   s        r/   r'   zChannelAttentionV2.__init__~   sg    y(*9S#'999Ic3''			r0   c                 `   |j         \  }}}|                     |                              ||d| j        || j        z                                ddddd          }|                    d          \  }}}| j        r	||dz  z  }n|| j        dz  z  }|                    dd          |z  }	|		                    d	          }	|	|                    dd          z                      dd          }|                    dd                              |||          }| 
                    |          }|S )
Nr   r!   r   r   r@         r   )rJ   rd   reshaper%   permuteunbindrb   ra   	transposesoftmaxr*   
r-   r1   rM   NrN   rd   qr   vattns
             r/   r5   zChannelAttentionV2.forward   s$   '1ahhqkk!!!Q4;T[8HIIQQRSUVXY[\^_``**Q--1a 	*AIAADMT))A{{2r""Q&|||##AKKB'''222r::KK1%%aA..IIaLLr0   )r]   TT)r6   r7   r8   r'   r5   r;   r<   s   @r/   r\   r\   |   sL        ( ( ( ( ( (      r0   r\   c                   ,     e Zd Zd fd	ZdefdZ xZS )ChannelAttentionr]   Fc                     t                                                       || _        ||z  }|dz  | _        t	          j        ||dz  |          | _        t	          j        ||          | _        d S )Nrh   r   r_   )r&   r'   re   scaler(   rc   rd   r*   )r-   r   re   rf   ra   r.   s        r/   r'   zChannelAttention.__init__   si    ")#%
9S#'999Ic3''			r0   r1   c                 :   |j         \  }}}|                     |                              ||d| j        || j        z                                ddddd          }|                    d          \  }}}|| j        z  }|                    dd          |z  }	|	                    d          }	|	|                    dd          z                      dd          }|                    dd                              |||          }| 	                    |          }|S )	Nr   r!   r   r   r@   ri   rj   rk   )
rJ   rd   rl   re   rm   rn   ry   ro   rp   r*   rq   s
             r/   r5   zChannelAttention.forward   s   '1ahhqkk!!!Q4>1;NOOWWXY[\^_abdeff**Q--1a
N{{2r""Q&|||##AKKB'''222r::KK1%%aA..IIaLLr0   )r]   F)r6   r7   r8   r'   r   r5   r;   r<   s   @r/   rw   rw      sX        ( ( ( ( ( (        r0   rw   c                   P     e Zd Zdddej        ej        dddf fd	ZdefdZ xZ	S )ChannelBlock      @F        Tc                 v   t                                                       t          |d|	          | _        || _         ||          | _        |
rt          nt          } ||||          | _        |dk    rt          |          nt          j                    | _        t          |d|	          | _        | j        rf ||          | _        t          |t!          ||z            |          | _        |dk    rt          |          nt          j                    | _        d S d | _        d | _        d | _        d S Nr   )r   r   r   )re   rf   r~   )in_featureshidden_features	act_layer)r&   r'   r   cpe1ffnnorm1r\   rw   ru   r
   r(   r,   
drop_path1cpe2norm2r   r9   mlp
drop_path2)r-   r   re   	mlp_ratiorf   	drop_pathr   rH   r   cpe_actv2
attn_layerr.   s               r/   r'   zChannelBlock.__init__   s8    	3!999	Z__
+-C''3C
J
 
 
	
 2;R(9---R[]]3!999	8 	##CDJ #C)O 4 4#  DH
 6?^^hy111DOOODJDH"DOOOr0   r1   c                    |j         \  }}}}|                     |                              d                              dd          }|                     |          }|                     |          }||                     |          z   }|                     |                    dd                              ||||                    }| j	        |                    d                              dd          }|| 
                    | 	                    |                     |                              z   }|                    dd                              ||||          }|S )Nr!   r   )rJ   r   flattenro   r   ru   r   r   viewr   r   r   )r-   r1   rM   rN   rO   rP   curs          r/   r5   zChannelBlock.forward   s$   W
1aIIaLL  ##--a33jjmmiinn$$$IIakk!Q'',,Q1a88998		!&&q!,,ADOODHHTZZ]]$;$;<<<AAq!!&&q!Q22Ar0   )
r6   r7   r8   r(   r+   	LayerNormr'   r   r5   r;   r<   s   @r/   r|   r|      sw         g|&# &# &# &# &# &#P        r0   r|   r1   window_sizec                 *   | j         \  }}}}|                     |||d         z  |d         ||d         z  |d         |          } |                     dddddd                                                              d|d         |d         |          }|S )z
    Args:
        x: (B, H, W, C)
        window_size (int): window size
    Returns:
        windows: (num_windows*B, window_size, window_size, C)
    r   r   r   r!   r@      ri   rJ   r   rm   
contiguous)r1   r   rM   rO   rP   rN   windowss          r/   window_partitionr      s     JAq!Q	q!{1~%{1~qKN7JKXYN\]^^Aii1aAq))4466;;BAP[\]P^`abbGNr0   r   rO   rP   c                    | j         d         }|                     d||d         z  ||d         z  |d         |d         |          }|                    dddddd                                                              d|||          }|S )z
    Args:
        windows: (num_windows*B, window_size, window_size, C)
        window_size (int): Window size
        H (int): Height of image
        W (int): Width of image
    Returns:
        x: (B, H, W, C)
    ri   r   r   r   r!   r@   r   r   )r   r   rO   rP   rN   r1   s         r/   window_reverser      s     	bARk!n,a;q>.A;q>S^_`SacdeeA			!Q1a##..0055b!QBBAHr0   c                   \     e Zd ZU dZej        j        e         ed<   d fd	Z	de
fdZ xZS )WindowAttentiona   Window based multi-head self attention (W-MSA) module with relative position bias.
    It supports both of shifted and non-shifted window.
    Args:
        dim (int): Number of input channels.
        window_size (tuple[int]): The height and width of the window.
        num_heads (int): Number of attention heads.
        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
    
fused_attnTc                 \   t                                                       || _        || _        || _        ||z  }|dz  | _        t                      | _        t          j	        ||dz  |          | _
        t          j	        ||          | _        t          j        d          | _        d S )Nrh   r   r_   ri   rk   )r&   r'   r   r   re   ry   r   r   r(   rc   rd   r*   Softmaxrp   )r-   r   r   re   rf   ra   r.   s         r/   r'   zWindowAttention.__init__  s    &")#%
(**9S#'999Ic3''	zb)))r0   r1   c                 $   |j         \  }}}|                     |                              ||d| j        || j        z                                ddddd          }|                    d          \  }}}| j        rt          j        |||          }n=|| j	        z  }||
                    dd          z  }	|                     |	          }	|	|z  }|
                    dd                              |||          }|                     |          }|S )Nr   r!   r   r   r@   rj   ri   )rJ   rd   rl   re   rm   rn   r   rK   scaled_dot_product_attentionry   ro   rp   r*   )
r-   r1   B_rr   rN   rd   rs   r   rt   ru   s
             r/   r5   zWindowAttention.forward*  s   7Aqhhqkk!!"aDNA<OPPXXYZ\]_`bcefgg**Q--1a? 	.q!Q77AADJAB+++D<<%%DqAKK1%%b!Q//IIaLLr0   T)r6   r7   r8   rS   torchjitFinalr:   __annotations__r'   r   r5   r;   r<   s   @r/   r   r     s{           	%%%%* * * * * *        r0   r   c                   T     e Zd ZdZddddej        ej        ddf fd	Zdefd	Z	 xZ
S )
SpatialBlocka<   Windows Block.
    Args:
        dim (int): Number of input channels.
        num_heads (int): Number of attention heads.
        window_size (int): Window size.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        drop_path (float, optional): Stochastic depth rate. Default: 0.0
        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
    rB   r}   Tr~   Fc                    t                                                       || _        |	| _        || _        t          |          | _        || _        t          |d|
          | _	         ||          | _
        t          || j        ||          | _        |dk    rt          |          nt          j                    | _        t          |d|
          | _        | j        rh ||          | _        t'          ||z            }t)          |||          | _        |dk    rt          |          nt          j                    | _        d S d | _        d | _        d | _        d S r   )r&   r'   r   r   re   r   r   r   r   r   r   r   ru   r
   r(   r,   r   r   r   r9   r   r   r   )r-   r   re   r   r   rf   r   r   rH   r   r   mlp_hidden_dimr.   s               r/   r'   zSpatialBlock.__init__J  sU    	"$[11"3!999	Z__
#	
 
 
	 2;R(9---R[]]3!999	8 	##CDJ y11N .#  DH
 6?^^hy111DOOODJDH"DOOOr0   r1   c           	         |j         \  }}}}|                     |                              d                              dd          }|                     |          }|                    ||||          }dx}}| j        d         || j        d         z  z
  | j        d         z  }	| j        d         || j        d         z  z
  | j        d         z  }
t          j        |dd||	||
f          }|j         \  }}}}t          || j                  }|                    d| j        d         | j        d         z  |          }| 
                    |          }|                    d| j        d         | j        d         |          }t          || j        ||          }|d d d |d |d d f                                         }|                    |||z  |          }||                     |          z   }|                     |                    dd                              ||||                    }| j        |                    d                              dd          }||                     |                     |                     |                              z   }|                    dd                              ||||          }|S )Nr!   r   r   ri   )rJ   r   r   ro   r   r   r   rK   rL   r   ru   r   r   r   r   r   r   r   )r-   r1   rM   rN   rO   rP   shortcutpad_lpad_trQ   rR   _HpWp	x_windowsattn_windowss                   r/   r5   zSpatialBlock.forwardw  s   W
1a99Q<<''**44Q::JJx  FF1aA!!$q4+;A+>'>>$BRSTBUU!!$q4+;A+>'>>$BRSTBUUE!aE5%788w2r1$Q(899	NN2t'7':T=Ma=P'PRSTT	 yy++ $((T-=a-@$BRSTBUWXYY<)92rBB aaa!RaRlO&&((FF1a!eQtq)))IIakk!Q'',,Q1a88998		!&&q!,,ADOODHHTZZ]]$;$;<<<AAq!!&&q!Q22Ar0   )r6   r7   r8   rS   r(   r+   r   r'   r   r5   r;   r<   s   @r/   r   r   =  s        
 
  g|+# +# +# +# +# +#Z% % % % % % % % %r0   r   c                        e Zd Zddddddddeej        ddd	ddf fd
	Zej        j	        dd            Z
defdZ xZS )
DaVitStager   Tspatialchannelr   rB   r}   )r   r   Fr!   c                    t                                                       d| _        |rt          ||||          | _        nt          j                    | _        	 g }t          |          D ]}ddlm	} g }t          |          D ]x\  }}|dk    r4|                    dt          ||||	|
|         ||||	  	        f           ?|dk    r3|                    d	t          ||||	|
|         ||||
	  	        f           y|r1|                    t          j         ||                               |                    t          j        d |D                         t          j        | | _        d S )NF)r"   rH   r   )OrderedDictr   spatial_block)	r   re   r   rf   r   rH   r   r   r   r   channel_block)	r   re   r   rf   r   rH   r   r   r   c                     g | ]
}|d          S )r    ).0bs     r/   
<listcomp>z'DaVitStage.__init__.<locals>.<listcomp>  s    3W3W3WQAaD3W3W3Wr0   )r&   r'   grad_checkpointingrU   
downsampler(   r,   rangecollectionsr   	enumerateappendr   r|   
Sequentialblocks)r-   rD   rE   depthr   
attn_typesre   r   r   rf   drop_path_ratesrH   norm_layer_clr   r   down_kernel_sizenamed_blockschannel_attn_v2stage_blocks	block_idxr   dual_attention_blockattn_idx	attn_typer.   s                           r/   r'   zDaVitStage.__init__  s   ( 	"'  	,(FVcmnnnDOO kmmDO	 u 	Z 	ZI//////#% '0'<'<  #)	))(//,#"+"+!)"1)"<#0 '$/
C 
C 
C 
1 
 
 
 
 )++(//,#"+"+!)"1)"<#0 '*
C 
C 
C 
1 
 
 
  Z##BM++>R2S2S$T$TUUUU##BM3W3WBV3W3W3W$XYYYYm\2r0   c                     || _         d S r3   )r   )r-   enables     r/   set_grad_checkpointingz!DaVitStage.set_grad_checkpointing  s    "(r0   r1   c                     |                      |          }| j        r4t          j                                        st          | j        |          }n|                     |          }|S r3   )r   r   r   r   is_scriptingr   r   r-   r1   s     r/   r5   zDaVitStage.forward  sZ    OOA" 	59+A+A+C+C 	t{A..AAAAr0   r   )r6   r7   r8   r   r(   r   r'   r   r   ignorer   r   r5   r;   r<   s   @r/   r   r     s        
 -"",!%F3 F3 F3 F3 F3 F3P Y) ) ) )        r0   r   c                       e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d  fd	Zd Zej        j        d!d            Z	ej        j        d"d            Z
ej        j        dej        fd            Zd#dedee         fdZd Zd!defdZd Z xZS )$r   a   DaViT
        A PyTorch implementation of `DaViT: Dual Attention Vision Transformers`  - https://arxiv.org/abs/2204.03645
        Supports arbitrary input sizes and pyramid feature extraction
        
    Args:
        in_chans (int): Number of input image channels. Default: 3
        num_classes (int): Number of classes for classification head. Default: 1000
        depths (tuple(int)): Number of blocks in each stage. Default: (1, 1, 3, 1)
        embed_dims (tuple(int)): Patch embedding dimension. Default: (96, 192, 384, 768)
        num_heads (tuple(int)): Number of attention heads in different layers. Default: (3, 6, 12, 24)
        window_size (int): Window size. Default: 7
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
        drop_path_rate (float): Stochastic depth rate. Default: 0.1
        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
    r   r   r   r   r   r?           r            rB   r@   Tlayernorm2d	layernormh㈵>r   Fr!   r~     avgc                    t                                                       t          |          }|t          |          cxk    rt          |          k    sn J t          t	          |          |
          }t          t	          |	          |
          }	|| _        |d         x| _        | _        || _        d| _	        g | _
        t          ||d         |          | _        |d         }d t          j        d|t          |                                        |          D             }g }t#          |          D ]w}||         }t%          ||f||         |dk    |||         |||||         ||	|||||d}|}|                    |           | xj
        t)          |dd	| 
          gz  c_
        xt+          j        | | _        |r8 || j                  | _        t3          | j        ||| j                  | _        n;t+          j                    | _        t9          | j        ||| j        |          | _        |                     | j                   d S )N)epsri   Fr   )rH   c                 6    g | ]}|                                 S r   )tolist)r   r1   s     r/   r   z"DaVit.__init__.<locals>.<listcomp>-  s     ```aqxxzz```r0   )r   r   r   re   r   r   rf   r   rH   r   r   r   r   r   r   r!   zstages.)num_chs	reductionmodule)	pool_type	drop_rate)r   r   rH   )r&   r'   lenr   r   num_classesnum_featureshead_hidden_sizer   r   feature_infor>   stemr   linspacesumsplitr   r   r   dictr(   r   stagesnorm_prer   headr,   r   apply_init_weights)r-   in_chansdepths
embed_dimsre   r   r   rf   rH   r   norm_epsr   r   r   r   r   r   r   drop_path_rater   global_poolhead_norm_first
num_stagesrD   dprr  	stage_idxrE   stager.   s                                r/   r'   zDaVit.__init__  s   0 	__
S^^::::s6{{::::::^J77XFFF
} = =8LLL&4>rNBD1""':a=ZHHH	A``5>!^S[[#Q#Q#W#WX^#_#_```z** 	d 	dI +G Y'$q=%#I.'#! #I%+!1 /)#  E& FMM%   $w!LaV_LaLa"b"b"b!ccmV,
  	&Jt'899DM&!%.	  DII KMMDM-!%.%  DI 	

4%&&&&&r0   c                     t          |t          j                  r^t          |j        d           t          |t          j                  r0|j        +t          j                            |j        d           d S d S d S d S )Ng{Gz?)stdr   )
isinstancer(   rc   r   weightr`   init	constant_)r-   ms     r/   r  zDaVit._init_weights`  s    a## 	-!(,,,,!RY'' -AF,>!!!&!,,,,,	- 	-- -,>,>r0   c                 0    t          d|rdng d          S )Nz^stemz^stages\.(\d+)))z^stages\.(\d+).downsample)r   )z^stages\.(\d+)\.blocks\.(\d+)N)z	^norm_pre)i )r   r   )r  )r-   coarses     r/   group_matcherzDaVit.group_matcherf  s9    (. $$ 5 5 5
 
 
 	
r0   c                 T    || _         | j        D ]}|                    |           d S )N)r   )r   r  r   )r-   r   r  s      r/   r   zDaVit.set_grad_checkpointingq  s?    "([ 	8 	8E(((7777	8 	8r0   returnc                     | j         j        S r3   )r  fc)r-   s    r/   get_classifierzDaVit.get_classifierw  s    y|r0   Nr   r  c                 <    | j                             ||           d S r3   )r  reset)r-   r   r  s      r/   reset_classifierzDaVit.reset_classifier{  s    	[11111r0   c                     |                      |          }| j        r4t          j                                        st          | j        |          }n|                     |          }|                     |          }|S r3   )r   r   r   r   r   r   r  r  r   s     r/   forward_featureszDaVit.forward_features~  si    IIaLL" 	59+A+A+C+C 	t{A..AAAAMM!r0   
pre_logitsc                 ^    |r|                      |d          n|                      |          S )NT)r(  )r  )r-   r1   r(  s      r/   forward_headzDaVit.forward_head  s,    0:Ltyyty,,,		!Lr0   c                 Z    |                      |          }|                     |          }|S r3   )r'  r*  r   s     r/   r5   zDaVit.forward  s-    !!!$$a  r0   )r   r   r   r   rB   r@   Tr   r   r   r   TFr!   FFr~   r~   r   r   FFr   r3   )r6   r7   r8   rS   r'   r  r   r   r   r  r   r(   Moduler"  r9   r   strr%  r'  r:   r*  r5   r;   r<   s   @r/   r   r     s        & *$$%-!!-W' W' W' W' W' W'r- - - Y
 
 
 
 Y8 8 8 8
 Y	    2 2C 2hsm 2 2 2 2  M M$ M M M M      r0   vision_tower.c                    dd l }i }|                                 D ]B\  }}|                    |          r|                    |d          }n3|                    dd|          }|                    dd|          }|                    dd          }|                    d	d
          }|                    dd          }|                    dd          }|                    dd          }|                    dd          }|                    dd          }|                    dd          }|                    dd          }|                    dd          }|||<   D|S )Nr    zconvs.([0-9]+)stages.\1.downsamplezblocks.([0-9]+)stages.\1.blocksdownsample.projdownsample.convstages.0.downsampler   zwindow_attn.norm.znorm1.zwindow_attn.fn.zattn.zchannel_attn.norm.zchannel_attn.fn.z	ffn.norm.znorm2.zffn.fn.net.zmlp.zconv1.fn.dwz	cpe1.projzconv2.fn.dwz	cpe2.proj)reitems
startswithreplacesub)
state_dictmodelprefixr7  out_dictr   rt   s          r/   _convert_florence2r@    s`   IIIH  ""  1<< 			&"%%AAFF$&=qAAFF%':A>>II'):;;II+V44 II)844II'11II*H55II('22IIk8,,IImV,,IIm[11IIm[11Or0   c                     d| v r| S d| v r| d         } d| v rt          | |          S ddl}i }|                                 D ]\  }}|                    dd|          }|                    dd	|          }|                    d
d          }|                    dd          }|                    dd          }|                    dd          }|                    dd          }|                    dd          }|||<   |S )z  Remap MSFT checkpoints -> timm zhead.fc.weightr<  z vision_tower.convs.0.proj.weightr   Nzpatch_embeds.([0-9]+)r2  zmain_blocks.([0-9]+)r3  r4  r5  r6  r   zhead.zhead.fc.znorms.z
head.norm.zcpe.0r   zcpe.1r   )r@  r7  r8  r;  r:  )r<  r=  r7  r?  r   rt   s         r/   checkpoint_filter_fnrB    s#   :%%z!!-
)Z77!*e444IIIH  "" 	 	1FF+-DaHHFF*,?CCII'):;;II+V44IIgz**IIh--IIgv&&IIgv&&Or0   Fc           	      X   t          d t          |                    dd                    D                       }|                    d|          }|                    dd          }|                     d          rd}t          t          | |ft          t          d|	          |d
|}|S )Nc              3       K   | ]	\  }}|V  
d S r3   r   )r   ir   s      r/   	<genexpr>z _create_davit.<locals>.<genexpr>  s&      \\da\\\\\\r0   r
  r   out_indicespretrained_strictT_flF)flatten_sequentialrG  )pretrained_filter_fnfeature_cfgrH  )	tupler   getpopendswithr   r   rB  r  )variant
pretrainedkwargsdefault_out_indicesrG  strictr=  s          r/   _create_davitrV    s    \\i

8\8Z8Z.[.[\\\\\**],?@@KZZ+T22F   2DkJJJ    E Lr0   r1  c                 4    | dddddt           t          ddd
|S )	Nr   )r      rX  )rB   rB   gffffff?bicubicz	stem.convzhead.fc)
urlr   
input_size	pool_sizecrop_pctinterpolationmeanr  
first_conv
classifierr   )rZ  rS  s     r/   _cfgrb    s5    =v9%.B!   r0   ztimm/)	hf_hub_idzmicrosoft/Florence-2-base)r   r   r   )rc  r   r[  zmicrosoft/Florence-2-large)zdavit_tiny.msft_in1kzdavit_small.msft_in1kzdavit_base.msft_in1kdavit_large
davit_hugedavit_giantzdavit_base_fl.msft_florence2zdavit_huge_fl.msft_florence2r  c           	      X    t          ddd          }t          dd| it          |fi |S )Nr   r   r   r
  r  re   
davit_tinyrR  )ri  r  rV  rR  rS  
model_argss      r/   ri  ri    sB    \6IUcdddJ[[*[Z@Z@ZSY@Z@Z[[[r0   c           	      X    t          ddd          }t          dd| it          |fi |S )Nr   r   	   r   r   r   rh  davit_smallrR  )rp  rj  rk  s      r/   rp  rp    sB    \6IUcdddJ\\:\jA[A[TZA[A[\\\r0   c           	      X    t          ddd          }t          dd| it          |fi |S )Nrn              r@   r]          rh  
davit_baserR  )rz  rj  rk  s      r/   rz  rz    sB    \6KWefffJ[[*[Z@Z@ZSY@Z@Z[[[r0   c           	      X    t          ddd          }t          dd| it          |fi |S )Nrn  )r   r   r      )r   r   r   0   rh  rd  rR  )rd  rj  rk  s      r/   rd  rd    sB    \6KWfgggJ\\:\jA[A[TZA[A[\\\r0   c           	      X    t          ddd          }t          dd| it          |fi |S )Nrn  rt  ru  rv  i   r]   rx  ry  @   rh  re  rR  )re  rj  rk  s      r/   re  re    sB    \6LXghhhJ[[*[Z@Z@ZSY@Z@Z[[[r0   c           	      X    t          ddd          }t          dd| it          |fi |S )N)r   r   r   r   )r   r   r|  i   )r   r   r}  r?   rh  rf  rR  )rf  rj  rk  s      r/   rf  rf    sB    ]7MYijjjJ\\:\jA[A[TZA[A[\\\r0   c           	      `    t          ddddddd          }t          d
d	| it          |fi |S )Nrn  rr  rw  r   r   Tr
  r  re   r   r   r   r   davit_base_flrR  )r  rj  rk  s      r/   r  r     sS    (=Dt  J ^^Z^4
C]C]V\C]C]^^^r0   c           	      `    t          ddddddd          }t          d
d	| it          |fi |S )Nrn  r  r  r   r   Tr  davit_huge_flrR  )r  rj  rk  s      r/   r  r  )  sU     (>/Dt  J ^^Z^4
C]C]V\C]C]^^^r0   )r/  r,  )r1  )?rS   	functoolsr   typingr   r   r   torch.nnr(   torch.nn.functional
functionalrK   r   	timm.datar   r	   timm.layersr
   r   r   r   r   r   r   r   r   _builderr   _features_fxr   _manipulater   	_registryr   r   __all__r-  r   r>   rU   r\   rw   r|   r9   r   r   r   r   r   r   r@  rB  rV  rb  default_cfgsri  rp  rz  rd  re  rf  r  r  r   r0   r/   <module>r     s#  	 	       " " " " " " " "                       A A A A A A A A l l l l l l l l l l l l l l l l l l = = = = = = = = * * * * * * 3 3 3 3 3 3 ' ' ' ' ' ' < < < < < < < <)       (" " " " "29 " " "J       D       >    ry   4: : : : :29 : : :z U38_     F sCx S S     ) ) ) ) )bi ) ) )X_ _ _ _ _29 _ _ _DS S S S S S S SlX X X X XBI X X Xv   8  4   *    %$ D  !T   D  466$&&466$(D--%1 %1 %1 %)D.-%1 %1 %1& &  ( \ \e \ \ \ \
 ] ]u ] ] ] ]
 \ \e \ \ \ \
 ] ]u ] ] ] ]
 \ \e \ \ \ \
 ] ]u ] ] ] ] _ _ _ _ _ _ _ _ _ _ _ _ _ _r0   