
    NgLF                        d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
 ddlZddlmZ ddlmZmZ ddlmZmZmZmZmZmZ ddlmZ dd	lmZmZmZ dd
lmZm Z  ddl!m"Z"m#Z# ddl$m%Z%  G d dej&                  Z'dcdZ(dddZ)	 	 dedee*ej+        f         de%de*de,dee*ej+        f         f
dZ-dfdZ.dgdZ/ ei d e/d d!dd"#          d$ e/d%d!d"d&d'd(          d) e/d*d!d+          d, e/d-d!d&d'd.          d/ e/            d0 e/d1d!d&d'2          d3 e/d4d!d+          d5 e/d6d!d&d'd.          d7 e/d8d!d9d:d"d;          d< e/d=d!d9d:d>          d? e/d!dd:@          dA e/dBd!d9d:d>          dC e/eedDE          dF e/eedDE          dG e/eedDE          dH e/eedDE          dI e/dJdKdLdMdNdOP          dQ e/dRdSdLdMdNdOP          i          Z0edhde%fdT            Z1edhde%fdU            Z2edhde%fdV            Z3edhde%fdW            Z4edhde%fdX            Z5edhde%fdY            Z6edhde%fdZ            Z7edhde%fd[            Z8edhde%fd\            Z9edhde%fd]            Z:edhde%fd^            Z;edhde%fd_            Z<edhde%fd`            Z=edhde%fda            Z> ee?d7d<d?d?dAd0db           dS )ia   Hybrid Vision Transformer (ViT) in PyTorch

A PyTorch implement of the Hybrid Vision Transformers as described in:

'An Image Is Worth 16 x 16 Words: Transformers for Image Recognition at Scale'
    - https://arxiv.org/abs/2010.11929

`How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers`
    - https://arxiv.org/abs/2106.10270

NOTE These hybrid model definitions depend on code in vision_transformer.py.
They were moved here to keep file sizes sane.

Hacked together by / Copyright 2020, Ross Wightman
    N)partial)DictListOptionalTupleTypeUnion)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)StdConv2dSame	StdConv2dConvNormAct	to_2tuple	to_ntupleHybridEmbed   )build_model_with_cfg)generate_default_cfgsregister_modelregister_model_deprecations)	resnet26d	resnet50d)ResNetV2create_resnetv2_stem)VisionTransformerc                       e Zd Zddddddej        ej        fdededeeeedf         f         d	eeeedf         f         d
eeeedf         f         dee	eeedf         f         de
ej                 de
ej                 f fdZ xZS )ConvStem   @   )   r    r     in_chansdepthchannels.kernel_sizestridepadding
norm_layer	act_layerc	                    t                                                       t          t                    r1t	          fdt          |          D             d d d                    t          |          |          } t          |          |          }|t          |          cxk    r't          |          cxk    rt                    k    sn J |}	t          t                              D ]h}
|
t                    dz
  k    }|                     |
 t          |	|
         ||
         ||
         ||
         || | ||
  
                   |
         }	id S )Nc                      g | ]
}d |z  z  S )r     ).0ir$   s     a/var/www/html/ai-engine/env/lib/python3.11/site-packages/timm/models/vision_transformer_hybrid.py
<listcomp>z%ConvStem.__init__.<locals>.<listcomp>0   s"    EEE1h!Q$.EEE    r   )r%   r&   r'   bias
apply_norm	apply_actr(   r)   )
super__init__
isinstanceinttupleranger   len
add_moduler   )selfr"   r#   r$   r%   r&   r'   r(   r)   in_chsr.   	last_conv	__class__s      `        r/   r7   zConvStem.__init__"   s    	h$$ 	MEEEEeEEEdddKLLH&i&&{33")E""7++FHHHHs;'7'7HHHH3x==HHHHHHs8}}%% 	! 	!AS]]Q..IOOqFK'Nay
(='-%#% % %    a[FF	! 	!r1   )__name__
__module____qualname__nnBatchNorm2dReLUr9   r	   r   strr   Moduler7   __classcell__)rA   s   @r/   r   r   !   s        46782;8:*,.)+#! #!#! #! CsCx01	#!
 sE#s(O34#! #uS#X./#! 3U38_45#! RY#! BI#! #! #! #! #! #! #! #! #! #!r1   r   r      	   c           	      \   |                     dd          }|rdnd}|rt          t          d          nt          t          d          }t	          |           r+t          | dd|                     dd	          d
||          }n't          |                     dd	          |d
|          }|S )z ResNet-V2 backbone helperpadding_sameTsamer!   g:0yE>)epsr   r"   r   F)layersnum_classesglobal_poolr"   preact	stem_type
conv_layer)rV   rU   rW   )getr   r   r   r<   r   r   )rR   kwargsrO   rV   rW   backbones         r/   	_resnetv2r[   H   s    ::nd33L&.BI5AcD1111wy^bGcGcGcJ
6{{ aqb6::jZ[C\C\I*F F F (JJz1%%5U_a a aOr1   image_encoder.model.c                    i }|                                  D ]\  }}|                    |          s|                    |d          }|                    dd          }|                    dd          }|                    dd          }|                    dd	          }|                    d
d          }|                    dd          }|                    dd          }|                    dd          }|                    dd          }|                    dd          }|                    dd          }|                    dd          }|dk    rd}|                    d          }d|v rU|                    dd          }|                    dd          }|j        }t          j        |j        d                   ||<   |||<   |S ) Nr!   z
patch_emb.zpatch_embed.backbone.z
block.convconvz
block.normbnzpost_transformer_norm.znorm.zpre_norm_mha.0norm1zpre_norm_mha.1attnzpre_norm_ffn.0norm2zpre_norm_ffn.1zmlp.fc1zpre_norm_ffn.4zmlp.fc2z	qkv_proj.zqkv.z	out_proj.zproj.ztransformer.zblocks.zpos_embed.pos_embed.pos_embed	pos_embedr   zclassifier.projz	head.biaszhead.weight)items
startswithreplacesqueezeTtorchzerosshape)
state_dictmodelprefixoutkvbias_ks          r/   _convert_mobilecliprs   W   s   
C  ""  1||F## 	IIfb!!IIl$;<<IIlF++IIlD))II.88II&00II&//II&00II&	22II&	22IIk6**IIk7++IIni00///A		!A!!YY0+>>F		+];;AA+agaj11CKAJr1   bicubicTrl   rm   interpolation	antialiasreturnc                 T    ddl m} d| v rt          | |          }  || |||          S )Nr   )checkpoint_filter_fnz1image_encoder.model.patch_emb.0.block.conv.weight)ru   rv   )vision_transformerry   rs   )rl   rm   ru   rv   
_filter_fns        r/   ry   ry   u   sK     GFFFFF:jHH(U;;
:j%}PYZZZZr1   Fc                    |                     dd          }|pi }t          t          fd|i|}|                    d|           |                    dd           t	          t
          | |ft          t          |d          d	|S )
Nout_indicesr   rZ   embed_layer
patch_sizer   getter)r}   feature_cls)pretrained_filter_fnfeature_cfg)popr   r   
setdefaultr   r   ry   dict)variantrZ   
embed_args
pretrainedrY   r}   r~   s          r/   !_create_vision_transformer_hybridr      s    **]A..K!rJ+GGGJGGK
m[111
lA&&& 2[hGGG    r1   r!   c                 "    | ddd dddddddd	|S )
Ni  )r      r   ?rt   T)      ?r   r   zpatch_embed.backbone.stem.convhead)urlrS   
input_size	pool_sizecrop_pctru   fixed_input_sizemeanstd
first_conv
classifierr,   )r   rY   s     r/   _cfgr      s6    =t6f   r1   z*vit_tiny_r_s16_p8_224.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R_Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npzztimm/zpatch_embed.backbone.conv)r   	hf_hub_idcustom_loadr   z*vit_tiny_r_s16_p8_384.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R_Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz)r     r         ?)r   r   r   r   r   r   z*vit_small_r26_s32_224.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R26_S_32-i21k-300ep-lr_0.001-aug_light0-wd_0.03-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.03-res_224.npz)r   r   r   z*vit_small_r26_s32_384.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R26_S_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz)r   r   r   r   r   zvit_base_r26_s32_224.untrainedz'vit_base_r50_s16_384.orig_in21k_ft_in1kzthttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_resnet50_384-9fd3c705.pth)r   r   r   r   z*vit_large_r50_s32_224.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R50_L_32-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_224.npzz*vit_large_r50_s32_384.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/R50_L_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npzz"vit_tiny_r_s16_p8_224.augreg_in21kzohttps://storage.googleapis.com/vit_models/augreg/R_Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npziSU  r   )r   r   rS   r   r   r   z"vit_small_r26_s32_224.augreg_in21kzshttps://storage.googleapis.com/vit_models/augreg/R26_S_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.03-do_0.0-sd_0.0.npz)r   r   rS   r   r   zvit_base_r50_s16_224.orig_in21k)r   rS   r   z"vit_large_r50_s32_224.augreg_in21kzrhttps://storage.googleapis.com/vit_models/augreg/R50_L_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.1-do_0.0-sd_0.0.npzz!vit_small_resnet26d_224.untrainedzpatch_embed.backbone.conv1.0)r   r   r   z%vit_small_resnet50d_s16_224.untrainedz vit_base_resnet26d_224.untrainedz vit_base_resnet50d_224.untrainedzvit_base_mci_224.apple_mclip_ltzapple/mobileclip_b_lt_timmzYhttps://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_blt.pti   )        r   r   )r   r   r   zpatch_embed.backbone.0.conv)r   r   rS   r   r   r   zvit_base_mci_224.apple_mclipzapple/mobileclip_b_timmzWhttps://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_b.ptc           	      ~    t          dddi|}t          dddd          }t          	 d
|| d	t          |fi |}|S )z3 R+ViT-Ti/S16 w/ 8x8 patch hybrid @ 224 x 224.
    rR   r,            r   r   	embed_dimr#   	num_headsvit_tiny_r_s16_p8_224rZ   r   )r   r[   r   r   r   rY   rZ   
model_argsrm   s        r/   r   r      w     ---f--HcqIIIJ-i*2zi iMQR\MgMg`fMgMgi iELr1   c           	      ~    t          dddi|}t          dddd          }t          	 d
|| d	t          |fi |}|S )z3 R+ViT-Ti/S16 w/ 8x8 patch hybrid @ 384 x 384.
    rR   r,   r   r   r   r   r   vit_tiny_r_s16_p8_384r   )r   r   r   s        r/   r   r      r   r1   c           	      x    t          di |}t          ddd          }t          	 d	|| dt          |fi |}|S )
 R26+ViT-S/S32 hybrid.
    r    r    r    r    r   r      r   r#   r   vit_small_r26_s32_224r   r   )r   r   r   s        r/   r   r     p     0000H2;;;J-i*2zi iMQR\MgMg`fMgMgi iELr1   c           	      x    t          di |}t          ddd          }t          	 d	|| dt          |fi |}|S )
r   r   r   r   r   r   vit_small_r26_s32_384r   r   )r   r   r   s        r/   r   r     r   r1   c           	      x    t          di |}t          ddd          }t          	 d|| dt          |fi |}|S )	z R26+ViT-B/S32 hybrid.
    r      r   r   vit_base_r26_s32_224r   r   )r   r   r   s        r/   r   r     sp     0000H2<<<J-h)1jh hLPQ[LfLf_eLfLfh hELr1   c           	      x    t          di |}t          ddd          }t          	 d|| dt          |fi |}|S )	zR R50+ViT-B/S16 hybrid from original paper (https://arxiv.org/abs/2010.11929).
    rK   r   r   r   vit_base_r50_s16_224r   rK   )r   r   r   s        r/   r   r   $  sp     --f--H2<<<J-h)1jh hLPQ[LfLf_eLfLfh hELr1   c           	      x    t          di |}t          ddd          }t          	 d|| dt          |fi |}|S )	z R50+ViT-B/16 hybrid from original paper (https://arxiv.org/abs/2010.11929).
    ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
    rK   r   r   r   vit_base_r50_s16_384r   r   )r   r   r   s        r/   r   r   /  sp    
 --f--H2<<<J-h)1jh hLPQ[LfLf_eLfLfh hELr1   c           	      x    t          di |}t          ddd          }t          	 d	|| dt          |fi |}|S )
 R50+ViT-L/S32 hybrid.
    r   rL   r   r            r   vit_large_r50_s32_224r   r   )r   r   r   s        r/   r   r   ;  p     0000HB"===J-i*2zi iMQR\MgMg`fMgMgi iELr1   c           	      x    t          di |}t          ddd          }t          	 d	|| dt          |fi |}|S )
r   r   r   r   r   r   vit_large_r50_s32_384r   r   )r   r   r   s        r/   r   r   F  r   r1   c           	          t          | |                    dd          ddg          }t          dddd          }t          	 d|| d
t          |fi |}|S )zL Custom ViT small hybrid w/ ResNet26D stride 32. No pretrained weights.
    r"   r   TrL   r   r"   features_onlyr}   r   r   r   r#   r   	mlp_ratiovit_small_resnet26d_224r   )r   r   rX   r   r   r   s        r/   r   r   Q  s     JJPQ9R9RbfuvtwxxxH1QGGGJ-!k,4k kOST^OiOibhOiOik kELr1   c           	          t          | |                    dd          ddg          }t          dddd          }t          	 d
|| d	t          |fi |}|S )zV Custom ViT small hybrid w/ ResNet50D 3-stages, stride 16. No pretrained weights.
    r"   r   Tr   r   r   r   vit_small_resnet50d_s16_224r   )r   r   rX   r   r   r   s        r/   r   r   \  s     JJPQ9R9RbfuvtwxxxH1QGGGJ-%o08Zo oSWXbSmSmflSmSmo oELr1   c           	          t          | |                    dd          ddg          }t          ddd          }t          	 d|| d
t          |fi |}|S )zK Custom ViT base hybrid w/ ResNet26D stride 32. No pretrained weights.
    r"   r   TrL   r   r   r   r   vit_base_resnet26d_224r   )r   r   r   s        r/   r   r   g       JJPQ9R9RbfuvtwxxxH2<<<J- j+3
j jNRS]NhNhagNhNhj jELr1   c           	          t          | |                    dd          ddg          }t          ddd          }t          	 d|| d
t          |fi |}|S )K Custom ViT base hybrid w/ ResNet50D stride 32. No pretrained weights.
    r"   r   TrL   r   r   r   r   vit_base_resnet50d_224r   )r   r   r   s        r/   r   r   r  r   r1   c           
          t          dddd|                    dd          t          j                  }t	          dddd	
          }t          	 d|t	          d          | dt	          |fi |}|S )r   )r   r   r   )rL   r    r    r   r"   r   )r$   r&   r%   r'   r"   r)   r   r   T)r   r#   r   no_embed_classvit_base_mci_224F)proj)rZ   r   r   )r   )r   rX   rE   GELUr   r   r   s        r/   r   r   }  s     &J**'  H 2DQQQJ-%-$E:J:J:J !%j!;!;F!;!; E Lr1   )vit_tiny_r_s16_p8_224_in21kvit_small_r26_s32_224_in21kvit_base_r50_s16_224_in21kvit_base_resnet50_224_in21kvit_large_r50_s32_224_in21kvit_base_resnet50_384r   )r\   )rt   T)NF)r!   )F)@__doc__math	functoolsr   typingr   r   r   r   r   r	   ri   torch.nnrE   	timm.datar
   r   timm.layersr   r   r   r   r   r   _builderr   	_registryr   r   r   resnetr   r   resnetv2r   r   rz   r   
Sequentialr   r[   rs   rH   Tensorboolry   r   r   default_cfgsr   r   r   r   r   r   r   r   r   r   r   r   r   r   rB   r,   r1   r/   <module>r      s           ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ;        A A A A A A A A ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` * * * * * * Y Y Y Y Y Y Y Y Y Y ( ( ( ( ( ( ( ( 4 4 4 4 4 4 4 4 1 1 1 1 1 1$! $! $! $! $!r} $! $! $!N      B '	[ [el*+[ [ [ 	[
 
#u|
[ [ [ [        %$ L&0$$ f.	30 30 30L& 1$$ f.=SVdh3j 3j 3jL& 1$$ i3 3 3L&  1$$ j 3D3B 3B 3B!L&( %ddff)L&* .tt C 300 00 00+L&2 1$$ i3 3 33L&< 1$$ i 3D3 3 3=L&J )$$}C4O]a+c +c +cKL&R )$$ BCT+; +; +;SL&Z &tt(% (% (%[L&b )$$ ACT+; +; +;cL&n ("(<Ig*i *i *ioL&r ,TT"(<Ig.i .i .isL&v '"(<Ig)i )i )iwL&z '"(<Ig)i )i )i{L&@ &tt.g|8U	( ( (AL&L #DD+e|8U	% % %ML& L& L L^  9J      9J      9J      9J      8I      8I      8I      9J      9J      ;L      ?P      :K      :K      4E    &  H#G#G"C#D#GF' '     r1   