
    g              	       F   d Z ddlZddlmZmZmZmZmZ ddlZddl	Zddlm
Z
 ddlmZmZmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZmZmZmZm Z m!Z! ddl"m#Z#  ej$        e%          Z&dZ'dZ(g dZ)dZ*dZ+dCde,de,dee,         de,fdZ- G d de
j.                  Z/ G d de
j.                  Z0 G d de
j.                  Z1 G d de
j.                  Z2 G d  d!e
j.                  Z3 G d" d#e
j.                  Z4 G d$ d%e
j.                  Z5 G d& d'e
j.                  Z6 G d( d)e
j.                  Z7 G d* d+e
j.                  Z8 G d, d-e
j.                  Z9 G d. d/e
j.                  Z: G d0 d1e          Z;d2Z<d3Z= ed4e<           G d5 d6e;                      Z> ed7e<           G d8 d9e;                      Z? G d: d;e
j.                  Z@ G d< d=e
j.                  ZA G d> d?e
j.                  ZB ed@e<           G dA dBe;                      ZCdS )DzPyTorch MobileViT model.    N)DictOptionalSetTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings	torch_int   )MobileViTConfigr   zapple/mobilevit-small)r   i     r   ztabby, tabby catr   valuedivisor	min_valuereturnc                     ||}t          |t          | |dz  z             |z  |z            }|d| z  k     r||z  }t          |          S )a  
    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
    original TensorFlow repo. It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    N   g?)maxint)r   r   r    	new_values       l/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/mobilevit/modeling_mobilevit.pymake_divisibler(   >   s^     	Is57Q;#6777BWLMMI3;W	y>>    c                        e Zd Z	 	 	 	 	 	 ddededededed	ed
edededeeef         ddf fdZde	j
        de	j
        fdZ xZS )MobileViTConvLayerr   FTconfigin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr!   Nc                 n   t                                                       t          |dz
  dz            |z  }||z  dk    rt          d| d| d          ||z  dk    rt          d| d| d          t	          j        ||||||||d		  	        | _        |	rt	          j        |d
ddd          | _        nd | _        |
rjt          |
t                    rt          |
         | _        d S t          |j        t                    rt          |j                 | _        d S |j        | _        d S d | _        d S )Nr   r#   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r-   r.   r/   r0   paddingr3   r1   r2   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r%   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr   
activation
hidden_act)selfr,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r8   	__class__s               r'   r@   zMobileViTConvLayer.__init__N   sz    	{Q!+,,x71$$dddTZdddeee& A%%fffV\fffggg9#%# 

 

 

  		&!#)$(" " "D "&D 	#.#.. 4"("8F-s33 4"():";"("3"DOOOr)   featuresc                     |                      |          }| j        |                     |          }| j        |                     |          }|S N)rC   rE   rH   )rJ   rL   s     r'   forwardzMobileViTConvLayer.forward   sO    ##H--)))(33H?&x00Hr)   )r   r   Fr   TT)__name__
__module____qualname__r   r%   boolr   rG   r@   torchTensorrO   __classcell__rK   s   @r'   r+   r+   M   s         "&+/4# 4#4# 4# 	4#
 4# 4# 4# 4# 4#  4# dCi(4# 
4# 4# 4# 4# 4# 4#l         r)   r+   c                   d     e Zd ZdZ	 ddedededededd	f fd
Zdej        dej        fdZ	 xZ
S )MobileViTInvertedResidualzQ
    Inverted residual block (MobileNetv2): https://arxiv.org/abs/1801.04381
    r   r,   r-   r.   r0   r3   r!   Nc           	         t                                                       t          t          t	          ||j        z                      d          }|dvrt          d| d          |dk    o||k    | _        t          |||d          | _	        t          |||d|||          | _
        t          |||dd	
          | _        d S )Nr   )r   r#   zInvalid stride .r   r-   r.   r/   r   )r-   r.   r/   r0   r1   r3   Fr-   r.   r/   r5   )r?   r@   r(   r%   roundexpand_ratiorA   use_residualr+   
expand_1x1conv_3x3
reduce_1x1)rJ   r,   r-   r.   r0   r3   expanded_channelsrK   s          r'   r@   z"MobileViTInvertedResidual.__init__   s     	*3u[6CV5V/W/W+X+XZ[\\8v888999#q[K{l/J,:KYZ
 
 
 +)*$
 
 
 -)% 
 
 
r)   rL   c                     |}|                      |          }|                     |          }|                     |          }| j        r||z   n|S rN   )ra   rb   rc   r`   )rJ   rL   residuals      r'   rO   z!MobileViTInvertedResidual.forward   sR    ??8,,==**??8,,&*&7Ex(""XEr)   r   )rP   rQ   rR   __doc__r   r%   r@   rT   rU   rO   rV   rW   s   @r'   rY   rY      s         
 jk
 
%
47
GJ
TW
cf
	
 
 
 
 
 
BF F F F F F F F F Fr)   rY   c                   `     e Zd Z	 ddedededededdf fd	Zd
ej        dej        fdZ xZ	S )MobileViTMobileNetLayerr   r,   r-   r.   r0   
num_stagesr!   Nc                 
   t                                                       t          j                    | _        t          |          D ]9}t          ||||dk    r|nd          }| j                            |           |}:d S )Nr   r   )r-   r.   r0   )r?   r@   r   
ModuleListlayerrangerY   append)	rJ   r,   r-   r.   r0   rk   irn   rK   s	           r'   r@   z MobileViTMobileNetLayer.__init__   s     	]__
z"" 	' 	'A-')!"avvQ	  E Je$$$&KK	' 	'r)   rL   c                 0    | j         D ]} ||          }|S rN   rn   )rJ   rL   layer_modules      r'   rO   zMobileViTMobileNetLayer.forward   s)     J 	. 	.L#|H--HHr)   )r   r   
rP   rQ   rR   r   r%   r@   rT   rU   rO   rV   rW   s   @r'   rj   rj      s        op' '%'47'GJ'TW'il'	' ' ' ' ' '          r)   rj   c                   t     e Zd Zdededdf fdZdej        dej        fdZdej        dej        fd	Z	 xZ
S )
MobileViTSelfAttentionr,   hidden_sizer!   Nc                 4   t                                                       ||j        z  dk    rt          d|f d|j         d          |j        | _        t	          ||j        z            | _        | j        | j        z  | _        t          j        || j        |j	                  | _
        t          j        || j        |j	                  | _        t          j        || j        |j	                  | _        t          j        |j                  | _        d S )Nr   zThe hidden size z4 is not a multiple of the number of attention heads r[   )r2   )r?   r@   num_attention_headsrA   r%   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyr   Dropoutattention_probs_dropout_probdropoutrJ   r,   rx   rK   s      r'   r@   zMobileViTSelfAttention.__init__   s   33q887;. 7 737 7 7  
 $*#= #&{V5O'O#P#P !58PPY{D,>V_UUU
9[$*<6?SSSY{D,>V_UUU
z&"EFFr)   xc                     |                                 d d         | j        | j        fz   } |j        | }|                    dddd          S )Nr   r#   r   r   )sizerz   r{   viewpermute)rJ   r   new_x_shapes      r'   transpose_for_scoresz+MobileViTSelfAttention.transpose_for_scores   sM    ffhhssmt'?AY&ZZAFK yyAq!$$$r)   hidden_statesc                    |                      |          }|                     |                     |                    }|                     |                     |                    }|                     |          }t	          j        ||                    dd                    }|t          j        | j	                  z  }t          j                            |d          }|                     |          }t	          j        ||          }|                    dddd                                          }|                                d d         | j        fz   }	 |j        |	 }|S )Nr   dimr   r#   r   r   )r   r   r   r   rT   matmul	transposemathsqrtr{   r   
functionalsoftmaxr   r   
contiguousr   r|   r   )
rJ   r   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapes
             r'   rO   zMobileViTSelfAttention.forward   sD    JJ}55--dhh}.E.EFF	//

=0I0IJJ//0ABB !<Y5H5HR5P5PQQ+di8P.Q.QQ -//0@b/II ,,77_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S**,CDr)   )rP   rQ   rR   r   r%   r@   rT   rU   r   rO   rV   rW   s   @r'   rw   rw      s        G GS GT G G G G G G&%el %u| % % % %
U\ el        r)   rw   c                   P     e Zd Zdededdf fdZdej        dej        fdZ xZ	S )MobileViTSelfOutputr,   rx   r!   Nc                     t                                                       t          j        ||          | _        t          j        |j                  | _        d S rN   r?   r@   r   r}   denser   hidden_dropout_probr   r   s      r'   r@   zMobileViTSelfOutput.__init__  sD    Y{K88
z&"<==r)   r   c                 Z    |                      |          }|                     |          }|S rN   r   r   rJ   r   s     r'   rO   zMobileViTSelfOutput.forward  s*    

=11]33r)   ru   rW   s   @r'   r   r     sx        > >S >T > > > > > >
U\ el        r)   r   c                   l     e Zd Zdededdf fdZdee         ddfdZdej	        dej	        fd	Z
 xZS )
MobileViTAttentionr,   rx   r!   Nc                     t                                                       t          ||          | _        t	          ||          | _        t                      | _        d S rN   )r?   r@   rw   	attentionr   outputsetpruned_headsr   s      r'   r@   zMobileViTAttention.__init__  sM    /DD)&+>>EEr)   headsc                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   r   )lenr   r   rz   r{   r   r   r   r   r   r   r   r|   union)rJ   r   indexs      r'   prune_headszMobileViTAttention.prune_heads  s   u::??F74>5t~7Y[_[l
 
u
  2$.2FNN/0BEJJ1$.2FNN.t{/@%QOOO .2^-ORUV[R\R\-\*'+~'IDNLn'n$ -33E::r)   r   c                 Z    |                      |          }|                     |          }|S rN   )r   r   )rJ   r   self_outputsattention_outputs       r'   rO   zMobileViTAttention.forward,  s+    ~~m44;;|44r)   )rP   rQ   rR   r   r%   r@   r   r   rT   rU   rO   rV   rW   s   @r'   r   r     s        " "S "T " " " " " ";S ;d ; ; ; ;$ U\  el                r)   r   c                   T     e Zd Zdedededdf fdZdej        dej        fdZ xZ	S )	MobileViTIntermediater,   rx   intermediate_sizer!   Nc                     t                                                       t          j        ||          | _        t          |j        t                    rt          |j                 | _	        d S |j        | _	        d S rN   )
r?   r@   r   r}   r   rF   rI   rG   r   intermediate_act_fnrJ   r,   rx   r   rK   s       r'   r@   zMobileViTIntermediate.__init__3  si    Y{,=>>
f'-- 	9'-f.?'@D$$$'-'8D$$$r)   r   c                 Z    |                      |          }|                     |          }|S rN   )r   r   r   s     r'   rO   zMobileViTIntermediate.forward;  s,    

=1100??r)   ru   rW   s   @r'   r   r   2  s        9 9S 9UX 9]a 9 9 9 9 9 9U\ el        r)   r   c                   b     e Zd Zdedededdf fdZdej        dej        dej        fd	Z xZ	S )
MobileViTOutputr,   rx   r   r!   Nc                     t                                                       t          j        ||          | _        t          j        |j                  | _        d S rN   r   r   s       r'   r@   zMobileViTOutput.__init__B  sE    Y0+>>
z&"<==r)   r   input_tensorc                 d    |                      |          }|                     |          }||z   }|S rN   r   )rJ   r   r   s      r'   rO   zMobileViTOutput.forwardG  s4    

=11]33%4r)   ru   rW   s   @r'   r   r   A  s        > >S >UX >]a > > > > > >
U\  RWR^        r)   r   c                   T     e Zd Zdedededdf fdZdej        dej        fdZ xZ	S )	MobileViTTransformerLayerr,   rx   r   r!   Nc                 J   t                                                       t          ||          | _        t	          |||          | _        t          |||          | _        t          j	        ||j
                  | _        t          j	        ||j
                  | _        d S )Nr;   )r?   r@   r   r   r   intermediater   r   r   	LayerNormlayer_norm_epslayernorm_beforelayernorm_afterr   s       r'   r@   z"MobileViTTransformerLayer.__init__O  s    +FK@@1&+GXYY%fk;LMM "[f>S T T T!|KV=RSSSr)   r   c                     |                      |                     |                    }||z   }|                     |          }|                     |          }|                     ||          }|S rN   )r   r   r   r   r   )rJ   r   r   layer_outputs       r'   rO   z!MobileViTTransformerLayer.forwardW  sk    >>$*?*?*N*NOO(=8++M::((66{{<??r)   ru   rW   s   @r'   r   r   N  s        T TS TUX T]a T T T T T TU\ el        r)   r   c                   T     e Zd Zdedededdf fdZdej        dej        fdZ xZ	S )	MobileViTTransformerr,   rx   rk   r!   Nc           	         t                                                       t          j                    | _        t          |          D ]C}t          ||t          ||j        z                      }| j        	                    |           Dd S )N)rx   r   )
r?   r@   r   rm   rn   ro   r   r%   	mlp_ratiorp   )rJ   r,   rx   rk   _transformer_layerrK   s         r'   r@   zMobileViTTransformer.__init__b  s    ]__
z"" 	1 	1A 9'"%kF4D&D"E"E! ! !
 J/0000	1 	1r)   r   c                 0    | j         D ]} ||          }|S rN   rs   )rJ   r   rt   s      r'   rO   zMobileViTTransformer.forwardn  s*     J 	8 	8L(L77MMr)   ru   rW   s   @r'   r   r   a  s        
1 
1S 
1c 
1VZ 
1 
1 
1 
1 
1 
1U\ el        r)   r   c                        e Zd ZdZ	 ddedededededed	ed
df fdZdej        d
e	ej        e
f         fdZdej        de
d
ej        fdZdej        d
ej        fdZ xZS )MobileViTLayerz;
    MobileViT block: https://arxiv.org/abs/2110.02178
    r   r,   r-   r.   r0   rx   rk   r3   r!   Nc                 <   t                                                       |j        | _        |j        | _        |dk    r/t          ||||dk    r|nd|dk    r|dz  nd          | _        |}nd | _        t          ||||j                  | _	        t          |||ddd          | _
        t          |||          | _        t          j        ||j                  | _        t          |||d          | _        t          |d|z  ||j                  | _        d S )	Nr#   r   )r-   r.   r0   r3   r\   F)r-   r.   r/   r4   r5   )rx   rk   r   )r?   r@   
patch_sizepatch_widthpatch_heightrY   downsampling_layerr+   conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projectionfusion)	rJ   r,   r-   r.   r0   rx   rk   r3   rK   s	           r'   r@   zMobileViTLayer.__init__y  sg    	!,"-Q;;&?')!)QvvA*2Q,,QA' ' 'D# 'KK&*D#*#$/	
 
 
 +#$# 
 
 
 0#!
 
 
 kv7LMMM1+ST 
  
  
 )KkW]Wn
 
 
r)   rL   c                    | j         | j        }}t          ||z            }|j        \  }}}}t          j                                        r't          t	          j        ||z            |z            n&t          t          j        ||z            |z            }	t          j                                        r't          t	          j        ||z            |z            n&t          t          j        ||z            |z            }
d}|
|k    s|	|k    r't          j                            ||	|
fdd          }d}|
|z  }|	|z  }||z  }|                    ||z  |z  |||          }|                    dd          }|                    ||||          }|                    dd          }|                    ||z  |d          }||f||||||d	}||fS )
NFbilinearr   modealign_cornersTr   r#   r   r   )	orig_size
batch_sizechannelsinterpolatenum_patchesnum_patches_widthnum_patches_height)r   r   r%   shaperT   jit
is_tracingr   ceilr   r   r   r   reshaper   )rJ   rL   r   r   
patch_arear   r   orig_height
orig_width
new_height	new_widthr   num_patch_widthnum_patch_heightr   patches	info_dicts                    r'   	unfoldingzMobileViTLayer.unfolding  s   $($4d6G\|344
8@5
Hk: y##%%KIej|!;<<|KLLLTY{\9::\IJJ 	 y##%%HIejk!9::[HIIITYzK788;FGG 	 
""jK&?&?}00
I6ZW\ 1  H K ${2%5&8 ""!$44lOU`
 
 ##Aq))//*hZPP##Aq))//*z"9;KK &z2$ &&!0"2
 
	 	!!r)   r   r   c                    | j         | j        }}t          ||z            }|d         }|d         }|d         }|d         }	|d         }
|                                                    |||d          }|                    dd          }|                    ||z  |	z  |
||          }|                    dd	          }|                    |||	|z  |
|z            }|d
         r)t          j        	                    ||d         dd          }|S )Nr   r   r   r   r   r   r   r   r#   r   r   r   Fr   )
r   r   r%   r   r   r   r   r   r   r   )rJ   r   r   r   r   r   r   r   r   r   r   rL   s               r'   foldingzMobileViTLayer.folding  sA   $($4d6G\|344
|,
Z(.$%9:#$78 %%'',,Z[RTUU%%a++##!$44o|U`
 
 %%a++##"2\"A?U`C`
 
 ]# 	}00y5JV[ 1  H r)   c                    | j         r|                      |          }|}|                     |          }|                     |          }|                     |          \  }}|                     |          }|                     |          }|                     ||          }|                     |          }|                     t          j
        ||fd                    }|S Nr   r   )r   r   r   r   r   r   r  r   r   rT   cat)rJ   rL   rf   r   r   s        r'   rO   zMobileViTLayer.forward  s    " 	9..x88H ==**==** "^^H55 ""7++..)) <<33''11;;uy(H)=1EEEFFr)   rg   )rP   rQ   rR   rh   r   r%   r@   rT   rU   r   r   r   r  rO   rV   rW   s   @r'   r   r   t  s$         8
 8
8
 8
 	8

 8
 8
 8
 8
 
8
 8
 8
 8
 8
 8
t1"%, 1"5t9K3L 1" 1" 1" 1"fu|      :         r)   r   c                   `     e Zd Zdeddf fdZ	 	 ddej        ded	edee	e
f         fd
Z xZS )MobileViTEncoderr,   r!   Nc           	         t                                                       || _        t          j                    | _        d| _        dx}}|j        dk    rd}d}n|j        dk    rd}d}t          ||j	        d         |j	        d         dd          }| j        
                    |           t          ||j	        d         |j	        d         dd	          }| j        
                    |           t          ||j	        d         |j	        d	         d|j        d         d
          }| j        
                    |           |r|dz  }t          ||j	        d	         |j	        d         d|j        d         d|          }| j        
                    |           |r|dz  }t          ||j	        d         |j	        d         d|j        d         d	|          }	| j        
                    |	           d S )NFr   T   r   r   )r-   r.   r0   rk   r#   r   )r-   r.   r0   rx   rk      )r-   r.   r0   rx   rk   r3      )r?   r@   r,   r   rm   rn   gradient_checkpointingoutput_striderj   neck_hidden_sizesrp   r   hidden_sizes)rJ   r,   dilate_layer_4dilate_layer_5r3   layer_1layer_2layer_3layer_4layer_5rK   s             r'   r@   zMobileViTEncoder.__init__  s9   ]__
&+# +0/1$$!N!NN!R''!N)031!4
 
 
 	
'""")031!4
 
 
 	
'""" 031!4+A.
 
 
 	
'""" 	MH 031!4+A.
 
 
 	
'""" 	MH 031!4+A.
 
 
 	
'"""""r)   FTr   output_hidden_statesreturn_dictc                    |rdnd }t          | j                  D ]B\  }}| j        r#| j        r|                     |j        |          }n ||          }|r||fz   }C|st          d ||fD                       S t          ||          S )N c              3      K   | ]}||V  	d S rN   r  ).0vs     r'   	<genexpr>z+MobileViTEncoder.forward.<locals>.<genexpr>}  s"      XXq!-----XXr)   )last_hidden_stater   )	enumeratern   r  training_gradient_checkpointing_func__call__tupler   )rJ   r   r  r  all_hidden_statesrq   rt   s          r'   rO   zMobileViTEncoder.forwardh  s     #7@BBD(44 
	I 
	IOA|* <t} < $ A A )!! !
 !-] ; ;# I$58H$H! 	YXX]4E$FXXXXXX-]noooor)   )FT)rP   rQ   rR   r   r@   rT   rU   rS   r   r$  r   rO   rV   rW   s   @r'   r  r    s        H# H#4 H# H# H# H# H# H#Z &+ 	p p|p #p 	p
 
u44	5p p p p p p p pr)   r  c                   h    e Zd ZdZeZdZdZdZdgZ	de
ej        ej        ej        f         ddfd	ZdS )
MobileViTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    	mobilevitpixel_valuesTr   moduler!   Nc                    t          |t          j        t          j        f          rT|j        j                            d| j        j                   |j	         |j	        j        
                                 dS dS t          |t          j                  r?|j	        j        
                                 |j        j                            d           dS dS )zInitialize the weightsg        )meanstdNg      ?)rF   r   r}   rB   weightdatanormal_r,   initializer_ranger2   zero_r   fill_)rJ   r*  s     r'   _init_weightsz&MobileViTPreTrainedModel._init_weights  s    fry")455 	* M&&CT[5R&SSS{& &&((((( '&-- 	*K""$$$M$$S)))))	* 	*r)   )rP   rQ   rR   rh   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   r   r}   rB   r   r4  r  r)   r'   r'  r'    st         
 #L#$O&*#)*
*E")RY*L$M 
*RV 
* 
* 
* 
* 
* 
*r)   r'  aK  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`MobileViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aF  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`MobileViTImageProcessor.__call__`] for details.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zWThe bare MobileViT model outputting raw hidden-states without any specific head on top.c                        e Zd Zddedef fdZd Z ee           e	e
eede          	 	 	 dd	eej                 d
ee         dee         deeef         fd                        Z xZS )MobileViTModelTr,   expand_outputc                 r   t                                          |           || _        || _        t	          ||j        |j        d         dd          | _        t          |          | _	        | j        r.t	          ||j        d         |j        d         d          | _
        |                                  d S )	Nr   r   r#   )r-   r.   r/   r0   r     r   r\   )r?   r@   r,   r<  r+   num_channelsr  	conv_stemr  encoderconv_1x1_exp	post_init)rJ   r,   r<  rK   s      r'   r@   zMobileViTModel.__init__  s       *++1!4
 
 
 (// 	 2"4Q7#5a8	! ! !D 	r)   c                     |                                 D ]U\  }}| j        j        |         }t          |t                    r)|j        j        D ]}|j                            |           VdS )zPrunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        N)itemsrA  rn   rF   r   r   r   r   )rJ   heads_to_prunelayer_indexr   mobilevit_layerr   s         r'   _prune_headszMobileViTModel._prune_heads  s     #1"6"6"8"8 	C 	CK"l0=O/>:: C)8)D)J C C%%/;;EBBBB		C 	Cr)   vision)
checkpointoutput_typer5  modalityexpected_outputNr)  r  r  r!   c                    ||n| j         j        }||n| j         j        }|t          d          |                     |          }|                     |||          }| j        r5|                     |d                   }t          j	        |ddgd          }n
|d         }d }|s|||fn|f}||dd          z   S t          |||j        	          S )
Nz You have to specify pixel_valuesr  r  r   r   r   F)r   keepdimr   )r  pooler_outputr   )r,   r  use_return_dictrA   r@  rA  r<  rB  rT   r,  r   r   )	rJ   r)  r  r  embedding_outputencoder_outputsr  pooled_outputr   s	            r'   rO   zMobileViTModel.forward  s'    %9$D  $+Jj 	 &1%<kk$+B]?@@@>>,77,,!5# ' 
 
  	! $ 1 1/!2D E E "J'8r2hPUVVVMM / 2 M 	0;H;T'77[lZnFOABB///7/')7
 
 
 	
r)   )T)NNN)rP   rQ   rR   r   rS   r@   rI  r   MOBILEVIT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   rT   rU   r   r$  rO   rV   rW   s   @r'   r;  r;    s       
  t      4C C C +*+EFF&<$.   04/3&*	'
 '
u|,'
 'tn'
 d^	'

 
u>>	?'
 '
 '
  GF'
 '
 '
 '
 '
r)   r;  z
    MobileViT model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    c                        e Zd Zdeddf fdZ ee           eee	e
e          	 	 	 	 ddeej                 dee         deej                 d	ee         deee	f         f
d
                        Z xZS )MobileViTForImageClassificationr,   r!   Nc                    t                                          |           |j        | _        t          |          | _        t          j        |j        d          | _        |j        dk    r%t          j	        |j
        d         |j                  nt          j                    | _        |                                  d S )NT)inplacer   r   )r?   r@   
num_labelsr;  r(  r   r   classifier_dropout_probr   r}   r  Identity
classifierrC  rJ   r,   rK   s     r'   r@   z(MobileViTForImageClassification.__init__  s        +'// z&"@$OOOJPJ[^_J_J_BIf.r2F4EFFFegeperer 	
 	r)   )rK  rL  r5  rN  r)  r  labelsr  c                    ||n| j         j        }|                     |||          }|r|j        n|d         }|                     |                     |                    }d}|Z| j         j        f| j        dk    rd| j         _        nN| j        dk    r7|j        t          j
        k    s|j        t          j        k    rd| j         _        nd| j         _        | j         j        dk    rWt                      }	| j        dk    r1 |	|                                |                                          }n |	||          }n| j         j        dk    rGt                      }	 |	|                    d| j                  |                    d                    }n*| j         j        dk    rt!                      }	 |	||          }|s|f|dd         z   }
||f|
z   n|
S t#          |||j        	          S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NrP  r   
regressionsingle_label_classificationmulti_label_classificationr   r#   )losslogitsr   )r,   rS  r(  rR  rb  r   problem_typer_  dtyperT   longr%   r   squeezer
   r   r	   r   r   )rJ   r)  r  rd  r  outputsrV  rj  ri  loss_fctr   s              r'   rO   z'MobileViTForImageClassification.forward%  s   ( &1%<kk$+B]..DXfq.rr1<L--'!*m!<!<=={'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE3!/
 
 
 	
r)   NNNN)rP   rQ   rR   r   r@   r   rW  r   _IMAGE_CLASS_CHECKPOINTr   rY  _IMAGE_CLASS_EXPECTED_OUTPUTr   rT   rU   rS   r   r$  rO   rV   rW   s   @r'   r\  r\    s        4       +*+EFF*8$4	   04/3)-&*4
 4
u|,4
 'tn4
 &	4

 d^4
 
u::	;4
 4
 4
  GF4
 4
 4
 4
 4
r)   r\  c                   T     e Zd Zdedededdf fdZdej        dej        fdZ xZ	S )	MobileViTASPPPoolingr,   r-   r.   r!   Nc           	          t                                                       t          j        d          | _        t          |||dddd          | _        d S )Nr   )output_sizeTrelu)r-   r.   r/   r0   r4   r5   )r?   r@   r   AdaptiveAvgPool2dglobal_poolr+   r   )rJ   r,   r-   r.   rK   s       r'   r@   zMobileViTASPPPooling.__init__d  s^    /A>>>*#%"!
 
 
r)   rL   c                     |j         dd          }|                     |          }|                     |          }t          j                            ||dd          }|S )Nr   r   Fr   )r   rz  r   r   r   r   )rJ   rL   spatial_sizes      r'   rO   zMobileViTASPPPooling.forwards  sZ    ~bcc*##H--==**=,,XLzin,oor)   ru   rW   s   @r'   ru  ru  c  s        
 
S 
PS 
X\ 
 
 
 
 
 
         r)   ru  c                   P     e Zd ZdZdeddf fdZdej        dej        fdZ xZ	S )MobileViTASPPzs
    ASPP module defined in DeepLab papers: https://arxiv.org/abs/1606.00915, https://arxiv.org/abs/1706.05587
    r,   r!   Nc                 v   t                                                       j        d         j        t	          j                  dk    rt          d          t          j                    | _	        t          dd          }| j	                            |           | j	                            fdj        D                        t                    }| j	                            |           t          dz  dd          | _        t          j        j        	          | _        d S )
Nr   r   z"Expected 3 values for atrous_ratesr   rx  r]   c           
      :    g | ]}t          d |d          S )r   rx  )r-   r.   r/   r3   r5   )r+   )r  rater,   r-   r.   s     r'   
<listcomp>z*MobileViTASPP.__init__.<locals>.<listcomp>  sL     
 
 
  # +!- !!#)  
 
 
r)   r  )p)r?   r@   r  aspp_out_channelsr   atrous_ratesrA   r   rm   convsr+   rp   extendru  projectr   aspp_dropout_probr   )rJ   r,   in_projection
pool_layerr-   r.   rK   s    `  @@r'   r@   zMobileViTASPP.__init__  s[   .r2/v"##q((ABBB]__
*#%!
 
 
 	
-(((

 
 
 
 
 
 #/
 
 
	
 	
 	
 *&+|LL

*%%%)L 0|YZkq
 
 
 zF$<===r)   rL   c                     g }| j         D ] }|                     ||                     !t          j        |d          }|                     |          }|                     |          }|S r  )r  rp   rT   r  r  r   )rJ   rL   pyramidconvpooled_featuress        r'   rO   zMobileViTASPP.forward  sq    J 	+ 	+DNN44>>****)G+++,,w//,,77r)   
rP   rQ   rR   rh   r   r@   rT   rU   rO   rV   rW   s   @r'   r~  r~  {  s|         )> )>4 )> )> )> )> )> )>V         r)   r~  c                   P     e Zd ZdZdeddf fdZdej        dej        fdZ xZ	S )MobileViTDeepLabV3zB
    DeepLabv3 architecture: https://arxiv.org/abs/1706.05587
    r,   r!   Nc           	          t                                                       t          |          | _        t	          j        |j                  | _        t          ||j	        |j
        dddd          | _        d S )Nr   FT)r-   r.   r/   r4   r5   r2   )r?   r@   r~  asppr   	Dropout2dr`  r   r+   r  r_  rb  rc  s     r'   r@   zMobileViTDeepLabV3.__init__  sq    !&))	|F$BCC,0*# 
 
 
r)   r   c                     |                      |d                   }|                     |          }|                     |          }|S )Nr   )r  r   rb  )rJ   r   rL   s      r'   rO   zMobileViTDeepLabV3.forward  s?    99]2.//<<))??8,,r)   r  rW   s   @r'   r  r    s{         
 
4 
 
 
 
 
 
 U\ el        r)   r  zX
    MobileViT model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                        e Zd Zdeddf fdZ ee           eee	          	 	 	 	 dde
ej                 de
ej                 de
e         d	e
e         deeef         f
d
                        Z xZS ) MobileViTForSemanticSegmentationr,   r!   Nc                     t                                          |           |j        | _        t          |d          | _        t          |          | _        |                                  d S )NF)r<  )r?   r@   r_  r;  r(  r  segmentation_headrC  rc  s     r'   r@   z)MobileViTForSemanticSegmentation.__init__  sa        +'eDDD!3F!;!; 	r)   )rL  r5  r)  rd  r  r  c                 B   ||n| j         j        }||n| j         j        }|| j         j        dk    rt	          d          |                     |d|          }|r|j        n|d         }|                     |          }d}|Vt          j	        
                    ||j        dd         dd	          }	t          | j         j        
          }
 |
|	|          }|s)|r|f|dd         z   }n|f|dd         z   }||f|z   n|S t          |||r|j        ndd          S )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> import requests
        >>> import torch
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, MobileViTForSemanticSegmentation

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-small")
        >>> model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTrP  r   r   Fr   )ignore_indexr#   )ri  rj  r   
attentions)r,   r  rS  r_  rA   r(  r   r  r   r   r   r   r
   semantic_loss_ignore_indexr   )rJ   r)  rd  r  r  ro  encoder_hidden_statesrj  ri  upsampled_logitsrp  r   s               r'   rO   z(MobileViTForSemanticSegmentation.forward  s   N %9$D  $+Jj 	 &1%<kk$+B]$+"8A"="=NOOO..!%# ! 
 
 :E T 5 5'RS*''(=>>!}88V\"##.Zu  9     (T[5[\\\H8,f55D 	F# 1 WQRR[0 WQRR[0)-)9TGf$$vE&3GQ'//T	
 
 
 	
r)   rq  )rP   rQ   rR   r   r@   r   rW  r   r   rY  r   rT   rU   rS   r   r$  rO   rV   rW   s   @r'   r  r    s	        4       +*+EFF+BQ`aaa 04)-/3&*K
 K
u|,K
 &K
 'tn	K

 d^K
 
u--	.K
 K
 K
 ba GFK
 K
 K
 K
 K
r)   r  )r   N)Drh   r   typingr   r   r   r   r   rT   torch.utils.checkpointr   torch.nnr	   r
   r   activationsr   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   r   configuration_mobilevitr   
get_loggerrP   loggerrY  rX  rZ  rr  rs  r%   r(   Moduler+   rY   rj   rw   r   r   r   r   r   r   r   r  r'  MOBILEVIT_START_DOCSTRINGrW  r;  r\  ru  r~  r  r  r  r)   r'   <module>r     s  "    4 4 4 4 4 4 4 4 4 4 4 4 4 4            A A A A A A A A A A ! ! ! ! ! !            . - - - - - Q Q Q Q Q Q Q Q                5 4 4 4 4 4 
	H	%	% $ . '  2 1  #  HSM UX    = = = = = = = =@-F -F -F -F -F	 -F -F -F`    bi   .0 0 0 0 0RY 0 0 0f	 	 	 	 	") 	 	 	               >    BI   
 
 
 
 
bi 
 
 
    	   &    29   &f f f f fRY f f fRbp bp bp bp bpry bp bp bpJ* * * * * * * *2	 
  ] T
 T
 T
 T
 T
- T
 T
	 T
n   K
 K
 K
 K
 K
&> K
 K
 K
\    29   08 8 8 8 8BI 8 8 8v       8  	 X
 X
 X
 X
 X
'? X
 X
 X
 X
 X
r)   