
    g=              	          d Z ddlmZmZmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZ  ej        e          ZdZ dZ!g dZ"dZ#dZ$dAde%de%dee%         de%fdZ& e'd           e'd          fde'de'de'de'fdZ( G d dej)                  Z* G d dej)                  Z+ G d  d!ej)                  Z, G d" d#ej)                  Z- G d$ d%ej)                  Z. G d& d'ej)                  Z/ G d( d)ej)                  Z0 G d* d+ej)                  Z1 G d, d-ej)                  Z2 G d. d/e          Z3d0Z4d1Z5 ed2e4           G d3 d4e3                      Z6 ed5e4           G d6 d7e3                      Z7 G d8 d9ej)                  Z8 G d: d;ej)                  Z9 G d< d=ej)                  Z: ed>e4           G d? d@e3                      Z;dS )BzPyTorch MobileViTV2 model.    )OptionalTupleUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )MobileViTV2Configr   z$apple/mobilevitv2-1.0-imagenet1k-256)r         r   ztabby, tabby catr   valuedivisor	min_valuereturnc                     ||}t          |t          | |dz  z             |z  |z            }|d| z  k     r||z  }t          |          S )a  
    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
    original TensorFlow repo. It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    N   g?)maxint)r   r   r   	new_values       p/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/mobilevitv2/modeling_mobilevitv2.pymake_divisibler$   <   s^     	Is57Q;#6777BWLMMI3;W	y>>    z-infinfmin_valmax_valc                 >    t          |t          ||                     S N)r    minr   r'   r(   s      r#   clipr-   K   s    wGU++,,,r%   c                        e Zd Z	 	 	 	 	 	 ddededededed	ed
edededeeef         ddf fdZde	j
        de	j
        fdZ xZS )MobileViTV2ConvLayerr   FTconfigin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr   Nc                 n   t                                                       t          |dz
  dz            |z  }||z  dk    rt          d| d| d          ||z  dk    rt          d| d| d          t	          j        ||||||||d		  	        | _        |	rt	          j        |d
ddd          | _        nd | _        |
rjt          |
t                    rt          |
         | _        d S t          |j        t                    rt          |j                 | _        d S |j        | _        d S d | _        d S )Nr   r   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r1   r2   r3   r4   paddingr7   r5   r6   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r!   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr   
activation
hidden_act)selfr0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r<   	__class__s               r#   rD   zMobileViTV2ConvLayer.__init__Q   sz    	{Q!+,,x71$$dddTZdddeee& A%%fffV\fffggg9#%# 

 

 

  		&!#)$(" " "D "&D 	#.#.. 4"("8F-s33 4"():";"("3"DOOOr%   featuresc                     |                      |          }| j        |                     |          }| j        |                     |          }|S r*   )rG   rI   rL   )rN   rP   s     r#   forwardzMobileViTV2ConvLayer.forward   sO    ##H--)))(33H?&x00Hr%   )r   r   Fr   TT)__name__
__module____qualname__r   r!   boolr   rK   rD   torchTensorrR   __classcell__rO   s   @r#   r/   r/   P   s         "&+/4# 4#!4# 4# 	4#
 4# 4# 4# 4# 4#  4# dCi(4# 
4# 4# 4# 4# 4# 4#l         r%   r/   c                   d     e Zd ZdZ	 ddedededededd	f fd
Zdej        dej        fdZ	 xZ
S )MobileViTV2InvertedResidualzQ
    Inverted residual block (MobileNetv2): https://arxiv.org/abs/1801.04381
    r   r0   r1   r2   r4   r7   r   Nc           	         t                                                       t          t          t	          ||j        z                      d          }|dvrt          d| d          |dk    o||k    | _        t          |||d          | _	        t          |||d|||          | _
        t          |||dd	
          | _        d S )Nr   )r   r   zInvalid stride .r   )r1   r2   r3   r
   )r1   r2   r3   r4   r5   r7   Fr1   r2   r3   r9   )rC   rD   r$   r!   roundexpand_ratiorE   use_residualr/   
expand_1x1conv_3x3
reduce_1x1)rN   r0   r1   r2   r4   r7   expanded_channelsrO   s          r#   rD   z$MobileViTV2InvertedResidual.__init__   s     	*3u[6CV5V/W/W+X+XZ[\\8v888999#q[K{l/J.:KYZ
 
 
 -)*$
 
 
 /)% 
 
 
r%   rP   c                     |}|                      |          }|                     |          }|                     |          }| j        r||z   n|S r*   )rc   rd   re   rb   )rN   rP   residuals      r#   rR   z#MobileViTV2InvertedResidual.forward   sR    ??8,,==**??8,,&*&7Ex(""XEr%   )r   rS   rT   rU   __doc__r   r!   rD   rW   rX   rR   rY   rZ   s   @r#   r\   r\      s         
 lm
 
'
69
IL
VY
eh
	
 
 
 
 
 
BF F F F F F F F F Fr%   r\   c                   `     e Zd Z	 ddedededededdf fd	Zd
ej        dej        fdZ xZ	S )MobileViTV2MobileNetLayerr   r0   r1   r2   r4   
num_stagesr   Nc                 
   t                                                       t          j                    | _        t          |          D ]9}t          ||||dk    r|nd          }| j                            |           |}:d S )Nr   r   )r1   r2   r4   )rC   rD   r   
ModuleListlayerranger\   append)	rN   r0   r1   r2   r4   rm   irp   rO   s	           r#   rD   z"MobileViTV2MobileNetLayer.__init__   s     	]__
z"" 	' 	'A/')!"avvQ	  E Je$$$&KK	' 	'r%   rP   c                 0    | j         D ]} ||          }|S r*   rp   )rN   rP   layer_modules      r#   rR   z!MobileViTV2MobileNetLayer.forward   s)     J 	. 	.L#|H--HHr%   )r   r   
rS   rT   rU   r   r!   rD   rW   rX   rR   rY   rZ   s   @r#   rl   rl      s        qr' '''69'IL'VY'kn'	' ' ' ' ' '          r%   rl   c                   T     e Zd ZdZdededdf fdZdej        dej        fdZ	 xZ
S )	MobileViTV2LinearSelfAttentionaq  
    This layer applies a self-attention with linear complexity, as described in MobileViTV2 paper:
    https://arxiv.org/abs/2206.02680

    Args:
        config (`MobileVitv2Config`):
             Model configuration object
        embed_dim (`int`):
            `input_channels` from an expected input of size :math:`(batch_size, input_channels, height, width)`
    r0   	embed_dimr   Nc           	         t                                                       t          ||dd|z  z   dddd          | _        t	          j        |j                  | _        t          |||dddd          | _        || _        d S )Nr   r   TF)r0   r1   r2   r6   r3   r8   r9   p)	rC   rD   r/   qkv_projr   Dropoutattn_dropoutout_projrz   )rN   r0   rz   rO   s      r#   rD   z'MobileViTV2LinearSelfAttention.__init__   s    ,!a)m,# 
 
 
 J)<===,!"# 
 
 
 #r%   hidden_statesc                    |                      |          }t          j        |d| j        | j        gd          \  }}}t          j        j                            |d          }|                     |          }||z  }t          j        |dd          }t          j        j        	                    |          |
                    |          z  }|                     |          }|S )Nr   )split_size_or_sectionsdimr   Tr   keepdim)r~   rW   splitrz   r   
functionalsoftmaxr   sumrelu	expand_asr   )	rN   r   qkvquerykeyr   context_scorescontext_vectorouts	            r#   rR   z&MobileViTV2LinearSelfAttention.forward   s    mmM**
 "KQX\XfDgmnooosE ,44U4CC**>:: ~->r4HHH h!&&u--0H0H0O0OOmmC  
r%   ri   rZ   s   @r#   ry   ry      s        	 	#0 #S #T # # # # # #2U\ el        r%   ry   c                   \     e Zd Z	 ddededededdf
 fdZd	ej        dej        fd
Z	 xZ
S )MobileViTV2FFN        r0   rz   ffn_latent_dimffn_dropoutr   Nc           
         t                                                       t          |||ddddd          | _        t	          j        |          | _        t          |||ddddd          | _        t	          j        |          | _        d S )Nr   TF)r0   r1   r2   r3   r4   r6   r8   r9   )	rC   rD   r/   conv1r   r   dropout1conv2dropout2)rN   r0   rz   r   r   rO   s        r#   rD   zMobileViTV2FFN.__init__  s     	)!'#	
 	
 	

 
;//)&"# 	
 	
 	

 
;//r%   r   c                     |                      |          }|                     |          }|                     |          }|                     |          }|S r*   )r   r   r   r   )rN   r   s     r#   rR   zMobileViTV2FFN.forward9  sL    

=11m44

=11m44r%   r   rS   rT   rU   r   r!   floatrD   rW   rX   rR   rY   rZ   s   @r#   r   r     s         !0 0!0 0 	0
 0 
0 0 0 0 0 0@U\ el        r%   r   c                   \     e Zd Z	 ddededededdf
 fdZd	ej        dej        fd
Z	 xZ
S )MobileViTV2TransformerLayerr   r0   rz   r   dropoutr   Nc                 b   t                                                       t          j        d||j                  | _        t          ||          | _        t          j        |          | _	        t          j        d||j                  | _
        t          ||||j                  | _        d S )Nr   
num_groupsnum_channelsr?   r|   )rC   rD   r   	GroupNormlayer_norm_epslayernorm_beforery   	attentionr   r   layernorm_afterr   r   ffn)rN   r0   rz   r   r   rO   s        r#   rD   z$MobileViTV2TransformerLayer.__init__B  s     	 "	W]Wl m m m7	JJ
W---!|qyV\Vklll!&)^VEWXXr%   r   c                     |                      |          }|                     |          }||z   }|                     |          }|                     |          }||z   }|S r*   )r   r   r   r   )rN   r   layernorm_1_outattention_outputlayer_outputs        r#   rR   z#MobileViTV2TransformerLayer.forwardP  se    //>>>>/::(=8++M::xx--#m3r%   r   r   rZ   s   @r#   r   r   A  s         Y Y!Y Y 	Y
 Y 
Y Y Y Y Y Y	U\ 	el 	 	 	 	 	 	 	 	r%   r   c                   T     e Zd Zdedededdf fdZdej        dej        fdZ xZ	S )	MobileViTV2Transformerr0   n_layersd_modelr   Nc                 8   t                                                       |j        }||z  g|z  }d |D             }t          j                    | _        t          |          D ]4}t          ||||                   }| j                            |           5d S )Nc                 8    g | ]}t          |d z  d z            S )   )r!   ).0ds     r#   
<listcomp>z3MobileViTV2Transformer.__init__.<locals>.<listcomp>e  s(    :::ACbB'':::r%   )rz   r   )	rC   rD   ffn_multiplierr   ro   rp   rq   r   rr   )	rN   r0   r   r   r   ffn_dims	block_idxtransformer_layerrO   s	           r#   rD   zMobileViTV2Transformer.__init__]  s    ."W,-8 ;::::]__
x 	1 	1I ;'(9:M! ! ! J/0000		1 	1r%   r   c                 0    | j         D ]} ||          }|S r*   ru   )rN   r   rv   s      r#   rR   zMobileViTV2Transformer.forwardn  s*     J 	8 	8L(L77MMr%   rw   rZ   s   @r#   r   r   \  s        10 1C 1# 1RV 1 1 1 1 1 1"U\ el        r%   r   c                        e Zd ZdZ	 	 	 ddededededed	ed
eddf fdZdej        de	ej        e	eef         f         fdZ
dej        de	eef         dej        fdZdej        dej        fdZ xZS )MobileViTV2Layerz=
    MobileViTV2 layer: https://arxiv.org/abs/2206.02680
    r   r   r0   r1   r2   attn_unit_dimn_attn_blocksr7   r4   r   Nc                    t                                                       |j        | _        |j        | _        |}|dk    r/t          ||||dk    r|nd|dk    r|dz  nd          | _        |}nd | _        t          ||||j        |          | _	        t          |||ddd          | _
        t          |||          | _        t          j        d||j                  | _        t          |||dd	d          | _        d S )
Nr   r   )r1   r2   r4   r7   )r1   r2   r3   r5   F)r1   r2   r3   r8   r9   )r   r   r   T)rC   rD   
patch_sizepatch_widthpatch_heightr\   downsampling_layerr/   conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projection)
rN   r0   r1   r2   r   r   r7   r4   cnn_out_dimrO   s
            r#   rD   zMobileViTV2Layer.__init__y  sO    	!,"-#Q;;&A')!)QvvA*2Q,,QA' ' 'D# 'KK&*D# -#$/
 
 
 -#$# 
 
 
 2&-Zghhh TZTijjj  4#$"  
  
  
r%   feature_mapc                     |j         \  }}}}t          j                            || j        | j        f| j        | j        f          }|                    ||| j        | j        z  d          }|||ffS )N)r3   r4   r   )shaper   r   unfoldr   r   reshape)rN   r   
batch_sizer1   
img_height	img_widthpatchess          r#   	unfoldingzMobileViTV2Layer.unfolding  s    9D9J6
KY-&&*D,<=%t'78 ' 
 

 //*k4;LtO_;_acddY///r%   r   output_sizec                     |j         \  }}}}|                    |||z  |          }t          j                            ||| j        | j        f| j        | j        f          }|S )N)r   r3   r4   )r   r   r   r   foldr   r   )rN   r   r   r   in_dimr   	n_patchesr   s           r#   foldingzMobileViTV2Layer.folding  sr    4;M1
FJ	//*fz.A9MMm((#*D,<=%t'78	 ) 
 
 r%   rP   c                 l   | j         r|                      |          }|                     |          }|                     |          }|                     |          \  }}|                     |          }|                     |          }|                     ||          }|                     |          }|S r*   )r   r   r   r   r   r   r   r   )rN   rP   r   r   s       r#   rR   zMobileViTV2Layer.forward  s    " 	9..x88H ==**==**  $~~h77 ""7++..)) <<55''11r%   )r   r   r   )rS   rT   rU   rj   r   r!   rD   rW   rX   r   r   r   rR   rY   rZ   s   @r#   r   r   t  s<         ;
 ;
!;
 ;
 	;

 ;
 ;
 ;
 ;
 
;
 ;
 ;
 ;
 ;
 ;
z	0U\ 	0eEL%PSUXPX/<Y6Z 	0 	0 	0 	0u| %S/ el             r%   r   c                   `     e Zd Zdeddf fdZ	 	 ddej        ded	edee	e
f         fd
Z xZS )MobileViTV2Encoderr0   r   Nc           	      "   t                                                       || _        t          j                    | _        d| _        dx}}|j        dk    rd}d}n|j        dk    rd}d}t          t          d|j
        z  dd          dd	          }t          d|j
        z  d
          }t          d|j
        z  d
          }t          d|j
        z  d
          }t          d|j
        z  d
          }	t          d|j
        z  d
          }
t          |||dd          }| j                            |           t          |||dd          }| j                            |           t          |||t          |j        d         |j
        z  d
          |j        d                   }| j                            |           |r|dz  }t          |||	t          |j        d         |j
        z  d
          |j        d         |          }| j                            |           |r|dz  }t          ||	|
t          |j        d         |j
        z  d
          |j        d         |          }| j                            |           d S )NFr   Tr   r       @   r,   r   r   r         i  r   )r1   r2   r4   rm   r   r   )r1   r2   r   r   )r1   r2   r   r   r7   )rC   rD   r0   r   ro   rp   gradient_checkpointingoutput_strider$   r-   width_multiplierrl   rr   r   base_attn_unit_dimsr   )rN   r0   dilate_layer_4dilate_layer_5r7   layer_0_dimlayer_1_dimlayer_2_dimlayer_3_dimlayer_4_dimlayer_5_dimlayer_1layer_2layer_3layer_4layer_5rO   s                   r#   rD   zMobileViTV2Encoder.__init__  s   ]__
&+# +0/1$$!N!NN!R''!N$rF33RLLLVWce
 
 
 %R&*A%A2NNN$S6+B%BANNN$S6+B%BANNN$S6+B%BANNN$S6+B%BANNN+#$
 
 
 	
'"""+#$
 
 
 	
'""""#$()CA)FI`)`jklll .q1
 
 
 	
'""" 	MH"#$()CA)FI`)`jklll .q1
 
 
 	
'""" 	MH"#$()CA)FI`)`jklll .q1
 
 
 	
'"""""r%   FTr   output_hidden_statesreturn_dictc                    |rdnd }t          | j                  D ]B\  }}| j        r#| j        r|                     |j        |          }n ||          }|r||fz   }C|st          d ||fD                       S t          ||          S )N c              3      K   | ]}||V  	d S r*   r  )r   vs     r#   	<genexpr>z-MobileViTV2Encoder.forward.<locals>.<genexpr>M  s"      XXq!-----XXr%   )last_hidden_stater   )	enumeraterp   r   training_gradient_checkpointing_func__call__tupler   )rN   r   r   r   all_hidden_statesrs   rv   s          r#   rR   zMobileViTV2Encoder.forward8  s     #7@BBD(44 
	I 
	IOA|* <t} < $ A A )!! !
 !-] ; ;# I$58H$H! 	YXX]4E$FXXXXXX-]noooor%   )FT)rS   rT   rU   r   rD   rW   rX   rV   r   r  r   rR   rY   rZ   s   @r#   r   r     s        O#0 O#T O# O# O# O# O# O#h &+ 	p p|p #p 	p
 
u44	5p p p p p p p pr%   r   c                   h    e Zd ZdZeZdZdZdZdgZ	de
ej        ej        ej        f         ddfd	ZdS )
MobileViTV2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    mobilevitv2pixel_valuesTr   moduler   Nc                    t          |t          j        t          j        f          rT|j        j                            d| j        j                   |j	         |j	        j        
                                 dS dS t          |t          j                  r?|j	        j        
                                 |j        j                            d           dS dS )zInitialize the weightsr   )meanstdNg      ?)rJ   r   LinearrF   weightdatanormal_r0   initializer_ranger6   zero_	LayerNormfill_)rN   r  s     r#   _init_weightsz(MobileViTV2PreTrainedModel._init_weights_  s    fry")455 	* M&&CT[5R&SSS{& &&((((( '&-- 	*K""$$$M$$S)))))	* 	*r%   )rS   rT   rU   rj   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   r   r  rF   r  r  r  r%   r#   r  r  S  st         
 %L%$O&*#+,
*E")RY*L$M 
*RV 
* 
* 
* 
* 
* 
*r%   r  aM  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`MobileViTV2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aF  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`MobileViTImageProcessor.__call__`] for details.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zYThe bare MobileViTV2 model outputting raw hidden-states without any specific head on top.c                        e Zd Zddedef fdZd Z ee           e	e
eede          	 	 	 dd	eej                 d
ee         dee         deeef         fd                        Z xZS )MobileViTV2ModelTr0   expand_outputc           	      J   t                                          |           || _        || _        t	          t          d|j        z  dd          dd          }t          ||j        |ddd	d	
          | _	        t          |          | _        |                                  d S )Nr   r   r   r,   r   r   r
   r   Tr1   r2   r3   r4   r8   r9   )rC   rD   r0   r%  r$   r-   r   r/   r   	conv_stemr   encoder	post_init)rN   r0   r%  r   rO   s       r#   rD   zMobileViTV2Model.__init__  s       *$rF33RLLLVWce
 
 
 .+$"
 
 
 *&11 	r%   c                     |                                 D ]U\  }}| j        j        |         }t          |t                    r)|j        j        D ]}|j                            |           VdS )zPrunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        N)itemsr)  rp   rJ   r   r   r   prune_heads)rN   heads_to_prunelayer_indexheadsmobilevitv2_layerr   s         r#   _prune_headszMobileViTV2Model._prune_heads  s     #1"6"6"8"8 	C 	CK $ 2; ?+-=>> C):)F)L C C%%/;;EBBBB		C 	Cr%   vision)
checkpointoutput_typer  modalityexpected_outputNr  r   r   r   c                    ||n| j         j        }||n| j         j        }|t          d          |                     |          }|                     |||          }| j        r"|d         }t          j        |ddgd          }n
|d         }d }|s|||fn|f}||dd          z   S t          |||j
        	          S )
Nz You have to specify pixel_valuesr   r   r   r   Fr   r   )r  pooler_outputr   )r0   r   use_return_dictrE   r(  r)  r%  rW   r  r   r   )	rN   r  r   r   embedding_outputencoder_outputsr  pooled_outputoutputs	            r#   rR   zMobileViTV2Model.forward  s    %9$D  $+Jj 	 &1%<kk$+B]?@@@>>,77,,!5# ' 
 
  	! / 2 "J'8r2hPUVVVMM / 2 M 	0;H;T'77[lZnFOABB///7/')7
 
 
 	
r%   )T)NNN)rS   rT   rU   r   rV   rD   r2  r   MOBILEVITV2_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   rW   rX   r   r  rR   rY   rZ   s   @r#   r$  r$    s       
 0       .C C C +*+GHH&<$.   04/3&*	'
 '
u|,'
 'tn'
 d^	'

 
u>>	?'
 '
 '
  IH'
 '
 '
 '
 '
r%   r$  z
    MobileViTV2 model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    c                        e Zd Zdeddf fdZ ee           eee	e
e          	 	 	 	 ddeej                 dee         deej                 d	ee         deee	f         f
d
                        Z xZS )!MobileViTV2ForImageClassificationr0   r   Nc                 `   t                                          |           |j        | _        t          |          | _        t          d|j        z  d          }|j        dk    rt          j        ||j                  nt          j	                    | _
        |                                  d S )Nr   r   r   r   )in_featuresout_features)rC   rD   
num_labelsr$  r  r$   r   r   r  Identity
classifierr*  )rN   r0   r2   rO   s      r#   rD   z*MobileViTV2ForImageClassification.__init__  s        ++F33%cF,C&CQOOO  1$$ I,V=NOOOO 	 	r%   )r4  r5  r  r7  r  r   labelsr   c                    ||n| j         j        }|                     |||          }|r|j        n|d         }|                     |          }d}|Z| j         j        f| j        dk    rd| j         _        nN| j        dk    r7|j        t          j	        k    s|j        t          j
        k    rd| j         _        nd| j         _        | j         j        dk    rWt                      }	| j        dk    r1 |	|                                |                                          }n |	||          }n| j         j        dk    rGt                      }	 |	|                    d| j                  |                    d                    }n*| j         j        dk    rt                      }	 |	||          }|s|f|dd         z   }
||f|
z   n|
S t!          |||j        	          S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr9  r   
regressionsingle_label_classificationmulti_label_classificationr   r   )losslogitsr   )r0   r<  r  r;  rL  problem_typerJ  dtyperW   longr!   r	   squeezer   viewr   r   r   )rN   r  r   rM  r   outputsr?  rS  rR  loss_fctr@  s              r#   rR   z)MobileViTV2ForImageClassification.forward  s   ( &1%<kk$+B]""<FZhs"tt1<L--'!*//{'/?a''/;DK,,_q((flej.H.HFL\a\eLeLe/LDK,,/KDK,{'<77"99?a''#8FNN$4$4fnn6F6FGGDD#8FF33DD)-JJJ+--xB @ @&++b//RR)-III,..x// 	FY,F)-)9TGf$$vE3!/
 
 
 	
r%   NNNN)rS   rT   rU   r   rD   r   rA  r   _IMAGE_CLASS_CHECKPOINTr   rC  _IMAGE_CLASS_EXPECTED_OUTPUTr   rW   rX   rV   r   r  rR   rY   rZ   s   @r#   rF  rF    s	       0 T      " +*+GHH*8$4	   04/3)-&*4
 4
u|,4
 'tn4
 &	4

 d^4
 
u::	;4
 4
 4
  IH4
 4
 4
 4
 4
r%   rF  c                   T     e Zd Zdedededdf fdZdej        dej        fdZ xZ	S )	MobileViTV2ASPPPoolingr0   r1   r2   r   Nc           	          t                                                       t          j        d          | _        t          |||dddd          | _        d S )Nr   )r   Tr   r'  )rC   rD   r   AdaptiveAvgPool2dglobal_poolr/   r   )rN   r0   r1   r2   rO   s       r#   rD   zMobileViTV2ASPPPooling.__init__5  s^    /A>>>,#%"!
 
 
r%   rP   c                     |j         dd          }|                     |          }|                     |          }t          j                            ||dd          }|S )Nr:  bilinearFsizemodealign_corners)r   rb  r   r   r   interpolate)rN   rP   spatial_sizes      r#   rR   zMobileViTV2ASPPPooling.forwardD  sZ    ~bcc*##H--==**=,,XLzin,oor%   rw   rZ   s   @r#   r_  r_  4  s        
0 
s 
RU 
Z^ 
 
 
 
 
 
         r%   r_  c                   P     e Zd ZdZdeddf fdZdej        dej        fdZ xZ	S )MobileViTV2ASPPzs
    ASPP module defined in DeepLab papers: https://arxiv.org/abs/1606.00915, https://arxiv.org/abs/1706.05587
    r0   r   Nc                    t                                                       t          dj        z  d          }|j        t          j                  dk    rt          d          t          j	                    | _
        t          dd          }| j
                            |           | j
                            fd	j        D                        t                    }| j
                            |           t          d
z  dd          | _        t          j        j                  | _        d S )Nr   r   r   r
   z"Expected 3 values for atrous_ratesr   r   r_   c           
      :    g | ]}t          d |d          S )r
   r   )r1   r2   r3   r7   r9   )r/   )r   rater0   r1   r2   s     r#   r   z,MobileViTV2ASPP.__init__.<locals>.<listcomp>g  sL     
 
 
  % +!- !!#)  
 
 
r%      r|   )rC   rD   r$   r   aspp_out_channelslenatrous_ratesrE   r   ro   convsr/   rr   extendr_  projectr   aspp_dropout_probr   )rN   r0   encoder_out_channelsin_projection
pool_layerr1   r2   rO   s    `   @@r#   rD   zMobileViTV2ASPP.__init__Q  so   -cF4K.KUVWWW*/v"##q((ABBB]__
,#%!
 
 
 	
-(((

 
 
 
 
 
 #/
 
 
	
 	
 	
 ,FKNN

*%%%+L 0|YZkq
 
 
 zF$<===r%   rP   c                     g }| j         D ] }|                     ||                     !t          j        |d          }|                     |          }|                     |          }|S )Nr   r   )rt  rr   rW   catrv  r   )rN   rP   pyramidconvpooled_featuress        r#   rR   zMobileViTV2ASPP.forward}  sq    J 	+ 	+DNN44>>****)G+++,,w//,,77r%   
rS   rT   rU   rj   r   rD   rW   rX   rR   rY   rZ   s   @r#   rl  rl  L  s}         *>0 *>T *> *> *> *> *> *>X         r%   rl  c                   P     e Zd ZdZdeddf fdZdej        dej        fdZ xZ	S )MobileViTV2DeepLabV3zB
    DeepLabv3 architecture: https://arxiv.org/abs/1706.05587
    r0   r   Nc           	          t                                                       t          |          | _        t	          j        |j                  | _        t          ||j	        |j
        dddd          | _        d S )Nr   FT)r1   r2   r3   r8   r9   r6   )rC   rD   rl  asppr   	Dropout2dclassifier_dropout_probr   r/   rq  rJ  rL  rN   r0   rO   s     r#   rD   zMobileViTV2DeepLabV3.__init__  sq    #F++	|F$BCC.0*# 
 
 
r%   r   c                     |                      |d                   }|                     |          }|                     |          }|S )Nr   )r  r   rL  )rN   r   rP   s      r#   rR   zMobileViTV2DeepLabV3.forward  s?    99]2.//<<))??8,,r%   r  rZ   s   @r#   r  r    s|         
0 
T 
 
 
 
 
 
 U\ el        r%   r  zZ
    MobileViTV2 model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                        e Zd Zdeddf fdZ ee           eee	          	 	 	 	 dde
ej                 de
ej                 de
e         d	e
e         deeef         f
d
                        Z xZS )"MobileViTV2ForSemanticSegmentationr0   r   Nc                     t                                          |           |j        | _        t          |d          | _        t          |          | _        |                                  d S )NF)r%  )rC   rD   rJ  r$  r  r  segmentation_headr*  r  s     r#   rD   z+MobileViTV2ForSemanticSegmentation.__init__  sb        ++F%HHH!5f!=!= 	r%   )r5  r  r  rM  r   r   c                 B   ||n| j         j        }||n| j         j        }|| j         j        dk    rt	          d          |                     |d|          }|r|j        n|d         }|                     |          }d}|Vt          j	        
                    ||j        dd         dd	          }	t          | j         j        
          }
 |
|	|          }|s)|r|f|dd         z   }n|f|dd         z   }||f|z   n|S t          |||r|j        ndd          S )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> import requests
        >>> import torch
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, MobileViTV2ForSemanticSegmentation

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")
        >>> model = MobileViTV2ForSemanticSegmentation.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr9  r:  rd  Fre  )ignore_indexr   )rR  rS  r   
attentions)r0   r   r<  rJ  rE   r  r   r  r   r   ri  r   r   semantic_loss_ignore_indexr   )rN   r  rM  r   r   rY  encoder_hidden_statesrS  rR  upsampled_logitsrZ  r@  s               r#   rR   z*MobileViTV2ForSemanticSegmentation.forward  s   N %9$D  $+Jj 	 &1%<kk$+B]$+"8A"="=NOOO""!%# # 
 
 :E T 5 5'RS*''(=>>!}88V\"##.Zu  9     (T[5[\\\H8,f55D 	F# 1 WQRR[0 WQRR[0)-)9TGf$$vE&3GQ'//T	
 
 
 	
r%   r[  )rS   rT   rU   r   rD   r   rA  r   r   rC  r   rW   rX   rV   r   r  rR   rY   rZ   s   @r#   r  r    s
       0 T       +*+GHH+BQ`aaa 04)-/3&*K
 K
u|,K
 &K
 'tn	K

 d^K
 
u--	.K
 K
 K
 ba IHK
 K
 K
 K
 K
r%   r  )r   N)<rj   typingr   r   r   rW   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   modeling_outputsr   r   r   r   modeling_utilsr   utilsr   r   r   r   r   configuration_mobilevitv2r   
get_loggerrS   loggerrC  rB  rD  r\  r]  r!   r$   r   r-   Moduler/   r\   rl   ry   r   r   r   r   r   r  MOBILEVITV2_START_DOCSTRINGrA  r$  rF  r_  rl  r  r  r  r%   r#   <module>r     su  " !   ) ) ) ) ) ) ) ) ) )            A A A A A A A A A A ! ! ! ! ! !            . - - - - -              9 8 8 8 8 8 
	H	%	% & = '  A 1  #  HSM UX     ).fe - - - - -Y^ - - - -
= = = = =29 = = =B-F -F -F -F -F") -F -F -Fb    	   .< < < < <RY < < <~& & & & &RY & & &R    ")   6    RY   0o o o o ory o o odip ip ip ip ip ip ip ipZ* * * * * * * *2	 
   _ Q
 Q
 Q
 Q
 Q
1 Q
 Q
	 Q
h    M
 M
 M
 M
 M
(B M
 M
 M
b    RY   09 9 9 9 9bi 9 9 9z    29   8   	 X
 X
 X
 X
 X
)C X
 X
 X
 X
 X
r%   