
    g              	       (   d Z ddlZddlZddlmZmZmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZ ddlmZmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZ  ej        e          ZdZ  G d dej!                  Z"e
j#        j$        d             Z%d Z& G d dej!                  Z'd5de
j(        de)de*de
j(        fdZ+ G d dej!                  Z, G d dej!                  Z- G d d ej!                  Z. G d! d"ej!                  Z/d# Z0d$ Z1 G d% d&ej!                  Z2 G d' d(ej!                  Z3d)ej!        ddfd*Z4 G d+ d,e          Z5d-Z6d.Z7 ed/e6           G d0 d1e5                      Z8 ed2e6           G d3 d4e5e                      Z9dS )6zPyTorch ViTDet backbone.    N)DictListOptionalTupleUnion)nn   )ACT2FN)BackboneOutputBaseModelOutput)PreTrainedModel)add_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings)BackboneMixin   )VitDetConfigr   c                   L     e Zd ZdZ fdZd Zdej        dej        fdZ xZ	S )VitDetEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) to be consumed by a Transformer.
    c                 X   t                                                       |j        |j        }}|j        |j        }}t          |t          j        j	                  r|n||f}t          |t          j        j	                  r|n||f}|d         |d         z  |d         |d         z  z  }|| _
        || _        || _        || _        |j        r8|dz   }t          j        t          j        d||j                            | _        nd | _        t          j        ||||          | _        d S )Nr   r   )kernel_sizestride)super__init__pretrain_image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterable
image_sizenum_patches use_absolute_position_embeddingsr   	Parametertorchzerosposition_embeddingsConv2d
projection)	selfconfigr$   r   r   r   r%   num_positions	__class__s	           f/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/vitdet/modeling_vitdet.pyr   zVitDetEmbeddings.__init__2   s$   !'!;V=NJ
$*$79Kk#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
!!}
15*Q-:VW=:XY$$(&2 	,'!OM')|EK=RXRd4e4e'f'fD$$'+D$)L+:^hiii    c                    |r|ddddf         }|j         d         }t          t          j        |                    }||z  |k    rt	          d          t
          j                                        s||k    s||k    rit          j	        
                    |                    d||d                              dddd          ||fdd	
          }|                    dddd          S |                    d||d          S )a  
        Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token dimension for the
        original embeddings.

        Args:
            abs_pos_embeddings (`torch.Tensor`):
                Absolute positional embeddings with (1, num_position, num_channels).
            has_cls_token (`bool`):
                If true, has 1 embedding in abs_pos_embeddings for cls token.
            height (`int`):
                Height of input image tokens.
            width (`int`):
                Width of input image tokens.

        Returns:
            Absolute positional embeddings after processing with shape (1, height, width, num_channels)
        Nr   z5Absolute position embeddings must be a square number.r   r	      bicubicF)sizemodealign_corners)shapeintmathsqrt
ValueErrorr(   jit
is_tracingr   
functionalinterpolatereshapepermute)r-   abs_pos_embeddingshas_cls_tokenheightwidthnum_positionr7   new_abs_pos_embeddingss           r1   get_absolute_positionsz'VitDetEmbeddings.get_absolute_positionsH   s   $  	;!3AAAqrrE!:)/249\**++$;,&&TUUU9!! 	Ddfnn%']%>%>"**1dD"==EEaAqQQe_#	 &? & &" *11!Q1===%--aCCCr2   pixel_valuesreturnc                 p   |j         d         }|| j        k    rt          d| j         d| d          |                     |          }| j        f|                    dddd          }||                     | j        d|j         d         |j         d                   z   }|                    dddd          }|S )	Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r5   r	   T)r:   r   r>   r,   r*   rD   rK   )r-   rL   r   
embeddingss       r1   forwardzVitDetEmbeddings.forwardn   s    #)!,4,,,I!.I I9EI I I   __\22
#/#++Aq!Q77J#d&A&A($
0@0CZEUVWEX' ' J $++Aq!Q77Jr2   )
__name__
__module____qualname____doc__r   rK   r(   TensorrQ   __classcell__r0   s   @r1   r   r   ,   s         
j j j j j,$D $D $DLEL U\        r2   r   c                    t          dt          | |          z  dz
            }|j        d         |k    rt          j                            |                    d|j        d         d                              ddd          |d          }|                    d|                              dd          }n|}t          j	        |           dddf         t          || z  d          z  }t          j	        |          dddf         t          | |z  d          z  }||z
  |dz
  t          | |z  d          z  z   }||
                                         S )	a  
    Get relative positional embeddings according to the relative positions of query and key sizes.

    Args:
        q_size (`int`):
            Size of query q.
        k_size (`int`):
            Size of key k.
        rel_pos (`torch.Tensor`):
            Relative position embeddings (num_embeddings, num_channels).

    Returns:
        Extracted positional embeddings according to relative positions.
    r5   r   r   r4   linear)r7   r8   N      ?)r;   maxr:   r   rA   rB   rC   rD   r(   arangelong)q_sizek_sizerel_posmax_rel_distrel_pos_resizedq_coordsk_coordsrelative_coordss           r1   get_rel_posrg      sQ     q3vv...233L}Q<''-33OOAw}Q/44<<Q1EE 4 
 

 *11"lCCKKAqQQ! |F##AAAtG,s6F?C/H/HHH|F##D!!!G,s6F?C/H/HHH(*vzS&RU=V=V.VVO?//1122r2   c                    |\  }}|\  }}	t          |||          }
t          ||	|          }|j        \  }}}|                    ||||          }t          j        d||
          }
t          j        d||          }|                     |||||	          |
dddddddddf         z   |dddddddddf         z                       |||z  ||	z            } | S )a  
    Calculate decomposed Relative Positional Embeddings as introduced in
    [MViT2](https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py).

    Args:
        attn (`torch.Tensor`):
            Attention map.
        queries (`torch.Tensor`):
            Query q in the attention layer with shape (batch_size, queries_height * queries_width, num_channels).
        rel_pos_h (`torch.Tensor`):
            Relative position embeddings (Lh, num_channels) for height axis.
        rel_pos_w (`torch.Tensor`):
            Relative position embeddings (Lw, num_channels) for width axis.
        q_size (`Tuple[int]`):
            Spatial sequence size of query q with (queries_height, queries_width).
        k_size (`Tuple[int]`):
            Spatial sequence size of key k with (keys_height, keys_width).

    Returns:
        attn (Tensor): attention map with added relative positional embeddings.
    zbhwc,hkc->bhwkzbhwc,wkc->bhwkN)rg   r:   rC   r(   einsumview)attnqueries	rel_pos_h	rel_pos_wr_   r`   queries_heightqueries_widthkeys_height
keys_widthrelative_heightrelative_width
batch_size_dimr_qrelative_weights                    r1   !add_decomposed_relative_positionsrz      s   , %+!NM$K!.+yIIO 
IFFN J3
//*nmS
I
ICl#3S/JJOl#3S.IIO 			*nm[*UU
!!!QQQ111d*
+	,
!!!QQQ4*
+	, d:~5{Z7OPP	 	 Kr2   c                   ,     e Zd ZdZd fd	ZddZ xZS )VitDetAttentionz=Multi-head Attention block with relative position embeddings.Nc                    t                                                       |j        }|j        }|| _        ||z  }|dz  | _        t          j        ||dz  |j                  | _	        t          j        ||          | _
        |j        | _        | j        rrt          j        t          j        d|d         z  dz
  |                    | _        t          j        t          j        d|d         z  dz
  |                    | _        dS dS )z
        Args:
            config (`VitDetConfig`):
                Model configuration.
            input_size (`Tuple[int]`, *optional*):
                Input resolution, only required in case relative position embeddings are added.
        g      r	   biasr5   r   r   N)r   r   r   num_attention_heads	num_headsscaler   Linearqkv_biasqkvproj use_relative_position_embeddingsr'   r(   r)   rm   rn   )r-   r.   
input_sizerw   r   head_dimr0   s         r1   r   zVitDetAttention.__init__   s     	 .	")#t^
9S#'@@@Ic3''	060W-0 	X\%+a*Q-6G!6KX*V*VWWDN\%+a*Q-6G!6KX*V*VWWDNNN	X 	Xr2   Fc           	      4   |j         \  }}}}|                     |                              |||z  d| j        d                              ddddd          }|                    d|| j        z  ||z  d                              d          \  }}	}
|| j        z  |	                    dd          z  }| j        r"t          ||| j
        | j        ||f||f          }|                    d          }||
z  }|                    || j        ||d          }|                    ddddd          }|                    |||d          }|                     |          }|r8|                    || j        |j         d         |j         d                   }||f}n|f}|S )	Nr	   r4   r5   r   r      )rw   )r:   r   rC   r   rD   unbindr   	transposer   rz   rm   rn   softmaxrj   r   )r-   hidden_stateoutput_attentionsru   rG   rH   rv   r   rl   keysvaluesattention_scoresattention_probsoutputss                 r1   rQ   zVitDetAttention.forward   s   '3'9$
FE1hh|$$,,Z%DN\^__gghiklnoqrtuvv #AzDN/JFUZN\^ _ _ f fgh i iv#dj0DNN2r4J4JJ0 	@ '4>4>FTY?]cej\k    +22r2::&/#((T^VUTVWW#++Aq!Q::#++JrJJyy.. 	&-55DNO,A",EG\]_G` O $_5GG#oGr2   N)FrR   rS   rT   rU   r   rQ   rW   rX   s   @r1   r|   r|      s]        GGX X X X X X4       r2   r|           Finput	drop_probtrainingrM   c                     |dk    s|s| S d|z
  }| j         d         fd| j        dz
  z  z   }|t          j        || j        | j                  z   }|                                 |                     |          |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)r:   ndimr(   randr   r   floor_div)r   r   r   	keep_probr:   random_tensoroutputs          r1   	drop_pathr     s     CxII[^
Q 77E
5EL Y Y YYMYYy!!M1FMr2   c                   j     e Zd ZdZd	dee         ddf fdZdej        dej        fdZ	de
fdZ xZS )
VitDetDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rM   c                 V    t                                                       || _        d S r   )r   r   r   )r-   r   r0   s     r1   r   zVitDetDropPath.__init__)  s$    "r2   hidden_statesc                 8    t          || j        | j                  S r   )r   r   r   )r-   r   s     r1   rQ   zVitDetDropPath.forward-  s    FFFr2   c                 6    d                     | j                  S )Nzp={})formatr   r-   s    r1   
extra_reprzVitDetDropPath.extra_repr0  s    }}T^,,,r2   r   )rR   rS   rT   rU   r   floatr   r(   rV   rQ   strr   rW   rX   s   @r1   r   r   &  s        bb# #(5/ #T # # # # # #GU\ Gel G G G G-C - - - - - - - -r2   r   c                   *     e Zd ZdZd fd	Zd Z xZS )VitDetLayerNormaL  
    A LayerNorm variant, popularized by Transformers, that performs point-wise mean and variance normalization over the
    channel dimension for inputs that have shape (batch_size, channels, height, width).
    https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119
    ư>c                    t                                                       t          j        t	          j        |                    | _        t          j        t	          j        |                    | _        || _	        |f| _
        d S r   )r   r   r   r'   r(   onesweightr)   r   epsnormalized_shape)r-   r   r   r0   s      r1   r   zVitDetLayerNorm.__init__;  si    l5:.>#?#?@@L-=!>!>??	!1 3r2   c                 "   |                     dd          }||z
                      d                               dd          }||z
  t          j        || j        z             z  }| j        d d d d f         |z  | j        d d d d f         z   }|S )Nr   T)keepdimr5   )meanpowr(   r=   r   r   r   )r-   xuss       r1   rQ   zVitDetLayerNorm.forwardB  s    FF1dF##UKKNN400UejTX...K4&*TYqqq$}-EEr2   )r   r   rX   s   @r1   r   r   4  sV         4 4 4 4 4 4      r2   r   c                   (     e Zd ZdZ fdZd Z xZS )VitDetResBottleneckBlockz
    The standard bottleneck residual block without the last activation layer. It contains 3 conv layers with kernels
    1x1, 3x3, 1x1.
    c                    t                                                       t          j        ||dd          | _        t          |          | _        t          |j                 | _	        t          j        ||ddd          | _
        t          |          | _        t          |j                 | _        t          j        ||dd          | _        t          |          | _        dS )ar  
        Args:
            config (`VitDetConfig`):
                Model configuration.
            in_channels (`int`):
                Number of input channels.
            out_channels (`int`):
                Number of output channels.
            bottleneck_channels (`int`):
                Number of output channels for the 3x3 "bottleneck" conv layers.
        r   Fr~   r	   )paddingr   N)r   r   r   r+   conv1r   norm1r
   
hidden_actact1conv2norm2act2conv3norm3)r-   r.   in_channelsout_channelsbottleneck_channelsr0   s        r1   r   z!VitDetResBottleneckBlock.__init__P  s     	Y{,?OOO
$%899
6,-	Y24GTU\abbb
$%899
6,-	Y2L!%PPP
$\22


r2   c                 X    |}|                                  D ]} ||          }||z   }|S r   )children)r-   r   outlayers       r1   rQ   z VitDetResBottleneckBlock.forwardh  s;    ]]__ 	 	E%**CC#g
r2   r   rX   s   @r1   r   r   J  sQ         
3 3 3 3 30      r2   r   c                   P     e Zd Zdededdf fdZdej        dej        fdZ xZS )	VitDetMlpin_featureshidden_featuresrM   Nc                    t                                                       t          j        ||          | _        t
          |j                 | _        t          j        ||          | _        t          j	        |j
                  | _        d S r   )r   r   r   r   fc1r
   r   actfc2Dropoutdropout_probdrop)r-   r.   r   r   r0   s       r1   r   zVitDetMlp.__init__r  sf    9[/::&+,9_k::Jv233			r2   r   c                     |                      |          }|                     |          }|                     |          }|                     |          }|                     |          }|S r   )r   r   r   r   )r-   r   s     r1   rQ   zVitDetMlp.forwardy  sR    HHQKKHHQKKIIaLLHHQKKIIaLLr2   )	rR   rS   rT   r;   r   r(   rV   rQ   rW   rX   s   @r1   r   r   q  sx        4C 4# 4$ 4 4 4 4 4 4 %,        r2   r   c           	      v   | j         \  }}}}|||z  z
  |z  }|||z  z
  |z  }t          j                            | ddd|d|f          } ||z   ||z   }	}|                     |||z  ||	|z  ||          } |                     dddddd                                                              d|||          }
|
||	ffS )a  
    Partition into non-overlapping windows with padding if needed.

    Args:
        hidden_state (`torch.Tensor`):
            Input tokens with [batch_size, height, width, num_channels].
        window_size (`int`):
            Window size.

    Returns:
        `tuple(torch.FloatTensor)` comprising various elements:
        - windows: windows after partition with [batch_size * num_windows, window_size, window_size, num_channels].
        - (padded_height, padded_width): padded height and width before partition
    r   r   r	   r5   r      r4   )r:   r   rA   padrj   rD   
contiguous)r   window_sizeru   rG   rH   r   
pad_height	pad_widthpadded_heightpadded_widthwindowss              r1   window_partitionr     s     /;.@+J| 44CJu{22kAI =$$\Aq!Y:3VWWL"(:"5uy7H<M$$M[0+|{?Z\giu L ""1aAq!44??AAFFr;XceqrrG]L111r2   c                 `   |\  }}|\  }}| j         d         ||z  |z  |z  z  }|                     |||z  ||z  ||d          }	|	                    dddddd                                          }	|	                    |||d          }	|	ddd|d|ddf                                         }	|	S )	aB  
    Window unpartition into original sequences and removing padding.

    Args:
        windows (`torch.Tensor`):
            Input tokens with [batch_size * num_windows, window_size, window_size, num_channels].
        window_size (`int`):
            Window size.
        pad_height_width (`Tuple[int]`):
            Padded height and width (padded_height, padded_width).
        height_width (`Tuple[int]`):
            Original height and width before padding.

    Returns:
        hidden_state: unpartitioned sequences with [batch_size, height, width, num_channels].
    r   r4   r   r	   r5   r   r   N)r:   rj   rD   r   )
r   r   pad_height_widthheight_widthr   r   rG   rH   ru   r   s
             r1   window_unpartitionr     s    " #3M< MFEq!ml&Bk&QU`&`aJ<<M[0,+2M{\gik L  ''1aAq99DDFFL$$ZbQQL  7F7FUFAAA 56AACCLr2   c                        e Zd ZdZ	 ddededededd	f
 fd
Z	 	 dde	j
        dee	j
                 dedeee	j
        e	j
        f         ee	j
                 f         fdZ xZS )VitDetLayerzCThis corresponds to the Block class in the original implementation.r   Fr.   drop_path_rater   use_residual_blockrM   Nc                 h   t                                                       |j        }|j        |j        z  |j        |j        z  f}t          j        ||j                  | _        t          ||dk    r|n||f          | _
        |dk    rt          |          nt          j                    | _        t          j        ||j                  | _        t          ||t!          ||j        z                      | _        || _        || _        | j        rt+          ||||dz            | _        d S d S )N)r   r   )r   r   )r.   r   r   r5   )r.   r   r   r   )r   r   r   r$   r   r   	LayerNormlayer_norm_epsr   r|   	attentionr   Identityr   r   r   r;   	mlp_ratiomlpr   r   r   residual)r-   r.   r   r   r   rw   r   r0   s          r1   r   zVitDetLayer.__init__  sG    	 '6+<<f>OSYSd>de
\#6+@AAA
([A-=-=zzKQ\C]
 
 
 <JC;O;O777UWU`UbUb\#6+@AAA
FSQTW]WgQgMhMhiii&"4" 	4 $'1H	  DMMM	 	r2   r   	head_maskr   c                    |                     dddd          }|}|                     |          }| j        dk    r2|j        d         |j        d         }}t	          || j                  \  }}|                     ||          }|d         }|dd          }	| j        dk    rt          || j        |||f          }||                     |          z   }||                     |                     | 	                    |                              z   }|                     dddd          }| j
        r|                     |          }|f|	z   }	|	S )Nr   r5   r	   r   )r   )rD   r   r   r:   r   r   r   r   r   r   r   r   )
r-   r   r   r   shortcutrG   rH   r   self_attention_outputsr   s
             r1   rQ   zVitDetLayer.forward  sh    &--aAq99 

=11 a)/2M4G4JEF.>}dN^._._+M+!%/ "0 "
 "
 /q1(, a.}d>NP`cikpbqrrM !4>>-#@#@@%txx

=@Y@Y7Z7Z([([[%--aAq99" 	9 MM-88M "W,r2   )r   r   F)NF)rR   rS   rT   rU   r   r   r;   boolr   r(   rV   r   r   r   rQ   rW   rX   s   @r1   r   r     s        MM qv "49LOim	     @ -1"'	( (|( EL)(  	(
 
uU\5</0%2EE	F( ( ( ( ( ( ( (r2   r   c                        e Zd Zdeddf fdZ	 	 	 	 ddej        deej                 d	ed
edede	e
ef         fdZ xZS )VitDetEncoderr.   rM   Nc           
         t                                                       || _        |j        }d t	          j        d|j        |          D             }g }t          |          D ]E}|                    t          |||         ||j
        v r|j        nd||j        v                      Ft          j        |          | _        d| _        d S )Nc                 6    g | ]}|                                 S  )item).0r   s     r1   
<listcomp>z*VitDetEncoder.__init__.<locals>.<listcomp>  s     \\\q!&&((\\\r2   r   )r   r   r   F)r   r   r.   num_hidden_layersr(   linspacer   rangeappendr   window_block_indicesr   residual_block_indicesr   
ModuleListr   gradient_checkpointing)r-   r.   depthr   layersir0   s         r1   r   zVitDetEncoder.__init__  s    ( ]\EN1f>SUZ,[,[\\\u 	 	AMM#1!#4676;V6V6V 2 2\]'(F,I'I	      ]6**
&+###r2   FTr   r   r   output_hidden_statesreturn_dictc                    |rdnd }|rdnd }t          | j                  D ]h\  }}	|r||fz   }|||         nd }
| j        r%| j        r|                     |	j        ||
|          }n |	||
|          }|d         }|r||d         fz   }i|r||fz   }|st          d |||fD                       S t          |||          S )Nr  r   r   c              3      K   | ]}||V  	d S r   r  )r  vs     r1   	<genexpr>z(VitDetEncoder.forward.<locals>.<genexpr>H  s(      mmq_`_l_l_l_l_lmmr2   last_hidden_stater   
attentions)	enumerater   r  r   _gradient_checkpointing_func__call__tupler   )r-   r   r   r   r  r  all_hidden_statesall_self_attentionsr  layer_modulelayer_head_masklayer_outputss               r1   rQ   zVitDetEncoder.forward$  sI    #7@BBD$5?bb4(44 	P 	POA|# I$58H$H!.7.CillO* `t} ` $ A A )!#%	! ! !-]OM^ _ _)!,M  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r2   )NFFT)rR   rS   rT   r   r   r(   rV   r   r  r   r!  r   rQ   rW   rX   s   @r1   r  r    s        ,| , , , , , , ,2 -1"'%* )
 )
|)
 EL))
  	)

 #)
 )
 
uo%	&)
 )
 )
 )
 )
 )
 )
 )
r2   r  modulec                     t           j                            | j        dd           | j        't           j                            | j        d           dS dS )a  
    Initialize `module.weight` using the "MSRAFill" implemented in Caffe2. Also initializes `module.bias` to 0.

    Source: https://detectron2.readthedocs.io/en/latest/_modules/fvcore/nn/weight_init.html.

    Args:
        module (torch.nn.Module): module to initialize.
    fan_outrelu)r8   nonlinearityNr   )r   initkaiming_normal_r   r   	constant_)r'  s    r1   caffe2_msra_fillr/  P  sS     GFM	OOO{
&+q))))) r2   c                   f    e Zd ZdZeZdZdZdZg Z	de
ej        ej        ej        f         ddfdZdS )	VitDetPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    vitdetrL   Tr'  rM   Nc                    t          |t          j        t          j        f          rt          j                            |j        j                            t          j
                  d| j        j                                      |j        j                  |j        _        |j         |j        j                                         dS dS t          |t          j                  r?|j        j                                         |j        j                            d           dS t          |t$                    r|t          j                            |j        j                            t          j
                  d| j        j                                      |j        j                  |j        _        dS t          |t(                    r| j        j        rt          j                            |j        j                            t          j
                  d| j        j                  |j        _        t          j                            |j        j                            t          j
                  d| j        j                  |j        _        dS t          |t0                    r|j        |j        |j        fD ]}t9          |           |j        |j        fD ]?}|j        j                            d           |j        j                                         @|j        j        j                                         |j        j        j                                         dS dS )zInitialize the weightsr   )r   stdNr[   ) r    r   r   r+   r,  trunc_normal_r   datator(   float32r.   initializer_ranger   r   zero_r   fill_r   r*   r|   r   rm   rn   r   r   r   r   r/  r   r   r   )r-   r'  r   s      r1   _init_weightsz#VitDetPreTrainedModel._init_weightsj  s   fry")455 '	+ "$!6!6"%%em443DKDa "7 " "b$%% M {& &&((((( '&-- 	+K""$$$M$$S))))) 011 	+.0g.C.C*/225=AAK1 /D / / b+122	 &+++ 00 	+T[5a 	+$&G$9$9 %((77K1 %: % %F!
 %'G$9$9 %((77K1 %: % %F!!!  899 	+ ,flC ( ( '''' ,5 ( (!'',,,
%%''''L$**,,,L"((*****	+ 	+r2   )rR   rS   rT   rU   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   r   r   r+   r   r<  r  r2   r1   r1  r1  ^  sq         
  L $O&*#)+E")RY*L$M )+RV )+ )+ )+ )+ )+ )+r2   r1  aH  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`VitDetConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aK  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
            for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z`The bare VitDet Transformer model outputting raw hidden-states without any specific head on top.c                   0    e Zd Zdef fdZdefdZdeee	e         f         ddfdZ
 ee           eee          	 	 	 	 	 dd	eej                 d
eej                 dee         dee         dee         deeef         fd                        Z xZS )VitDetModelr.   c                     t                                          |           || _        t          |          | _        t          |          | _        |                                  d S r   )r   r   r.   r   rP   r  encoder	post_initr-   r.   r0   s     r1   r   zVitDetModel.__init__  sX       *622$V,, 	r2   rM   c                     | j         j        S r   rP   r,   r   s    r1   get_input_embeddingsz VitDetModel.get_input_embeddings      ))r2   heads_to_pruneNc                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrE  r   r   prune_heads)r-   rL  r   headss       r1   _prune_headszVitDetModel._prune_heads  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr2   output_typer=  rL   r   r   r  r  c                    ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |                     || j         j                  }|                     |          }|                     |||||          }|d         }|s|f|dd         z   S t          ||j
        |j                  S )a  
        Returns:

        Examples:

        ```python
        >>> from transformers import VitDetConfig, VitDetModel
        >>> import torch

        >>> config = VitDetConfig()
        >>> model = VitDetModel(config)

        >>> pixel_values = torch.randn(1, 3, 224, 224)

        >>> with torch.no_grad():
        ...     outputs = model(pixel_values)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 768, 14, 14]
        ```Nz You have to specify pixel_values)r   r   r  r  r   r   r  )r.   r   r  use_return_dictr>   get_head_maskr
  rP   rE  r   r   r  )	r-   rL   r   r   r  r  embedding_outputencoder_outputssequence_outputs	            r1   rQ   zVitDetModel.forward  s   > 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]?@@@ &&y$+2OPP	??<88,,/!5# ' 
 
 *!, 	<#%(;;;-)7&1
 
 
 	
r2   )NNNNN)rR   rS   rT   r   r   r   rJ  r   r;   r   rQ  r   VITDET_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr   r(   rV   r  r   r   rQ   rW   rX   s   @r1   rC  rC    sS       
|      *&6 * * * *C4T#Y+? CD C C C C +*+BCC?YYY 04,0,0/3&*?
 ?
u|,?
 EL)?
 $D>	?

 'tn?
 d^?
 
uo%	&?
 ?
 ?
 ZY DC?
 ?
 ?
 ?
 ?
r2   rC  zF
    ViTDet backbone, to be used with frameworks like Mask R-CNN.
    c                        e Zd Z fdZdefdZ ee           ee	e
          	 	 	 ddej        dee         dee         d	ee         de	f
d
                        Z xZS )VitDetBackbonec                 V   t                                                     t                                                     t                    | _        t                    | _        fdt          j        dz             D             | _	        | 
                                 d S )Nc                     g | ]	}j         
S r  )r   )r  rv   r.   s     r1   r	  z+VitDetBackbone.__init__.<locals>.<listcomp>#  s    ]]]AV/]]]r2   r   )r   r   _init_backboner   rP   r  rE  r  r
  num_featuresrF  rG  s    `r1   r   zVitDetBackbone.__init__  s       v&&&*622$V,,]]]]v?WZ[?[9\9\]]] 	r2   rM   c                     | j         j        S r   rI  r   s    r1   rJ  z#VitDetBackbone.get_input_embeddings(  rK  r2   rR  NrL   r  r   r  c                    ||n| j         j        }||n| j         j        }||n| j         j        }|                     |          }|                     |d||          }|r|j        n|d         }d}t          | j        |          D ]\  }	}
|	| j	        v r||
fz  }|s!|r|f|dd         z   }n|f|dd         z   }|S t          ||r|j        nd|j                  S )a  
        Returns:

        Examples:

        ```python
        >>> from transformers import VitDetConfig, VitDetBackbone
        >>> import torch

        >>> config = VitDetConfig()
        >>> model = VitDetBackbone(config)

        >>> pixel_values = torch.randn(1, 3, 224, 224)

        >>> with torch.no_grad():
        ...     outputs = model(pixel_values)

        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 14, 14]
        ```NT)r  r   r  r   r  r5   )feature_mapsr   r  )r.   rU  r  r   rP   rE  r   zipstage_namesout_featuresr   r  )r-   rL   r  r   r  rW  r   r   rd  stager   r   s               r1   rQ   zVitDetBackbone.forward+  sH   < &1%<kk$+B]$8$D  $+Jj 	 2C1N--TXT_Tq??<88,,!%/#	  
 
 2=L--'!*#&t'7#G#G 	0 	0E<)))/ 	# 7&7122;6&7122;6M%3GQ'//T)
 
 
 	
r2   )NNN)rR   rS   rT   r   r   rJ  r   rZ  r   r   r[  r(   rV   r   r  rQ   rW   rX   s   @r1   r]  r]    s        	 	 	 	 	*&6 * * * * +*+BCC>XXX 04,0&*=
 =
l=
 'tn=
 $D>	=

 d^=
 
=
 =
 =
 YX DC=
 =
 =
 =
 =
r2   r]  )r   F):rU   collections.abcr!   r<   typingr   r   r   r   r   r(   torch.utils.checkpointr   activationsr
   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   r   utils.backbone_utilsr   configuration_vitdetr   
get_loggerrR   loggerr[  Moduler   r?   script_if_tracingrg   rz   r|   rV   r   r  r   r   r   r   r   r   r   r   r  r/  r1  VITDET_START_DOCSTRINGrZ  rC  r]  r  r2   r1   <module>rw     sQ          5 5 5 5 5 5 5 5 5 5 5 5 5 5            ! ! ! ! ! ! ? ? ? ? ? ? ? ? - - - - - -            2 1 1 1 1 1 . . . . . . 
	H	%	% !U U U U Ury U U Up !3 !3 !3H& & &R; ; ; ; ;bi ; ; ;~ U\ e T V[Vb    *- - - - -RY - - -    bi   ,$ $ $ $ $ry $ $ $N    	   $2 2 2@  >H H H H H") H H HV@
 @
 @
 @
 @
BI @
 @
 @
F*RY *4 * * * *5+ 5+ 5+ 5+ 5+O 5+ 5+ 5+p	  . f W
 W
 W
 W
 W
' W
 W
	 W
t  	 N
 N
 N
 N
 N
*M N
 N
 N
 N
 N
r2   