
    g                       d Z ddlZddlmZ ddlmZmZmZm	Z	 ddl
ZddlZddlZddlmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZmZmZ ddl m!Z!m"Z"m#Z#  ej$        e%          Z&dZ'dej(        dej(        fdZ)dej(        dej(        fdZ*dej(        de+fdZ,dIdej(        de-de.de+dej(        f
dZ/dJdZ0d Z1 G d dej2                  Z3 G d d ej2                  Z4 G d! d"ej2                  Z5e G d# d$e                      Z6 G d% d&ej2                  Z7 G d' d(ej2                  Z8 G d) d*ej2                  Z9 G d+ d,ej2                  Z: G d- d.ej2                  Z; G d/ d0e;          Z< G d1 d2ej2                  Z= G d3 d4ej2                  Z> G d5 d6e          Z?d7Z@d8ZAd9ZBd:ZC G d; d<ej2                  ZD G d= d>ej2                  ZE G d? d@ej2                  ZF G dA dBe?          ZG G dC dDej2                  ZH G dE dFe?          ZI ee@           G dG dHe?                      ZJdS )KzPyTorch GroupViT model.    N)	dataclass)AnyOptionalTupleUnion)nn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings	torch_int   )GroupViTConfigGroupViTTextConfigGroupViTVisionConfigznvidia/groupvit-gcc-yfcclogitsreturnc                     t           j                            | t          j        t          |           | j                            S )Ndevice)r   
functionalcross_entropytorcharangelenr   )r   s    j/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/groupvit/modeling_groupvit.pycontrastive_lossr%   0   s3    =&&vu|CKKPVP]/^/^/^___    
similarityc                 r    t          |           }t          |                                           }||z   dz  S )Ng       @)r%   t)r'   caption_loss
image_losss      r$   groupvit_lossr,   5   s4    #J//L!*,,..11J:%,,r&   dimc                    |                      |          }|                    |d          d         }t          j        | t          j                                      ||d          }||                                z
  |z   }|S )NTkeepdimr   memory_format      ?)softmaxmaxr!   
zeros_likelegacy_contiguous_formatscatter_detach)r   r-   y_softindexy_hardrets         r$   hard_softmaxr>   ;   sv    ^^C  FJJsDJ))!,EfE4RSSS\\]`bgilmmF
6==??
"V
+CJr&   Ftauhardc                    t           j        j                            t          j        d| j        | j                  t          j        d| j        | j                            }|                    | j                  }| |z   |z  }|	                    |          }|rm|
                    |d          d         }t          j        | t           j                                      ||d          }||                                z
  |z   }	n|}	|	S )N        )r   dtyper3   Tr/   r   r1   )r!   distributionsgumbelGumbeltensorr   rD   sampleshaper4   r5   r6   r7   r8   r9   )
r   r@   rA   r-   gumbel_distgumbelsr:   r;   r<   r=   s
             r$   gumbel_softmaxrM   E   s    %,33SflCCCSflCCC K   ..G3&G__S!!F 

3
--a0!&8VWWW``adfkmpqqv}}&/ Jr&   c                    ||z  | j         d         z  dz  }||k    r5t          t          j        ||z                      }| j         d         |z  }n4t          t          j        ||z                      }| j         d         |z  }| j         d         }| j         d         }|                     ||||          } t
          j                            | ||fd|          } | S )a  
    Args:
        attentions (`torch.Tensor`): attention map of shape [batch_size, groups, feat_height*feat_width]
        height (`int`): height of the output attention map
        width (`int`): width of the output attention map
        align_corners (`bool`, *optional*): the `align_corner` argument for `nn.functional.interpolate`.

    Returns:
        `torch.Tensor`: resized attention map of shape [batch_size, groups, height, width]
             ?r   r   bilinearsizemodealign_corners)rJ   intnproundreshaper   r   interpolate)	
attentionsheightwidthrU   scale
feat_widthfeat_height
batch_sizegroupss	            r$   resize_attention_maprc   [   s     e^z/22s:E~~%%-0011
 &q)Z7"(6E>2233%a(K7
!!$Ja F##JZPPJ**&%z +  J r&   c           	      x   g }t          j                    5  d}| D ]~}|                    ddd                                          }||}n||z  }t	          |                    ddd                                          g|R  }|                    |           	 ddd           n# 1 swxY w Y   |d         }|S )a1  
    Args:
        attentions (`tuple(torch.FloatTensor)`: tuple of attention maps returned by `GroupViTVisionTransformer`
        hw_shape (`tuple(int)`): height and width of the output attention map
    Returns:
        `torch.Tensor`: the attention map of shape [batch_size, groups, height, width]
    Nr   rO   r   r?   )r!   no_gradpermute
contiguousrc   append)r[   hw_shape	attn_mapsprev_attn_masks
attn_maskscur_attn_mapfinal_groupings          r$   get_grouping_from_attentionsro   y   s    I	 + +$ 		+ 		+J#++Aq!44??AAJ&","1J">/0G0G1a0P0P0[0[0]0]i`hiiiL\****		++ + + + + + + + + + + + + + + r]Ns   BB''B+.B+c                   *     e Zd Zdef fdZd Z xZS )GroupViTCrossAttentionLayerconfigc                 ,   t                                                       t          |          | _        t	          j        |j        |j                  | _        t          |          | _
        t	          j        |j        |j                  | _        d S Neps)super__init__GroupViTAttentionattnr   	LayerNormhidden_sizelayer_norm_epsnorm2GroupViTMLPmlp	norm_postselfrr   	__class__s     r$   rx   z$GroupViTCrossAttentionLayer.__init__   ss    %f--	\&"4&:OPPP
v&&f&8f>STTTr&   c                     |}||                      ||          d         z   }||                     |                     |                    z   }|                     |          }|S )N)encoder_hidden_statesr   )rz   r   r~   r   )r   querykeyxs       r$   forwardz#GroupViTCrossAttentionLayer.forward   s\    		%s	;;A>>A'''NN1r&   )__name__
__module____qualname__r   rx   r   __classcell__r   s   @r$   rq   rq      s[        U3 U U U U U U      r&   rq   c                   2     e Zd Zdef fdZddZd Z xZS )GroupViTAssignAttentionrr   c                    t                                                       |j        dz  | _        t	          j        |j        |j                  | _        t	          j        |j        |j                  | _        t	          j        |j        |j                  | _        t	          j        |j        |j                  | _	        |j
        | _
        d S )N      )rw   rx   r|   r^   r   Linearq_projk_projv_projproj
assign_epsr   s     r$   rx   z GroupViTAssignAttention.__init__   s    '-
i 2F4FGGi 2F4FGGi 2F4FGGIf0&2DEE	 +r&   Tc                     |r| j         rt          |d|          }n5|rt          |d          }n!t          j                            |d          }|S )N)r-   rA   r-   )trainingrM   r>   r   r   r4   )r   rz   rF   rA   s       r$   get_attnz GroupViTAssignAttention.get_attn   sd     	;dm 	;!$BT:::DD ;#Db111},,Tr,::r&   c                    |}|                      |          }|                     |          }|                     |          }||                    dd          z  | j        z  }|                     |          }|                     |dd          }||                    dd          | j        z   z  }||z  }|                     |          }||fS )Nr   r?   F)rF   rA   Tr-   r0   )	r   r   r   	transposer^   r   sumr   r   )r   r   r   valueraw_attnrz   	soft_attnouts           r$   r   zGroupViTAssignAttention.forward   s    E"" kk# E"" CMM"b111TZ?}}X&&MM(5uMEE	txxBx55GHUliinnI~r&   )TT)r   r   r   r   rx   r   r   r   r   s   @r$   r   r      sh        ,3 , , , , , ,	 	 	 	      r&   r   c                   0     e Zd Zdef fdZd Zd Z xZS )GroupViTTokenAssignrr   c                    t                                                       || _        t          j        j        j                  | _        t          j	        t          j        j                  rj	        nj	        j	        f}fd|D             \  }}t          |||          | _        t          j        j        j                  | _        t          j        j        j                  | _        t#                    | _        t'                    | _        t          j        j        j                  | _        t-          j        |j                  | _        d S )Nru   c                 >    g | ]}t          |j        z            S  )rV   r|   ).0r   rr   s     r$   
<listcomp>z0GroupViTTokenAssign.__init__.<locals>.<listcomp>   s)    #Z#Z#ZACF,>(>$?$?#Z#Z#Zr&   )rw   rx   num_output_groupr   r{   r|   r}   norm_tokens
isinstanceassign_mlp_ratiocollectionsabcIterableGroupViTMixerMLP	mlp_internorm_post_tokensnorm_xrq   pre_assign_attnr   assign
norm_new_xr   mlp_channels)r   rr   num_group_tokenr   r   
tokens_dimchannels_dimr   s    `     r$   rx   zGroupViTTokenAssign.__init__   sF    0<(:@UVVV &1;?3KLLDF##)6+BC 	
 $[#Z#Z#ZIY#Z#Z#Z 
L)&/:O_`` "V-?VEZ [ [ [l6#56;PQQQ:6BB-f55,v'9v?TUUU'0BLRXRdeer&   c                 Z    |                      |          }|                     |          }|S )z
        Args:
            group_tokens (torch.Tensor): group tokens, [batch_size, num_group_tokens, channels]

        Returns:
            projected_group_tokens (torch.Tensor): [batch_size, num_output_groups, channels]
        )r   r   )r   group_tokensprojected_group_tokenss      r$   project_group_tokenz'GroupViTTokenAssign.project_group_token   s1     "&!=!=!%!6!67M!N!N%%r&   c                 F   |                      |          }|                     |          }|                     |          }|                     ||          }|                     ||          \  }}||z  }||                     |                     |                    z   }||fS )z
        Args:
            image_tokens (`torch.Tensor`): image tokens, of shape [batch_size, input_length, channels]
            group_tokens (`torch.Tensor`): group tokens, [batch_size, num_group_tokens, channels]
        )r   r   r   r   r   r   r   )r   image_tokensr   r   new_image_tokens	attentions         r$   r   zGroupViTTokenAssign.forward   s     ''55{{<00!%!9!9,!G!G!%!5!56Ll![![&*kk2H,&W&W#)22+d.?.?P`@a@a.b.bb**r&   )r   r   r   r   rx   r   r   r   r   s   @r$   r   r      sj        f3 f f f f f f*& & &+ + + + + + +r&   r   c                       e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
ej        ed<   dZej        ed<   dZej        ed<   dZej        ed<   dZeed	<   dZeed
<   dee         fdZdS )GroupViTModelOutputa\  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        segmentation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
            Classification scores for each pixel.

            <Tip warning={true}>

            The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
            to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
            original image size as post-processing. You should always check your logits shape and resize as needed.

            </Tip>

        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of
            [`GroupViTTextModel`].
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of
            [`GroupViTVisionModel`].
        text_model_output (`BaseModelOutputWithPooling`):
            The output of the [`GroupViTTextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`GroupViTVisionModel`].
    Nlosslogits_per_imagelogits_per_textsegmentation_logitstext_embedsimage_embedstext_model_outputvision_model_outputr   c                 ^     t           fd                                 D                       S )Nc              3   t   K   | ]2}|d vr|         n!t          |                                          V  3dS ))r   r   N)getattrto_tuple)r   kr   s     r$   	<genexpr>z/GroupViTModelOutput.to_tuple.<locals>.<genexpr>:  sc       
 
  LLLDGGRYZ^`aRbRbRkRkRmRm
 
 
 
 
 
r&   )tuplekeysr   s   `r$   r   zGroupViTModelOutput.to_tuple9  sC     
 
 
 
YY[[
 
 
 
 
 	
r&   )r   r   r   __doc__r   r   r!   FloatTensor__annotations__r   r   r   r   r   r   r   r   r   r   r   r   r&   r$   r   r     s          B )-D(5$
%,,,*.e'...)-OU&----1*111%)K")))&*L%#***4818886:3:::
%* 
 
 
 
 
 
r&   r   c            	            e Zd ZdZ	 	 	 	 ddedeeeeef         f         ded	ef fd
Zddej	        de
dej	        fdZ xZS )GroupViTPatchEmbeddingsz#
    Image to Patch Embedding.
          r	      
image_size
patch_sizenum_channels	embed_dimc                    t                                                       t          |t          j        j                  r|n||f}t          |t          j        j                  r|n||f}|d         |d         z  |d         |d         z  z  }|| _        || _        || _        t          j
        ||||          | _        d S )Nr   r   )kernel_sizestride)rw   rx   r   r   r   r   r   r   num_patchesr   Conv2d
projection)r   r   r   r   r   r   r   s         r$   rx   z GroupViTPatchEmbeddings.__init__E  s     	#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
!!}
15*Q-:VW=:XY$$&)L)\fgggr&   Fpixel_valuesinterpolate_pos_encodingr   c                 B   |j         \  }}}}|sT|| j        d         k    s|| j        d         k    r2t          d| d| d| j        d          d| j        d          d	          |                     |                              d                              dd          }|S )Nr   r   zInput image size (*z) doesn't match model ().rO   )rJ   r   
ValueErrorr   flattenr   )r   r   r   ra   r   r\   r]   r   s           r$   r   zGroupViTPatchEmbeddings.forwardV  s    2>2D/
L&%' 	+++u8J/J/J E E E% E E+E E.2oa.@E E E   OOL))11!44>>q!DDr&   )r   r   r	   r   F)r   r   r   r   rV   r   r   rx   r!   Tensorboolr   r   r   s   @r$   r   r   @  s          24h hh #uS#X./h 	h
 h h h h h h"	 	EL 	D 	]b]i 	 	 	 	 	 	 	 	r&   r   c                   z     e Zd Zdef fdZdej        dededej        fdZdd	ej        d
e	dej        fdZ
 xZS )GroupViTVisionEmbeddingsrr   c                    t                                                       t          |j        |j        |j        |j                  | _        | j        j        }t          j
        t          j        d||j                            | _        t          j        |j                  | _        t          j        |j        |j                  | _        |j        | _        || _        d S )N)r   r   r   r   r   ru   )rw   rx   r   r   r   r   r|   patch_embeddingsr   r   	Parameterr!   zerosposition_embeddingsDropoutdropoutr{   r}   	layernormrr   )r   rr   r   r   s      r$   rx   z!GroupViTVisionEmbeddings.__init__c  s     7((,(	!
 !
 !
 +7#%<A{FL^0_0_#`#` z&.11f&8f>STTT +r&   
embeddingsr\   r]   r   c                    |j         d         }| j        j         d         }t          j                                        s||k    r||k    r| j        S | j        }|j         d         }|| j        z  }|| j        z  }	t          |dz            }
|                    d|
|
|          }|                    dddd          }t          j
                            |||	fdd	          }|                    dddd                              dd|          }|S )
a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and no class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r?   rP   r   r	   rO   bicubicFrR   )rJ   r   r!   jit
is_tracingr   r   rY   rf   r   r   rZ   view)r   r  r\   r]   r   num_positionspatch_pos_embedr-   
new_height	new_widthsqrt_num_positionss              r$   r   z1GroupViTVisionEmbeddings.interpolate_pos_encodings  s*    !&q)06q9 y##%% 	,+*F*F6UZ??++2r"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNr&   Fr   r   c                    |j         \  }}}}|                     ||          }|                     |          }|                                \  }}}	|r||                     |||          z   }n
|| j        z   }|                     |          }|S )N)r   )rJ   r   r   rS   r   r   r   )
r   r   r   ra   r   r\   r]   r  seq_len_s
             r$   r   z GroupViTVisionEmbeddings.forward  s    2>2D/
L&%**<Rj*kk
^^J//
!+!2!2
GQ $ 	?#d&C&CJPVX]&^&^^JJ#d&>>J\\*--
r&   r   )r   r   r   r   rx   r!   r   rV   r   r   r   r   r   s   @r$   r   r   b  s        3       $5< $ $UX $]b]i $ $ $ $L EL D ]b]i        r&   r   c            	            e Zd Zdef fdZ	 	 	 d	deej                 deej                 deej                 dej	        fdZ
 xZS )
GroupViTTextEmbeddingsrr   c                 V   t                                                       |j        }t          j        |j        |          | _        t          j        |j        |          | _        | 	                    dt          j        |j                                      d          d           d S )Nposition_ids)r   r?   F)
persistent)rw   rx   r|   r   	Embedding
vocab_sizetoken_embeddingmax_position_embeddingsposition_embeddingregister_bufferr!   r"   expandr   rr   r   r   s      r$   rx   zGroupViTTextEmbeddings.__init__  s    &	!|F,=yII"$,v/My"Y"Y 	EL)GHHOOPWXXej 	 	
 	
 	
 	
 	
r&   N	input_idsr  inputs_embedsr   c                     ||j         d         n|j         d         }|| j        d d d |f         }||                     |          }|                     |          }||z   }|S )Nr?   r   )rJ   r  r  r  )r   r  r  r  
seq_lengthr   r  s          r$   r   zGroupViTTextEmbeddings.forward  s     -6,AY_R((}GZ[]G^
,QQQ^<L  00;;M"55lCC"%88
r&   NNN)r   r   r   r   rx   r   r!   
LongTensorr   r   r   r   r   s   @r$   r  r    s        

1 

 

 

 

 

 

 153759	 E,- u/0   12	
 
       r&   r  c            
            e Zd ZdZdededededef
 fdZed             Zd	 Z	dde
j        dee
j                 de
j        fdZ	 	 dde
j        dee
j                 dee         dee
j                 fdZ xZS )GroupViTStagezMThis corresponds to the `GroupingLayer` class in the GroupViT implementation.rr   depthnum_prev_group_tokenr   r   c           	      ^   t                                                       || _        || _        |dk    r3t	          j        t          j        d|j                            | _	        nd | _	        t	          j
        fdt          |          D                       | _        |dk    rt          ||          | _        nd | _        |dk    rX|dk    rRt	          j        t	          j        j        j                  t%          |j        dz  |                    | _        d S d | _        d S )Nr   r   c                 .    g | ]}t                    S r   GroupViTEncoderLayerr   r  rr   s     r$   r   z*GroupViTStage.__init__.<locals>.<listcomp>  s"    $X$X$Xa%9&%A%A$X$X$Xr&   )rr   r   r   ru   rO   )rw   rx   r$  r   r   r   r!   r   r|   group_token
ModuleListrangelayersr   
downsample
Sequentialr{   r}   r   group_projector)r   rr   r$  r%  r   r   r   s    `    r$   rx   zGroupViTStage.__init__  s9    	
.Q!|EK?FL^,_,_``D#Dm$X$X$X$X5QV<<$X$X$XYYQ1 /!1  DOO #DO!##!(;(;#%=V/V5JKKK )=v?QUV?VXghh$ $D   
 $(D   r&   c                     | j         d uS N)r+  r   s    r$   with_group_tokenzGroupViTStage.with_group_token  s    t++r&   c                 h    | j         r(|d d d | j         f         |d d | j         d f         fS |d fS r3  )r4  r   )r   r   s     r$   split_xzGroupViTStage.split_x  sU      	QQQ/4////0!AAA8L7L7N7N4N2OOOd7Nr&   Nr   r+  r   c                 :    ||S t          j        ||gd          S )Nr   r   )r!   cat)r   r   r+  s      r$   concat_xzGroupViTStage.concat_x  s'    Hy![)q1111r&   Fhidden_statesprev_group_tokenoutput_attentionsc                    | j         rO| j                            |                    d          dd          }| j        ||                     |          z   }nd}|}|                     ||          }| j        D ]} ||dd          }|d         }|                     |          \  }}d}	| j        |                     ||          \  }}	||f}
|r|
|	fz   }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the grouping tensors of Grouping block.
        r   r?   N)attention_maskcausal_attention_mask)	r4  r+  r  rS   r1  r9  r.  r6  r/  )r   r:  r;  r<  r+  r   cat_xlayer	layer_outr   outputss              r$   r   zGroupViTStage.forward  s      	*11-2D2DQ2G2GRPPK#/)D,@,@AQ,R,RRKa--[ 	! 	!EeDPTUUUIaLEEe,,;	?&??1k::LAyk" 	-,Gr&   r3  )NF)r   r   r   r   r   rV   rx   propertyr4  r6  r!   r   r   r9  r   r   r   r   r   r   s   @r$   r#  r#    s5       WW ($ (  ( "	 (
  (  (  (  (  (  (  (D , , X,  2 2%, 2Xel5K 2W\Wc 2 2 2 2 48,1	' '|' #5<0' $D>	'
 
u 	!' ' ' ' ' ' ' 'r&   r#  c            
            e Zd Z	 	 	 d
dedee         dee         dee         f fdZdej        dej        fd	Z	 xZ
S )r   Nrr   r|   intermediate_sizeoutput_sizec                 $   t                                                       || _        t          |j                 | _        ||n|j        }||n|j        }||n|}t          j	        ||          | _
        t          j	        ||          | _        d S r3  )rw   rx   rr   r
   
hidden_actactivation_fnr|   rF  r   r   fc1fc2)r   rr   r|   rF  rG  r   s        r$   rx   zGroupViTMLP.__init__-  s     	#F$56%0%<kk&BT1B1N--TZTl%0%<kk+9[*;<<9.<<r&   r:  r   c                     |                      |          }|                     |          }|                     |          }|S r3  )rK  rJ  rL  )r   r:  s     r$   r   zGroupViTMLP.forward=  s=    //**=99//r&   r   )r   r   r   r   r   rV   rx   r!   r   r   r   r   s   @r$   r   r   ,  s         &*+/%)= =$= c]= $C=	=
 c]= = = = = = U\ el        r&   r   c                        e Zd Z fdZ xZS )r   c                     t                                          |                    dd                    }|                    dd          S Nr   rO   )rw   r   r   )r   r   r   s     r$   r   zGroupViTMixerMLP.forwardE  s:    GGOOAKK1--..{{1a   r&   )r   r   r   r   r   r   s   @r$   r   r   D  s8        ! ! ! ! ! ! ! ! !r&   r   c                       e Zd ZdZ fdZdej        dedefdZ	 	 	 	 dd	ej        d
e	ej                 de	ej                 de	ej
                 de	e         deej        e	ej                 e	eej                          f         fdZ xZS )ry   z=Multi-headed attention from 'Attention Is All You Need' paperc                 t   t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r   r   )rw   rx   rr   r|   r   num_attention_heads	num_headshead_dimr   r^   attention_dropoutr   r   r   r   r   r   out_projr   s     r$   rx   zGroupViTAttention.__init__M  s   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
/i??i??i??	$.$.AAr&   rH   r  bszc                     |                     ||| j        | j                                      dd                                          S rP  )r  rT  rU  r   rg   )r   rH   r  rX  s       r$   _shapezGroupViTAttention._shape`  s<    {{3GGQQRSUVWWbbdddr&   NFr:  r>  r?  r   r<  r   c                 j   |                                 \  }}}|du}	|                     |          | j        z  }
|	rU|                     |                     |          d|          }|                     |                     |          d|          }nT|                     |                     |          d|          }|                     |                     |          d|          }|| j        z  d| j        f} |                     |
||          j        | }
 |j        | } |j        | }|                     d          }t          j
        |
|                    dd                    }|                                 || j        z  ||fk    r2t          d|| j        z  ||f d|                                            ||                                 |d||fk    r+t          d|d||f d|                                            |                    || j        ||          |z   }|                    || j        z  ||          }||                                 |d||fk    r+t          d|d||f d|                                            |                    || j        ||          |z   }|                    || j        z  ||          }t          j                            |d          }|r=|                    || j        ||          }|                    || j        z  ||          }nd}t          j                            || j        | j        	          }t          j
        ||          }|                                 || j        z  || j        fk    r5t          d
|| j        || j        f d|                                            |                    || j        || j                  }|                    dd          }|                    |||          }|                     |          }||fS )z#Input shape: Batch x Time x ChannelNr?   r   rO   z$Attention weights should be of size z	, but is z!Attention mask should be of size r   )pr   z `attn_output` should be of size )rS   r   r^   rZ  r   r   rT  rU  r  r!   bmmr   r   r   r   r4   r   r   rY   rW  )r   r:  r>  r?  r   r<  rX  tgt_lenr   is_cross_attentionquery_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                      r$   r   zGroupViTAttention.forwardc  s    #0"4"4"6"6Wi2$> {{=11DJ> 	LT[[1F%G%GSQQJ;;t{{3H'I'I2sSSLLT[[%?%?SIIJ;;t{{='A'A2sKKLDN*B>
Ct{{<#>>CZP$Z_j1
(|(*5//!$$yz/C/CAq/I/IJJ3#7'"JJJ*dn8LgW^7_ * * %%''* *   !,$))++Q/III 7a'8R 7 7-22447 7   (,,S$.'7SSVkkL',,S4>-A7GTTL%""$$a'(BBB ta'8Rtt]k]p]p]r]rtt   (,,S$.'7SSVddL',,S4>-A7GTTL},,\r,BB 	)
 %1$5$5c4>7T[$\$\!055cDN6JGU\]]LL$(!]**<4<RVR_*``
i
L99#"6!OOO)CRVR_3` ) )$$&&) )  
 "&&sDNGT]SS!++Aq11!))#w	BBmmK00111r&   )NNNF)r   r   r   r   rx   r!   r   rV   rZ  r   r   r   r   r   r   r   s   @r$   ry   ry   J  s"       GGB B B B B&eU\ eC ec e e e e 268<=A,1R2 R2|R2 !.R2  (5	R2
  ((9:R2 $D>R2 
u|Xel3XeEL>Q5RR	SR2 R2 R2 R2 R2 R2 R2 R2r&   ry   c                        e Zd Zdef fdZ	 d
dej        dej        dej        dee         de	ej
                 f
d	Z xZS )r)  rr   c                 D   t                                                       |j        | _        t	          |          | _        t          j        | j        |j                  | _	        t          |          | _        t          j        | j        |j                  | _        d S rt   )rw   rx   r|   r   ry   	self_attnr   r{   r}   layer_norm1r   r   layer_norm2r   s     r$   rx   zGroupViTEncoderLayer.__init__  s    +*622<F<QRRRv&&<F<QRRRr&   Fr:  r>  r?  r<  r   c                     |}|                      |          }|                     ||||          \  }}||z   }|}|                     |          }|                     |          }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r:  r>  r?  r<  )rl  rk  rm  r   )r   r:  r>  r?  r<  residualre  rC  s           r$   r   zGroupViTEncoderLayer.forward  s    " !((77&*nn')"7/	 '5 '
 '
#| !=0 ((77// =0 " 	'&Gr&   r   )r   r   r   r   rx   r!   r   r   r   r   r   r   r   r   s   @r$   r)  r)    s        S~ S S S S S S -2& &|& &  %|	&
 $D>& 
u 	!& & & & & & & &r&   r)  c                   $    e Zd ZdZeZdZdZd ZdS )GroupViTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    groupvitTc                    | j         j        }t          |t          j        t          j        f          rG|j        j                            d|           |j	        |j	        j        
                                 nWt          |t          j                  r=|j	        j        
                                 |j        j                            d           | j         j        }t          |t                    rT|j        j        j                            d|dz             |j        j        j                            d|dz             dS t          |t"                    r| j         j        }|j        dz  d|j         j        z  dz  z  |z  }|j        dz  |z  }t          j                            |j        j        |           t          j                            |j        j        |           t          j                            |j        j        |           t          j                            |j        j        |           dS t          |t2                    r| j         j        }|j         j        dz  d|j         j        z  dz  z  |z  }d|j         j        z  dz  |z  }t          j                            |j        j        |           t          j                            |j        j        |           dS dS )	zInitialize the weightsrC   )meanstdNr3   g{Gz?r   rO   )ru  )rr   initializer_ranger   r   r   r   weightdatanormal_biaszero_r{   fill_initializer_factorr  r  r  ry   r   num_hidden_layersinitr   r   r   rW  r   r|   rK  rL  )r   module
init_rangefactorin_proj_stdout_proj_stdfc_stds          r$   _init_weightsz%GroupViTPreTrainedModel._init_weights  s    [2
fry")455 	* M&&CZ&@@@{& &&(((-- 	*K""$$$M$$S)))/f455 	@").66CVd]6SSS%,199sQU9VVVVV 122 	@[3F!+T1q6=;Z7Z_c6cdgmmK",d2f<LGOOFM0kOBBBGOOFM0kOBBBGOOFM0kOBBBGOOFO2OEEEEE,, 	@[3F!=4d:FMDc@chl?lmpvvK&-33<vEFGOOFJ-6O:::GOOFJ-;O?????	@ 	@r&   N)	r   r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointingr  r   r&   r$   rq  rq    sE         
 "L"&*#@ @ @ @ @r&   rq  aJ  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`GroupViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`CLIPTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
aE  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`CLIPTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`CLIPImageProcessor.__call__`] for details.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                        e Zd Zdeddf fdZ	 	 	 d
dej        dee         dee         dee         de	e
ef         f
d	Z xZS )GroupViTVisionEncoderrr   r   Nc                     t                                                       | _        t          j        fdt          t          j                            D                       | _        d| _	        d S )Nc                     g | ]M}t          j        |         j        |         j        |         |d k    rj        |dz
           nd           NS )r   r   )rr   r$  r   r   r%  )r#  depthsnum_group_tokensnum_output_groups)r   irr   s     r$   r   z2GroupViTVisionEncoder.__init__.<locals>.<listcomp>|  s{     	 	 	  ! -*$*$;A$>%+%=a%@LMPQEE)A!a%)H)HWX  	 	 	r&   F)
rw   rx   rr   r   r,  r-  r#   r  stagesgradient_checkpointingr   s    `r$   rx   zGroupViTVisionEncoder.__init__x  s~    m	 	 	 	 s6=1122	 	 	
 
 ',###r&   r:  output_hidden_statesr<  return_dictc                    ||n| j         j        }||n| j         j        }||n| j         j        }|rdnd }|rdnd }d }t	          | j                  D ]@\  }}	|r||fz   } |	|||          }
|
d         }|
d         }|r|
d         ||
d         fz   }A|r||fz   }|st          d |||fD                       S t          |||          S )Nr   r   r   rO   c              3      K   | ]}||V  	d S r3  r   r   vs     r$   r   z0GroupViTVisionEncoder.forward.<locals>.<genexpr>  s(      ggqYZYfYfYfYfYfggr&   last_hidden_stater:  r[   )rr   r<  r  use_return_dict	enumerater  r   r   )r   r:  r  r<  r  all_hidden_statesall_groupingsr   r  stagelayer_outputss              r$   r   zGroupViTVisionEncoder.forward  sU    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]"6@BBD/9T!$+.. 
	D 
	DHAu# I$58H$H!!E-?PQQM)!,M(+L  D]1%5%A -q1A0C C 	E 1]4D D 	hgg]4E}$Ugggggg+;LYf
 
 
 	
r&   r   )r   r   r   r   rx   r!   r   r   r   r   r   r   r   r   r   s   @r$   r  r  w  s        ,3 , , , , , , ,( 04,0&*%
 %
|%
 'tn%
 $D>	%

 d^%
 
uo%	&%
 %
 %
 %
 %
 %
 %
 %
r&   r  c                        e Zd ZdZdef fdZ	 	 	 	 	 ddeej                 deej                 dee	         dee	         d	ee	         d
e
eef         fdZ xZS )GroupViTTextEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self-attention layers. Each layer is a
    [`GroupViTEncoderLayer`].

    Args:
        config: GroupViTTextConfig
    rr   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r   r(  r*  s     r$   r   z0GroupViTTextEncoder.__init__.<locals>.<listcomp>  s"    $k$k$ka%9&%A%A$k$k$kr&   F)	rw   rx   rr   r   r,  r-  r~  r.  r  r   s    `r$   rx   zGroupViTTextEncoder.__init__  sa    m$k$k$k$k5QWQiKjKj$k$k$kll&+###r&   Nr>  r?  r<  r  r  r   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|rdnd}|rdnd}|}	t	          | j                  D ]_\  }
}|r||	fz   }| j        r&| j        r|                     |j	        |	|||          }n ||	|||          }|d         }	|r||d         fz   }`|r||	fz   }|st          d |	||fD                       S t          |	||          S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr   )r<  r   r   c              3      K   | ]}||V  	d S r3  r   r  s     r$   r   z.GroupViTTextEncoder.forward.<locals>.<genexpr>  s(      eeqWXWdWdWdWdWdeer&   r  )rr   r<  r  r  r  r.  r  r   _gradient_checkpointing_func__call__r   r   )r   r  r>  r?  r<  r  r  encoder_statesall_attentionsr:  idxencoder_layerr  s                r$   r   zGroupViTTextEncoder.forward  s   L 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]3=0:d%"+DK"8"8 	F 	FC# C!/=2B!B* t}  $ A A!*!")%! ! !.!")&7	! ! ! *!,M  F!/=3C2E!E 	?+}.>>N 	fee]NN$Seeeeee+>Vd
 
 
 	
r&   )NNNNN)r   r   r   r   r   rx   r   r!   r   r   r   r   r   r   r   r   s   @r$   r  r    s         ,1 , , , , , , 268<,0/3&*O
 O
 !.O
  (5	O

 $D>O
 'tnO
 d^O
 
uo%	&O
 O
 O
 O
 O
 O
 O
 O
r&   r  c                       e Zd Zdef fdZ ee           eee          	 	 	 	 	 	 dde	e
j                 de	e
j                 de	e
j                 de	e         d	e	e         d
e	e         deeef         fd                        Z xZS )GroupViTTextTransformerrr   c                    t                                                       || _        |j        }t	          |          | _        t          |          | _        t          j	        ||j
                  | _        |j        | _        d S rt   )rw   rx   rr   r|   r  r  r  encoderr   r{   r}   final_layer_normeos_token_idr  s      r$   rx   z GroupViTTextTransformer.__init__  ss    &	088*622 "YF<Q R R R #/r&   output_typer  Nr  r>  r  r<  r  r  r   c                 *   ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |                                }|                    d|d                   }|                     ||          }t          ||j	        |j
                  }	|t          ||j	                  }|                     |||	|||          }
|
d         }|                     |          }| j        dk    rg|t          j        |j        d         |j
                  |                    t          j        |j
        	                              d
          f         }n|t          j        |j        d         |j
                  |                    t          j        |j
        	          | j        k                                                        d
          f         }|s||f|
dd         z   S t+          |||
j        |
j                  S )
        Returns:

        NzYou have to specify input_idsr?   )r  r  r   )r  r>  r?  r<  r  r  r   rO   )rD   r   r   r   r  pooler_outputr:  r[   )rr   r<  r  r  r   rS   r  r  r   rD   r   r   r  r  r  r!   r"   rJ   torV   argmaxr   r:  r[   )r   r  r>  r  r<  r  r  input_shaper:  r?  encoder_outputsr  pooled_outputs                r$   r   zGroupViTTextTransformer.forward  sH    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]<===nn&&NN2{277	),WW !A,]5I!
 !
 !

 %7H[\\N,,')"7/!5# ' 
 
 ,A. 112CDD!! ..4Q7@Q@XYYY595F5MNNUUZ\U]]_MM ..4Q7@Q@XYYY EI6G6NOOSWSddB!M  	L%}58KKK)/')7&1	
 
 
 	
r&   NNNNNN)r   r   r   r   rx   r   GROUPVIT_TEXT_INPUTS_DOCSTRINGr   r   r   r!   r   r   r   r   r   r   r   s   @r$   r  r    s*       	01 	0 	0 	0 	0 	0 	0 +*+IJJ+ETfggg -115/3,0/3&*P
 P
EL)P
 !.P
 u|,	P

 $D>P
 'tnP
 d^P
 
u00	1P
 P
 P
 hg KJP
 P
 P
 P
 P
r&   r  c                   4    e Zd ZeZdef fdZdej        fdZd Z	 e
e           eee          	 	 	 	 	 	 ddeej                 d	eej                 d
eej                 dee         dee         dee         deeef         fd                        Z xZS )GroupViTTextModelrr   c                     t                                          |           t          |          | _        |                                  d S r3  )rw   rx   r  
text_model	post_initr   s     r$   rx   zGroupViTTextModel.__init__v  s@       1&99r&   r   c                 $    | j         j        j        S r3  r  r  r  r   s    r$   get_input_embeddingsz&GroupViTTextModel.get_input_embeddings|  s    )99r&   c                 (    || j         j        _        d S r3  r  )r   r   s     r$   set_input_embeddingsz&GroupViTTextModel.set_input_embeddings  s    5:"222r&   r  Nr  r>  r  r<  r  r  c                 8    |                      ||||||          S )aK  
        Returns:

        Examples:

        ```python
        >>> from transformers import CLIPTokenizer, GroupViTTextModel

        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> model = GroupViTTextModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r  r>  r  r<  r  r  )r  )r   r  r>  r  r<  r  r  s          r$   r   zGroupViTTextModel.forward  s1    8 )%/!5#  
 
 	
r&   r  )r   r   r   r   r  rx   r   Moduler  r  r   r  r   r   r   r!   r   r   r   r   r   r   r   s   @r$   r  r  s  sK       %L1      :bi : : : :; ; ; +*+IJJ+ETfggg -115/3,0/3&*!
 !
EL)!
 !.!
 u|,	!

 $D>!
 'tn!
 d^!
 
u00	1!
 !
 !
 hg KJ!
 !
 !
 !
 !
r&   r  c                        e Zd Zdef fdZ ee           eee          	 	 	 	 dde	e
j                 de	e         de	e         de	e         d	eeef         f
d
                        Z xZS )GroupViTVisionTransformerrr   c                     t                                                       || _        |j        }t	          |          | _        t          |          | _        t          j	        ||j
                  | _        d S rt   )rw   rx   rr   r|   r   r  r  r  r   r{   r}   r   r  s      r$   rx   z"GroupViTVisionTransformer.__init__  sc    &	26::,V44iV5JKKKr&   r  Nr   r  r<  r  r   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |                     |          }|                     ||||          }|d         }|                     |          }|                    d          }|s||f|dd         z   S t          |||j
        |j                  S )r  Nz You have to specify pixel_values)r:  r  r<  r  r   r   r   r  )rr   r<  r  r  r   r  r  r   rt  r   r:  r[   )	r   r   r  r<  r  r:  r  r  r  s	            r$   r   z!GroupViTVisionTransformer.forward  s    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]?@@@55,,'!5/#	 ' 
 
 ,A. !NN+<==)..1.55 	L%}58KKK)/')7&1	
 
 
 	
r&   NNNN)r   r   r   r   rx   r    GROUPVIT_VISION_INPUTS_DOCSTRINGr   r   r   r!   r   r   r   r   r   r   r   s   @r$   r  r    s        L3 L L L L L L +*+KLL+EThiii 59/3,0&*+
 +
u01+
 'tn+
 $D>	+

 d^+
 
u00	1+
 +
 +
 ji ML+
 +
 +
 +
 +
r&   r  c                        e Zd ZeZdZdef fdZdefdZ e	e
           eee          	 	 	 	 ddeej                 dee         d	ee         d
ee         deeef         f
d                        Z xZS )GroupViTVisionModelr   rr   c                     t                                          |           t          |          | _        |                                  d S r3  )rw   rx   r  vision_modelr  r   s     r$   rx   zGroupViTVisionModel.__init__  sA       5f==r&   r   c                 $    | j         j        j        S r3  )r  r  r   r   s    r$   r  z(GroupViTVisionModel.get_input_embeddings  s     +<<r&   r  Nr<  r  r  c                 4    |                      ||||          S )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, GroupViTVisionModel

        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> model = GroupViTVisionModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r<  r  r  )r  )r   r   r<  r  r  s        r$   r   zGroupViTVisionModel.forward  s-    >   %/!5#	 ! 
 
 	
r&   r  )r   r   r   r   r  main_input_namerx   r   r  r   r  r   r   r   r!   r   r   r   r   r   r   r   s   @r$   r  r    s       'L$O3      =&= = = = = +*+KLL+EThiii 59,0/3&*"
 "
u01"
 $D>"
 'tn	"

 d^"
 
u00	1"
 "
 "
 ji ML"
 "
 "
 "
 "
r&   r  c                       e Zd ZeZdef fdZ ee          	 	 	 	 	 	 ddee	j
                 dee	j
                 dee	j
                 dee         dee         d	ee         d
e	j        fd            Z ee          	 	 	 	 ddee	j                 dee         dee         d	ee         d
e	j        f
d            Z ee           eee          	 	 	 	 	 	 	 	 	 ddee	j                 dee	j                 dee	j
                 dee	j                 dee         dee         dee         dee         d	ee         d
eeef         fd                        Z xZS )GroupViTModelrr   c           
         t                                          |           t          |j        t                    s%t          dt          |j                   d          t          |j        t                    s%t          dt          |j                   d          |j        }|j        }|j	        | _	        |j
        | _
        |j        | _        |j        | _        t          |          | _        t!          |          | _        t%          j        t%          j        | j        | j
        d          t%          j        | j
                  t%          j        d          t%          j        | j
        | j	        d                    | _        t%          j        t%          j        | j        | j
        d          t%          j        | j
                  t%          j        d          t%          j        | j
        | j	        d                    | _        t%          j        t5          j        | j        j                            | _        |                                  d S )NzOconfig.text_config is expected to be of type GroupViTTextConfig but is of type .zSconfig.vision_config is expected to be of type GroupViTVisionConfig but is of type T)rz  )inplace) rw   rx   r   text_configr   	TypeErrortypevision_configr   projection_dimprojection_intermediate_dimr|   text_embed_dimvision_embed_dimr  r  r  r  r   r0  r   BatchNorm1dReLUvisual_projectiontext_projectionr   r!   rH   rr   logit_scale_init_valuelogit_scaler  )r   rr   r  r  r   s       r$   rx   zGroupViTModel.__init__  s      &,.@AA 	0+,,0 0 0  
 &.0DEE 	2-..2 2 2  
 (,$3+1+M()5 - 91+>>5mDD!#Id+T-MTXYYYN4;<<GD!!!Id68KRVWWW	"
 "
  "}Id)4+KRVWWWN4;<<GD!!!Id68KRVWWW	 
  
 <T[5W(X(XYY 	r&   Nr  r>  r  r<  r  r  r   c                     ||n| j         j        }||n| j         j        }||n| j         j        }|                     ||||||          }|d         }|                     |          }	|	S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`GroupViTTextModel`].

        Examples:

        ```python
        >>> from transformers import CLIPTokenizer, GroupViTModel

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```Nr  r   )rr   r<  r  r  r  r  )
r   r  r>  r  r<  r  r  text_outputsr  text_featuress
             r$   get_text_featureszGroupViTModel.get_text_featuresE  s    6 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B])%/!5# ' 
 
 %Q,,];;r&   r   c                     ||n| j         j        }||n| j         j        }||n| j         j        }|                     ||||          }|d         }|                     |          }|S )aH  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`GroupViTVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, GroupViTModel

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```Nr  r   )rr   r<  r  r  r  r  )r   r   r<  r  r  vision_outputsr  image_featuress           r$   get_image_featuresz GroupViTModel.get_image_featurest  s    > 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]**%/!5#	 + 
 
 'q)//>>r&   r  return_lossoutput_segmentationc
           
         ||n| j         j        }||n| j         j        }|rd}||n| j         j        }|	|	n| j         j        }	|                     ||||	          }
|                     ||||||	          }|
d         }|                     |          }|d         }|                     |          }||	                    dd          z  }||	                    dd          z  }| j
                                        }t          j        ||                                          |z  }|                                }d}|r{|
d         }|                     |                    d|j        d                             }|r	|
d	         }n|
d
         }t#          ||j        d
d                   }||	                    dd          z  }t          j        ||                                          |z  }|                    |j        d         d|j        d                                       dd
d          }|                    |j        d         |j        d         d          }t          j        ||          |z  }|                    |j        d         |j        d         |j        d
         |j        d	                   }d}|rt'          |          }|	s|
|||||||
f}n||||||
f}||f|z   n|S t)          ||||||||
          S )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, GroupViTModel

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```NTr  r  r   r?   r   r   r	   rO   )r   r   r   r   r   r   r   r   )rr   r<  r  r  r  r  r  r  r  normr  expr!   matmulr)   rY   rJ   ro   rf   r,   r   )r   r  r   r>  r  r  r<  r  r  r  r  r  r   r   r  r   r   
seg_logitsimage_group_embedsr[   groupinglogits_per_image_groupflatten_groupingr   outputs                            r$   r   zGroupViTModel.forward  s   N 2C1N--TXT_Tq#6#BHg 	  	% $$8$D  $+Jj 	 &1%<kk$+B]**%/!5#	 + 
 
 )%/!5# ' 
 
 &a(--l;;"1o**;77 $l&7&7B&7&M&MM!K$4$4T$4$J$JJ &**,,,{LNN4D4DEES*,,..
 	 "0!2!%!7!78J8R8RSUWiWoprWs8t8t!u!u# /+A.

+A.
3J@RSTSUSU@VWWH "46H6M6MRT^b6M6c6c!c%*\2Dkmmoo%V%VYd%d"%;%C%C"1%r;+<Q+?& &gaA #
  (//q0A8>RSCTVXYY &<>NOOR]]J#++ #Z%5a%8(.:KX^\]M^ J  	2 11D 	F%$#  " +O[,Xdftu)-)9TGf$$vE"-+ *#%* .	
 	
 	
 		
r&   r  r  )	NNNNNNNNN)r   r   r   r   r  rx   r   r  r   r!   r   r   r   r  r  r  GROUPVIT_INPUTS_DOCSTRINGr   r   r!  r   r   r   r   r   s   @r$   r  r    s       !L)~ ) ) ) ) ) )V +*+IJJ -115/3,0/3&*, ,EL), !., u|,	,
 $D>, 'tn, d^, 
	, , , KJ,\ +*+KLL 59,0/3&*. .u01. $D>. 'tn	.
 d^. 
	. . . ML.` +*+DEE+>^\\\ 15481537&*,0/3.2&*K
 K
E,-K
 u01K
 !.	K

 u/0K
 d^K
 $D>K
 'tnK
 &d^K
 d^K
 
u))	*K
 K
 K
 ]\ FEK
 K
 K
 K
 K
r&   r  )r   Fr?   r   )Kr   collections.abcr   dataclassesr   typingr   r   r   r   numpyrW   r!   torch.utils.checkpointr   activationsr
   modeling_attn_mask_utilsr   r   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   r   r   r   configuration_groupvitr   r   r   
get_loggerr   logger_CHECKPOINT_FOR_DOCr   r%   r,   rV   r>   floatr   rM   rc   ro   r  rq   r   r   r   r   r   r  r#  r   r   ry   r)  rq  GROUPVIT_START_DOCSTRINGr  r  r  r  r  r  r  r  r  r  r   r&   r$   <module>r     s?         ! ! ! ! ! ! . . . . . . . . . . . .                ! ! ! ! ! ! d d d d d d d d K K K K K K K K - - - - - -                ] \ \ \ \ \ \ \ \ \ 
	H	%	%0 
`U\ `el ` ` ` `
-el -u| - - - - C     5< e t RU _d_k    ,   <  :    ")    - - - - -bi - - -`4+ 4+ 4+ 4+ 4+") 4+ 4+ 4+n /
 /
 /
 /
 /
+ /
 /
 /
d    bi   DG G G G Gry G G GV    RY   B[ [ [ [ [BI [ [ [|    ")   0! ! ! ! !{ ! ! !k2 k2 k2 k2 k2	 k2 k2 k2^/ / / / /29 / / /d)@ )@ )@ )@ )@o )@ )@ )@X	 " @$  # L7
 7
 7
 7
 7
BI 7
 7
 7
t^
 ^
 ^
 ^
 ^
") ^
 ^
 ^
B^
 ^
 ^
 ^
 ^
bi ^
 ^
 ^
B2
 2
 2
 2
 2
/ 2
 2
 2
j7
 7
 7
 7
 7
	 7
 7
 7
t1
 1
 1
 1
 1
1 1
 1
 1
h .//[
 [
 [
 [
 [
+ [
 [
 0/[
 [
 [
r&   