
    g'                    H   d Z ddlmZ ddlmZ ddlmZmZmZm	Z	m
Z
 ddlZddlZddlmZmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZmZ ddl m!Z!m"Z"m#Z#  e            rddl$m%Z%  ej&        e'          Z(dZ)dej        dej        fdZ*dej        dej        fdZ+e G d de                      Z,dedefdZ-dedefdZ.d Z/d Z0e G d de                      Z1e G d  d!e                      Z2 G d" d#ej3                  Z4 G d$ d%ej3                  Z5 G d& d'ej3                  Z6 G d( d)ej3                  Z7 G d* d+ej3                  Z8 G d, d-e          Z9d.Z:d/Z;d0Z<d1Z=d2Z>d3Z? G d4 d5ej3                  Z@ G d6 d7ej3                  ZA G d8 d9e9          ZB G d: d;ej3                  ZC G d< d=e9          ZD ee:           G d> d?e9                      ZE G d@ dAej3                  ZF G dB dCej3                  ZG G dD dEe9          ZHdS )FzPyTorch OWL-ViT model.    )	dataclass)	lru_cache)AnyDictOptionalTupleUnionN)Tensornn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardis_vision_availableloggingreplace_return_docstrings   )OwlViTConfigOwlViTTextConfigOwlViTVisionConfig)center_to_corners_formatzgoogle/owlvit-base-patch32logitsreturnc                     t           j                            | t          j        t          |           | j                            S )Ndevice)r   
functionalcross_entropytorcharangelenr"   )r   s    f/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/owlvit/modeling_owlvit.pycontrastive_lossr)   4   s3    =&&vu|CKKPVP]/^/^/^___    
similarityc                 r    t          |           }t          |                                           }||z   dz  S )Ng       @)r)   t)r+   caption_loss
image_losss      r(   owlvit_lossr0   9   s4    #J//L!*,,..11J:%,,r*   c                       e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
ej        ed<   dZej        ed<   dZej        ed<   dZeed<   dZeed	<   d
ee         fdZdS )OwlViTOutputa%  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds (`torch.FloatTensor` of shape `(batch_size * num_max_text_queries, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`OwlViTTextModel`].
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of
            [`OwlViTVisionModel`].
        text_model_output (Tuple[`BaseModelOutputWithPooling`]):
            The output of the [`OwlViTTextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`OwlViTVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                 ^     t           fd                                 D                       S )Nc              3   t   K   | ]2}|d vr|         n!t          |                                          V  3dS )r8   r9   Ngetattrto_tuple.0kselfs     r(   	<genexpr>z(OwlViTOutput.to_tuple.<locals>.<genexpr>_   c       
 
  LLLDGGRYZ^`aRbRbRkRkRmRm
 
 
 
 
 
r*   tuplekeysrC   s   `r(   r?   zOwlViTOutput.to_tuple^   C     
 
 
 
YY[[
 
 
 
 
 	
r*   )__name__
__module____qualname____doc__r3   r   r%   FloatTensor__annotations__r4   r5   r6   r7   r8   r   r9   r   r   r?    r*   r(   r2   r2   ?   s          * )-D(5$
%,,,*.e'...)-OU&---%)K")))&*L%#***4818886:3:::
%* 
 
 
 
 
 
r*   r2   r-   c                     |                                  r5| j        t          j        t          j        fv r| n|                                 S | j        t          j        t          j        fv r| n|                                 S N)	is_floating_pointdtyper%   float32float64floatint32int64int)r-   s    r(   _upcastr\   f   se     GGu}===qq17799LGU[999qqquuwwFr*   boxesc                     t          |           } | dddf         | dddf         z
  | dddf         | dddf         z
  z  S )a  
    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.

    Args:
        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
            < x2` and `0 <= y1 < y2`.

    Returns:
        `torch.FloatTensor`: a tensor containing the area for each box.
    N   r   r   r   )r\   )r]   s    r(   box_arear`   o   sT     ENNE!!!Q$K%1+%%1+aaad*CDDr*   c                    t          |           }t          |          }t          j        | d d d d df         |d d d df                   }t          j        | d d d dd f         |d d dd f                   }||z
                      d          }|d d d d df         |d d d d df         z  }|d d d f         |z   |z
  }||z  }	|	|fS )Nr_   r   minr   )r`   r%   maxrc   clamp)
boxes1boxes2area1area2left_topright_bottomwidth_heightinterunionious
             r(   box_iourp      s    VEVEy4!,fQQQUm<<H9VAAAtQRRK0&ABB-@@L 8+22q299LAAAq!LAAAq$99E!!!T'NU"U*E
%-C:r*   c                 n   | ddddf         | ddddf         k                                     st          d|            |ddddf         |ddddf         k                                     st          d|           t          | |          \  }}t          j        | dddddf         |ddddf                   }t          j        | dddddf         |ddddf                   }||z
                      d          }|dddddf         |dddddf         z  }|||z
  |z  z
  S )z
    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.

    Returns:
        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
    Nr_   z<boxes1 must be in [x0, y0, x1, y1] (corner) format, but got z<boxes2 must be in [x0, y0, x1, y1] (corner) format, but got r   rb   r   )all
ValueErrorrp   r%   rc   rd   re   )rf   rg   ro   rn   top_leftbottom_rightrl   areas           r(   generalized_box_iourw      s    111abb5MVAAArrE]*//11 b`X^``aaa111abb5MVAAArrE]*//11 b`X^``aaa((JCy4!,fQQQUm<<H9VAAAtQRRK0&ABB-@@L 8+22q299L111a <111a#88D$,$&&&r*   c                       e Zd ZU dZdZeej                 ed<   dZ	ee
         ed<   dZej        ed<   dZej        ed<   dZej        ed<   dZej        ed<   dZej        ed	<   dZeed
<   dZeed<   dee         fdZdS )OwlViTObjectDetectionOutputa  
    Output type of [`OwlViTForObjectDetection`].

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
            scale-invariant IoU loss.
        loss_dict (`Dict`, *optional*):
            A dictionary containing the individual losses. Useful for logging.
        logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
            Classification logits (including no-object) for all queries.
        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
            possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to retrieve the
            unnormalized bounding boxes.
        text_embeds (`torch.FloatTensor` of shape `(batch_size, num_max_text_queries, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`OwlViTTextModel`].
        image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
            Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes
            image embeddings for each patch.
        class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
            Class embeddings of all image patches. OWL-ViT represents images as a set of image patches where the total
            number of patches is (image_size / patch_size)**2.
        text_model_output (Tuple[`BaseModelOutputWithPooling`]):
            The output of the [`OwlViTTextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`OwlViTVisionModel`].
    Nr3   	loss_dictr   
pred_boxesr6   r7   class_embedsr8   r9   r   c                 ^     t           fd                                 D                       S )Nc              3   t   K   | ]2}|d vr|         n!t          |                                          V  3dS r<   r=   r@   s     r(   rD   z7OwlViTObjectDetectionOutput.to_tuple.<locals>.<genexpr>   rE   r*   rF   rI   s   `r(   r?   z$OwlViTObjectDetectionOutput.to_tuple   rJ   r*   )rK   rL   rM   rN   r3   r   r%   rO   rP   rz   r   r   r{   r6   r7   r|   r8   r   r9   r   r   r?   rQ   r*   r(   ry   ry      s          > )-D(5$
%,,, $Ix~$$$ $FE$$$$(J!(((%)K")))&*L%#***&*L%#***4818886:3:::
%* 
 
 
 
 
 
r*   ry   c                       e Zd ZU dZdZej        ed<   dZej        ed<   dZ	ej        ed<   dZ
ej        ed<   dZej        ed<   dZej        ed<   dZeed	<   dZeed
<   dee         fdZdS )&OwlViTImageGuidedObjectDetectionOutputa  
    Output type of [`OwlViTForObjectDetection.image_guided_detection`].

    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
            Classification logits (including no-object) for all queries.
        target_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual target image in the batch
            (disregarding possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to
            retrieve the unnormalized bounding boxes.
        query_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual query image in the batch
            (disregarding possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to
            retrieve the unnormalized bounding boxes.
        image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
            Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes
            image embeddings for each patch.
        query_image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
            Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes
            image embeddings for each patch.
        class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
            Class embeddings of all image patches. OWL-ViT represents images as a set of image patches where the total
            number of patches is (image_size / patch_size)**2.
        text_model_output (Tuple[`BaseModelOutputWithPooling`]):
            The output of the [`OwlViTTextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`OwlViTVisionModel`].
    Nr   r7   query_image_embedstarget_pred_boxesquery_pred_boxesr|   r8   r9   r   c                 ^     t           fd                                 D                       S )Nc              3   t   K   | ]2}|d vr|         n!t          |                                          V  3dS r<   r=   r@   s     r(   rD   zBOwlViTImageGuidedObjectDetectionOutput.to_tuple.<locals>.<genexpr>  rE   r*   rF   rI   s   `r(   r?   z/OwlViTImageGuidedObjectDetectionOutput.to_tuple  rJ   r*   )rK   rL   rM   rN   r   r%   rO   rP   r7   r   r   r   r|   r8   r   r9   r   r   r?   rQ   r*   r(   r   r      s          > !%FE$$$&*L%#***,0)000+/u(///*.e'...&*L%#***4818886:3:::
%* 
 
 
 
 
 
r*   r   c                   H     e Zd Zdef fdZdej        dej        fdZ xZ	S )OwlViTVisionEmbeddingsconfigc                 J   t                                                       || _        |j        | _        t          j        t          j        |j                            | _	        t          j
        |j        | j        |j        |j        d          | _        |j        |j        z  dz  | _        | j        dz   | _        t          j        | j        | j                  | _        |                     dt          j        | j                                      d          d           d S )NF)in_channelsout_channelskernel_sizestridebiasr_   r   position_idsr   
persistent)super__init__r   hidden_size	embed_dimr   	Parameterr%   randnclass_embeddingConv2dnum_channels
patch_sizepatch_embedding
image_sizenum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr&   expandrC   r   	__class__s     r(   r   zOwlViTVisionEmbeddings.__init__  s    +!|EK8J,K,KLL!y+)$ 
  
  
 #-1BBqH!-1"$,t/A4>"R"R^U\$:L-M-M-T-TU\-]-]jopppppr*   pixel_valuesr   c                 >   |j         d         }|                     |          }|                    d                              dd          }| j                            |dd          }t          j        ||gd          }||                     | j	                  z   }|S )Nr   r_   r   r   dim)
shaper   flatten	transposer   r   r%   catr   r   )rC   r   
batch_sizepatch_embedsr|   
embeddingss         r(   forwardzOwlViTVisionEmbeddings.forward   s    !'*
++L99#++A..88A>>+22:q"EEYl;CCC
$"9"9$:K"L"LL
r*   )
rK   rL   rM   r   r   r%   rO   r
   r   __classcell__r   s   @r(   r   r     ss        q1 q q q q q q&	E$5 	%, 	 	 	 	 	 	 	 	r*   r   c            	            e Zd Zdef fdZ	 	 	 d	deej                 deej                 deej                 dej	        fdZ
 xZS )
OwlViTTextEmbeddingsr   c                 \   t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        | 	                    dt          j        |j                                      d          d           d S )Nr   r   Fr   )r   r   r   r   
vocab_sizer   token_embeddingmax_position_embeddingsr   r   r%   r&   r   r   s     r(   r   zOwlViTTextEmbeddings.__init__-  s    !|F,=v?QRR"$,v/MvOa"b"b 	EL)GHHOOPWXXej 	 	
 	
 	
 	
 	
r*   N	input_idsr   inputs_embedsr   c                     ||j         d         n|j         d         }|| j        d d d |f         }||                     |          }|                     |          }||z   }|S )Nr   )r   r   r   r   )rC   r   r   r   
seq_lengthposition_embeddingsr   s          r(   r   zOwlViTTextEmbeddings.forward7  s     -6,AY_R((}GZ[]G^
,QQQ^<L  00;;M"55lCC"%88
r*   NNN)rK   rL   rM   r   r   r   r%   
LongTensorrO   r
   r   r   r   s   @r(   r   r   ,  s        
/ 
 
 
 
 
 
 153759	 E,- u/0   12	
 
       r*   r   c                        e Zd ZdZ fdZdej        dedefdZ	 	 	 dd	ej        d
e	ej                 de	ej                 de	e
         deej        e	ej                 e	eej                          f         f
dZ xZS )OwlViTAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                 t   t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      )r   r   r   r   r   num_attention_heads	num_headshead_dimrs   scaleattention_dropoutdropoutr   Lineark_projv_projq_projout_projr   s     r(   r   zOwlViTAttention.__init__N  s   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
/i??i??i??	$.$.AAr*   tensorseq_lenbszc                     |                     ||| j        | j                                      dd                                          S )Nr   r_   )viewr   r   r   
contiguous)rC   r   r   r   s       r(   _shapezOwlViTAttention._shapea  s<    {{3GGQQRSUVWWbbdddr*   NFhidden_statesattention_maskcausal_attention_maskoutput_attentionsr   c                    |                                 \  }}}|                     |          | j        z  }|                     |                     |          d|          }	|                     |                     |          d|          }
|| j        z  d| j        f} |                     |||          j        | } |	j        | }	 |
j        | }
|	                     d          }t          j
        ||	                    dd                    }|                                 || j        z  ||fk    r2t          d|| j        z  ||f d|                                            ||                                 |d||fk    r+t          d|d||f d|                                            |                    || j        ||          |z   }|                    || j        z  ||          }||                                 |d||fk    r+t          d|d||f d|                                            |                    || j        ||          |z   }|                    || j        z  ||          }t          j                            |d          }|r=|                    || j        ||          }|                    || j        z  ||          }nd}t          j                            || j        | j        	          }|                    |
j                  }t          j
        ||
          }|                                 || j        z  || j        fk    r5t          d
|| j        || j        f d|                                            |                    || j        || j                  }|                    dd          }|                    |||          }|                     |          }||fS )z#Input shape: Batch x Time x Channelr   r   r_   z$Attention weights should be of size z	, but is Nz!Attention mask should be of size r   )ptrainingz `attn_output` should be of size )sizer   r   r   r   r   r   r   r   r%   bmmr   rs   r   r#   softmaxr   r   torU   reshaper   )rC   r   r   r   r   r   tgt_lenr   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                    r(   r   zOwlViTAttention.forwardd  sF    #0"4"4"6"6Wi {{=11DJ>[[]!;!;REE
{{4;;}#=#=r3GGDN*B>
Ct{{<#>>CZP$Z_j1
(|(*5//!$$yz/C/CAq/I/IJJ3#7'"JJJ*dn8LgW^7_ * * %%''* *   !,$))++Q/III 7a'8R 7 7-22447 7   (,,S$.'7SSVkkL',,S4>-A7GTTL%""$$a'(BBB ta'8Rtt]k]p]p]r]rtt   (,,S$.'7SSVddL',,S4>-A7GTTL},,\r,BB 	)
 %1$5$5c4>7T[$\$\!055cDN6JGU\]]LL$(!]**<4<RVR_*``
  ]]<#566
i
L99#"6!OOO)CRVR_3` ) )$$&&) )  
 "&&sDNGT]SS!++Aq11!))#w	BBmmK00111r*   )NNF)rK   rL   rM   rN   r   r%   r
   r[   r   r   boolr   r   r   r   s   @r(   r   r   K  s       GGB B B B B&eU\ eC ec e e e e 268<,1O2 O2|O2 !.O2  (5	O2
 $D>O2 
u|Xel3XeEL>Q5RR	SO2 O2 O2 O2 O2 O2 O2 O2r*   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )	OwlViTMLPc                    t                                                       || _        t          |j                 | _        t          j        |j        |j	                  | _
        t          j        |j	        |j                  | _        d S rS   )r   r   r   r   
hidden_actactivation_fnr   r   r   intermediate_sizefc1fc2r   s     r(   r   zOwlViTMLP.__init__  sf    #F$569V/1IJJ9V5v7IJJr*   r   r   c                     |                      |          }|                     |          }|                     |          }|S rS   )r   r   r   )rC   r   s     r(   r   zOwlViTMLP.forward  s=    //**=99//r*   )rK   rL   rM   r   r%   r
   r   r   r   s   @r(   r   r     sc        K K K K KU\ el        r*   r   c                        e Zd Zdef fdZ	 d
dej        dej        dej        dee         de	ej
                 f
d	Z xZS )OwlViTEncoderLayerr   c                 D   t                                                       |j        | _        t	          |          | _        t          j        | j        |j                  | _	        t          |          | _        t          j        | j        |j                  | _        d S N)eps)r   r   r   r   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   s     r(   r   zOwlViTEncoderLayer.__init__  s    +(00<F<QRRRV$$<F<QRRRr*   Fr   r   r   r   r   c                     |}|                      |          }|                     ||||          \  }}||z   }|}|                     |          }|                     |          }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r   r   r   )r  r   r  r  )rC   r   r   r   r   residualr   outputss           r(   r   zOwlViTEncoderLayer.forward  s    " !((77&*nn')"7/	 '5 '
 '
#| !=0 ((77// =0 " 	'&Gr*   )F)rK   rL   rM   r   r   r%   r
   r   r   r   rO   r   r   r   s   @r(   r   r     s        S| S S S S S S -2& &|& &  %|	&
 $D>& 
u 	!& & & & & & & &r*   r   c                   *    e Zd ZdZeZdZdZdgZd Z	dS )OwlViTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    owlvitTr   c                 4   | j         j        }t          |t                    rT|j        j        j                            d|dz             |j        j        j                            d|dz             nt          |t                    r| j         j        }t          j                            |j        d|j        dz  |z             t          j                            |j        j        |j         j        |z             t          j                            |j        j        |j         j        |z             nCt          |t                     r| j         j        }|j        dz  d|j         j        z  dz  z  |z  }|j        dz  |z  }t          j                            |j        j        |           t          j                            |j        j        |           t          j                            |j        j        |           t          j                            |j        j        |           nGt          |t,                    r| j         j        }|j         j        dz  d|j         j        z  dz  z  |z  }d|j         j        z  dz  |z  }t          j                            |j        j        |           t          j                            |j        j        |           nt          |t4                    rt          j                            |j        j        |j        dz  | j         j        z             t          j                            |j        j        |j        dz  | j         j        z             t          |t          j                  r=|j         j        !                                 |j        j        "                    d           t          |t          j#                  r'|j         "|j         j        !                                 dS dS dS )	zInitialize the weights        g{Gz?)meanstdr   )r  r_         ?N)$r   initializer_factor
isinstancer   r   weightdatanormal_r   r   r   initr   r   r   initializer_ranger   num_hidden_layersr   r   r   r   r   r   r   r   OwlViTModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr   r   zero_fill_r   )rC   modulefactorin_proj_stdout_proj_stdfc_stds         r(   _init_weightsz#OwlViTPreTrainedModel._init_weights  s   /f233 	").66CVd]6SSS%,199sQU9VVVV 677 	[3FGOOF2&BRTXBX[aBaObbbGOOF29v}?^ag?gOhhhGOOF5<&-BadjBjOkkkk00 	[3F!+T1q6=;Z7Z_c6cdgmmK",d2f<LGOOFM0kOBBBGOOFM0kOBBBGOOFM0kOBBBGOOFO2OEEEE	** 	[3F!=4d:FMDc@chl?lmpvvK&-33<vEFGOOFJ-6O:::GOOFJ-;O????,, 	GOO&-)4/$+2PP     GOO(/+T1DK4RR     fbl++ 	*K""$$$M$$S)))fbi(( 	%V[-DK""$$$$$	% 	%-D-Dr*   N)
rK   rL   rM   rN   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modulesr$  rQ   r*   r(   r	  r	    sI         
  L &*#-.&% &% &% &% &%r*   r	  a@  

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`OwlViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, num_max_text_queries, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values.
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`, *optional*):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids).
        attention_mask (`torch.Tensor` of shape `(batch_size, num_max_text_queries, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the last hidden state. See `text_model_last_hidden_state` and
            `vision_model_last_hidden_state` under returned tensors for more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a_  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values.
        query_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values of query image(s) to be detected. Pass in one query image per target image.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                        e Zd ZdZdef fdZ	 	 	 	 	 ddeej                 deej                 dee	         dee	         d	ee	         d
e
eef         fdZ xZS )OwlViTEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`OwlViTEncoderLayer`].

    Args:
        config: OwlViTConfig
    r   c                     t                                                       t          j        fdt	          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S rQ   )r   )rA   _r   s     r(   
<listcomp>z*OwlViTEncoder.__init__.<locals>.<listcomp>  s"    $i$i$iA%7%?%?$i$i$ir*   F)r   r   r   
ModuleListranger  layersgradient_checkpointingr   s    `r(   r   zOwlViTEncoder.__init__  sY    m$i$i$i$ivOgIhIh$i$i$ijj&+###r*   Nr   r   r   output_hidden_statesreturn_dictr   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|rdnd}|rdnd}|}	| j        D ]\}
|r||	fz   }| j        r&| j        r|                     |
j        |	|||          }n |
|	|||          }|d         }	|r||d         fz   }]|r||	fz   }|st          d |	||fD                       S t          |	||          S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`).
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NrQ   )r   r   r   c              3      K   | ]}||V  	d S rS   rQ   )rA   vs     r(   rD   z(OwlViTEncoder.forward.<locals>.<genexpr>  s(      eeqWXWdWdWdWdWdeer*   )last_hidden_stater   
attentions)r   r   r3  use_return_dictr1  r2  r   _gradient_checkpointing_func__call__rG   r   )rC   r   r   r   r   r3  r4  encoder_statesall_attentionsr   encoder_layerlayer_outputss               r(   r   zOwlViTEncoder.forward  s~   > 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]3=0:d%![ 	F 	FM# C!/=2B!B* t}  $ A A!*!")%! ! !.!")&7	! ! ! *!,M  F!/=3C2E!E 	?+}.>>N 	fee]NN$Seeeeee+>Vd
 
 
 	
r*   NNNNN)rK   rL   rM   rN   r   r   r   r%   r
   r   r	   r   r   r   r   r   s   @r(   r*  r*    s         ,| , , , , , , 268<,0/3&*H
 H
 !.H
  (5	H

 $D>H
 'tnH
 d^H
 
uo%	&H
 H
 H
 H
 H
 H
 H
 H
r*   r*  c                       e Zd Zdef fdZ ee           eee          	 	 	 	 	 dde	j
        dee	j
                 dee	j
                 dee         d	ee         d
ee         deeef         fd                        Z xZS )OwlViTTextTransformerr   c                     t                                                       || _        |j        }t	          |          | _        t          |          | _        t          j	        ||j
                  | _        d S r   )r   r   r   r   r   r   r*  encoderr   r   r  final_layer_norm)rC   r   r   r   s      r(   r   zOwlViTTextTransformer.__init__  sf    &	.v66$V,, "YF<Q R R Rr*   output_typer%  Nr   r   r   r   r3  r4  r   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|                                }|                    d|d                   }|                     ||          }t          ||j        |j	                  }	|t          ||j                  }|                     |||	|||          }
|
d         }|                     |          }|t          j        |j        d         |j	                  |                    t          j                                      d                              |j	                  f         }|s||f|
dd         z   S t'          |||
j        |
j        	          S )

        Returns:
        Nr   )r   r   r!   )r   r   r   r   r3  r4  r   r   r   r8  pooler_outputr   r9  )r   r   r3  r:  r   r   r   r   rU   r"   r   rE  rF  r%   r&   r   r   r[   argmaxr   r   r9  )rC   r   r   r   r   r3  r4  input_shaper   r   encoder_outputsr8  pooled_outputs                r(   r   zOwlViTTextTransformer.forward   s    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]nn&&NN2{277	),WW
 !A,]5I!
 !
 !
 %7H[\\N,,')"7/!5# ' 
 
 ,A. 112CDD *L*03<M<TUUULL##**r*22556G6NOOQ

  	L%}58KKK)/')7&1	
 
 
 	
r*   rA  )rK   rL   rM   r   r   r   OWLVIT_TEXT_INPUTS_DOCSTRINGr   r   r%   r
   r   r   r	   r   r   r   r   s   @r(   rC  rC    s       S/ S S S S S S +*+GHH+ETdeee 26/3,0/3&*<
 <
<<
 !.<
 u|,	<

 $D><
 'tn<
 d^<
 
u00	1<
 <
 <
 fe IH<
 <
 <
 <
 <
r*   rC  c                   
    e Zd ZeZdef fdZdej        fdZd Z	 e
e           eee          	 	 	 	 ddej        d	eej                 d
ee         dee         dee         deeef         fd                        Z xZS )OwlViTTextModelr   c                     t                                          |           t          |          | _        |                                  d S rS   )r   r   rC  
text_model	post_initr   s     r(   r   zOwlViTTextModel.__init__D  s@       /77r*   r   c                 $    | j         j        j        S rS   rU  r   r   rI   s    r(   get_input_embeddingsz$OwlViTTextModel.get_input_embeddingsJ  s    )99r*   c                 (    || j         j        _        d S rS   rX  )rC   values     r(   set_input_embeddingsz$OwlViTTextModel.set_input_embeddingsM  s    5:"222r*   rG  Nr   r   r   r3  r4  c                 6    |                      |||||          S )ay  
        Returns:

        Examples:
        ```python
        >>> from transformers import AutoProcessor, OwlViTTextModel

        >>> model = OwlViTTextModel.from_pretrained("google/owlvit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
        >>> inputs = processor(
        ...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
        ... )
        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   r   r3  r4  )rU  )rC   r   r   r   r3  r4  s         r(   r   zOwlViTTextModel.forwardP  s.    8 )/!5#  
 
 	
r*   NNNN)rK   rL   rM   r   r%  r   r   ModulerY  r\  r   rQ  r   r   r%   r
   r   r   r	   r   r   r   r   s   @r(   rS  rS  A  s/       #L/      :bi : : : :; ; ; +*+GHH+ETdeee 26,0/3&* 
  
< 
 !. 
 $D>	 

 'tn 
 d^ 
 
u00	1 
  
  
 fe IH 
  
  
  
  
r*   rS  c                        e Zd Zdef fdZ ee           eee          	 	 	 dde	j
        dee         dee         dee         d	eeef         f
d
                        Z xZS )OwlViTVisionTransformerr   c                 :   t                                                       || _        t          |          | _        t          j        |j        |j                  | _	        t          |          | _        t          j        |j        |j                  | _        d S r   )r   r   r   r   r   r   r   r   r  pre_layernormr*  rE  post_layernormr   s     r(   r   z OwlViTVisionTransformer.__init__v  s~    088\&*<&BWXXX$V,, l6+=6CXYYYr*   rG  Nr   r   r3  r4  r   c                    ||n| j         j        }||n| j         j        }||n| j         j        }| j        j        j        j        }|                    |          }|                     |          }| 	                    |          }| 
                    ||||          }|d         }|dddddf         }	|                     |	          }	|s||	f|dd         z   S t          ||	|j        |j                  S )rJ  N)r   r   r3  r4  r   r   rK  )r   r   r3  r:  r   r   r  rU   r   rd  rE  re  r   r   r9  )
rC   r   r   r3  r4  expected_input_dtyper   rO  r8  rP  s
             r(   r   zOwlViTVisionTransformer.forward  s:    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]  $>EK#';<<55**=99,,'/!5#	 ' 
 
 ,A.)!!!Q'2++M:: 	L%}58KKK)/')7&1	
 
 
 	
r*   r   )rK   rL   rM   r   r   r   OWLVIT_VISION_INPUTS_DOCSTRINGr   r   r%   rO   r   r   r	   r   r   r   r   s   @r(   rb  rb  u  s        Z1 Z Z Z Z Z Z +*+IJJ+ETfggg -1/3&*+
 +
'+
 $D>+
 'tn	+

 d^+
 
u00	1+
 +
 +
 hg KJ+
 +
 +
 +
 +
r*   rb  c                        e Zd ZeZdZdef fdZdej        fdZ	 e
e           eee          	 	 	 	 ddeej                 dee         d	ee         d
ee         deeef         f
d                        Z xZS )OwlViTVisionModelr   r   c                     t                                          |           t          |          | _        |                                  d S rS   )r   r   rb  vision_modelrV  r   s     r(   r   zOwlViTVisionModel.__init__  sA       3F;;r*   r   c                 $    | j         j        j        S rS   )rl  r   r   rI   s    r(   rY  z&OwlViTVisionModel.get_input_embeddings  s     +;;r*   rG  Nr   r3  r4  c                 4    |                      ||||          S )a  
        Returns:

        Examples:
        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, OwlViTVisionModel

        >>> model = OwlViTVisionModel.from_pretrained("google/owlvit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   r3  r4  )rl  )rC   r   r   r3  r4  s        r(   r   zOwlViTVisionModel.forward  s-    :   %/!5#	 ! 
 
 	
r*   r_  )rK   rL   rM   r   r%  main_input_namer   r   r`  rY  r   rh  r   r   r   r%   rO   r   r	   r   r   r   r   s   @r(   rj  rj    s       %L$O1      <bi < < < < +*+IJJ+ETfggg 59,0/3&* 
  
u01 
 $D> 
 'tn	 

 d^ 
 
u00	1 
  
  
 hg KJ 
  
  
  
  
r*   rj  c                   p    e Zd ZeZdef fdZ ee          	 	 	 	 	 ddee	j
                 dee	j
                 dee         dee         dee         d	e	j        fd
            Z ee          	 	 	 	 ddee	j                 dee         dee         dee         d	e	j        f
d            Z ee           eee          	 	 	 	 	 	 	 	 ddee	j                 dee	j                 dee	j
                 dee         dee         dee         dee         dee         d	eeef         fd                        Z xZS )r  r   c                    t                                          |           t          |j        t                    s%t          dt          |j                   d          t          |j        t                    s%t          dt          |j                   d          |j        }|j        }|j	        | _	        |j
        | _        |j
        | _        t          |          | _        t          |          | _        t#          j        | j        | j	        d          | _        t#          j        | j        | j	        d          | _        t#          j        t-          j        |j                            | _        |                                  d S )NzMconfig.text_config is expected to be of type OwlViTTextConfig but is of type .zQconfig.vision_config is expected to be of type OwlViTVisionConfig but is of type F)r   )r   r   r  text_configr   	TypeErrortypevision_configr   projection_dimr   r  r  rC  rU  rb  rl  r   r   r  r  r   r%   r   logit_scale_init_valuelogit_scalerV  )rC   r   rt  rw  r   s       r(   r   zOwlViTModel.__init__  sz      &,.>?? 	0+,,0 0 0  
 &.0BCC 	2-..2 2 2  
 (,$3)5 - 9/<<3MBB!#4+@$BU\a!b!b!b!y)<d>QX]^^^<V5R(S(STT 	r*   Nr   r   r   r3  r4  r   c                     ||n| j         j        }|                     |||          }|d         }|                     |          }|S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`OwlViTTextModel`].

        Examples:
        ```python
        >>> from transformers import AutoProcessor, OwlViTModel

        >>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
        >>> inputs = processor(
        ...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
        ... )
        >>> text_features = model.get_text_features(**inputs)
        ```N)r   r   r4  r   )r   r:  rU  r  )	rC   r   r   r   r3  r4  text_outputrP  text_featuress	            r(   get_text_featureszOwlViTModel.get_text_features  sS    4 &1%<kk$+B] oo	.fqorr#A,,];;r*   r   c                     ||n| j         j        }||n| j         j        }||n| j         j        }|                     ||||          }|d         }|                     |          }|S )aB  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`OwlViTVisionModel`].

        Examples:
        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, OwlViTModel

        >>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")
        >>> image_features = model.get_image_features(**inputs)
        ```Nro  r   )r   r   r3  r:  rl  r  )rC   r   r   r3  r4  vision_outputsrP  image_featuress           r(   get_image_featureszOwlViTModel.get_image_features(  s    6 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]**%/!5#	 + 
 
 'q)//>>r*   rG  return_lossreturn_base_image_embedsc	           	      0   ||n| j         j        }||n| j         j        }||n| j         j        }|                     ||||          }	|                     |||||          }
|
d         }|                     |          }|	d         }|                     |          }|t          j	        
                    |ddd          z  }|t          j	        
                    |ddd          z  }| j                                                            |j                  }t          j        ||                                          |z  }|                                }d}|rt#          |          }|}|s|||||
|	f}||f|z   n|S t%          ||||||
|		          S )
a[  
        Returns:

        Examples:
        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, OwlViTModel

        >>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(text=[["a photo of a cat", "a photo of a dog"]], images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```Nro  r^  r   r_   r   T)ordr   keepdim)r3   r4   r5   r6   r7   r8   r9   )r   r   r3  r:  rl  rU  r  r  r%   linalgnormrz  expr   r"   matmulr-   r0   r2   )rC   r   r   r   r  r   r3  r  r4  r  text_outputsr6   r7   text_embeds_normrz  r5   r4   r3   outputs                      r(   r   zOwlViTModel.forwardU  s   @ 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]**%/!5#	 + 
 
 )/!5# ' 
 
 #1o**;77%a(--l;; $el&7&7!QS]a&7&b&bb&):):;ASU_c):)d)dd &**,,//0CDD,'79I9IJJ[X*,,.. 	0//D& 	F&lT`bpqF)-)9TGf$$vE-+#%* .
 
 
 	
r*   rA  r_  )NNNNNNNN)rK   rL   rM   r   r%  r   r   rQ  r   r%   r
   r   rO   r~  rh  r  OWLVIT_INPUTS_DOCSTRINGr   r2   r   r	   r   r   r   r   s   @r(   r  r    sw       L|      @ +*+GHH -115,0/3&*   EL)  !.  $D>	 
 'tn  d^  
	      IH D +*+IJJ 59,0/3&** *u01* $D>* 'tn	*
 d^* 
	* * * KJ*X +*+BCC<lSSS 154815&*,0/337&*U
 U
E,-U
 u01U
 !.	U

 d^U
 $D>U
 'tnU
 #+4.U
 d^U
 
ul"	#U
 U
 U
 TS DCU
 U
 U
 U
 U
r*   r  c                   N     e Zd Zddedef fdZdej        dej        fdZ	 xZ
S )	OwlViTBoxPredictionHead   r   out_dimc                 ,   t                                                       |j        j        }t	          j        ||          | _        t	          j        ||          | _        t	          j                    | _	        t	          j        ||          | _
        d S rS   )r   r   rw  r   r   r   dense0dense1GELUgeludense2)rC   r   r  widthr   s       r(   r   z OwlViTBoxPredictionHead.__init__  sn    $0iu--iu--GII	iw//r*   r  r   c                     |                      |          }|                     |          }|                     |          }|                     |          }|                     |          }|S rS   )r  r  r  r  )rC   r  r  s      r(   r   zOwlViTBoxPredictionHead.forward  s\    ^,,6""V$$6""V$$r*   )r  )rK   rL   rM   r   r[   r   r%   r
   rO   r   r   r   s   @r(   r  r    sw        0 0| 0c 0 0 0 0 0 0el u7H        r*   r  c            	            e Zd Zdef fdZdej        deej                 deej                 de	ej                 fdZ
 xZS )OwlViTClassPredictionHeadr   c                 l   t                                                       |j        j        }|j        j        | _        t          j        | j        |          | _        t          j        | j        d          | _	        t          j        | j        d          | _
        t          j                    | _        d S )Nr   )r   r   rt  r   rw  	query_dimr   r   r  logit_shiftrz  ELUelu)rC   r   r  r   s      r(   r   z"OwlViTClassPredictionHead.__init__  s    $0-9i889T^Q779T^Q77688r*   r7   query_embeds
query_maskr   c                     |                      |          }|L|j        }|j        d d         \  }}t          j        ||| j        f                              |          }||fS |t          j                            |dd          dz   z  }|t          j                            |dd          dz   z  }t          j	        d||          }| 
                    |          }	|                     |          }
|                     |
          dz   }
||	z   |
z  }|v|j        dk    rt          j        |d	          }t          j        |d
k    t          j        |j                  j        |          }|                    t          j                  }||fS )Nr_   r   T)r   r  gư>z...pd,...qd->...pqr   r   r   r   )r  r"   r   r%   zerosr  r   r  r  einsumr  rz  r  ndim	unsqueezewherefinforU   rc   rV   )rC   r7   r  r  image_class_embedsr"   r   r   pred_logitsr  rz  s              r(   r   z!OwlViTClassPredictionHead.forward  s    "[[66'.F&8&>rr&B#J+z;&OPPSSTZ[[K!344 05<3D3DEW]_im3D3n3nqu3uv#u|'8'82W['8'\'\_c'cd l#79K\ZZ &&|44&&|44hh{++a/"[0K?!"""_ZR@@@
+jAou{;CT7U7U7Y[fggK%..77K/00r*   )rK   rL   rM   r   r   r%   rO   r   r
   r   r   r   r   s   @r(   r  r    s        	| 	 	 	 	 	 	!1'!1 u01!1 U\*	!1
 
u 	!!1 !1 !1 !1 !1 !1 !1 !1r*   r  c                       e Zd ZeZdef fdZededej	        fd            Z
 ed          dded	eej                 dej	        fd
            Zdej        d	ej        dej        fdZ	 	 d dej        deej                 deej	                 deej                 fdZ	 	 d dej	        dej        dej	        dee         dee         deej                 fdZ	 	 d dej        dee         dee         deej                 fdZdej        dej        dej        fdZ ee           eee          	 	 	 	 d!dej        deej                 dee         dee         dee         defd                        Z ee           eee          	 	 	 	 d!dej	        dej        deej	                 dee         dee         dee         defd                        Z xZS )"OwlViTForObjectDetectionr   c                    t                                          |           t          |          | _        t	          |          | _        t          |          | _        t          j	        |j
        j        |j
        j                  | _        t          j                    | _        |j
        j        |j
        j        z  | _        |                     | j                  | _        d S r   )r   r   r  r
  r  
class_headr  box_headr   r   rw  r   r  
layer_normSigmoidsigmoidr   r   sqrt_num_patchescompute_box_biasbox_biasr   s     r(   r   z!OwlViTForObjectDetection.__init__  s       !&))3F;;/77,v';'GVMaMpqqqz|| & 4 ?6CWCb b--d.CDDr*   r   r   c                 0   t          j        d| dz   t           j                  }t          j        d| dz   t           j                  }t          j        ||d          \  }}t          j        ||fd          }|| z  }|                    dd          }|S )Nr   )rU   xy)indexingr   r   r_   )r%   r&   rV   meshgridstackr   )r   x_coordinatesy_coordinatesxxyybox_coordinatess         r(   !normalize_grid_corner_coordinatesz:OwlViTForObjectDetection.normalize_grid_corner_coordinates  s     Qau}MMMQau}MMM}tLLLB  +r2hB777;& *..r155r*   r_   )maxsizeNfeature_mapc                    |t          d          |                     |          }t          j        |dd          }t          j        |dz             t          j        | dz             z
  }t          j        |d|z            }t          j        |dz             t          j        | dz             z
  }t          j        ||gd          }|S )NzOfeature_map has been deprecated as an input. Please pass in num_patches insteadr  r  g-C6?r   r   )rs   r  r%   cliploglog1p	full_liker   )rC   r   r  r  box_coord_biasbox_sizebox_size_biasr  s           r(   r  z)OwlViTForObjectDetection.compute_box_bias  s    "nooo@@MM*_c3?? ?T#9::U[/IY\`I`=a=aa ?>33DEE	(T/22U[(TAQ5R5RR 9nm<"EEEr*   image_featsc                     |                      |          }| j                            |j                  }||z  }|                     |          }|S )a  
        Args:
            image_feats:
                Features extracted from the image, returned by the `image_text_embedder` method.
            feature_map:
                A spatial re-arrangement of image_features, also returned by the `image_text_embedder` method.
        Returns:
            pred_boxes:
                List of predicted boxes (cxcywh normalized to 0, 1) nested within a dictionary.
        )r  r  r   r"   r  )rC   r  r  r{   r  s        r(   box_predictorz&OwlViTForObjectDetection.box_predictor%  sP      ]];//
 =##K$677h
\\*--
r*   r  r  c                 >    |                      |||          \  }}||fS )a8  
        Args:
            image_feats:
                Features extracted from the `image_text_embedder`.
            query_embeds:
                Text query embeddings.
            query_mask:
                Must be provided with query_embeddings. A mask indicating which query embeddings are valid.
        )r  )rC   r  r  r  r  r  s         r(   class_predictorz(OwlViTForObjectDetection.class_predictor=  s,     -1OOKWa,b,b)(/00r*   r   r   r   r   r3  c                    |                      |||||d          }|j        d         }| j         j                            |          }t	          j        |d d d dd d f         |d d d df         j                  }	|d d dd d d f         |	z  }|                     |          }|j        d         | j        | j        |j        d         f}
|	                    |
          }|d         }|||fS )NT)r   r   r   r   r3  r4  r   r   r   )
r
  r9   rl  re  r%   broadcast_tor   r  r  r   )rC   r   r   r   r   r3  r  r8  r7   class_token_outnew_sizer6   s               r(   image_text_embedderz,OwlViTForObjectDetection.image_text_embedderP  s'    ++%)/!5  
 
 $7:{/>>?PQQ  ,\!!!RaR(-C\RSRSRSUXVXUXRXEYE_`` $AAAqrr111H-?|44 q!!!r"	
 $++H55bk\733r*   c                    | j                             |d          }|d         }| j         j                            |          }t          j        |d d d dd d f         |d d d df         j                  }|d d dd d d f         |z  }|                     |          }|j        d         | j        | j        |j        d         f}|                    |          }||fS )NT)r   r4  r   r   r   )	r
  rl  re  r%   r  r   r  r  r   )	rC   r   r   r3  r  r8  r7   r  r  s	            r(   image_embedderz'OwlViTForObjectDetection.image_embeddery  s
    11|Y]1^^ +1-{/>>?PQQ  ,\!!!RaR(-C\RSRSRSUXVXUXRXEYE_`` $AAAqrr111H-?|44 q!!!r"	
 $++H55n--r*   query_image_featuresquery_feature_mapc                    |                      |          \  }}|                     ||          }t          |          }g }g }|j        }	t	          |j        d                   D ]Q}
t          j        g dg|	          }||
         }t          ||          \  }}t          j	        |d         dk              rt          ||          }t          j        |          dz  }|d         |k                                    }|                                r||
         |                    d                   }t          j        ||
         d          }t          j        d||          }|t          j        |                   }|                    ||
         |                    |                    |           S|r)t          j        |          }t          j        |          }nd	\  }}|||fS )
Nr   )r   r   r   r   r!   r  g?r   )axiszd,id->iNN)r  r  r   r"   r0  r   r%   r   rp   rr   rw   rd   nonzeronumelsqueezer  r  argminappendr  )rC   r  r  r-  r|   r{   pred_boxes_as_cornersbest_class_embedsbest_box_indicespred_boxes_deviceieach_query_boxeach_query_pred_boxesiousiou_thresholdselected_indsselected_embeddingsmean_embedsmean_simbest_box_indr  box_indicess                         r(   embed_image_queryz*OwlViTForObjectDetection.embed_image_query  s    ../CDD<''(<>OPP
 8 D D 18+1!455 	6 	6A"\<<<.ARSSSN$9!$<!n.CDDGD! yaC(( R*>;PQQ "IdOOc1M!!W5>>@@M""$$ 6&21om6K6KA6N6N&O##jaqAAA <	;@STT,U\(-C-CD!((a)FGGG ''555 	3 ;'899L+&677KK(2%L+[*44r*   rG  query_pixel_valuesr4  c           
         ||n| j         j        }||n| j         j        }||n| j         j        }|                     |          d         }|                     |||          \  }}|j        \  }	}
}
}t          j        ||	|
|
z  |f          }|j        \  }	}
}
}t          j        ||	|
|
z  |f          }|                     ||          \  }}}| 	                    ||          \  }}| 
                    ||          }|s6|||||||                                f}t          d |D                       }|S t          ||||||d|          S )aU  
        Returns:

        Examples:
        ```python
        >>> import requests
        >>> from PIL import Image
        >>> import torch
        >>> from transformers import AutoProcessor, OwlViTForObjectDetection

        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch16")
        >>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch16")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> query_url = "http://images.cocodataset.org/val2017/000000001675.jpg"
        >>> query_image = Image.open(requests.get(query_url, stream=True).raw)
        >>> inputs = processor(images=image, query_images=query_image, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model.image_guided_detection(**inputs)
        >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
        >>> target_sizes = torch.Tensor([image.size[::-1]])
        >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
        >>> results = processor.post_process_image_guided_detection(
        ...     outputs=outputs, threshold=0.6, nms_threshold=0.3, target_sizes=target_sizes
        ... )
        >>> i = 0  # Retrieve predictions for the first image
        >>> boxes, scores = results[i]["boxes"], results[i]["scores"]
        >>> for box, score in zip(boxes, scores):
        ...     box = [round(i, 2) for i in box.tolist()]
        ...     print(f"Detected similar object with confidence {round(score.item(), 3)} at location {box}")
        Detected similar object with confidence 0.856 at location [10.94, 50.4, 315.8, 471.39]
        Detected similar object with confidence 1.0 at location [334.84, 25.33, 636.16, 374.71]
        ```N)r   r   )r   r   r3  )r  r  c              3      K   | ]}||V  	d S rS   rQ   rA   xs     r(   rD   zBOwlViTForObjectDetection.image_guided_detection.<locals>.<genexpr>  "      >>1>>r*   )r7   r   r   r   r   r|   r8   r9   )r   r   r3  r4  r  r   r%   r   r  r  r  r?   rG   r   )rC   r   r  r   r3  r4  r  r  r  r   r   
hidden_dimr  query_image_featsr  r  r   r  r|   r   r  s                        r(   image_guided_detectionz/OwlViTForObjectDetection.image_guided_detection  s   V 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+BY !//=O/PPQRS&*&9&9%/!5 ': '
 '
#^ <G;L8
KjmK*kK>WYc1dee;L;R8
Kj!M*;j+XcJceo=pqq;?;Q;QRcev;w;w8&(8 '+&:&:{am&:&n&n#l !..{KHH 	!! ''))F >>f>>>>>FM5$0/-%" .	
 	
 	
 		
r*   c           	         ||n| j         j        }||n| j         j        }||n| j         j        }|                     |||||          \  }}}	|	j        }
|	j        }|j        \  }}}}t          j	        ||||z  |f          }|j        d         |z  }|	                    |||j        d                   }|	                    |||j        d                   }|d         dk    }| 
                    |||          \  }}|                     ||          }|sH||||||
                                |                                f}t          d |D                       }|S t          ||||||
|          S )a  
        Returns:

        Examples:
        ```python
        >>> import requests
        >>> from PIL import Image
        >>> import torch
        >>> from transformers import AutoProcessor, OwlViTForObjectDetection

        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
        >>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> texts = [["a photo of a cat", "a photo of a dog"]]
        >>> inputs = processor(text=texts, images=image, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
        >>> target_sizes = torch.Tensor([image.size[::-1]])
        >>> # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
        >>> results = processor.post_process_object_detection(
        ...     outputs=outputs, threshold=0.1, target_sizes=target_sizes
        ... )

        >>> i = 0  # Retrieve predictions for the first image for the corresponding text queries
        >>> text = texts[i]
        >>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]

        >>> for box, score, label in zip(boxes, scores, labels):
        ...     box = [round(i, 2) for i in box.tolist()]
        ...     print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
        Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29]
        Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17]
        ```N)r   r   r   r   r3  r   r   ).r   c              3      K   | ]}||V  	d S rS   rQ   r  s     r(   rD   z3OwlViTForObjectDetection.forward.<locals>.<genexpr>}  r  r*   )r7   r6   r{   r   r|   r8   r9   )r   r   r3  r4  r  r8   r9   r   r%   r   r  r  r?   rG   ry   )rC   r   r   r   r   r3  r4  r  r  r  r  r  r   r   r  r  max_text_queriesr  r  r|   r{   r  s                         r(   r   z OwlViTForObjectDetection.forward   s   ^ 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+BY .2-E-E%)/!5 .F .
 .
*k7 0 4;F;L8
KjmK*kK>WYc1dee %?1-;#++J8H,J\]_J`aa %%j2BIOTVDWXX	v&*
 '+&:&:;V`&a&a#l ''[AA
 	%%''''))F >>f>>>>>FM*$$!%* .
 
 
 	
r*   rS   r  r_  )rK   rL   rM   r   r%  r   staticmethodr[   r%   r
   r  r   r   rO   r  r  r   r  r   r  r  r  r   5OWLVIT_IMAGE_GUIDED_OBJECT_DETECTION_INPUTS_DOCSTRINGr   r   r  (OWLVIT_OBJECT_DETECTION_INPUTS_DOCSTRINGry   r   r   r   s   @r(   r  r    s       LE| E E E E E E s u|    \ Yq C huGX>Y ejeq    $& & 
		   6 59-1	1 1&1 u011 U\*	1
 
u 	!1 1 1 10 -1/3'4 '4<'4 ''4 	'4
 $D>'4 'tn'4 
u 	!'4 '4 '4 '4X -1/3	. .'. $D>. 'tn	.
 
u 	!. . . .>'5$)$5'5JOJ['5		'5 '5 '5 '5R +*+`aa+Q`lmmm ;?,0/3&*[
 [
'[
 %U%67[
 $D>	[

 'tn[
 d^[
 
0[
 [
 [
 nm ba[
z +*+STT+FUabbb
 26,0/3&*f
 f
<f
 'f
 !.	f

 $D>f
 'tnf
 d^f
 
%f
 f
 f
 cb UTf
 f
 f
 f
 f
r*   r  )IrN   dataclassesr   	functoolsr   typingr   r   r   r   r	   r%   torch.utils.checkpointr
   r   activationsr   modeling_attn_mask_utilsr   r   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   r   r   r   configuration_owlvitr   r   r   transformers.image_transformsr   
get_loggerrK   logger_CHECKPOINT_FOR_DOCr)   r0   r2   r\   r`   rp   rw   ry   r   r`  r   r   r   r   r   r	  OWLVIT_START_DOCSTRINGrQ  rh  r  r  r  r*  rC  rS  rb  rj  r  r  r  r  rQ   r*   r(   <module>r     s     ! ! ! ! ! !       4 4 4 4 4 4 4 4 4 4 4 4 4 4              ! ! ! ! ! ! d d d d d d d d K K K K K K K K - - - - - -                U T T T T T T T T T  GFFFFFF 
	H	%	%2 `U\ `el ` ` ` `
-EL -U\ - - - - "
 "
 "
 "
 "
; "
 "
 "
LGv G& G G G GEF Ev E E E E"  "' ' '0 .
 .
 .
 .
 .
+ .
 .
 .
b -
 -
 -
 -
 -
[ -
 -
 -
`    RY   @    29   >h2 h2 h2 h2 h2bi h2 h2 h2X    	    / / / / / / / /d1% 1% 1% 1% 1%O 1% 1% 1%h    *"  2, ((9 5"V
 V
 V
 V
 V
BI V
 V
 V
rG
 G
 G
 G
 G
BI G
 G
 G
T1
 1
 1
 1
 1
+ 1
 1
 1
h7
 7
 7
 7
 7
bi 7
 7
 7
t/
 /
 /
 /
 /
- /
 /
 /
d ,--J
 J
 J
 J
 J
' J
 J
 .-J
Z    bi   &-1 -1 -1 -1 -1	 -1 -1 -1`V
 V
 V
 V
 V
4 V
 V
 V
 V
 V
r*   