
    g<                    
   d Z ddlZddlZddlmZ ddlmZmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZmZmZ ddlmZm Z m!Z!  ej"        e#          Z$dZ%de
j&        de
j&        fdZ'de
j&        de
j&        fdZ(e G d de                      Z)e G d de                      Z*e G d de                      Z+ G d dej,                  Z- G d dej,                  Z. G d dej,                  Z/ G d  d!ej,                  Z0 G d" d#ej,                  Z1 G d$ d%e          Z2d&Z3d'Z4d(Z5d)Z6 G d* d+ej,                  Z7 G d, d-ej,                  Z8 G d. d/e2          Z9 G d0 d1ej,                  Z: G d2 d3e2          Z; ee3           G d4 d5e2                      Z< G d6 d7ej,                  Z= G d8 d9e2          Z> ed:e3           G d; d<e2                      Z?dS )=zPyTorch CLIPSeg model.    N)	dataclass)AnyOptionalTupleUnion)nn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings	torch_int   )CLIPSegConfigCLIPSegTextConfigCLIPSegVisionConfigzCIDAS/clipseg-rd64-refinedlogitsreturnc                     t           j                            | t          j        t          |           | j                            S )Ndevice)r   
functionalcross_entropytorcharangelenr   )r   s    h/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/clipseg/modeling_clipseg.pycontrastive_lossr%   1   s3    =&&vu|CKKPVP]/^/^/^___    
similarityc                 r    t          |           }t          |                                           }||z   dz  S )Ng       @)r%   t)r'   caption_loss
image_losss      r$   clipseg_lossr,   6   s4    #J//L!*,,..11J:%,,r&   c                       e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
ej        ed<   dZej        ed<   dZej        ed<   dZeed<   dZeed	<   d
ee         fdZdS )CLIPSegOutputa  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
        text_model_output (`BaseModelOutputWithPooling`):
            The output of the [`CLIPSegTextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`CLIPSegVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                 ^     t           fd                                 D                       S )Nc              3   t   K   | ]2}|d vr|         n!t          |                                          V  3dS ))r4   r5   Ngetattrto_tuple.0kselfs     r$   	<genexpr>z)CLIPSegOutput.to_tuple.<locals>.<genexpr>\   sc       
 
  LLLDGGRYZ^`aRbRbRkRkRmRm
 
 
 
 
 
r&   tuplekeysr>   s   `r$   r:   zCLIPSegOutput.to_tuple[   C     
 
 
 
YY[[
 
 
 
 
 	
r&   )__name__
__module____qualname____doc__r/   r   r!   FloatTensor__annotations__r0   r1   r2   r3   r4   r   r5   r   r   r:    r&   r$   r.   r.   <   s          ( )-D(5$
%,,,*.e'...)-OU&---%)K")))&*L%#***4818886:3:::
%* 
 
 
 
 
 
r&   r.   c                       e Zd ZU dZdZej        ed<   dZe	e
ej                          ed<   dZe	e
ej                          ed<   dS )CLIPSegDecoderOutputa  
    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
            Classification scores for each pixel.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    Nr   hidden_states
attentions)rE   rF   rG   rH   r   r!   rI   rJ   rN   r   r   rO   rK   r&   r$   rM   rM   b   sk           !%FE$$$8<M8E%"345<<<59Ju01299999r&   rM   c                       e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
ej        ed<   dZej        ed<   dZeed<   dZeed<   d	ee         fd
ZdS )CLIPSegImageSegmentationOutputa,  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        ...
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`CLIPSegVisionModel`].
    Nr/   r   conditional_embeddingspooled_outputr5   decoder_outputr   c                 ^     t           fd                                 D                       S )Nc              3   t   K   | ]2}|d vr|         n!t          |                                          V  3dS ))r5   rT   Nr8   r;   s     r$   r?   z:CLIPSegImageSegmentationOutput.to_tuple.<locals>.<genexpr>   sb       
 
  IIIDGGwW[]^O_O_OhOhOjOj
 
 
 
 
 
r&   r@   rC   s   `r$   r:   z'CLIPSegImageSegmentationOutput.to_tuple   rD   r&   )rE   rF   rG   rH   r/   r   r!   rI   rJ   r   rR   rS   r5   r   rT   rM   r   r   r:   rK   r&   r$   rQ   rQ   v   s           )-D(5$
%,,, $FE$$$04E-444'+M5$+++6:3:::+/N(///
%* 
 
 
 
 
 
r&   rQ   c                   v     e Zd Zdef fdZdej        dededej        fdZdd	ej	        dej        fd
Z
 xZS )CLIPSegVisionEmbeddingsconfigc                 z   t                                                       || _        |j        | _        |j        | _        |j        | _        t          j        t          j
        | j                            | _        t          j        |j        | j        | j        | j        d          | _        | j        | j        z  dz  | _        | j        dz   | _        t          j        | j        | j                  | _        |                     dt          j        | j                                      d          d           d S )NF)in_channelsout_channelskernel_sizestridebias   r   position_idsr   
persistent)super__init__rY   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr!   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr"   expandr>   rY   	__class__s     r$   rg   z CLIPSegVisionEmbeddings.__init__   s   + + +!|EK,G,GHH!y+? 
  
  
 !Ot>1D!-1"$,t/A4>"R"R^U\$:L-M-M-T-TU\-]-]jopppppr&   
embeddingsheightwidthr   c                    |j         d         dz
  }| j        j                            d          }|j         d         dz
  }t          j                                        s&||k    r ||k    r|                     | j                  S |ddddf         }|ddddf         }|j         d         }	|| j        z  }
|| j        z  }t          |dz            }|
                    d|||	          }|                    dddd          }t          j                            ||
|fdd	
          }|                    dddd                              dd|	          }t	          j        ||fd          S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nrc   g      ?r	   r`   bicubicF)sizemodealign_cornersdim)shaperu   weight	unsqueezer!   jit
is_tracingra   rk   r   reshapepermuter   r   interpolateviewcat)r>   rz   r{   r|   rr   ru   rs   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r$   interpolate_pos_encodingz0CLIPSegVisionEmbeddings.interpolate_pos_encoding   s    !&q)A-!4;EEaHH*03a7 y##%% 	>+*F*F6UZ??**4+<===,QQQU3,QQQU3r"t.
T_,	&}c'9::)11!5GI[]`aa)11!Q1==-33i(	 4 
 
 *11!Q1==BB1b#NNy/?;CCCCr&   Fpixel_valuesc                    |j         \  }}}}|s<|| j        k    s|| j        k    r&t          d| d| d| j         d| j         d	          |                     |          }|                    d                              dd          }| j                            |dd          }t          j	        ||gd          }	|r|	| 
                    |	||          z   }	n|	|                     | j                  z   }	|	S )	NzInput image size (*z) doesn't match model ().r`   r   rc   r   )r   rj   
ValueErrorrq   flatten	transposern   rw   r!   r   r   ru   ra   )
r>   r   r   
batch_size_r{   r|   patch_embedsclass_embedsrz   s
             r$   forwardzCLIPSegVisionEmbeddings.forward   s*   '3'9$
Avu' 	Vt-F-F%SWSbJbJbuVuueuuPTP_uubfbquuu   ++L99#++A..88A>>+22:q"EEYl;CCC
# 	Q#d&C&CJPVX]&^&^^JJ#d&=&=d>O&P&PPJr&   F)rE   rF   rG   r   rg   r!   Tensorintr   rI   r   __classcell__ry   s   @r$   rX   rX      s        q2 q q q q q q,'D5< 'D 'DUX 'D]b]i 'D 'D 'D 'DR E$5 Z_Zf        r&   rX   c            	            e Zd Zdef fdZ	 	 	 d	deej                 deej                 deej                 dej	        fdZ
 xZS )
CLIPSegTextEmbeddingsrY   c                 V   t                                                       |j        }t          j        |j        |          | _        t          j        |j        |          | _        | 	                    dt          j        |j                                      d          d           d S )Nra   rb   Frd   )rf   rg   rh   r   rt   
vocab_sizetoken_embeddingmax_position_embeddingsru   rv   r!   r"   rw   r>   rY   ri   ry   s      r$   rg   zCLIPSegTextEmbeddings.__init__   s    &	!|F,=yII"$,v/My"Y"Y 	EL)GHHOOPWXXej 	 	
 	
 	
 	
 	
r&   N	input_idsra   inputs_embedsr   c                     ||j         d         n|j         d         }|| j        d d d |f         }||                     |          }|                     |          }||z   }|S )Nrc   )r   ra   r   ru   )r>   r   ra   r   
seq_lengthposition_embeddingsrz   s          r$   r   zCLIPSegTextEmbeddings.forward   s     -6,AY_R((}GZ[]G^
,QQQ^<L  00;;M"55lCC"%88
r&   )NNN)rE   rF   rG   r   rg   r   r!   
LongTensorrI   r   r   r   r   s   @r$   r   r      s        

0 

 

 

 

 

 

 153759	 E,- u/0   12	
 
       r&   r   c                        e Zd ZdZ fdZdej        dedefdZ	 	 	 dd	ej        d
e	ej                 de	ej                 de	e
         deej        e	ej                 f         f
dZ xZS )CLIPSegAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                 t   t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r         )rf   rg   rY   rh   ri   num_attention_heads	num_headshead_dimr   scaleattention_dropoutdropoutr   Lineark_projv_projq_projout_projrx   s     r$   rg   zCLIPSegAttention.__init__  s   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
/i??i??i??	$.$.AAr&   tensorseq_lenbszc                     |                     ||| j        | j                                      dd                                          S )Nr   r`   )r   r   r   r   
contiguous)r>   r   r   r   s       r$   _shapezCLIPSegAttention._shape  s<    {{3GGQQRSUVWWbbdddr&   NFrN   attention_maskcausal_attention_maskoutput_attentionsr   c                    |                                 \  }}}|                     |          | j        z  }|                     |                     |          d|          }	|                     |                     |          d|          }
|| j        z  d| j        f} |                     |||          j        | } |	j        | }	 |
j        | }
|	                     d          }t          j
        ||	                    dd                    }|                                 || j        z  ||fk    r2t          d|| j        z  ||f d|                                            ||                                 |d||fk    r+t          d|d||f d|                                            |                    || j        ||          |z   }|                    || j        z  ||          }||                                 |d||fk    r+t          d|d||f d|                                            |                    || j        ||          |z   }|                    || j        z  ||          }t          j                            |d          }|r=|                    || j        ||          }|                    || j        z  ||          }nd}t          j                            || j        | j        	          }t          j
        ||
          }|                                 || j        z  || j        fk    r5t          d
|| j        || j        f d|                                            |                    || j        || j                  }|                    dd          }|                    |||          }|                     |          }||fS )z#Input shape: Batch x Time x Channelrc   r   r`   z$Attention weights should be of size z	, but is Nz!Attention mask should be of size r   )ptrainingz `attn_output` should be of size )r   r   r   r   r   r   r   r   r   r!   bmmr   r   r   r   softmaxr   r   r   r   )r>   rN   r   r   r   r   tgt_lenri   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                    r$   r   zCLIPSegAttention.forward  s0    #0"4"4"6"6Wi {{=11DJ>[[]!;!;REE
{{4;;}#=#=r3GGDN*B>
Ct{{<#>>CZP$Z_j1
(|(*5//!$$yz/C/CAq/I/IJJ3#7'"JJJ*dn8LgW^7_ * * %%''* *   !,$))++Q/III 7a'8R 7 7-22447 7   (,,S$.'7SSVkkL',,S4>-A7GTTL%""$$a'(BBB ta'8Rtt]k]p]p]r]rtt   (,,S$.'7SSVddL',,S4>-A7GTTL},,\r,BB 	)
 %1$5$5c4>7T[$\$\!055cDN6JGU\]]LL$(!]**<4<RVR_*``
i
L99#"6!OOO)CRVR_3` ) )$$&&) )  
 "&&sDNGT]SS!++Aq11!))#w	BBmmK00111r&   )NNF)rE   rF   rG   rH   rg   r!   r   r   r   r   boolr   r   r   r   s   @r$   r   r     s        GGB B B B B&eU\ eC ec e e e e 268<,1L2 L2|L2 !.L2  (5	L2
 $D>L2 
u|Xel33	4L2 L2 L2 L2 L2 L2 L2 L2r&   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )
CLIPSegMLPc                    t                                                       || _        t          |j                 | _        t          j        |j        |j	                  | _
        t          j        |j	        |j                  | _        d S N)rf   rg   rY   r
   
hidden_actactivation_fnr   r   rh   intermediate_sizefc1fc2rx   s     r$   rg   zCLIPSegMLP.__init__o  sf    #F$569V/1IJJ9V5v7IJJr&   rN   r   c                     |                      |          }|                     |          }|                     |          }|S r   )r   r   r   )r>   rN   s     r$   r   zCLIPSegMLP.forwardv  s=    //**=99//r&   )rE   rF   rG   rg   r!   r   r   r   r   s   @r$   r   r   n  sc        K K K K KU\ el        r&   r   c                        e Zd Zdef fdZ	 d
dej        dej        dej        dee         de	ej
                 f
d	Z xZS )CLIPSegEncoderLayerrY   c                 D   t                                                       |j        | _        t	          |          | _        t          j        | j        |j                  | _	        t          |          | _        t          j        | j        |j                  | _        d S N)epsrf   rg   rh   ri   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rx   s     r$   rg   zCLIPSegEncoderLayer.__init__      +)&11<F<QRRRf%%<F<QRRRr&   FrN   r   r   r   r   c                     |}|                      |          }|                     ||||          \  }}||z   }|}|                     |          }|                     |          }||z   }|f}|r||fz  }|S aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rN   r   r   r   )r   r   r   r   r>   rN   r   r   r   residualr   outputss           r$   r   zCLIPSegEncoderLayer.forward  s    " !((77&*nn')"7/	 '5 '
 '
#| !=0 ((77// =0 " 	'&Gr&   r   )rE   rF   rG   r   rg   r!   r   r   r   r   rI   r   r   r   s   @r$   r   r   ~  s        S} S S S S S S -2& &|& &  %|	&
 $D>& 
u 	!& & & & & & & &r&   r   c                   $    e Zd ZdZeZdZdZd ZdS )CLIPSegPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    clipTc                 4   | j         j        }t          |t                    rT|j        j        j                            d|dz             |j        j        j                            d|dz             nt          |t                    r| j         j        }t          j                            |j        d|j        dz  |z             t          j                            |j        j        |j         j        |z             t          j                            |j        j        |j         j        |z             nCt          |t                     r| j         j        }|j        dz  d|j         j        z  dz  z  |z  }|j        dz  |z  }t          j                            |j        j        |           t          j                            |j        j        |           t          j                            |j        j        |           t          j                            |j        j        |           nGt          |t,                    r| j         j        }|j         j        dz  d|j         j        z  dz  z  |z  }d|j         j        z  dz  |z  }t          j                            |j        j        |           t          j                            |j        j        |           nt          |t4                    rt          j                            |j        j        |j        dz  | j         j        z             t          j                            |j        j        |j        dz  | j         j        z             t          |t          j                  r=|j         j        !                                 |j        j        "                    d           t          |t          j#                  r'|j         "|j         j        !                                 dS dS dS )	zInitialize the weightsg        g{Gz?)meanstdr   )r   r`   g      ?N)$rY   initializer_factor
isinstancer   r   r   datanormal_ru   rX   r   initrn   ri   rq   initializer_ranger   num_hidden_layersr   r   r   r   r   rh   r   r   CLIPSegModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr   r_   zero_fill_r   )r>   modulefactorin_proj_stdout_proj_stdfc_stds         r$   _init_weightsz$CLIPSegPreTrainedModel._init_weights  s   /f344 	").66CVd]6SSS%,199sQU9VVVV 788 	[3FGOOF2&BRTXBX[aBaObbbGOOF29v}?^ag?gOhhhGOOF5<&-BadjBjOkkkk 011 	[3F!+T1q6=;Z7Z_c6cdgmmK",d2f<LGOOFM0kOBBBGOOFM0kOBBBGOOFM0kOBBBGOOFO2OEEEE
++ 	[3F!=4d:FMDc@chl?lmpvvK&-33<vEFGOOFJ-6O:::GOOFJ-;O????-- 	GOO&-)4/$+2PP     GOO(/+T1DK4RR    
 fbl++ 	*K""$$$M$$S)))fbi(( 	%V[-DK""$$$$$	% 	%-D-Dr&   N)	rE   rF   rG   rH   r   config_classbase_model_prefixsupports_gradient_checkpointingr  rK   r&   r$   r   r     s@         
 !L&*#'% '% '% '% '%r&   r   aI  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`CLIPSegConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
            Whether to interpolate the pre-trained position encodings.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
            Whether to interpolate the pre-trained position encodings.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                        e Zd ZdZdef fdZ	 	 	 	 	 ddeej                 deej                 dee	         dee	         d	ee	         d
e
eef         fdZ xZS )CLIPSegEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPSegEncoderLayer`].

    Args:
        config: CLIPSegConfig
    rY   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S rK   )r   r<   r   rY   s     r$   
<listcomp>z+CLIPSegEncoder.__init__.<locals>.<listcomp>U  s"    $j$j$jQ%8%@%@$j$j$jr&   F)	rf   rg   rY   r   
ModuleListranger  layersgradient_checkpointingrx   s    `r$   rg   zCLIPSegEncoder.__init__R  sa    m$j$j$j$j%PVPhJiJi$j$j$jkk&+###r&   Nr   r   r   output_hidden_statesreturn_dictr   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|rdnd}|rdnd}|}	t	          | j                  D ]_\  }
}|r||	fz   }| j        r&| j        r|                     |j	        |	|||          }n ||	|||          }|d         }	|r||d         fz   }`|r||	fz   }|st          d |	||fD                       S t          |	||          S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NrK   )r   r   r   c              3      K   | ]}||V  	d S r   rK   r<   vs     r$   r?   z)CLIPSegEncoder.forward.<locals>.<genexpr>  s(      eeqWXWdWdWdWdWdeer&   )last_hidden_staterN   rO   )rY   r   r  use_return_dict	enumerater  r  r   _gradient_checkpointing_func__call__rA   r   )r>   r   r   r   r   r  r  encoder_statesall_attentionsrN   idxencoder_layerlayer_outputss                r$   r   zCLIPSegEncoder.forwardX  s   L 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]3=0:d%"+DK"8"8 	F 	FC# C!/=2B!B* t}  $ A A!*!")%! ! !.!")&7	! ! ! *!,M  F!/=3C2E!E 	?+}.>>N 	fee]NN$Seeeeee+>Vd
 
 
 	
r&   NNNNN)rE   rF   rG   rH   r   rg   r   r!   r   r   r   r   r   r   r   r   s   @r$   r  r  I  s         ,} , , , , , , 268<,0/3&*O
 O
 !.O
  (5	O

 $D>O
 'tnO
 d^O
 
uo%	&O
 O
 O
 O
 O
 O
 O
 O
r&   r  c                       e Zd Zdef fdZ ee           eee          	 	 	 	 	 	 dde	e
j                 de	e
j                 de	e
j                 de	e         d	e	e         d
e	e         deeef         fd                        Z xZS )CLIPSegTextTransformerrY   c                    t                                                       || _        |j        }t	          |          | _        t          |          | _        t          j	        ||j
                  | _        |j        | _        d S r   )rf   rg   rY   rh   r   rz   r  encoderr   r   r   final_layer_normeos_token_idr   s      r$   rg   zCLIPSegTextTransformer.__init__  ss    &	/77%f-- "YF<Q R R R #/r&   output_typer  Nr   r   ra   r   r  r  r   c                 *   ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |                                }|                    d|d                   }|                     ||          }t          ||j	        |j
                  }	|t          ||j	                  }|                     |||	|||          }
|
d         }|                     |          }| j        dk    rg|t          j        |j        d         |j
                  |                    t          j        |j
        	                              d
          f         }n|t          j        |j        d         |j
                  |                    t          j        |j
        	          | j        k                                                        d
          f         }|s||f|
dd         z   S t+          |||
j        |
j                  S )
        Returns:

        NzYou have to specify input_idsrc   )r   ra   r   )r   r   r   r   r  r  r   r`   )dtyper   r   r   r$  pooler_outputrN   rO   )rY   r   r  r%  r   r   r   rz   r   r9  r   r   r2  r3  r4  r!   r"   r   tor   argmaxr   rN   rO   )r>   r   r   ra   r   r  r  input_shaperN   r   encoder_outputsr$  rS   s                r$   r   zCLIPSegTextTransformer.forward  sH     2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]<===nn&&NN2{277	),WW !A,]5I!
 !
 !
 %7H[\\N,,')"7/!5# ' 
 
 ,A. 112CDD!! ..4Q7@Q@XYYY595F5MNNUUZ\U]]_MM ..4Q7@Q@XYYY EI6G6NOOSWSddB!M  	L%}58KKK)/')7&1	
 
 
 	
r&   NNNNNN)rE   rF   rG   r   rg   r   CLIPSEG_TEXT_INPUTS_DOCSTRINGr   r   r   r!   r   r   r   r   r   r   r   s   @r$   r0  r0    s*       	00 	0 	0 	0 	0 	0 	0 +*+HII+ETefff -115/3,0/3&*O
 O
EL)O
 !.O
 u|,	O

 $D>O
 'tnO
 d^O
 
u00	1O
 O
 O
 gf JIO
 O
 O
 O
 O
r&   r0  c                   <    e Zd ZeZddgZdef fdZdej        fdZ	d Z
 ee           eee          	 	 	 	 	 	 dd
eej                 deej                 deej                 dee         dee         dee         deeef         fd                        Z xZS )CLIPSegTextModelr   r   rY   c                     t                                          |           t          |          | _        |                                  d S r   )rf   rg   r0  
text_model	post_initrx   s     r$   rg   zCLIPSegTextModel.__init__  s@       088r&   r   c                 $    | j         j        j        S r   rE  rz   r   rC   s    r$   get_input_embeddingsz%CLIPSegTextModel.get_input_embeddings  s    )99r&   c                 (    || j         j        _        d S r   rH  )r>   values     r$   set_input_embeddingsz%CLIPSegTextModel.set_input_embeddings  s    5:"222r&   r5  Nr   r   ra   r   r  r  c                 8    |                      ||||||          S )aM  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPSegTextModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   ra   r   r  r  )rE  )r>   r   r   ra   r   r  r  s          r$   r   zCLIPSegTextModel.forward  s1    8 )%/!5#  
 
 	
r&   r@  )rE   rF   rG   r   r  _no_split_modulesrg   r   ModulerI  rL  r   rA  r   r   r   r!   r   r   r   r   r   r   r   s   @r$   rC  rC    sW       $L02GH0      :bi : : : :; ; ; +*+HII+ETefff -115/3,0/3&*!
 !
EL)!
 !.!
 u|,	!

 $D>!
 'tn!
 d^!
 
u00	1!
 !
 !
 gf JI!
 !
 !
 !
 !
r&   rC  c                        e Zd Zdef fdZ ee           eee          	 	 	 	 	 dde	e
j                 de	e         de	e         d	e	e         d
e	e         deeef         fd                        Z xZS )CLIPSegVisionTransformerrY   c                 4   t                                                       || _        |j        }t	          |          | _        t          j        ||j                  | _	        t          |          | _        t          j        ||j                  | _        d S r   )rf   rg   rY   rh   rX   rz   r   r   r   pre_layrnormr  r2  post_layernormr   s      r$   rg   z!CLIPSegVisionTransformer.__init__D  s    &	1&99L8MNNN%f-- l9&:OPPPr&   r5  NFr   r   r  r  r   r   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |                     ||          }|                     |          }|                     ||||          }|d         }|dddddf         }	|                     |	          }	|s||	f|dd         z   S t          ||	|j
        |j                  S )r8  Nz You have to specify pixel_values)r   )r   r   r  r  r   r   r:  )rY   r   r  r%  r   rz   rT  r2  rU  r   rN   rO   )
r>   r   r   r  r  r   rN   r?  r$  rS   s
             r$   r   z CLIPSegVisionTransformer.forwardN  s/    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]?@@@Oghh))-88,,'/!5#	 ' 
 
 ,A.)!!!Q'2++M:: 	L%}58KKK)/')7&1	
 
 
 	
r&   )NNNNF)rE   rF   rG   r   rg   r   CLIPSEG_VISION_INPUTS_DOCSTRINGr   r   r   r!   rI   r   r   r   r   r   r   s   @r$   rR  rR  B  s       Q2 Q Q Q Q Q Q +*+JKK+ETghhh 59,0/3&*38+
 +
u01+
 $D>+
 'tn	+

 d^+
 #+4.+
 
u00	1+
 +
 +
 ih LK+
 +
 +
 +
 +
r&   rR  c                       e Zd ZeZdZdef fdZdej        fdZ	 e
e           eee          	 	 	 	 	 ddeej                 d	ee         d
ee         dee         dee         deeef         fd                        Z xZS )CLIPSegVisionModelr   rY   c                     t                                          |           t          |          | _        |                                  d S r   )rf   rg   rR  vision_modelrF  rx   s     r$   rg   zCLIPSegVisionModel.__init__  sA       4V<<r&   r   c                 $    | j         j        j        S r   )r[  rz   rq   rC   s    r$   rI  z'CLIPSegVisionModel.get_input_embeddings  s     +;;r&   r5  NFr   r  r   r  c                 6    |                      |||||          S )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegVisionModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   r  r   r  )r[  )r>   r   r   r  r   r  s         r$   r   zCLIPSegVisionModel.forward  s1    @   %/!5%=# ! 
 
 	
r&   NNNFN)rE   rF   rG   r   r  main_input_namerg   r   rP  rI  r   rW  r   r   r   r!   rI   r   r   r   r   r   r   s   @r$   rY  rY    s+       &L$O2      <bi < < < < +*+JKK+ETghhh 59,0/338&*$
 $
u01$
 $D>$
 'tn	$

 #+4.$
 d^$
 
u00	1$
 $
 $
 ih LK$
 $
 $
 $
 $
r&   rY  c                       e Zd ZeZdef fdZ ee          	 	 	 	 	 	 ddee	j
                 dee	j
                 dee	j
                 dee         dee         d	ee         d
e	j        fd            Z ee          	 	 	 	 	 ddee	j                 dee         dee         ded	ee         d
e	j        fd            Z ee           eee          	 	 	 	 	 	 	 	 	 ddee	j                 dee	j                 dee	j
                 dee	j                 dee         dee         dee         ded	ee         d
eeef         fd                        Z xZS )r  rY   c                 $   t                                          |           t          |j        t                    s%t          dt          |j                   d          t          |j        t                    s%t          dt          |j                   d          |j        }|j        }|j	        | _	        |j
        | _        |j
        | _        t          |          | _        t          |          | _        t#          j        | j        | j	        d          | _        t#          j        | j        | j	        d          | _        t#          j        t-          j        | j        j                            | _        |                                  d S )NzNconfig.text_config is expected to be of type CLIPSegTextConfig but is of type .zRconfig.vision_config is expected to be of type CLIPSegVisionConfig but is of type F)r_   )rf   rg   r   text_configr   	TypeErrortypevision_configr   projection_dimrh   r  r  r0  rE  rR  r[  r   r   r  r  rl   r!   r   rY   logit_scale_init_valuelogit_scalerF  )r>   rY   rd  rg  ry   s       r$   rg   zCLIPSegModel.__init__  s|      &,.?@@ 	0+,,0 0 0  
 &.0CDD 	2-..2 2 2  
 (,$3)5 - 90==4]CC!#4+@$BU\a!b!b!b!y)<d>QX]^^^<T[5W(X(XYY 	r&   Nr   r   ra   r   r  r  r   c                     ||n| j         j        }||n| j         j        }||n| j         j        }|                     ||||||          }|d         }|                     |          }	|	S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPSegTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPSegModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```NrN  r   )rY   r   r  r%  rE  r  )
r>   r   r   ra   r   r  r  text_outputsrS   text_featuress
             r$   get_text_featureszCLIPSegModel.get_text_features  s    6 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B])%/!5# ' 
 
 %Q,,];;r&   Fr   r   c                     ||n| j         j        }||n| j         j        }||n| j         j        }|                     |||||          }|d         }|                     |          }|S )aI  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPSegVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```Nr^  r   )rY   r   r  r%  r[  r  )	r>   r   r   r  r   r  vision_outputsrS   image_featuress	            r$   get_image_featureszCLIPSegModel.get_image_features  s    @ 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]**%/!5%=# + 
 
 'q)//>>r&   r5  return_lossc
           	         ||n| j         j        }||n| j         j        }|	|	n| j         j        }	|                     |||||	          }
|                     ||||||	          }|
d         }|                     |          }|d         }|                     |          }||                    ddd          z  }||                    ddd          z  }| j	        
                                }t          j        ||                                          |z  }|                                }d}|rt          |          }|	s||||||
f}||f|z   n|S t          |||||||
	          S )
a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```Nr^  rN  r   r`   rc   T)r   r   keepdim)r/   r0   r1   r2   r3   r4   r5   )rY   r   r  r%  r[  rE  r  r  normrj  expr!   matmulr)   r,   r.   )r>   r   r   r   ra   rs  r   r  r   r  rp  rl  r3   r2   rj  r1   r0   r/   outputs                      r$   r   zCLIPSegModel.forward;  s   N 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]**%/!5%=# + 
 
 )%/!5# ' 
 
 &a(--l;;"1o**;77 $l&7&7!T&7&R&RR!K$4$4qb$$4$O$OO &**,,,{LNN4D4DEES*,,.. 	100D 	F&lT`bpqF)-)9TGf$$vE-+#%* .
 
 
 	
r&   r@  r_  )	NNNNNNNFN)rE   rF   rG   r   r  rg   r   rA  r   r!   r   r   rI   rn  rW  rr  CLIPSEG_INPUTS_DOCSTRINGr   r.   r   r   r   r   r   r   s   @r$   r  r    s        L}      @ +*+HII -115/3,0/3&*, ,EL), !., u|,	,
 $D>, 'tn, d^, 
	, , , JI,\ +*+JKK 59,0/3).&*0 0u010 $D>0 'tn	0
 #'0 d^0 
	0 0 0 LK0d +*+CDD=}UUU 15481537&*,0/3).&*[
 [
E,-[
 u01[
 !.	[

 u/0[
 d^[
 $D>[
 'tn[
 #'[
 d^[
 
um#	$[
 [
 [
 VU ED[
 [
 [
 [
 [
r&   r  c                        e Zd ZdZdef fdZ	 ddej        dej        dej        dee	         d	e
ej                 f
d
Z xZS )CLIPSegDecoderLayerz
    CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied after
    self-attention/MLP, rather than before.
    rY   c                 D   t                                                       |j        | _        t	          |          | _        t          j        | j        |j                  | _	        t          |          | _        t          j        | j        |j                  | _        d S r   r   rx   s     r$   rg   zCLIPSegDecoderLayer.__init__  r   r&   FrN   r   r   r   r   c                     |}|                      ||||          \  }}||z   }|                     |          }|}|                     |          }||z   }|                     |          }|f}|r||fz  }|S r   )r   r   r   r   r   s           r$   r   zCLIPSegDecoderLayer.forward  s    " !&*nn')"7/	 '5 '
 '
#| !=0((77 // =0((77 " 	'&Gr&   r   )rE   rF   rG   rH   r   rg   r!   r   r   r   r   rI   r   r   r   s   @r$   r|  r|    s         S} S S S S S S -2' '|' '  %|	'
 $D>' 
u 	!' ' ' ' ' ' ' 'r&   r|  c                        e Zd Zdef fdZ	 	 	 ddeej                 dej        dee	         dee	         d	ee	         f
d
Z
 xZS )CLIPSegDecoderrY   c                    t                                                     j        | _        t          j        j        j                  | _        t          j        j        j                  | _        j	        r׉j
        j        dz  j
        j        dz  f}t          j        t          j        j        j        dd          t          j                    t          j        j        j        dz  |d         |d                   t          j                    t          j        j        dz  d|d         |d                             | _        n6t          j        j        dj
        j        j
        j                  | _        t#          j                  }t          j        fd	t)          |          D                       | _        t-          j        j
                  j        _        j        _        j        _        d
_        t          j        fdt)          t#          j                            D                       | _        d S )N   r	   r   )r]   paddingr`   r   )r]   r^   )r^   c                 X    g | ]&}t          j        j        j        j                  'S rK   )r   r   rg  rh   
reduce_dimr  s     r$   r  z+CLIPSegDecoder.__init__.<locals>.<listcomp>  s/    bbbPQRYv+79JKKbbbr&   reluc                 .    g | ]}t                    S rK   )r|  )r<   r   decoder_configs     r$   r  z+CLIPSegDecoder.__init__.<locals>.<listcomp>  s"    $t$t$tQ%8%H%H$t$t$tr&   )rf   rg   conditional_layerr   r   rh  r  film_mulfilm_add"use_complex_transposed_convolutionrg  rk   
Sequentialro   ReLUConvTranspose2dtransposed_convolutionr#   extract_layersr  r  reducescopydeepcopyrh   decoder_num_attention_headsr   decoder_intermediate_sizer   r   r  )r>   rY   transposed_kernelsdepthr  ry   s    `  @r$   rg   zCLIPSegDecoder.__init__  s/      !'!9	&"79JKK	&"79JKK4 	"("6"AQ"FH\HgklHl!m*,-	&+V->AWXYYY		"%%* 21 5-a0	   		"%*A;Ma;PYklmYn  + +D'' +-*<!1f&:&EfNbNm+ + +D' F)**}bbbbUZ[`UaUabbb
 
 v';<<%+%6"-3-O*+1+K($*!m$t$t$t$tRWX[\b\qXrXrRsRs$t$t$tuur&   NTrN   rR   r   r  r  c                    |rdnd }|rdnd }|d d d         }d }	t          t          || j        | j                            D ]\  }
\  }}}|	 ||          |	z   }	n ||          }	|
| j        k    rZ|                     |          |	                    ddd          z  |                     |          z   }	|	                    ddd          }	 ||	d d |          }|d         }	|r||	fz  }|r||d         fz  }|	d d dd d d f                             ddd          }	t          t          j
        |	j        d                             }|j        d         }|	                    ||	j        d         ||          }	|                     |	                              d          }|st          d |||fD                       S t!          |||          S )	NrK   rc   r   r   r`   )r   r   r   c              3      K   | ]}||V  	d S r   rK   r"  s     r$   r?   z)CLIPSegDecoder.forward.<locals>.<genexpr>/  s(      aaqSTS`S`S`S`S`aar&   )r   rN   rO   )r&  zipr  r  r  r  r   r  r   mathsqrtr   r   r  squeezerA   rM   )r>   rN   rR   r   r  r  all_hidden_statesr*  activationsry  i
activationlayerreducer-  r   r   r   s                     r$   r   zCLIPSegDecoder.forward  sB    #7@BBD0:d#DDbD).7KVZVb8c8c.d.d 	6 	6*A*
E6!
++f4
++D***'=>>PQSTVWAXAXX[_[h[h*\ \   1a00!Et4[l  M #1%F# /!fY.!  6=#3"55122qqq!))!Q2249V\!_--..+1!4
Za$EE,,V44<<Q?? 	baaV->$Oaaaaaa#+%
 
 
 	
r&   )NNT)rE   rF   rG   r   rg   r   r!   r   r   r   r   r   r   s   @r$   r  r    s        (v} (v (v (v (v (v (v\ -1/3&*6
 6
U\*6
 !&6
 $D>	6

 'tn6
 d^6
 6
 6
 6
 6
 6
 6
 6
r&   r  zn
    CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
    c                       e Zd ZeZdef fdZ	 	 	 	 	 ddedeej	                 deej	                 deej	                 deej	                 f
d	Z
 ee           eee
          	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej	                 deej                 deej                 dee         dee         dedee         deeef         fd                        Z xZS )CLIPSegForImageSegmentationrY   c                     t                                          |           || _        t          |          | _        |j        | _        t          |          | _        |                                  d S r   )	rf   rg   rY   r  r   r  r  decoderrF  rx   s     r$   rg   z$CLIPSegForImageSegmentation.__init__A  sc        ((	$3%f-- 	r&   Nr   r   r   ra   conditional_pixel_valuesc                    |kt          |          |k    rt          d          t          j                    5  | j                            |||          }d d d            n# 1 swxY w Y   ny|ht          |          |k    rt          d          t          j                    5  | j                            |          }d d d            n# 1 swxY w Y   nt          d          |S )Nz@Make sure to pass as many prompt texts as there are query images)r   ra   zAMake sure to pass as many prompt images as there are query imagesz[Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`)r#   r   r!   no_gradr   rn  rr  )r>   r   r   r   ra   r  rR   s          r$   get_conditional_embeddingsz6CLIPSegForImageSegmentation.get_conditional_embeddingsN  s     9~~++ !cddd  )-)D)Dn< *E * *&               &1+,,
:: !deee ` `)-)E)EF^)_)_&` ` ` ` ` ` ` ` ` ` ` ` ` ` ` m   &%s#   A""A&)A&%CCCr5  Fr   rR   labelsr   r  r   r  r   c                    ||n| j         j        }t          j                    5  | j                            ||d|
|          }| j                            |d                   }|r|j        n|d         fd| j        D             }|r,t          |j
        |j        |	r|j        nd|j                  }n|	s|dd         |dd         z   n|}ddd           n# 1 swxY w Y   |&|                     |j        d	         ||||
          }nU|j        d	         |j        d	         k    rt          d          |j        d         | j         j        k    rt          d          |                     ||||	|          }|r|j        n|d	         }d}|9|                    |j                  }t+          j                    } |||          }|s|||||f}||f|z   n|S t/          ||||||          S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoProcessor, CLIPSegForImageSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> texts = ["a cat", "a remote", "a blanket"]
        >>> inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> print(logits.shape)
        torch.Size([3, 352, 352])
        ```NTr^  r   r`   c                 &    g | ]}|d z            S )r   rK   )r<   r  rN   s     r$   r  z7CLIPSegForImageSegmentation.forward.<locals>.<listcomp>  s"    MMMA=Q/MMMr&   r:  r	   r   )r   r   r   ra   r  zWMake sure to pass as many conditional embeddings as there are query images in the batchzcMake sure that the feature dimension of the conditional embeddings matches `config.projection_dim`.)r   r  r  )r/   r   rR   rS   r5   rT   )rY   r%  r!   r  r   r[  r  rN   r  r   r$  r;  rO   r  r   r   rh  r  r   r<  r   r   BCEWithLogitsLossrQ   )r>   r   r   r  rR   r   ra   r  r   r  r   r  rp  rS   r  decoder_outputsr   r/   loss_fnry  rN   s                       @r$   r   z#CLIPSegForImageSegmentation.forwardk  s   Z &1%<kk$+B] ]__ 	 	!Y33)"3%))A' 4  N !I77q8IJJM<G^N88^\]M^MMMMM9LMMMK  
!;&4&F"0">BV"`.">">\`-8	" " " DXkN2A2&);;;]k /	 	 	 	 	 	 	 	 	 	 	 	 	 	 	8 ")%)%D%D'-a0#-))A &E & &"" &+A.,2DQ2GGG m   &+A.$+2LLL 0   ,,"/!5# ' 
 
 ,7N''OA<NYYv}--F*,,G766**D 	F4m^UdeF)-)9TGf$$vE-#9' .*
 
 
 	
s   B+CC #C r.  )NNNNNNNNNFN)rE   rF   rG   r   r  rg   r   r   r!   r   r  r   rz  r   rQ   r   rI   r   r   r   r   r.   r   r   r   s   @r$   r  r  8  s        !L}       ,015/3;?& && EL)& !.	&
 u|,& #+5<"8& & & &: +*+CDD+IXijjj 2648@D>B1537-1,0/3).&*y
 y
E-.y
 u01y
 #+5+<"=	y

 !)): ;y
 !.y
 u/0y
 )*y
 $D>y
 'tny
 #'y
 d^y
 
um#	$y
 y
 y
 kj EDy
 y
 y
 y
 y
r&   r  )@rH   r  r  dataclassesr   typingr   r   r   r   r!   torch.utils.checkpointr   r  r
   modeling_attn_mask_utilsr   r   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   r   r   r   configuration_clipsegr   r   r   
get_loggerrE   logger_CHECKPOINT_FOR_DOCr   r%   r,   r.   rM   rQ   rP  rX   r   r   r   r   r   CLIPSEG_START_DOCSTRINGrA  rW  rz  r  r0  rC  rR  rY  r  r|  r  r  rK   r&   r$   <module>r     s       ! ! ! ! ! ! . . . . . . . . . . . .            ! ! ! ! ! ! d d d d d d d d K K K K K K K K - - - - - -                Y X X X X X X X X X 
	H	%	% 3 
`U\ `el ` ` ` `
-U\ -el - - - - !
 !
 !
 !
 !
K !
 !
 !
H : : : : :; : : :& 
 
 
 
 
[ 
 
 
0P P P P Pbi P P Ph    BI   De2 e2 e2 e2 e2ry e2 e2 e2R        / / / / /") / / /d1% 1% 1% 1% 1%_ 1% 1% 1%h	 ! @# "% R^
 ^
 ^
 ^
 ^
RY ^
 ^
 ^
B^
 ^
 ^
 ^
 ^
RY ^
 ^
 ^
B4
 4
 4
 4
 4
- 4
 4
 4
n:
 :
 :
 :
 :
ry :
 :
 :
z3
 3
 3
 3
 3
/ 3
 3
 3
l -..b
 b
 b
 b
 b
) b
 b
 /.b
J6 6 6 6 6") 6 6 6ra
 a
 a
 a
 a
+ a
 a
 a
H  	 h
 h
 h
 h
 h
"8 h
 h
 h
 h
 h
r&   