
    g                        d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
ZddlZddlmc mZ ddlZddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZm Z m!Z!m"Z"  ej#        e$          Z%dZ&dZ'e G d de                      Z(e G d de                      Z) G d dej*                  Z+ G d dej*                  Z, G d dej*                  Z- G d dej*                  Z. G d dej*                  Z/ G d dej*                  Z0 G d d ej*                  Z1 G d! d"ej*                  Z2 G d# d$ej*                  Z3 G d% d&ej*                  Z4 G d' d(ej*                  Z5 G d) d*ej*                  Z6 G d+ d,ej*                  Z7 G d- d.ej*                  Z8 G d/ d0ej*                  Z9 G d1 d2e          Z:d3Z;d4Z< ed5d6e;           G d7 d8e:                      Z=dS )9zPyTorch SAM model.    N)	dataclass)DictListOptionalTupleUnion)Tensornn   )ACT2FN)BaseModelOutput)PreTrainedModel)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )	SamConfigSamMaskDecoderConfigSamPromptEncoderConfigSamVisionConfigr   zfacebook/sam-vit-hugec                       e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dS )SamVisionEncoderOutputa  
    Base class for sam vision model's outputs that also contains image embeddings obtained by applying the projection
    layer to the pooler_output.

    Args:
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r        `/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/sam/modeling_sam.pyr   r   (   s          , 15L(5,-444+/u(///=AM8E%"3S"89:AAA:>Ju0#567>>>>>r&   r   c                       e Zd ZU dZdZej        ed<   dZej        ed<   dZ	e
eej        df                  ed<   dZe
eej        df                  ed<   dZe
eej        df                  ed<   dS )	SamImageSegmentationOutputa  
    Base class for Segment-Anything model's output

    Args:
        iou_scores (`torch.FloatTensor` of shape `(batch_size, num_masks)`):
            The iou scores of the predicted masks.
        pred_masks (`torch.FloatTensor` of shape `(batch_size, num_masks, height, width)`):
            The predicted low resolutions masks. Needs to be post-processed by the processor
        vision_hidden_states  (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the vision model at the output of each layer plus the optional initial embedding outputs.
        vision_attentions  (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        mask_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    N
iou_scores
pred_masks.vision_hidden_statesvision_attentionsmask_decoder_attentions)r   r   r    r!   r*   r"   r#   r$   r+   r,   r   r   r-   r.   r%   r&   r'   r)   r)   F   s          6 %)J!((($(J!(((DH(5):C)?#@AHHHAExe&7&< =>EEEGKXeE,=s,B&CDKKKKKr&   r)   c                   (     e Zd ZdZ fdZd Z xZS )SamPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t                                                       |j        |j        }}|j        |j        }}t          |t          j        j	                  r|n||f}t          |t          j        j	                  r|n||f}|d         |d         z  |d         |d         z  z  }|| _        || _        || _        || _
        t          j        ||||          | _        d S )Nr   r   kernel_sizestride)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesr
   Conv2d
projection)selfconfigr7   r8   r9   r:   r?   	__class__s          r'   r6   zSamPatchEmbeddings.__init__q   s    !'!2F4EJ
$*$79Kk#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
!!}
15*Q-:VW=:XY$$(&)L+:^hiiir&   c                 P   |j         \  }}}}|| j        k    rt          d          || j        d         k    s|| j        d         k    r2t          d| d| d| j        d          d| j        d          d	          |                     |                              ddd	d          }|S )
NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model (z).   r   )shaper9   
ValueErrorr7   rA   permute)rB   pixel_values
batch_sizer9   heightwidth
embeddingss          r'   forwardzSamPatchEmbeddings.forward   s    2>2D/
L&%4,,,w   T_Q'''5DOA4F+F+FwVwwewwDO\]L^wwaeapqraswww   __\22::1aAFF
r&   )r   r   r    r!   r6   rP   __classcell__rD   s   @r'   r0   r0   j   sV         j j j j j      r&   r0   c                   B     e Zd Z fdZdej        dej        fdZ xZS )SamMLPBlockc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          |j	                 | _
        d S N)r5   r6   r
   Linearr:   mlp_dimlin1lin2r   
hidden_actactrB   rC   rD   s     r'   r6   zSamMLPBlock.__init__   s\    If0&.AA	Ifnf.@AA	&+,r&   r   returnc                     |                      |          }|                     |          }|                     |          }|S rV   )rY   r\   rZ   rB   r   s     r'   rP   zSamMLPBlock.forward   s;    		-00//		-00r&   )r   r   r    r6   r"   r	   rP   rQ   rR   s   @r'   rT   rT      s^        - - - - -U\ el        r&   rT   c                   H     e Zd ZdZd fd	Zdej        dej        fdZ xZS )	SamLayerNormaA  LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
    ư>channels_lastc                 `   t                                                       t          j        t	          j        |                    | _        t          j        t	          j        |                    | _        || _	        || _
        | j
        dvrt          d| j
                   |f| _        d S )N)rd   channels_firstzUnsupported data format: )r5   r6   r
   	Parameterr"   onesweightzerosbiasepsdata_formatNotImplementedErrornormalized_shape)rB   ro   rl   rm   rD   s       r'   r6   zSamLayerNorm.__init__   s    l5:.>#?#?@@L-=!>!>??	&#FFF%&T$BR&T&TUUU!1 3r&   xr^   c                 *   | j         dk    r=t          j        j                            || j        | j        | j        | j                  }n| j         dk    r|j	        }|
                                }|                    dd          }||z
                      d                              dd          }||z
  t          j        || j        z             z  }|                    |          }| j        d d d d f         |z  | j        d d d d f         z   }|S )Nrd   rf   r   T)keepdimrG   )dtype)rm   r"   r
   
functional
layer_normro   ri   rk   rl   rs   floatmeanpowsqrtto)rB   rp   input_dtypeuss        r'   rP   zSamLayerNorm.forward   s   ..#..q$2GVZV_aeaijjAA!111'K		Aq$''AQA##At#44AQ%*Q\222A;''AAAAtTM*Q.111dD=1IIAr&   )rc   rd   )	r   r   r    r!   r6   r"   r	   rP   rQ   rR   s   @r'   rb   rb      sm         
4 4 4 4 4 4 %,        r&   rb   c                   j     e Zd ZdZd fd	ZdededefdZdededefd	Zdd
ededededef
dZ	 xZ
S )SamAttentionz
    SAM's attention layer that allows for downscaling the size of the embedding after projection to queries, keys, and
    values.
    Nc                    t                                                       |j        | _        ||j        n|}|j        |z  | _        |j        | _        | j        |j        z  dk    rt          d          t          j        | j        | j                  | _	        t          j        | j        | j                  | _
        t          j        | j        | j                  | _        t          j        | j        | j                  | _        d S )Nr   z,num_attention_heads must divide hidden_size.)r5   r6   r:   attention_downsample_rateinternal_dimnum_attention_headsrI   r
   rW   q_projk_projv_projout_proj)rB   rC   downsample_raterD   s      r'   r6   zSamAttention.__init__   s    !->M>U&::[j"./A#)#= v99Q>>KLLLi 0$2CDDi 0$2CDDi 0$2CDD	$"3T5EFFr&   r   r   r^   c                     |j         \  }}}}||z  }|                    ||z  |||          }|                    dd          S Nr   rG   )rH   reshape	transpose)rB   r   r   batchpoint_batch_sizen_tokenschannel
c_per_heads           r'   _separate_headszSamAttention._separate_heads   sV    5B5H27 33
%--e6F.FRegqrr&&q!,,,r&   r   c                     |j         \  }}}}|                    dd          }|                    ||z  ||||z            S r   )rH   r   r   )rB   r   r   r   n_headsr   r   s          r'   _recombine_headszSamAttention._recombine_heads   sQ    /</B,w*%//155$$U.>%>@PRZ\cfp\pqqqr&   querykeyvalueattention_similarityc                 d   |                      |          }|                     |          }|                     |          }|j        d         }|                     || j                  }|                     || j                  }|                     || j                  }|j        \  }}}}||                    dddd          z  }||dz  z  }t          j        |d          }|||z   }t          j        |d          }||z  }	| 	                    |	|          }	| 
                    |	          }	|	S )Nr   r   r   rG         ?dim)r   r   r   rH   r   r   rJ   r"   softmaxr   r   )
rB   r   r   r   r   r   _r   attnouts
             r'   rP   zSamAttention.forward   s5   E""kk#E"" ;q>$$UD,DEE""3(@AA$$UD,DEE $k1as{{1aA...z3'}Tr***+..D=2...D Ul##C)9::mmC  
r&   rV   )r   r   r    r!   r6   r	   intr   r   rP   rQ   rR   s   @r'   r   r      s         
G G G G G G -V -# -RX - - - -rf r rPV r r r r
 V &  W] io        r&   r   c                   N     e Zd Zddedef fdZ	 ddededed	ed
edefdZ xZS )SamTwoWayAttentionBlockrG   Fr   skip_first_layer_pec                 Z   t                                                       |j        | _        |j        | _        t	          |d          | _        t          j        | j        | j                  | _        t	          ||          | _	        t          j        | j        | j                  | _
        t          |          | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t	          ||          | _        || _        dS )a  
        A transformer block with four layers:
            (1) self-attention of sparse inputs (2) cross attention of sparse inputs -> dense inputs (3) mlp block on
            sparse inputs (4) cross attention of dense inputs -> sparse inputs

        Arguments:
            config (`SamMaskDecoderConfig`):
                The configuration file used to instantiate the block
            attention_downsample_rate (*optionalk*, int, defaults to 2):
                The downsample ratio of the block used to reduce the inner dim of the attention.
            skip_first_layer_pe (*optional*, bool, defaults to `False`):
                Whether or not to skip the addition of the query_point_embedding on the first layer.
        r   )r   rl   N)r5   r6   r:   layer_norm_epsr   	self_attnr
   	LayerNormlayer_norm1cross_attn_token_to_imagelayer_norm2rT   mlplayer_norm3layer_norm4cross_attn_image_to_tokenr   )rB   rC   r   r   rD   s       r'   r6   z SamTwoWayAttentionBlock.__init__   s    	!-$3%fa@@@<(8d>QRRR)5fNg)h)h)h&<(8d>QRRRv&&<(8d>QRRR<(8d>QRRR)5fNg)h)h)h&#6   r&   querieskeysquery_point_embeddingkey_point_embeddingr   output_attentionsc                 (   | j         r|                     |||          }n"||z   }|                     |||          }||z   }|                     |          }||z   }||z   }	|                     ||	||          }||z   }|                     |          }|                     |          }
||
z   }|                     |          }||z   }||z   }	|                     |	||          }||z   }|                     |          }||f}|r||fz   }n|dz   }|S )Nr   r   r   )r   r   r   r   rV   )	r   r   r   r   r   r   r   r   r   )rB   r   r   r   r   r   r   r   attn_outr   mlp_outoutputss               r'   rP   zSamTwoWayAttentionBlock.forward  sn    # 	)nn7wnOOGG33E~~EuG~LLH(G""7++ //((11SCW 2 
 
 H$""7++ ((7##G#""7++ //((11g1VVh%%D/ 	(+GG'Gr&   )rG   FF)	r   r   r    r   boolr6   r	   rP   rQ   rR   s   @r'   r   r      s        7 7# 7X\ 7 7 7 7 7 7P #(2 22 2  &	2
 $2 %2  2 2 2 2 2 2 2 2r&   r   c                        e Zd Zdef fdZ	 	 	 	 ddededededee         d	ee         d
ee         dee	e
f         fdZ xZS )SamTwoWayTransformerrC   c                    t                                                       || _        |j        | _        t	          j                    | _        t          | j                  D ]/}| j                            t          ||dk                         0t          |          | _        t	          j        |j                  | _        d S )Nr   )r   )r5   r6   rC   num_hidden_layersr
   
ModuleListlayersrangeappendr   r   final_attn_token_to_imager   r:   layer_norm_final_attn)rB   rC   irD   s      r'   r6   zSamTwoWayTransformer.__init__Q  s    !'!9moot-.. 	^ 	^AK6vTUYZTZ\\\]]]])5f)=)=&%'\&2D%E%E"""r&   Npoint_embeddingsimage_embeddingsimage_positional_embeddingsr   r   output_hidden_statesreturn_dictr^   c	           	      b   ||n| j         j        }||n| j         j        }||n| j         j        }d}	|t	          d          |                    d                              ddd                              d          }|                    d                              ddd                              d          }|}
|}| j        D ]&}||
|z  }
 ||
|||||          \  }
}}|r|	|fz   }	'|
|z   }||z   }| 	                    |||          }|
|z   }
| 
                    |
          }
|
||	fS )Nr%   z&You have to specify an image_embeddingrG   r   r   )r   r   r   r   r   r   r   )rC   r   r   use_return_dictrI   flattenrJ   	unsqueezer   r   r   )rB   r   r   r   r   target_embeddingr   r   r   all_attentionsr   r   layerattention_outputsr   r   r   s                    r'   rP   zSamTwoWayTransformer.forward^  s    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]#EFFF+33A66>>q!QGGQQRSTT&A&I&I!&L&L&T&TUVXY[\&]&]&g&ghi&j&j# # [ 	G 	GE+++/4u&6$?%9"30 0 0,GT, ! G!/3D2F!F **00113d1SSH$,,W55n,,r&   NNNN)r   r   r    r   r6   r	   r   r   r   r   r   rP   rQ   rR   s   @r'   r   r   P  s        F3 F F F F F F& ,0/3&*6- 6- 6- !6- &,	6-
 %6- $D>6- 'tn6- d^6- 
uo%	&6- 6- 6- 6- 6- 6- 6- 6-r&   r   c                   >     e Zd Z	 d	dededededef
 fdZd Z xZS )
SamFeedForwardF	input_dim
hidden_dim
output_dim
num_layerssigmoid_outputc                 h   t                                                       || _        t          j                    | _        t          j        |          | _        t          j        |          | _        t          j	        fdt          |dz
            D                       | _        || _        d S )Nc                 :    g | ]}t          j                  S r%   )r
   rW   ).0r   r   s     r'   
<listcomp>z+SamFeedForward.__init__.<locals>.<listcomp>  s%    $f$f$f1RYz:%F%F$f$f$fr&   rG   )r5   r6   r   r
   ReLU
activationrW   proj_inproj_outr   r   r   r   )rB   r   r   r   r   r   rD   s     `   r'   r6   zSamFeedForward.__init__  s     	$'))yJ77	*j99m$f$f$f$fPUV`cdVdPePe$f$f$fgg,r&   c                 
   |                      |          }|                     |          }| j        D ] }|                      ||                    }!|                     |          }| j        rt          j        |          }|S rV   )r   r   r   r   r   Fsigmoid)rB   r   r   s      r'   rP   zSamFeedForward.forward  s    ]3366[ 	B 	BE OOEE-,@,@AAMMm44 	5Im44Mr&   r   )r   r   r    r   r   r6   rP   rQ   rR   s   @r'   r   r     s|        hm	- 	-	-*-	-;>	-LO	-ae	- 	- 	- 	- 	- 	-	 	 	 	 	 	 	r&   r   c                        e Zd Zdef fdZ	 	 	 ddej        dej        dej        dej        ded	ee         d
ej        dej        de	ej        ej        f         fdZ
 xZS )SamMaskDecoderrC   c                 h   t                                                       |j        | _        |j        | _        |j        dz   | _        t          j        d| j                  | _        t          j        | j        | j                  | _        t          |          | _
        t          j        | j        | j        dz  dd          | _        t          j        | j        dz  | j        dz  dd          | _        t          | j        dz  d          | _        t          j                    | _        g }t%          | j                  D ]*}|t'          | j        | j        | j        dz  d          gz  }+t          j        |          | _        t'          | j        |j        | j        |j                  | _        d S )	Nr      rG   r2      rf   rm   r   )r5   r6   r:   num_multimask_outputsnum_mask_tokensr
   	Embedding	iou_tokenmask_tokensr   transformerConvTranspose2dupscale_conv1upscale_conv2rb   upscale_layer_normGELUr   r   r   r   output_hypernetworks_mlpsiou_head_hidden_dimiou_head_depthiou_prediction_head)rB   rC   	mlps_listr   rD   s       r'   r6   zSamMaskDecoder.__init__  s   !-%+%A"%;a?a)9::<(<d>NOO/77  /0@$BRVWBWefopqqq/0@A0EtGW[\G\jktuvvv".t/?1/DRb"c"c"c'))	t+,, 	h 	hA.)94;KTM]abMbdeffggII)+y)A)A&#1f8$:NPVPe$
 $
   r&   Nr   r   sparse_prompt_embeddingsdense_prompt_embeddingsmultimask_outputr   r   r   r^   c	           
         |j         \  }	}
}}|j         d         }t          j        | j        j        | j        j        gd          }|                    |	|dd          }|                                                                dk    rt          j        ||fd          }n|}|	                    | j        j        j
                  }||z   }|                    |d          }|                    |d          }|                     ||||||          \  }}}|dddddddf         }|dddddd| j        z   ddf         }|                    dd                              |	|z  |
||          }|                     |          }|                     |                     |                    }|                     |                     |                    }g }t)          | j                  D ].}| j        |         }| ||dddd|ddf                   gz  }/t          j        |d          }|j         \  }}
}}|                    |	||
||z            }||z                      |	|d||          }|                     |          }|rt1          dd          }nt1          dd          }|dddd|ddddf         }|dddd|f         }||f}|r||fz   }n|d	z   }|S )
a  
        Predict masks given image and prompt embeddings.

        Args:
            image_embeddings (`torch.Tensor`):
                the embeddings from the image encoder
            image_positional_embedding (`torch.Tensor`):
                positional encoding with the shape of image_embeddings
            sparse_prompt_embeddings (`torch.Tensor`):
                The embeddings of the points and boxes
            dense_prompt_embeddings (`torch.Tensor`):
                the embeddings of the mask inputs
            multimask_output (bool):
                Whether to return multiple masks or a single mask.
            output_attentions (bool, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
        r   r   r   rG   )r   r   r   r   r   r   Nr   r   rV   )rH   r"   catr   ri   r   repeatsumitemrz   rs   repeat_interleaver   r   r   r   r   r   r   r   r   r   stackr   slice)rB   r   r   r   r  r  r   r   r   rL   r9   rM   rN   r   output_tokenstokensr   point_embeddingr   iou_token_outmask_tokens_outupscaled_embeddinghyper_in_listr   current_mlphyper_inr   masksiou_pred
mask_slicer   s                                  r'   rP   zSamMaskDecoder.forward  s   8 3C2H/
L&%39!<	4>#8$:J:Q"RXYZZZ%,,Z9I1aPP#''))..00A55Y/GHaPPPFF"F!99T^%:%@AA ,.EE+==>NPQRR&A&S&STdfg&h&h# 9=8H8H--(C!5-/ 9I 9
 9
5): (111a
3)!!!QQQQ9M5M0NPQPQPQ*QR ,55a;;CC))<
 
 "//0@AA!__T-D-DEW-X-XYY!__T-?-?@R-S-STTt+,, 	H 	HA8;Kkk/!!!QQQ111**EFFGGMM;}!444);)A&</77
DTVbdjmrdrss..77
DTVXZ`bghh ++M::  	%q$JJq!JaaaJ111,-AAAqqq*,-(# 	(-GG'Gr&   NNN)r   r   r    r   r6   r"   r	   r   r   r   rP   rQ   rR   s   @r'   r   r     s        
3 
 
 
 
 
 
F -1-1)-^ ^,^ &+\^ #(,	^
 "'^ ^ $D>^ $l^  ,^ 
u|U\)	*^ ^ ^ ^ ^ ^ ^ ^r&   r   c                   &     e Zd Z fdZddZ xZS )SamPositionalEmbeddingc                     t                                                       |j        dz  | _        |                     d| j        t          j        d|j        f          z             d S )NrG   positional_embedding)r5   r6   r:   scaleregister_bufferr"   randnnum_pos_featsr]   s     r'   r6   zSamPositionalEmbedding.__init__.  s_    '1,
3TZ%+qRXRfNgBhBh5hiiiiir&   Nc                    |                                 }|P|dddddddf         |d         z  |dddddddf<   |dddddddf         |d         z  |dddddddf<   d|z  dz
  }|                    | j        j                  }|| j        z  }dt          j        z  |z  }t          j        t          j        |          t          j	        |          gd          S )z8Positionally encode points that are normalized to [0,1].Nr   r   rG   r   r   )
clonerz   r  rs   nppir"   r  sincos)rB   input_coordsinput_shapecoordinatess       r'   rP   zSamPositionalEmbedding.forward3  s   "((**"&1!!!QQQ1*&=A&NK111aaa
#&1!!!QQQ1*&=A&NK111aaa
# +o)!nnT%>%DEE!D$=="%i+-y%)K00%)K2H2HIrRRRRr&   rV   )r   r   r    r6   rP   rQ   rR   s   @r'   r  r  -  sY        j j j j j
S S S S S S S Sr&   r  c                   *     e Zd Zdef fdZd Z xZS )SamMaskEmbeddingrC   c                    t                                                       |j        dz  | _        t          |j                 | _        t          j        d| j        dd          | _        t          j        | j        |j        dd          | _	        t          j        |j        |j
        d          | _        t          | j        |j        d          | _        t          | j        dz  |j        d          | _        d S )Nr   r   rG   r2   )r3   rf   )rl   rm   )r5   r6   mask_input_channelsr   r[   r   r
   r@   conv1conv2r:   conv3rb   r   r   r   r]   s     r'   r6   zSamMaskEmbedding.__init__E  s    #)#=#B  !23Yq$":RSTTT
Yt79Sabklmmm
Yv96;M[\]]]
'$&*?M]
 
 
 ($q(f.CQa
 
 
r&   c                 ,   |                      |          }|                     |          }|                     |          }|                     |          }|                     |          }|                     |          }|                     |          }|S rV   )r-  r   r   r.  r   r/  )rB   r  r   dense_embeddingss       r'   rP   zSamMaskEmbedding.forwardS  s    

5))((7766

=11((7766::m44r&   )r   r   r    r   r6   rP   rQ   rR   s   @r'   r*  r*  D  sT        
5 
 
 
 
 
 
	  	  	  	  	  	  	 r&   r*  c                   0    e Zd Zdef fdZdej        dej        dedej        fdZdej        dej        fd	Z	d
e
eej        ej        f                  de
ej                 de
ej                 de
ej                 deej        ej        f         f
dZ xZS )SamPromptEncoderrC   c                    t                                                       || _        t                    | _        t          j        dj                  | _        j	        j	        f| _	        j
        | _        t          j        fdt          j                  D                       | _        j        | _        t          j        dj                  | _        d S )Nr   c                 D    g | ]}t          j        d j                  S )r   )r
   r   r:   )r   r   rC   s     r'   r   z-SamPromptEncoder.__init__.<locals>.<listcomp>j  s(    ]]]QR\!V/00]]]r&   )r5   r6   shared_embeddingr*  
mask_embedr
   r   r:   no_mask_embedimage_embedding_sizer7   input_image_sizer   r   num_point_embeddingspoint_embednot_a_point_embed)rB   rC   shared_patch_embeddingrD   s    ` r'   r6   zSamPromptEncoder.__init__`  s     6*622\!V-?@@%+%@&B]$^! & 1=]]]]%@[:\:\]]]
 
 "-!#a1C!D!Dr&   pointslabelspadr^   c           
         |dz   }|r|j         d         |j         d         d|j         d         f}|j         d         |j         d         df}t          j        ||j                  }t          j        ||j                   }t          j        ||gd          }t          j        ||gd          }| j        | j        f}|                     ||          }	t          j        |d         dk    | j	        j
        |	          }	t          j        |d         d	k    |	t          j        d
|	j        |	j                            }	t          j        |dk    dddddddf         |	| j        d         j
        ddddddf         z   |	          }	t          j        |dk    dddddddf         |	| j        d         j
        ddddddf         z   |	          }	|	S )zEmbeds point prompts.r   r   r   r   devicerG   r   ).Ni        rs   rD  N)rH   r"   rj   rD  rh   r  r:  r6  wherer=  ri   tensorrs   r<  )
rB   r?  r@  rA  target_point_shapetarget_labels_shapepadding_pointpadding_labelr'  r  s
             r'   _embed_pointszSamPromptEncoder._embed_pointso  s    # 	?"(,q/6<?Av|TVGW!X#)<?FLOQ"G!K(:6=QQQM"Z(;FMRRRRMY6A>>>FY6A>>>F,d.CD//DD  +fY&72&=t?U?\^mnn  +9$LO$9/BXYYY
 
  +q[!!!QQQ4-(d.q18tQQQ9IJJ
 
  +q[!!!QQQ4-(d.q18tQQQ9IJJ
 
 r&   boxesc                 T   |dz   }|j         dd         \  }}|                    ||dd          }| j        | j        f}|                     ||          }|dddddddfxx         | j        d         j        z  cc<   |dddddddfxx         | j        d         j        z  cc<   |S )zEmbeds box prompts.r   NrG   r   r   r   )rH   r   r:  r6  r<  ri   )rB   rN  rL   nb_boxescoordsr'  corner_embeddings          r'   _embed_boxeszSamPromptEncoder._embed_boxes  s    ${2A2
Hz8Q::,d.CD00EEAAAq!!!$$$(8(;(BB$$$AAAq!!!$$$(8(;(BB$$$r&   input_pointsinput_labelsinput_boxesinput_masksc                 F   d}d}| j         j        j        }|?|j        dd         \  }}|t	          d          |                     |||du           }	|	}|?|j        d         }|                     |          }
||
}nt          j        ||
gd          }|| 	                    |          }nN| j
        j                            dddd                              |d| j        d         | j        d                   }|t          j        |dd| j        f|	          }||fS )
au  
        Embeds different types of prompts, returning both sparse and dense embeddings.

        Args:
            points (`torch.Tensor`, *optional*):
                point coordinates and labels to embed.
            boxes (`torch.Tensor`, *optional*):
                boxes to embed
            masks (`torch.Tensor`, *optional*):
                masks to embed
        Nr   rG   z5If points are provided, labels must also be provided.)rA  r   r   r   rC  )r6  r  rD  rH   rI   rM  rS  r"   r  r7  r8  ri   r   expandr9  rj   r:   )rB   rT  rU  rV  rW  sparse_embeddingsrL   target_devicer   r   box_embeddingsr1  s               r'   rP   zSamPromptEncoder.forward  sc   $ !
-BI#+7+=bqb+A(J(# !XYYY#11,S^bfSf1hh 0"$*1-J!..{;;N ($2!!$)I/@..QWX$Y$Y$Y!"#{;;#18@@B1MMTTB 9! <d>WXY>Z    $ %ZAt?O,PYf g g g "222r&   )r   r   r    r   r6   r"   r	   r   rM  rS  r   r   rP   rQ   rR   s   @r'   r3  r3  _  s%       E5 E E E E E E$EL $%, $T $V[Vb $ $ $ $L	 %, 	 5< 	  	  	  	 ,3uU\5<%?@A,3 u|,,3 el+	,3
 el+,3 
u|U\)	*,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3r&   r3  c                        e Zd ZdZ fdZdededej        dej        fdZdej        d	ej        d
ej        dej        de	eef         de	eef         dej        fdZ
ddej        dej        fdZ xZS )SamVisionAttentionz=Multi-head Attention block with relative position embeddings.c                    t                                                       |dk    r|j        |j        z  |j        |j        z  fn||f}|j        | _        |j        |j        z  }|dz  | _        |j        | _        t          j
        |j        |j        dz  |j                  | _        t          j
        |j        |j                  | _        |j        | _        | j        r|t          d          t          j        t#          j        d|d         z  dz
  |                    | _        t          j        t#          j        d|d         z  dz
  |                    | _        d S d S )Nr   g      r   )rk   zBInput size must be provided if using relative positional encoding.rG   r   )r5   r6   r7   r8   r   r:   r  attention_dropoutdropoutr
   rW   qkv_biasqkvprojuse_rel_posrI   rg   r"   rj   	rel_pos_h	rel_pos_w)rB   rC   window_size
input_sizehead_dimrD   s        r'   r6   zSamVisionAttention.__init__  sZ    a &"33V5F&J[5[\\{+ 	 $*#= %)CCt^
/9V/1Ca1Gfo^^^If0&2DEE	!- 	X! !efff  \%+a*Q-6G!6KX*V*VWWDN\%+a*Q-6G!6KX*V*VWWDNNN	X 	Xr&   q_sizek_sizerel_posr^   c                 n   t          dt          ||          z  dz
            }t          j        |                    d|j        d         d                              ddd          |d          }|                    d|                              dd          }t          j        |          dddf         t          ||z  d          z  }t          j        |          dddf         t          ||z  d          z  }||z
  |dz
  t          ||z  d          z  z   }||	                                         S )	a  
        Get relative positional embeddings according to the relative positions of
            query and key sizes.

        Args:
            q_size (int):
                size of the query.
            k_size (int):
                size of key k.
            rel_pos (`torch.Tensor`):
                relative position embeddings (L, channel).

        Returns:
            Extracted positional embeddings according to relative positions.
        rG   r   r   r   linear)sizemodeNg      ?)
r   maxr   interpolater   rH   rJ   r"   arangelong)	rB   rk  rl  rm  max_rel_distrel_pos_resizedq_coordsk_coordsrelative_coordss	            r'   get_rel_poszSamVisionAttention.get_rel_pos  s2     1s66222Q677-OOAw}Q/44<<Q1EE
 
 

 *11"lCCKKAqQQ <''403v3L3LL<''aaa03v3L3LL#h.6A:Vf_VYAZAZ2ZZ335566r&   r   r   rf  rg  c                    |\  }}|\  }	}
|                      ||	|          }|                      ||
|          }|j        \  }}}|                    ||||          }t          j        d||          }t          j        d||          }|                    ||||	|
          }||dddddddddf         z   |dddddddddf         z   }|                    |||z  |	|
z            }|S )a  
        Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
        https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py

        Args:
            attn (`torch.Tensor`):
                attention map.
            query (`torch.Tensor`):
                query q in the attention layer with shape (batch_size, query_height * query_width, channel).
            rel_pos_h (`torch.Tensor`):
                relative position embeddings (Lh, channel) for height axis.
            rel_pos_w (`torch.Tensor`):
                relative position embeddings (Lw, channel) for width axis.
            q_size (tuple):
                spatial sequence size of query q with (query_height, query_width).
            k_size (tuple):
                spatial sequence size of key k with (key_height, key_width).

        Returns:
            attn (`torch.Tensor`):
                attention map with added relative positional embeddings.
        zbhwc,hkc->bhwkzbhwc,wkc->bhwkN)r{  rH   r   r"   einsum)rB   r   r   rf  rg  rk  rl  query_heightquery_width
key_height	key_widthrelative_position_heightrelative_position_widthrL   r   r   reshaped_queryrel_hrel_ws                      r'   add_decomposed_rel_posz)SamVisionAttention.add_decomposed_rel_pos  s$   > %+!k &
I#'#3#3L*i#X#X "&"2"2;	9"U"U"[
Asz<cRR-~?WXX-~?VWW||Jk:yYYeAAAqqq!!!QQQ,--aaaAAAtQQQ6F0GG||J{(BJQZDZ[[r&   Fr   c           	         |j         \  }}}}|                     |                              |||z  d| j        d                              ddddd          }|                    d|| j        z  ||z  d                              d          \  }}	}
|| j        z  |	                    dd          z  }| j        r(| 	                    ||| j
        | j        ||f||f          }t          j        j                            |t          j        d                              |j                  }t          j                            || j        | j        	          }||
z                      || j        ||d          }|                    ddddd                              |||d          }|                     |          }|r||f}n|d f}|S )
Nr   r   rG   r   r   r   )rs   r   )ptraining)rH   rc  r   r   rJ   unbindr  r   re  r  rf  rg  r"   r
   rt   r   float32rz   rs   ra  r  rd  )rB   r   r   rL   rM   rN   r   rc  r   r   r   attn_weights
attn_probsattn_outputr   s                  r'   rP   zSamVisionAttention.forward8  s   '4':$
FE1 HH]##WZ%D4LbQQWQ1a## 	  KK:8P+PRX[`R`bdeellmnoosE
*cmmB.C.CC 	66eT^T^fe_W]_dVe L x*22<u}Z\2]]``afalmm]**<4<RVR_*``
!E)22:t?WY_afhjkk!))!Q1a88@@VUZ\^__ii,, 	*"L1GG"D)Gr&   r   )r   r   r    r!   r6   r   r"   r	   r{  r   r  rP   rQ   rR   s   @r'   r^  r^    s       GGX X X X X27# 7s 7U\ 7el 7 7 7 7@+l+ |+ <	+
 <+ c3h+ c3h+ 
+ + + +Z   U\  u|                r&   r^  c            
            e Zd Z fdZdej        dedeej        eeef         f         fdZdej        dedeeef         deeef         dej        f
d	Z		 ddej        de
e         deej                 fdZ xZS )SamVisionLayerc                 <   t                                                       t          j        |j        |j                  | _        t          ||          | _        t          j        |j        |j                  | _	        t          |          | _        || _        d S )Nr   )r5   r6   r
   r   r:   r   r   r^  r   r   rT   r   rh  )rB   rC   rh  rD   s      r'   r6   zSamVisionLayer.__init__\  s    <(:@UVVV&v{;;	<(:@UVVVv&&&r&   r   rh  r^   c           	      `   |j         \  }}}}|||z  z
  |z  }|||z  z
  |z  }t          j        |ddd|d|f          }||z   ||z   }
}	|                    ||	|z  ||
|z  ||          }|                    dddddd                                                              d|||          }||	|
ffS )a  
        Args:
        Partition into non-overlapping windows with padding if needed.
            hidden_states (tensor): input tokens with [batch_size, height, width, channel]. window_size (int): window
            size.

        Returns:
            windows: windows after partition with [batch_size * num_windows, window_size, window_size, channel].
            (pad_height, pad_width): padded height and width before partition
        r   r   r   rG   r      r   )rH   r   rA  r   rJ   
contiguous)rB   r   rh  rL   rM   rN   r   pad_hpad_w
pad_height	pad_widthwindowss               r'   window_partitionzSamVisionLayer.window_partitiond  s     .;-@*
FE7v33{Bu{22kAmaAua-GHH &I
%--
k1;	[@XZegn
 
  ''1aAq99DDFFNNrS^`kmtuuY///r&   r  padding_shapeoriginal_shapec                 \   |\  }}|\  }}|j         d         ||z  |z  |z  z  }	|                    |	||z  ||z  ||d          }
|
                    dddddd                                                              |	||d          }
|
ddd|d|ddf                                         }
|
S )	aS  
        Args:
        Window unpartition into original sequences and removing padding.
            hidden_states (tensor):
                input tokens with [batch_size * num_windows, window_size, window_size, channel].
            window_size (int):
                window size.
            padding_shape (Tuple):
                padded height and width (pad_height, pad_width).
            original_shape (Tuple): original height and width (height, width) before padding.

        Returns:
            hidden_states: unpartitioned sequences with [batch_size, height, width, channel].
        r   r   r   r   rG   r   r  N)rH   r   rJ   r  )rB   r  rh  r  r  r  r  rM   rN   rL   r   s              r'   window_unpartitionz!SamVisionLayer.window_unpartition|  s    " !.
I&]1%*y*@K*OS^*^_

k193K[Zegi
 
 !!!Q1a33>>@@HHU_ajlnoo 	 &aaa&&5&!!!&;<GGIIr&   Fr   c                    |}|                      |          }| j        dk    r8|j        d         |j        d         }}|                     || j                  \  }}|                     ||          \  }}| j        dk    r|                     || j        |||f          }||z   }|                     |          }||                     |          z   }|f}	|r|	|fz  }	|	S )Nr   r   rG   )r   r   )r   rh  rH   r  r   r  r   r   )
rB   r   r   residualrM   rN   r  r  layernorm_outputr   s
             r'   rP   zSamVisionLayer.forward  s   
 !((77a)/2M4G4JEF+/+@+@PTP`+a+a(M=&*ii'/ '0 '
 '
#|
 a 33M4CSUbekmrdsttM =0++M::%1A(B(BB " 	'&Gr&   r   )r   r   r    r6   r"   r	   r   r   r  r  r   r   r#   rP   rQ   rR   s   @r'   r  r  [  s       ' ' ' ' '0el 0 0QVW\WcejknpsksetWtQu 0 0 0 00|25FKCQTHoglmprumugv	   B -2 | $D> 
u 	!	       r&   r  c                   *     e Zd Zdef fdZd Z xZS )SamVisionNeckrC   c                 `   t                                                       || _        t          j        |j        |j        dd          | _        t          |j        d          | _	        t          j        |j        |j        ddd          | _
        t          |j        d          | _        d S )Nr   F)r3   rk   rf   r   r   )r3   paddingrk   )r5   r6   rC   r
   r@   r:   output_channelsr-  rb   r   r.  r   r]   s     r'   r6   zSamVisionNeck.__init__  s    Yv163IWX_deee
'(>L\]]]Yv5v7M[\fgnsttt
'(>L\]]]r&   c                     |                     dddd          }|                     |          }|                     |          }|                     |          }|                     |          }|S )Nr   r   r   rG   )rJ   r-  r   r.  r   r`   s     r'   rP   zSamVisionNeck.forward  si    %--aAq99

=11((77

=11((77r&   )r   r   r    r   r6   rP   rQ   rR   s   @r'   r  r    sZ        ^ ^ ^ ^ ^ ^ ^      r&   r  c                        e Zd Zdef fdZd Z	 	 	 	 ddeej                 dee	         dee	         dee	         d	e
eef         f
d
Z xZS )SamVisionEncoderrC   c                 V   t                                                       || _        |j        | _        t	          |          | _        d | _        |j        rMt          j	        t          j        d|j        |j        z  |j        |j        z  |j                            | _        t          j                    | _        t!          |j                  D ]=}t%          |||j        vr|j        nd          }| j                            |           >t-          |          | _        d| _        d S )Nr   r   )rh  F)r5   r6   rC   r7   r0   patch_embed	pos_embeduse_abs_posr
   rg   r"   rj   r8   r:   r   r   r   r   r  global_attn_indexesrh  r   r  neckgradient_checkpointing)rB   rC   r   r   rD   s       r'   r6   zSamVisionEncoder.__init__  s$    +-f55 		\%)::%)::&	  DN moov/00 	& 	&A"236;U2U2UF..[\  E Ku%%%%!&))	&+###r&   c                     | j         S rV   )r  rB   s    r'   get_input_embeddingsz%SamVisionEncoder.get_input_embeddings  s    r&   NrK   r   r   r   r^   c                 R   ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |                     |          }| j        
|| j        z   }|rdnd }|rdnd }t          | j                  D ]Z\  }}	|r||fz   }| j	        r#| j
        r|                     |	j        |          }
n |	||          }
|
d         }|r||
d         fz   }[|r||fz   }|                     |          }|s|f}|r||fz   }|r||fz   }|S t          |||          S )Nz You have to specify pixel_valuesr%   )r   r   r   )r   r   r   )rC   r   r   r   rI   r  r  	enumerater   r  r  _gradient_checkpointing_func__call__r  r   )rB   rK   r   r   r   r   all_hidden_statesall_self_attentionsr   layer_modulelayer_outputsr   s               r'   rP   zSamVisionEncoder.forward  s    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]?@@@((66>%)DN:M"6@BBD$5?bb4(55 	P 	POA|# I$58H$H!* at} a $ A A )!! !
 !-]N_ ` ` `)!,M  P&9]1=M<O&O# 	E 1]4D D		-00 	$&G# 9!%6$88  ;!%8$::N%++*
 
 
 	
r&   r   )r   r   r    r   r6   r  r   r"   r#   r   r   r   r   rP   rQ   rR   s   @r'   r  r    s        , , , , , , ,>     
 59,0/3&*9
 9
u019
 $D>9
 'tn	9

 d^9
 
u,,	-9
 9
 9
 9
 9
 9
 9
 9
r&   r  c                   &    e Zd ZeZdZdZdgZd ZdS )SamPreTrainedModelsamrK   r^  c                    | j         j        }t          |t          j        t          j        t          j        f          rJ|j        j        	                    d|           |j
         |j
        j                                         d S d S t          |t          j                  rS|j        j        	                    d|           |j        -|j        j        |j                                                  d S d S d S )NrE  )rw   std)rC   initializer_ranger;   r
   rW   r@   r   ri   datanormal_rk   zero_r   padding_idx)rB   moduler  s      r'   _init_weightsz SamPreTrainedModel._init_weights3  s    k+fry")R5GHII 	?M&&CS&999{& &&((((( '&-- 	?M&&CS&999!-"6#56<<>>>>>	? 	?--r&   N)	r   r   r    r   config_classbase_model_prefixmain_input_name_no_split_modulesr  r%   r&   r'   r  r  -  s<        L$O-.	? 	? 	? 	? 	?r&   r  a<  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`SamConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`SamProcessor`]. See [`SamProcessor.__call__`] for
            details.
        input_points (`torch.FloatTensor` of shape `(batch_size, num_points, 2)`):
            Input 2D spatial points, this is used by the prompt encoder to encode the prompt. Generally yields to much
            better results. The points can be obtained by passing a list of list of list to the processor that will
            create corresponding `torch` tensors of dimension 4. The first dimension is the image batch size, the
            second dimension is the point batch size (i.e. how many segmentation masks do we want the model to predict
            per input point), the third dimension is the number of points per segmentation mask (it is possible to pass
            multiple points for a single mask), and the last dimension is the x (vertical) and y (horizontal)
            coordinates of the point. If a different number of points is passed either for each image, or for each
            mask, the processor will create "PAD" points that will correspond to the (0, 0) coordinate, and the
            computation of the embedding will be skipped for these points using the labels.
        input_labels (`torch.LongTensor` of shape `(batch_size, point_batch_size, num_points)`):
            Input labels for the points, this is used by the prompt encoder to encode the prompt. According to the
            official implementation, there are 3 types of labels

            - `1`: the point is a point that contains the object of interest
            - `0`: the point is a point that does not contain the object of interest
            - `-1`: the point corresponds to the background

            We added the label:

            - `-10`: the point is a padding point, thus should be ignored by the prompt encoder

            The padding labels should be automatically done by the processor.
        input_boxes (`torch.FloatTensor` of shape `(batch_size, num_boxes, 4)`):
            Input boxes for the points, this is used by the prompt encoder to encode the prompt. Generally yields to
            much better generated masks. The boxes can be obtained by passing a list of list of list to the processor,
            that will generate a `torch` tensor, with each dimension corresponding respectively to the image batch
            size, the number of boxes per image and the coordinates of the top left and botton right point of the box.
            In the order (`x1`, `y1`, `x2`, `y2`):

            - `x1`: the x coordinate of the top left point of the input box
            - `y1`: the y coordinate of the top left point of the input box
            - `x2`: the x coordinate of the bottom right point of the input box
            - `y2`: the y coordinate of the bottom right point of the input box

        input_masks (`torch.FloatTensor` of shape `(batch_size, image_size, image_size)`):
            SAM model also accepts segmentation masks as input. The mask will be embedded by the prompt encoder to
            generate a corresponding embedding, that will be fed later on to the mask decoder. These masks needs to be
            manually fed by the user, and they need to be of shape (`batch_size`, `image_size`, `image_size`).

        image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_channels, window_size, window_size)`):
            Image embeddings, this is used by the mask decder to generate masks and iou scores. For more memory
            efficient computation, users can first retrieve the image embeddings using the `get_image_embeddings`
            method, and then feed them to the `forward` method instead of feeding the `pixel_values`.
        multimask_output (`bool`, *optional*):
            In the original implementation and paper, the model always outputs 3 masks per image (or per point / per
            bounding box if relevant). However, it is possible to just output a single mask, that corresponds to the
            "best" mask, by specifying `multimask_output=False`.
        attention_similarity (`torch.FloatTensor`, *optional*):
            Attention similarity tensor, to be provided to the mask decoder for target-guided attention in case the
            model is used for personalization as introduced in [PerSAM](https://arxiv.org/abs/2305.03048).
        target_embedding (`torch.FloatTensor`, *optional*):
            Embedding of the target concept, to be provided to the mask decoder for target-semantic prompting in case
            the model is used for personalization as introduced in [PerSAM](https://arxiv.org/abs/2305.03048).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zYSegment Anything Model (SAM) for generating segmentation masks, given an input image and z) optional 2D location and bounding boxes.c                       e Zd ZdgZ fdZd Zd Z ej                    	 	 	 dde	e
         de	e
         de	e
         fd	            Z ej                    	 	 	 	 dd
e	ej                 de	ej                 de	ej                 de	ej                 fd            Z ee          	 	 	 	 	 	 	 	 	 	 	 	 dde	ej                 d
e	ej                 de	ej                 de	ej                 de	ej                 de	ej                 de
de	ej                 de	ej                 de	e
         de	e
         de	e
         deeeej        f                  fd            Z xZS )SamModelz4prompt_encoder.shared_embedding.positional_embeddingc                 F   t                                          |           t          |j                  | _        t          |j                  | _        t          |j        | j                  | _	        t          |j                  | _        |                                  d S rV   )r5   r6   r  vision_configshared_image_embeddingr  vision_encoderr3  prompt_encoder_configprompt_encoderr   mask_decoder_configmask_decoder	post_initr]   s     r'   r6   zSamModel.__init__  s       &<V=Q&R&R#.v/CDD.v/KTMhii*6+EFFr&   c                 4    | j                                         S rV   )r  r  r  s    r'   r  zSamModel.get_input_embeddings  s    "77999r&   c                    | j         j        j        }| j        j        j        }| j        j        j        }t          j        ||f||          }|	                    d          dz
  }|	                    d          dz
  }||z  }||z  }|                     t          j
        ||gd                    }|                    ddd                              d          S )N)rD  rs   r   r   r   r   r   rG   )rC   r  r9  r  r  rD  rs   r"   rh   cumsumr	  rJ   r   )rB   rp  r[  target_dtypegridy_embedx_embedr  s           r'   $get_image_wide_positional_embeddingsz-SamModel.get_image_wide_positional_embeddings  s    {0E3HO2GMz4,}LQQQ++!+$$s*++!+$$s*D.D.#::5;QXGY_a;b;b;bcc#++Aq!44>>qAAAr&   Nr   r   r   c                 H    |                      ||||          }|d         }|S )a  
        Returns the image embeddings by passing the pixel values through the vision encoder.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Input pixel values
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

        r   r   r   r   )r  )rB   rK   r   r   r   vision_outputr   s          r'   get_image_embeddingszSamModel.get_image_embeddings  s=    , ++/!5#	 , 
 
 )+r&   rT  rU  rV  rW  c                 8    |                      ||||          }|S )a  
        Returns the prompt embeddings by passing the input points, labels, boxes and masks through the prompt encoder.

        Args:
            input_points (`torch.FloatTensor` of shape `(batch_size, point_batch_size, num_points_per_image, 2)`):
                Optional input points for the prompt encoder. The padding of the point is automatically done by the
                processor. `point_batch_size` refers to the number of masks that we want the model to predict per
                point. The model will output `point_batch_size` times 3 masks in total.
            input_labels (`torch.LongTensor` of shape `(batch_size, point_batch_size, num_points_per_image)`):
                Optional input labels for the prompt encoder. The padding of the labels is automatically done by the
                processor, or can be fed by the user.
            input_boxes (`torch.FloatTensor` of shape `(batch_size, num_boxes_per_image, 4)`):
                Optional input boxes for the prompt encoder. The padding of the boxes is automatically done by the
                processor. users can also pass manually the input boxes.
            input_masks (`torch.LongTensor` of shape `(batch_size, image_size, image_size)`):
                Optional input masks for the prompt encoder.
        rT  rU  rV  rW  )r  )rB   rT  rU  rV  rW  prompt_outputs         r'   get_prompt_embeddingszSamModel.get_prompt_embeddings  s2    2 ++%%##	 , 
 
 r&   TrK   r   r  r   r   r^   c           
      H   |
|
n| j         j        }
||n| j         j        }||n| j         j        }||t	          d          ||t	          d          |@t          |j                  dk    r(t	          dd                    |j                            |@t          |j                  dk    r(t	          dd                    |j                            |E|C|j        d	         }|j        d	         }||k    r#t	          d
                    ||                    |                                 }||j        d         n|j        d         }|	                    |d	d	d	          }d}d}|5| 
                    ||
||          }|d         }|r|d	         }|
r|d         }|8|6t          j        |dddddddf         t          j        |j                  }|Y|j        d         |j        d         k    r=t	          dd                    |j        d         |j        d                   ddd          |                     ||||          \  }}|                     |||||||	|
          \  }}}|s||f}|r||fz   }|
r|||fz   }|S t#          |||||          S )a  
        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoModel, AutoProcessor

        >>> model = AutoModel.from_pretrained("facebook/sam-vit-base")
        >>> processor = AutoProcessor.from_pretrained("facebook/sam-vit-base")

        >>> img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-car.png"
        >>> raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
        >>> input_points = [[[400, 650]]]  # 2D location of a window on the car
        >>> inputs = processor(images=raw_image, input_points=input_points, return_tensors="pt")

        >>> # Get segmentation mask
        >>> outputs = model(**inputs)

        >>> # Postprocess masks
        >>> masks = processor.post_process_masks(
        ...     outputs.pred_masks, inputs["original_sizes"], inputs["reshaped_input_sizes"]
        ... )
        ```
        Nz9Either pixel_values or image_embeddings must be provided.z>Only one of pixel_values and image_embeddings can be provided.r   zlThe input_points must be a 4D tensor. Of shape `batch_size`, `point_batch_size`, `nb_points_per_image`, `2`.z got {}.r   zMThe input_points must be a 3D tensor. Of shape `batch_size`, `nb_boxes`, `4`.r   zQYou should provide as many bounding boxes as input points per box. Got {} and {}.r   r  r   rF  zNThe batch size of the image embeddings and the input points must be the same. zGot {} and {} respectively.zS if you want to pass multiple points for the same image, make sure that you passed zS input_points of shape (batch_size, point_batch_size, num_points_per_image, 3) and zK input_labels of shape (batch_size, point_batch_size, num_points_per_image)r  )r   r   r   r  r  r   r   r   )r*   r+   r,   r-   r.   )rC   r   r   r   rI   lenrH   formatr  r  r  r"   	ones_liker   rD  r  r  r)   )rB   rK   rT  rU  rV  rW  r   r  r   r   r   r   r   kwargsr   box_batch_sizer   rL   r-   r,   vision_outputsrZ  r1  low_res_masksiou_predictionsr.   outputs                              r'   rP   zSamModel.forward  s   T 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]$4$<XYYY#(8(D]^^^#L,>(?(?1(D(D~!!,"455   "s;+<'='='B'B_!!+"344   #(?+1!4(.q1N>11 gnn(.    '+&O&O&Q&Q#.:.F\'**L\LbcdLe
&A&H&HUVXY[\&]&]# ##!00"3%9'	 1  N  .a0# 9'5a'8$  7$22$6!#(< ?<111aaa
+C59]i]pqqqL#(8(>q(A\EWXYEZ(Z(Z`-445E5KA5NP\PbcdPeffee]   /3.A.A%%##	 /B /
 /
++ CGBSBS-(C%6$4-!5-/ CT 	C
 	C
?(?  	%}5F# :#7"99  O#46M"NNM)&$!5/$;
 
 
 	
r&   r  r   )NNNNNNTNNNNN)r   r   r    _tied_weights_keysr6   r  r  r"   no_gradr   r   r  r#   
LongTensorr  r   SAM_INPUTS_DOCSTRINGr   r   strr	   rP   rQ   rR   s   @r'   r  r    s        QQ    : : :B B B U]__ -1/3&*    $D>  'tn	 
 d^      _ < U]__ 59373726 u01 u/0 e/0	
 e./   _@ +*+?@@ 59483737268<!%<@8<,0/3&*L
 L
u01L
 u01L
 u/0	L

 e/0L
 e./L
 #5#45L
 L
 'u'89L
 #5#45L
 $D>L
 'tnL
 d^L
 
d3$%	&L
 L
 L
 A@L
 L
 L
 L
 L
r&   r  )>r!   r<   dataclassesr   typingr   r   r   r   r   numpyr"  r"   torch.nn.functionalr
   rt   r   torch.utils.checkpointr	   activationsr   modeling_outputsr   modeling_utilsr   utilsr   r   r   r   configuration_samr   r   r   r   
get_loggerr   logger_CONFIG_FOR_DOC_CHECKPOINT_FOR_DOCr   r)   Moduler0   rT   rb   r   r   r   r   r   r  r*  r3  r^  r  r  r  r  SAM_START_DOCSTRINGr  r  r%   r&   r'   <module>r     s8         ! ! ! ! ! ! 5 5 5 5 5 5 5 5 5 5 5 5 5 5                           ! ! ! ! ! ! / / / / / / - - - - - - f f f f f f f f f f f f g g g g g g g g g g g g 
	H	%	%-  ? ? ? ? ?[ ? ? ?:  L  L  L  L  L  L  L  LF               F    ")       29   << < < < <29 < < <~T T T T Tbi T T TnD- D- D- D- D-29 D- D- D-N    RY   0{ { { { {RY { { {|S S S S SRY S S S.         ry      6m3 m3 m3 m3 m3ry m3 m3 m3`I I I I I I I IX\ \ \ \ \RY \ \ \~    BI   (\
 \
 \
 \
 \
ry \
 \
 \
~? ? ? ? ? ? ? ?$  C L _/ 
j
 j
 j
 j
 j
! j
 j
 
j
 j
 j
r&   