
    gS                        d Z ddlmZ ddlmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZ d	d
lmZ d	dlmZmZmZmZ  ej        e          ZdZdZdej        dej        dedededeej        ej        f         fdZdej        dej        dedeej        ej        f         fdZdej        dedej        fdZe G d de                      Z  G d dej!                  Z" G d dej!                  Z# G d d ej!                  Z$ G d! d"ej!                  Z% G d# d$e
          Z&d%Z'd&Z( ed'e'           G d( d)e&                      Z)dS )*zPyTorch SuperPoint model.    )	dataclass)OptionalTupleUnionN)nn)PreTrainedModel)BaseModelOutputWithNoAttention)SuperPointConfig   )#is_torch_greater_or_equal_than_1_13)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingr
   zmagic-leap-community/superpoint	keypointsscoresborderheightwidthreturnc                     | dddf         |k    | dddf         ||z
  k     z  }| dddf         |k    | dddf         ||z
  k     z  }||z  }| |         ||         fS )zPRemoves keypoints (and their associated scores) that are too close to the borderNr       )r   r   r   r   r   mask_hmask_wmasks           n/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/superpoint/modeling_superpoint.pyremove_keypoints_from_bordersr   ,   s     1o'IaaadOv,OPF1o'IaaadOuv~,NOFF?DT?F4L((    kc                 x    |t          |           k    r| |fS t          j        ||d          \  }}| |         |fS )z(Keeps the k keypoints with highest scorer   )dim)lentorchtopk)r   r   r    indicess       r   top_k_keypointsr'   6   sH    C	NN&  j222OFGWv%%r   
nms_radiusc                 l   dk     rt          d          fd}t          j        |           }|  ||           k    }t          d          D ]Q} ||                                          dk    }t          j        |||           }| ||          k    }||| z  z  }Rt          j        || |          S )z)Applies non-maximum suppression on scoresr   z'Expected positive values for nms_radiusc                 V    t           j                            | dz  dz   d          S )N   r   kernel_sizestridepadding)r   
functional
max_pool2d)xr(   s    r   max_poolzsimple_nms.<locals>.max_poolC   s-    }''zA~7IRS]g'hhhr   r+   )
ValueErrorr$   
zeros_likerangefloatwhere)	r   r(   r3   zerosmax_mask_	supp_masksupp_scoresnew_max_masks	    `       r   
simple_nmsr?   >   s    A~~BCCCi i i i i V$$E&)))H1XX < <HX^^--..2	k)UF;;"hh{&;&;;|	z:;;x///r   c                       e Zd ZU dZdZeej                 ed<   dZ	eej
                 ed<   dZeej                 ed<   dZeej                 ed<   dZeej                 ed<   dZeeej                          ed<   dS )	#SuperPointKeypointDescriptionOutputa  
    Base class for outputs of image point description models. Due to the nature of keypoint detection, the number of
    keypoints is not fixed and can vary from image to image, which makes batching non-trivial. In the batch of images,
    the maximum number of keypoints is set as the dimension of the keypoints, scores and descriptors tensors. The mask
    tensor is used to indicate which values in the keypoints, scores and descriptors tensors are keypoint information
    and which are padding.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*):
            Loss computed during training.
        keypoints (`torch.FloatTensor` of shape `(batch_size, num_keypoints, 2)`):
            Relative (x, y) coordinates of predicted keypoints in a given image.
        scores (`torch.FloatTensor` of shape `(batch_size, num_keypoints)`):
            Scores of predicted keypoints.
        descriptors (`torch.FloatTensor` of shape `(batch_size, num_keypoints, descriptor_size)`):
            Descriptors of predicted keypoints.
        mask (`torch.BoolTensor` of shape `(batch_size, num_keypoints)`):
            Mask indicating which values in keypoints, scores and descriptors are keypoint information.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or
        when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
            (also called feature maps) of the model at the output of each stage.
    Nlossr   r   descriptorsr   hidden_states)__name__
__module____qualname____doc__rB   r   r$   FloatTensor__annotations__r   	IntTensorr   rC   r   
BoolTensorrD   r   r   r   r   rA   rA   P   s          2 )-D(5$
%,,,+/Ix(///*.FHU&'.../3K%+,333'+D(5#
$+++8<M8E%"345<<<<<r   rA   c                   \     e Zd Z	 ddededededdf
 fdZd	ej        dej        fd
Z	 xZ
S )SuperPointConvBlockFconfigin_channelsout_channelsadd_poolingr   Nc                 2   t                                                       t          j        ||ddd          | _        t          j        ||ddd          | _        t          j        d          | _        |rt          j        dd          nd | _	        d S )Nr   r   r,   Tinplacer+   r-   r.   )
super__init__r   Conv2dconv_aconv_bReLUrelu	MaxPool2dpool)selfrO   rP   rQ   rR   	__class__s        r   rX   zSuperPointConvBlock.__init__t   s     	i
 
 
 i
 
 
 GD)))	=HRBLQq9999d			r   rD   c                     |                      |                     |                    }|                      |                     |                    }| j        |                     |          }|S N)r]   rZ   r[   r_   )r`   rD   s     r   forwardzSuperPointConvBlock.forward   s[    		$++m"<"<==		$++m"<"<==9  IIm44Mr   )F)rE   rF   rG   r
   intboolrX   r$   Tensorrd   __classcell__ra   s   @r   rN   rN   s   s        afS S&S58SHKSZ^S	S S S S S S*U\ el        r   rN   c            	       n     e Zd ZdZdeddf fdZ	 	 ddee         d	ee         dee	e
f         fd
Z xZS )SuperPointEncoderz
    SuperPoint encoder module. It is made of 4 convolutional layers with ReLU activation and max pooling, reducing the
     dimensionality of the image.
    rO   r   Nc           	      >   t                                                       d| _        g }|                    t	          || j        |j        d         d                     t          dt          |j                  dz
            D ]A}|                    t	          ||j        |dz
           |j        |         d                     B|                    t	          ||j        d         |j        d         d                     t          j	        |          | _
        d S )Nr   r   T)rR   F)rW   rX   	input_dimappendrN   encoder_hidden_sizesr6   r#   r   
ModuleListconv_blocks)r`   rO   rs   ira   s       r   rX   zSuperPointEncoder.__init__   s7   8STU8Vdhiii	
 	
 	
 q#f9::Q>?? 	 	A#F7A>@[\]@^lp     
 	3B79TUW9Xfk  	
 	
 	

 =55r   FToutput_hidden_statesreturn_dictc                     |rdnd }| j         D ]} ||          }|r||fz   }|}|st          d ||fD                       S t          ||          S )Nr   c              3      K   | ]}||V  	d S rc   r   .0vs     r   	<genexpr>z,SuperPointEncoder.forward.<locals>.<genexpr>   s"      QQq1=====QQr   )last_hidden_staterD   )rs   tupler	   )r`   inputru   rv   all_hidden_states
conv_blockoutputs          r   rd   zSuperPointEncoder.forward   s     #7@BBD* 	A 	AJJu%%E# A$5$@! 	RQQV->$?QQQQQQ-$+
 
 
 	
r   )FT)rE   rF   rG   rH   r
   rX   r   rf   r   r   r	   rd   rh   ri   s   @r   rk   rk      s         
6/ 6D 6 6 6 6 6 62 05&*	
 
 'tn
 d^	

 
u44	5
 
 
 
 
 
 
 
r   rk   c                        e Zd ZdZdeddf fdZdej        deej        ej        f         fdZ	dej        dej        fdZ
d	ej        deej        ej        f         fd
Z xZS )SuperPointInterestPointDecodera  
    The SuperPointInterestPointDecoder uses the output of the SuperPointEncoder to compute the keypoint with scores.
    The scores are first computed by a convolutional layer, then a softmax is applied to get a probability distribution
    over the 65 possible keypoint classes. The keypoints are then extracted from the scores by thresholding and
    non-maximum suppression. Post-processing is then applied to remove keypoints too close to the image borders as well
    as to keep only the k keypoints with highest score.
    rO   r   Nc                    t                                                       |j        | _        |j        | _        |j        | _        |j        | _        t          j        d          | _        t          j	        dd          | _
        t          j        |j        d         |j        ddd          | _        t          j        |j        |j        ddd	          | _        d S 
NTrT   r+   rV   rn   r   r   r,   r   )rW   rX   keypoint_thresholdmax_keypointsr(   border_removal_distancer   r\   r]   r^   r_   rY   rq   decoder_hidden_sizeconv_score_akeypoint_decoder_dimconv_score_br`   rO   ra   s     r   rX   z'SuperPointInterestPointDecoder.__init__   s    "(";#1 +'-'E$GD)))	LQq999	I'+&
 
 
 I&(CQR[\fg
 
 
r   encodedc                 d    |                      |          }|                     |          \  }}||fS rc   )_get_pixel_scores_extract_keypoints)r`   r   r   r   s       r   rd   z&SuperPointInterestPointDecoder.forward   s8    ''00 33F;;	6&  r   c                    |                      |                     |                    }|                     |          }t          j                            |d          ddddf         }|j        \  }}}}|                    dddd                              |||dd          }|                    ddddd                              ||dz  |dz            }t          || j
                  }|S )	zKBased on the encoder output, compute the scores for each pixel of the imager   Nrn   r   r+   r         )r]   r   r   r   r0   softmaxshapepermutereshaper?   r(   )r`   r   r   
batch_sizer;   r   r   s          r   r   z0SuperPointInterestPointDecoder._get_pixel_scores   s    4,,W5566""6**&&vq11!!!SbS&9'-|$
Avu1a++33JqRSTT1aA..66z6A:uWXyYYFDO44r   r   c                    |j         \  }}}t          j        |d         | j        k              }|d         t	          |                                                   }t          ||| j        |dz  |dz            \  }}| j        dk    rt          ||| j                  \  }}t          j
        |dg                                          }||fS )ztBased on their scores, extract the pixels that represent the keypoints that will be used for descriptors computationr   r   r   )r   r$   nonzeror   r~   tr   r   r   r'   flipr7   )r`   r   r;   r   r   r   s         r   r   z1SuperPointInterestPointDecoder._extract_keypoints   s    !<65 M&)d.E"EFF	5//0 :vt;VaZQR
 
	6
 "" /	64CU V VIv Jy1#..4466	&  r   )rE   rF   rG   rH   r
   rX   r$   rg   r   rd   r   r   rh   ri   s   @r   r   r      s         
/ 
D 
 
 
 
 
 
(!u| !elEL6P0Q ! ! ! !	 	%, 	 	 	 	! !%el@Z:[ ! ! ! ! ! ! ! !r   r   c                        e Zd ZdZdeddf fdZdej        dej        dej        fdZe	dd
e
dej        fd            Z xZS )SuperPointDescriptorDecoderag  
    The SuperPointDescriptorDecoder uses the outputs of both the SuperPointEncoder and the
    SuperPointInterestPointDecoder to compute the descriptors at the keypoints locations.

    The descriptors are first computed by a convolutional layer, then normalized to have a norm of 1. The descriptors
    are then interpolated at the keypoints locations.
    rO   r   Nc                 ^   t                                                       t          j        d          | _        t          j        dd          | _        t          j        |j        d         |j	        ddd          | _
        t          j        |j	        |j        ddd	          | _        d S r   )rW   rX   r   r\   r]   r^   r_   rY   rq   r   conv_descriptor_adescriptor_decoder_dimconv_descriptor_br   s     r   rX   z$SuperPointDescriptorDecoder.__init__  s    GD)))	LQq999	!#'+&"
 "
 "
 "$&)"
 "
 "
r   r   r   c                 J   |                      |                     |                     |                              }t          j                            |dd          }|                     |d         |d         d         d          d         }t          j        |dd          }|S )zXBased on the encoder output and the keypoints, compute the descriptors for each keypointr+   r   pr"   Nr   r   )	r   r]   r   r   r0   	normalize_sample_descriptorsr$   	transpose)r`   r   r   rC   s       r   rd   z#SuperPointDescriptorDecoder.forward%  s    ,,TYYt7M7Mg7V7V-W-WXXm--kQA-FF..yAt@TVWXXYZ[ ok1a88r   r   scalec                    |j         \  }}}}| |dz  z
  dz   } t          j        ||z  |dz  z
  dz
  ||z  |dz  z
  dz
  gg          }|                    |           }| |z  } | dz  dz
  } t          rddini }|                     |ddd          } t          j        j        || fddi|}|	                    ||d          }t          j        
                    |dd	          }|S )
z-Interpolate descriptors at keypoint locationsr+   g      ?r   align_cornersTrn   modebilinearr   )r   r$   tensortor   viewr   r0   grid_sampler   r   )	r   rC   r   r   num_channelsr   r   divisorkwargss	            r   r   z/SuperPointDescriptorDecoder._sample_descriptors1  s    3>2C/
L&%	)C/	,%%-%!)";c"AVe^V[^_V_E_beEe ghii**Y''W	MA%	,OW/4((UWNN:q"a88	m/YbbZb[abb!))*lBGGm--kQA-FFr   )r   )rE   rF   rG   rH   r
   rX   r$   rg   rd   staticmethodre   r   rh   ri   s   @r   r   r     s         
/ 
D 
 
 
 
 
 
(
u| 
 
 
 
 
 
  3 u|    \    r   r   c                       e Zd ZdZeZdZdZdZde	e
j        e
j        e
j        f         ddfdZdej        dej        fd	ZdS )
SuperPointPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    
superpointpixel_valuesFmoduler   Nc                    t          |t          j        t          j        f          rT|j        j                            d| j        j                   |j	         |j	        j        
                                 dS dS t          |t          j                  r?|j	        j        
                                 |j        j                            d           dS dS )zInitialize the weightsg        )meanstdNg      ?)
isinstancer   LinearrY   weightdatanormal_rO   initializer_rangebiaszero_	LayerNormfill_)r`   r   s     r   _init_weightsz'SuperPointPreTrainedModel._init_weightsO  s    fry")455 	* M&&CT[5R&SSS{& &&((((( '&-- 	*K""$$$M$$S)))))	* 	*r   c                 F    |dddddddf         dddddddf         S )aG  
        Assuming pixel_values has shape (batch_size, 3, height, width), and that all channels values are the same,
        extract the first channel value to get a tensor of shape (batch_size, 1, height, width) for SuperPoint. This is
        a workaround for the issue discussed in :
        https://github.com/huggingface/transformers/pull/25786#issuecomment-1730176446

        Args:
            pixel_values: torch.FloatTensor of shape (batch_size, 3, height, width)

        Returns:
            pixel_values: torch.FloatTensor of shape (batch_size, 1, height, width)

        Nr   r   )r`   r   s     r    extract_one_channel_pixel_valuesz:SuperPointPreTrainedModel.extract_one_channel_pixel_values[  s<     AAAq!!!QQQJ'4AAA66r   )rE   rF   rG   rH   r
   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr   r   r   rY   r   r   r$   rI   r   r   r   r   r   r   D  s         
 $L$$O&+#
*E")RY*L$M 
*RV 
* 
* 
* 
*7U=N 7SXSd 7 7 7 7 7 7r   r   aP  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`SuperPointConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
    a-  
Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Pixel values. Pixel values can be obtained using [`SuperPointImageProcessor`]. See
        [`SuperPointImageProcessor.__call__`] for details.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more
        detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
    z6SuperPoint model outputting keypoints and descriptors.c                        e Zd ZdZdeddf fdZ ee          	 	 	 ddej	        de
ej                 de
e         d	e
e         deeef         f
d
            Z xZS )SuperPointForKeypointDetectiona  
    SuperPoint model. It consists of a SuperPointEncoder, a SuperPointInterestPointDecoder and a
    SuperPointDescriptorDecoder. SuperPoint was proposed in `SuperPoint: Self-Supervised Interest Point Detection and
    Description <https://arxiv.org/abs/1712.07629>`__ by Daniel DeTone, Tomasz Malisiewicz, and Andrew Rabinovich. It
    is a fully convolutional neural network that extracts keypoints and descriptors from an image. It is trained in a
    self-supervised manner, using a combination of a photometric loss and a loss based on the homographic adaptation of
    keypoints. It is made of a convolutional encoder and two decoders: one for keypoints and one for descriptors.
    rO   r   Nc                     t                                          |           || _        t          |          | _        t          |          | _        t          |          | _        | 	                                 d S rc   )
rW   rX   rO   rk   encoderr   keypoint_decoderr   descriptor_decoder	post_initr   s     r   rX   z'SuperPointForKeypointDetection.__init__  sg       (00 >v F F"=f"E"Er   r   labelsru   rv   c           	      .    d}|t          d          ||n j        j        }||n j        j        }                     |          }|j        d         }                     |||          }|d         } fd|D             }	d |	D             }
d |	D             } fdt          ||
          D             }t          d	 |
D                       }t          j
        ||d
f|j                  }t          j
        ||f|j                  }t          j
        || j        j        f|j                  }t          j
        ||f|j        t          j                  }t          t          |
||                    D ]Y\  }\  }}}|||d|j        d         f<   |||d|j        d         f<   |||d|j        d         f<   d||d|j        d         f<   Z|r|d         nd}|st          d ||||||fD                       S t!          ||||||          S )a  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, SuperPointForKeypointDetection
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint")
        >>> model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```Nz-SuperPoint does not support training for now.r   )ru   rv   c                 F    g | ]}                     |d                    S )N.)r   )rz   r}   r`   s     r   
<listcomp>z:SuperPointForKeypointDetection.forward.<locals>.<listcomp>  s:     !
 !
 !
DUD!!"3I">??!
 !
 !
r   c                     g | ]
}|d          S )r   r   rz   keypoints_scoress     r   r   z:SuperPointForKeypointDetection.forward.<locals>.<listcomp>  s    \\\2B*1-\\\r   c                     g | ]
}|d          S )r   r   r   s     r   r   z:SuperPointForKeypointDetection.forward.<locals>.<listcomp>  s    YYY/?'*YYYr   c                 Z    g | ]'\  }}                     |d          |d                    (S r   )r   )rz   r}   r   r`   s      r   r   z:SuperPointForKeypointDetection.forward.<locals>.<listcomp>  sI     
 
 
,!9 ##$5i$@)IBVWW
 
 
r   c              3   0   K   | ]}|j         d          V  dS )r   N)r   )rz   r   s     r   r|   z9SuperPointForKeypointDetection.forward.<locals>.<genexpr>  s)      #W#W9IOA$6#W#W#W#W#W#Wr   r+   )device)r   dtyper   c              3      K   | ]}||V  	d S rc   r   ry   s     r   r|   z9SuperPointForKeypointDetection.forward.<locals>.<genexpr>  s(      qqqcdcpcpcpcpcpqqr   )rB   r   r   rC   r   rD   )r4   rO   ru   use_return_dictr   r   r   zipmaxr$   r9   r   r   re   	enumerater~   rA   )r`   r   r   ru   rv   rB   r   encoder_outputsr}   list_keypoints_scoreslist_keypointslist_scoreslist_descriptorsmaximum_num_keypointsr   r   rC   r   rt   
_keypoints_scores_descriptorsrD   s   `                      r   rd   z&SuperPointForKeypointDetection.forward  s   4 LMMM %9$D  $+Jj 	 &1%<kk$+B]<<\JJ!'*
,,!5# ' 
 
 ,A.!
 !
 !
 !
Yj!
 !
 !
 ]\F[\\\YYCXYYY
 
 
 
034E~0V0V
 
 

 !$#W#W#W#W#W W WK-BA F|Obccc	j*?@I\]]]k.0RS&
 
 
 {J(=>|GZbgbklll6?NT_aq@r@r6s6s 	, 	,2A2
G\2<Ia.:+A.../,3F1(a((()6BK2\/2223*+D&gmA&&&''.BL** 	rqqT9fk4Q^$_qqqqqq2#'
 
 
 	
r   )NNN)rE   rF   rG   rH   r
   rX   r   SUPERPOINT_INPUTS_DOCSTRINGr$   rI   r   
LongTensorrf   r   r   rA   rd   rh   ri   s   @r   r   r     s        
 	/ 	D 	 	 	 	 	 	 +*+FGG .2/3&*U
 U
'U
 )*U
 'tn	U

 d^U
 
u99	:U
 U
 U
 HGU
 U
 U
 U
 U
r   r   )*rH   dataclassesr   typingr   r   r   r$   r   transformersr   transformers.modeling_outputsr	   7transformers.models.superpoint.configuration_superpointr
   pytorch_utilsr   utilsr   r   r   r   
get_loggerrE   logger_CONFIG_FOR_DOC_CHECKPOINT_FOR_DOCrg   re   r   r'   r?   rA   ModulerN   rk   r   r   r   SUPERPOINT_START_DOCSTRINGr   r   r   r   r   <module>r     sa      ! ! ! ! ! ! ) ) ) ) ) ) ) ) ) )        ( ( ( ( ( (      U T T T T T @ @ @ @ @ @            
	H	%	%$7 )|)%*\);>)HK)TW)
5<%&) ) ) )&u| &U\ &c &eTYT`bgbnTnNo & & & &0u| 0 0 0 0 0 0$ = = = = =+ = = =D    ")   </
 /
 /
 /
 /
	 /
 /
 /
dB! B! B! B! B!RY B! B! B!J9 9 9 9 9") 9 9 9x%7 %7 %7 %7 %7 %7 %7 %7P	 
  < k
 k
 k
 k
 k
%> k
 k
	 k
 k
 k
r   