
    gq                     v   d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ  ej        e          ZdZ e G d de                      Z! G d dej"                  Z# G d dej"                  Z$ G d dej"                  Z% G d dej"                  Z& G d dej"                  Z' G d dej"                  Z( G d dej"                  Z)dFd!Z* G d" d#ej"                  Z+ G d$ d%ej"                  Z, G d& d'ej"                  Z-e	j.        j/        dGd*e0d+e1fd,            Z2 G d- d.ej"                  Z3 G d/ d0ej"                  Z4 G d1 d2ej"                  Z5 G d3 d4ej"                  Z6 G d5 d6ej"                  Z7 G d7 d8ej"                  Z8 G d9 d:ej"                  Z9 G d; d<ej"                  Z: G d= d>ej"                  Z; G d? d@e          Z<dAZ=dBZ> edCe=           G dD dEe<                      Z?dS )HzPyTorch ZoeDepth model.    N)	dataclass)ListOptionalTupleUnion)nn   )ACT2FN)add_start_docstrings%add_start_docstrings_to_model_forwardreplace_return_docstrings)DepthEstimatorOutput)PreTrainedModel)ModelOutputlogging)load_backbone   )ZoeDepthConfigr   c                       e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
ej        ed<   dZeeej        df                  ed<   dZeeej        df                  ed<   dS )	ZoeDepthDepthEstimatorOutputa  
    Extension of `DepthEstimatorOutput` to include domain logits (ZoeDepth specific).

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`):
            Predicted depth for each pixel.

        domain_logits (`torch.FloatTensor` of shape `(batch_size, num_domains)`):
            Logits for each domain (e.g. NYU and KITTI) in case multiple metric heads are used.

        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlosspredicted_depthdomain_logits.hidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r   r        j/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/zoedepth/modeling_zoedepth.pyr   r   ,   s          2 )-D(5$
%,,,)-OU&---'+M5$+++=AM8E%"3S"89:AAA:>Ju0#567>>>>>r$   r   c                   ^     e Zd ZdZ fdZdeej                 deej                 fdZ xZ	S )ZoeDepthReassembleStageaE  
    This class reassembles the hidden states of the backbone into image-like feature representations at various
    resolutions.

    This happens in 3 stages:
    1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
       `config.readout_type`.
    2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
    3. Resizing the spatial dimensions (height, width).

    Args:
        config (`[ZoeDepthConfig]`):
            Model configuration class defining the model architecture.
    c           	      6   t                                                       |j        | _        t          j                    | _        t          |j        |j                  D ]/\  }}| j        	                    t          |||                     0|j        dk    r|t          j                    | _        |j        }|j        D ]W}| j        	                    t          j        t          j        d|z  |          t          |j                                      Vd S d S )N)channelsfactorproject   )super__init__readout_typer   
ModuleListlayerszipneck_hidden_sizesreassemble_factorsappendZoeDepthReassembleLayerreadout_projectsbackbone_hidden_size
SequentialLinearr
   
hidden_act)selfconfigneck_hidden_sizer*   hidden_size_	__class__s         r%   r.   z ZoeDepthReassembleStage.__init__^   s   "/moo(+F,DfF_(`(` 	j 	j$fK6vHXaghhhiiii)++$&MOOD! 5K-  %,,M")AO["I"I6RXRcKdee   	 ,+ r$   r   returnc                    |d         j         d         }t          j        |d          }|dddf         |ddddf         }}|j         \  }}}|                    ||||          }|                    dddd                                          }| j        dk    ri|                    d                              d          }|                    d          	                    |          }	t          j        ||	fd	          }n#| j        d
k    r||                    d	          z   }g }
t          |                    |d                    D ]~\  }}| j        dk    r | j        |         |          }|                    ddd                              |d	||          } | j        |         |          }|
                    |           |
S )z
        Args:
            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
                List of hidden states from the backbone.
        r   dimNr   r	   r,   r+   )r   r,   r   add)shaper    catreshapepermute
contiguousr/   flatten	unsqueeze	expand_as	enumeratesplitr7   r1   r5   )r<   r   patch_heightpatch_width
batch_size	cls_tokentotal_batch_sizesequence_lengthnum_channelsreadoutout	stage_idxhidden_states                r%   forwardzZoeDepthReassembleStage.forwardo   s    #1%+A.
 	-Q777#0A#6aaae8L=	:G:M7/<%--.>k[ghh%--aAq99DDFF	)))11!44<<YGGM))a)00::=IIG "I}g&>CCMM%'')I,?,?,C,CCM'01D1DZUV1D1W1W'X'X 	% 	%#I| I--?t4Y?MM (//1a88@@RQ]_jkkL14;y1,??LJJ|$$$$
r$   
r   r   r   r   r.   r   r    Tensorr]   __classcell__rA   s   @r%   r'   r'   N   su             "&T%,%7 &W[\a\hWi & & & & & & & &r$   r'   c                   $     e Zd Z fdZd Z xZS )r6   c           	         t                                                       |j        }t          j        ||d          | _        |dk    r t          j        ||||d          | _        d S |dk    rt          j                    | _        d S |dk     r0t          j        ||dt          d|z            d          | _        d S d S )Nr   )in_channelsout_channelskernel_sizer   rf   stridepaddingr	   )
r-   r.   r8   r   Conv2d
projectionConvTranspose2dresizeIdentityint)r<   r=   r)   r*   r?   rA   s        r%   r.   z ZoeDepthReassembleLayer.__init__   s    1)(`abbb A::,XxV\blmnnnDKKKq[[+--DKKKaZZ)HhAcRSV\R\ooghiiiDKKK Zr$   c                 Z    |                      |          }|                     |          }|S N)rk   rm   r<   r\   s     r%   r]   zZoeDepthReassembleLayer.forward   s*    |44{{<00r$   r   r   r   r.   r]   r`   ra   s   @r%   r6   r6      sL        j j j j j       r$   r6   c                   $     e Zd Z fdZd Z xZS )ZoeDepthFeatureFusionStagec                    t                                                       t          j                    | _        t          t          |j                            D ])}| j                            t          |                     *d S rq   )
r-   r.   r   r0   r1   rangelenr3   r5   ZoeDepthFeatureFusionLayer)r<   r=   r@   rA   s      r%   r.   z#ZoeDepthFeatureFusionStage.__init__   sx    moos634455 	C 	CAK9&AABBBB	C 	Cr$   c                    |d d d         }g } | j         d         |d                   }|                    |           t          |dd          | j         dd                    D ]&\  }} |||          }|                    |           '|S )NrF   r   r   )r1   r5   r2   )r<   r   fused_hidden_statesfused_hidden_stater\   layers         r%   r]   z"ZoeDepthFeatureFusionStage.forward   s    %ddd+ +T[^M!,<==""#5666#&}QRR'8$+abb/#J#J 	; 	;L%!&'9<!H!H&&'9::::""r$   rs   ra   s   @r%   ru   ru      sL        C C C C C# # # # # # #r$   ru   c                   F     e Zd ZdZ fdZdej        dej        fdZ xZS )ZoeDepthPreActResidualLayerz
    ResidualConvUnit, pre-activate residual unit.

    Args:
        config (`[ZoeDepthConfig]`):
            Model configuration class defining the model architecture.
    c                 6   t                                                       |j        | _        |j        |j        n| j         }t          j                    | _        t          j        |j	        |j	        ddd|          | _
        t          j                    | _        t          j        |j	        |j	        ddd|          | _        | j        rLt          j        |j	        |j                  | _        t          j        |j	        |j                  | _        d S d S )Nr	   r   )rf   rh   ri   bias)eps)r-   r.   !use_batch_norm_in_fusion_residualuse_batch_normuse_bias_in_fusion_residualr   ReLUactivation1rj   fusion_hidden_sizeconvolution1activation2convolution2BatchNorm2dbatch_norm_epsbatch_norm1batch_norm2)r<   r=   r   rA   s      r%   r.   z$ZoeDepthPreActResidualLayer.__init__   s$   $F 1= ..(( 	$ 799I%%,
 
 
 799I%%,
 
 
  	d!~f.GVMbcccD!~f.GVMbcccD	d 	dr$   r\   rB   c                 (   |}|                      |          }|                     |          }| j        r|                     |          }|                     |          }|                     |          }| j        r|                     |          }||z   S rq   )r   r   r   r   r   r   r   r<   r\   residuals      r%   r]   z#ZoeDepthPreActResidualLayer.forward   s    ''55((66 	:++L99L''55((66 	:++L99Lh&&r$   )	r   r   r   r   r.   r    r_   r]   r`   ra   s   @r%   r   r      sn          d  d  d  d  dD'EL 'U\ ' ' ' ' ' ' ' 'r$   r   c                   ,     e Zd ZdZd fd	ZddZ xZS )ry   a8  Feature fusion layer, merges feature maps from different stages.

    Args:
        config (`[ZoeDepthConfig]`):
            Model configuration class defining the model architecture.
        align_corners (`bool`, *optional*, defaults to `True`):
            The align_corner setting for bilinear upsample.
    Tc                     t                                                       || _        t          j        |j        |j        dd          | _        t          |          | _        t          |          | _	        d S )Nr   T)rf   r   )
r-   r.   align_cornersr   rj   r   rk   r   residual_layer1residual_layer2)r<   r=   r   rA   s      r%   r.   z#ZoeDepthFeatureFusionLayer.__init__  si    *)F$=v?Xfgnrsss:6BB:6BBr$   Nc                 t   |c|j         |j         k    r;t          j                            ||j         d         |j         d         fdd          }||                     |          z   }|                     |          }t          j                            |dd| j                  }|                     |          }|S )Nr,   r	   bilinearFsizemoder   scale_factorr   r   )rH   r   
functionalinterpolater   r   r   rk   r   s      r%   r]   z"ZoeDepthFeatureFusionLayer.forward  s    !X^33=44L$6q$9<;Ma;P#QXbrw 5   ($*>*>x*H*HHL++L99}00qzI[ 1 
 
 |44r$   )Trq   r   r   r   r   r.   r]   r`   ra   s   @r%   ry   ry     sa         C C C C C C       r$   ry   c                   ^     e Zd ZdZ fdZdeej                 deej                 fdZ xZ	S )ZoeDepthNeckaO  
    ZoeDepthNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
    input and produces another list of tensors as output. For ZoeDepth, it includes 2 stages:

    * ZoeDepthReassembleStage
    * ZoeDepthFeatureFusionStage.

    Args:
        config (dict): config dict.
    c           
         t                                                       || _        |j        |j        j        dv rd | _        nt          |          | _        t          j                    | _	        |j
        D ]8}| j	                            t          j        ||j        ddd                     9t          |          | _        d S )N)swinv2r	   r   F)rf   ri   r   )r-   r.   r=   backbone_config
model_typereassemble_stager'   r   r0   convsr3   r5   rj   r   ru   fusion_stage)r<   r=   channelrA   s      r%   r.   zZoeDepthNeck.__init__9  s     !-&2H2SWa2a2a$(D!!$;F$C$CD!]__
/ 	s 	sGJbi1JXYcdkpqqqrrrr 7v>>r$   r   rB   c                 |    t          |t          t          f          st          d          t	          |          t	           j        j                  k    rt          d           j                             |||          } fdt          |          D             } 
                    |          }||d         fS )z
        Args:
            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
                List of hidden states from the backbone.
        z2hidden_states should be a tuple or list of tensorszOThe number of hidden states should be equal to the number of neck hidden sizes.Nc                 B    g | ]\  }} j         |         |          S r#   )r   ).0ifeaturer<   s      r%   
<listcomp>z(ZoeDepthNeck.forward.<locals>.<listcomp>Z  s-    VVVzq'MDJqM'**VVVr$   rF   )
isinstancetuplelist	TypeErrorrx   r=   r3   
ValueErrorr   rP   r   )r<   r   rR   rS   featuresoutputs   `     r%   r]   zZoeDepthNeck.forwardJ  s     -%77 	RPQQQ}T[%B!C!CCCnooo  , 11-{[[MVVVVY}=U=UVVV ""8,,x|##r$   r^   ra   s   @r%   r   r   ,  su        	 	? ? ? ? ?"$T%,%7 $W[\a\hWi $ $ $ $ $ $ $ $r$   r   c                   R     e Zd ZdZ fdZdeej                 dej        fdZ xZ	S )#ZoeDepthRelativeDepthEstimationHeada  
    Relative depth estimation head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
    the predictions to the input resolution after the first convolutional layer (details can be found in DPT's paper's
    supplementary material).
    c                    t                                                       |j        | _        d | _        |j        rt          j        ddddd          | _        |j        }t          j        ||dz  ddd          | _        t          j	        ddd	
          | _
        t          j        |dz  |j        ddd          | _        t          j        |j        dddd          | _        d S )N   )r	   r	   )r   r   rg   r,   r	   r   r   Tr   r   )r-   r.   head_in_indexrk   add_projectionr   rj   r   conv1Upsampleupsamplenum_relative_featuresconv2conv3)r<   r=   r   rA   s      r%   r.   z,ZoeDepthRelativeDepthEstimationHead.__init__i  s    #1  	e iSfV]cdddDO,YxQAaYZ[[[
SWXXXYx1}f.JXYbcmnooo
Yv;QAVWabccc


r$   r   rB   c                    || j                  }| j        1|                     |          } t          j                    |          }|                     |          }|                     |          }|                     |          } t          j                    |          }|}|                     |          } t          j                    |          }|                    d          }||fS )Nr   rD   )	r   rk   r   r   r   r   r   r   squeeze)r<   r   r   r   s       r%   r]   z+ZoeDepthRelativeDepthEstimationHead.forwardx  s    %d&89?& OOM::M%BGIIm44M

=11m44

=11!		-00 

=11!		-00'//A/66((r$   r^   ra   s   @r%   r   r   b  sr         d d d d d)T%,%7 )EL ) ) ) ) ) ) ) )r$   r   Hz>c                     | |z   } ||z   }| t          j        |           z  |t          j        |          z  z
  | |z
  t          j        | |z
  |z             z  z
  S )z%log(nCk) using stirling approximation)r    log)nkr   s      r%   	log_binomr     sY    	CA	CAuy||a%)A,,..!a%59QUS[;Q;Q1QQQr$   c                   6     e Zd Zdej        f fd	ZddZ xZS )LogBinomialSoftmaxr   c           	      x   t                                                       || _        || _        |                     dt          j        d|                              dddd          d           |                     dt          j        | j        dz
  g                              dddd          d           dS )	a7  Compute log binomial distribution for n_classes

        Args:
            n_classes (`int`, *optional*, defaults to 256):
                Number of output classes.
            act (`torch.nn.Module`, *optional*, defaults to `torch.softmax`):
                Activation function to apply to the output.
        k_idxr   r   rF   F)
persistent	k_minus_1N)	r-   r.   r   actregister_bufferr    arangeviewr_   )r<   	n_classesr   rA   s      r%   r.   zLogBinomialSoftmax.__init__  s     	Wel1i&@&@&E&EaQPQ&R&R_deee[%,
|*D*D*I*I!RQRTU*V*Vchiiiiir$         ?-C6?c                    |j         dk    r|                    d          }t          j        d|z
  |d          }t          j        ||d          }t	          | j        | j                  | j        t          j        |          z  z   | j        | j        z
  t          j        |          z  z   }|                     ||z  d          S )a  Compute the log binomial distribution for probabilities.

        Args:
            probabilities (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
                Tensor containing probabilities of each class.
            temperature (`float` or `torch.Tensor` of shape `(batch_size, num_channels, height, width)`, *optional*, defaults to 1):
                Temperature of distribution.
            eps (`float`, *optional*, defaults to 1e-4):
                Small number for numerical stability.

        Returns:
            `torch.Tensor` of shape `(batch_size, num_channels, height, width)`:
                Log binomial distribution logbinomial(p;t).
        r	   r   rD   )	ndimrN   r    clampr   r   r   r   r   )r<   probabilitiestemperaturer   one_minus_probabilitiesys         r%   r]   zLogBinomialSoftmax.forward  s     "")33A66M"'+a-.?a"H"HM3::dndj11j59]3334~
*ei8O.P.PPQ 	

 xxKQx///r$   )r   r   )r   r   r   r    softmaxr.   r]   r`   ra   s   @r%   r   r     s^        !$%- j j j j j j0 0 0 0 0 0 0 0r$   r   c                   *     e Zd Z	 	 d fd	Zd Z xZS )%ZoeDepthConditionalLogBinomialSoftmaxr   r,   c                    t                                                       ||z   |z  }t          j        t          j        ||z   |ddd          t          j                    t          j        |dddd          t          j                              | _        d| _        |j	        | _	        |j
        | _
        t          |t          j                  | _        dS )a  Per-pixel MLP followed by a Conditional Log Binomial softmax.

        Args:
            in_features (`int`):
                Number of input channels in the main feature.
            condition_dim (`int`):
                Number of input channels in the condition feature.
            n_classes (`int`, *optional*, defaults to 256):
                Number of classes.
            bottleneck_factor (`int`, *optional*, defaults to 2):
                Hidden dim factor.

        r   r   rg      r   )r   N)r-   r.   r   r9   rj   GELUSoftplusmlpp_epsmax_tempmin_tempr   r    r   log_binomial_transform)r<   r=   in_featurescondition_dimr   bottleneck_factor
bottleneckrA   s          r%   r.   z.ZoeDepthConditionalLogBinomialSoftmax.__init__  s    * 	!M16GG
=IkM1:1UV`abbbGIIIj%Qq!LLLKMM
 
 
&8&V&V&V###r$   c                    |                      t          j        ||fd                    }|dddddf         |dddddf         }}|| j        z   }|ddddf         |ddddf         |ddddf         z   z  }|| j        z   }|ddddf         |ddddf         |ddddf         z   z  }|                    d          }| j        | j        z
  |z  | j        z   }|                     ||          S )az  
        Args:
            main_feature (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
                Main feature.
            condition_feature (torch.Tensor of shape `(batch_size, num_channels, height, width)`):
                Condition feature.

        Returns:
            `torch.Tensor`:
                Output log binomial distribution
        r   rD   Nr,   .r   )r   r    concatr   rN   r   r   r   )r<   main_featurecondition_featureprobabilities_and_temperaturer   r   s         r%   r]   z-ZoeDepthConditionalLogBinomialSoftmax.forward  s@    )-|M^>_ef1g1g1g(h(h%)!!!RaR*5)!!!QRR*5 #
 &
2%aaaCi0M!!!Q)4L}]^]^]^`acf]fOg4gh!DJ.!!!!Q),AAAq#I0FUVUVUVXY[^U^I_0_`!++A..}t}4CdmS**=+FFFr$   )r   r,   rs   ra   s   @r%   r   r     sb         #W #W #W #W #W #WJG G G G G G Gr$   r   c                   &     e Zd Zd fd	Zd Z xZS )ZoeDepthSeedBinRegressor   r   MbP?
   c                    t                                                       |j        | _        |j        | _        || _        || _        t          j        | j        |ddd          | _	        t          j
        d          | _        t          j        ||ddd          | _        | j        dk    rt          j
        d          nt          j                    | _        dS )ad  Bin center regressor network.

        Can be "normed" or "unnormed". If "normed", bin centers are bounded on the (min_depth, max_depth) interval.

        Args:
            config (`int`):
                Model configuration.
            n_bins (`int`, *optional*, defaults to 16):
                Number of bin centers.
            mlp_dim (`int`, *optional*, defaults to 256):
                Hidden dimension.
            min_depth (`float`, *optional*, defaults to 1e-3):
                Min depth value.
            max_depth (`float`, *optional*, defaults to 10):
                Max depth value.
        r   r   TinplacenormedN)r-   r.   bottleneck_featuresr   bin_centers_type	min_depth	max_depthr   rj   r   r   act1r   r   act2)r<   r=   n_binsmlp_dimr  r  rA   s         r%   r.   z!ZoeDepthSeedBinRegressor.__init__  s    " 	!5 & 7""Yt/!QBB
GD)))	Yw1a88
-1-Bh-N-NBGD))))TVT_TaTa			r$   c                    |                      |          }|                     |          }|                     |          }|                     |          }| j        dk    r|dz   }||                    dd          z  }| j        | j        z
  |z  }t          j	        
                    |dd| j                  }t          j        |d	          }d
|dddddf         |dddddf         z   z  }||fS ||fS )z]
        Returns tensor of bin_width vectors (centers). One vector b for every pixel
        r   r   r   TrE   keepdim)r   r   r   r   r   r   constant)r   valuerD   g      ?NrF   .)r   r  r   r  r   sumr  r  r   r   padr    cumsum)r<   xbin_centersbin_widths_normed
bin_widths	bin_edgess         r%   r]   z ZoeDepthSeedBinRegressor.forward"  s    JJqMMIIaLLJJqMMiill H,,%,K +koo!To.R.R R.4>9=NNJ**:7IPZbfbp*qqJZQ777I111crc3;!7)AAAqrr3J:O!OPK$k11 ++r$   )r   r   r   r   rs   ra   s   @r%   r   r     sR        b b b b b b:, , , , , , ,r$   r   ,  r,   alphagammac                 ^    |                      d||                     |          z  z             S )a:  Inverse attractor: dc = dx / (1 + alpha*dx^gamma), where dx = a - c, a = attractor point, c = bin center, dc = shift in bin center
    This is the default one according to the accompanying paper.

    Args:
        dx (`torch.Tensor`):
            The difference tensor dx = Ai - Cj, where Ai is the attractor point and Cj is the bin center.
        alpha (`float`, *optional*, defaults to 300):
            Proportional Attractor strength. Determines the absolute strength. Lower alpha = greater attraction.
        gamma (`int`, *optional*, defaults to 2):
            Exponential Attractor strength. Determines the "region of influence" and indirectly number of bin centers affected.
            Lower gamma = farther reach.

    Returns:
        torch.Tensor: Delta shifts - dc; New bin centers = Old bin centers + dc
    r   )divpow)dxr  r  s      r%   inv_attractorr  <  s*    " 66!ebffUmm++,,,r$   c                   0     e Zd Z	 	 	 	 d	 fd	Zd
dZ xZS )ZoeDepthAttractorLayerr   r   r   Fc                    t                                                       |j        | _        |j        | _        |j        | _        || _        || _	        || _
        || _        || _        |j        x}}t          j        ||ddd          | _        t          j        d          | _        t          j        ||dz  ddd          | _        t          j        d          | _        dS )zq
        Attractor layer for bin centers. Bin centers are bounded on the interval (min_depth, max_depth)
        r   r   Tr   r,   N)r-   r.   attractor_alphar  attractor_gammagemmaattractor_kindkindn_attractorsr  r  r  memory_efficientbin_embedding_dimr   rj   r   r   r  r   r  
r<   r=   r  r$  r  r  r%  r   r  rA   s
            r%   r.   zZoeDepthAttractorLayer.__init__Q  s     	+
+
)	("" 0 !' 88gY{GQ1==
GD)))	Ywq(8!QBB
GD)))			r$   NTc                 n   |7|r0t           j                            ||j        dd         dd          }||z   }|                     |          }|                     |          }|                     |          }|                     |          }|dz   }|j        \  }}}}	|                    || j	        d||	          }|dddddd	f         }
t           j                            |||	fdd          }| j
        sht          j        t          j        d
| j                 } |t          |
                    d          |                    d          z
            d          }nzt          j        ||j                  }t'          | j	                  D ]5}|t          |
dd|d	f                             d          |z
            z  }6| j        dk    r
|| j	        z  }||z   }| j        | j        z
  |z  | j        z   }t          j        |d          \  }}t          j        || j        | j                  }||fS )ao  
        The forward pass of the attractor layer. This layer predicts the new bin centers based on the previous bin centers
        and the attractor points (the latter are predicted by the MLP).

        Args:
            x (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
                Feature block.
            prev_bin (`torch.Tensor` of shape `(batch_size, prev_number_of_bins, height, width)`):
                Previous bin centers normed.
            prev_bin_embedding (`torch.Tensor`, *optional*):
                Optional previous bin embeddings.
            interpolate (`bool`, *optional*, defaults to `True`):
                Whether to interpolate the previous bin embeddings to the size of the input features.

        Returns:
            `Tuple[`torch.Tensor`, `torch.Tensor`]:
                New bin centers normed and scaled.
        Nr   Tr   r   r   r,   r   .meanr  r   rD   devicer,  )r   r   r   rH   r   r  r   r  r   r$  r%  r    r,  r  r#  r  rN   
zeros_liker.  rw   r  r  sortclip)r<   r  prev_binprev_bin_embeddingr   
attractorsrT   r@   heightwidthattractors_normedr  funcdelta_cr   bin_new_centerss                   r%   r]   zZoeDepthAttractorLayer.forwardp  sP   & ) %']%>%>&:UY &? & &" &&AJJqMMIIaLLJJqMMYYq\\
$&
'1'7$
Avu__Z1BAvuUU
 'qqq!!!Q|4m//65/PZjn/oo $ 	6!Juy99$)DDd=):)D)DQ)G)G+J_J_`aJbJb)bccijkkkGG&{;;MNNNG4,-- b b=):111a9)E)O)OPQ)R)RU`)`aaayF""!D$55%/~6/IDNZKQ777QjdndnMM++r$   )r   r   r   FNTrs   ra   s   @r%   r  r  P  s_        
 * * * * * *><, <, <, <, <, <, <, <,r$   r  c                   0     e Zd Z	 	 	 	 d fd	Zd	dZ xZS )
ZoeDepthAttractorLayerUnnormedr   r   r   Tc                    t                                                       || _        || _        || _        || _        |j        | _        |j        | _        |j	        | _
        || _        |j        x}}t          j        ||ddd          | _        t          j        d          | _        t          j        ||ddd          | _        t          j                    | _        dS )zL
        Attractor layer for bin centers. Bin centers are unbounded
        r   r   Tr   N)r-   r.   r$  r  r  r  r  r  r  r"  r#  r%  r&  r   rj   r   r   r  r   r   r  r'  s
            r%   r.   z'ZoeDepthAttractorLayerUnnormed.__init__  s     	(""+
+
)	 0 & 88gY{GQ1==
GD)))	YwaA>>
KMM			r$   Nc                 r   |7|r0t           j                            ||j        dd         dd          }||z   }|                     |          }|                     |          }|                     |          }|                     |          }|j        dd         \  }}t           j                            |||fdd          }| j        sht          j
        t          j        d| j                 }	 |	t          |                    d          |                    d          z
            d	          }
nzt          j        ||j        
          }
t#          | j                  D ]5}|
t          |dd|df                             d          |z
            z  }
6| j        dk    r
|
| j        z  }
||
z   }|}||fS )a  
        The forward pass of the attractor layer. This layer predicts the new bin centers based on the previous bin centers
        and the attractor points (the latter are predicted by the MLP).

        Args:
            x (`torch.Tensor` of shape (batch_size, num_channels, height, width)`):
                Feature block.
            prev_bin (`torch.Tensor` of shape (batch_size, prev_num_bins, height, width)`):
                Previous bin centers normed.
            prev_bin_embedding (`torch.Tensor`, *optional*):
                Optional previous bin embeddings.
            interpolate (`bool`, *optional*, defaults to `True`):
                Whether to interpolate the previous bin embeddings to the size of the input features.

        Returns:
            `Tuple[`torch.Tensor`, `torch.Tensor`]:
                New bin centers unbounded. Two outputs just to keep the API consistent with the normed version.
        Nr)  r   Tr*  r+  r,   r   rD   r-  .r,  )r   r   r   rH   r   r  r   r  r%  r    r,  r  r#  r  rN   r/  r.  rw   r$  )r<   r  r2  r3  r   r4  r5  r6  r  r8  r9  r   r:  s                r%   r]   z&ZoeDepthAttractorLayerUnnormed.forward  s   & ) %']%>%>&:UY &? & &" &&AJJqMMIIaLLJJqMMYYq\\
"(-m//65/PZjn/oo$ 	6!Juy99$)DDd=)=)=a)@)@;CXCXYZC[C[)[\\bcdddGG&{;;MNNNG4,-- [ [=AAAq#I)>)H)H)K)Kk)YZZZyF""!D$55%/%++r$   )r   r   r   Tr;  rs   ra   s   @r%   r=  r=    s_        
 " " " " " ":3, 3, 3, 3, 3, 3, 3, 3,r$   r=  c                   D     e Zd Zd fd	Zdej        dej        fdZ xZS )ZoeDepthProjector   c                     t                                                       t          j        ||ddd          | _        t          j        d          | _        t          j        ||ddd          | _        dS )a  Projector MLP.

        Args:
            in_features (`int`):
                Number of input channels.
            out_features (`int`):
                Number of output channels.
            mlp_dim (`int`, *optional*, defaults to 128):
                Hidden dimension.
        r   r   Tr   N)r-   r.   r   rj   r   r   r   r   )r<   r   out_featuresr  rA   s       r%   r.   zZoeDepthProjector.__init__  se     	Y{GQ1==
74(((YwaA>>


r$   r\   rB   c                     |                      |          }|                     |          }|                     |          }|S rq   )r   r   r   rr   s     r%   r]   zZoeDepthProjector.forward  s;    zz,//xx--zz,//r$   )rB  )r   r   r   r.   r    r_   r]   r`   ra   s   @r%   rA  rA    sc        ? ? ? ? ? ?"EL U\        r$   rA  c                        e Zd ZdZ fdZdej        dej        fdZ	 	 ddej        d	ej        d
ej        deej	                 dee
         deej                 fdZ xZS )ZoeDepthMultiheadAttentionzKEquivalent implementation of nn.MultiheadAttention with `batch_first=True`.c                 
   t                                                       ||z  dk    rt          d| d| d          || _        t	          ||z            | _        | j        | j        z  | _        t          j        || j                  | _	        t          j        || j                  | _
        t          j        || j                  | _        t          j        ||          | _        t          j        |          | _        d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ())r-   r.   r   num_attention_headsro   attention_head_sizeall_head_sizer   r:   querykeyr  out_projDropoutdropout)r<   r?   rJ  rQ  rA   s       r%   r.   z#ZoeDepthMultiheadAttention.__init__"  s    ,,111K 1 1-1 1 1  
 $7 #&{5H'H#I#I !58PPY{D,>??
9[$*<==Y{D,>??
	+{;;z'**r$   r  rB   c                     |                                 d d         | j        | j        fz   }|                    |          }|                    dddd          S )NrF   r   r,   r   r	   )r   rJ  rK  r   rK   )r<   r  new_x_shapes      r%   transpose_for_scoresz/ZoeDepthMultiheadAttention.transpose_for_scores6  sP    ffhhssmt'?AY&ZZFF;yyAq!$$$r$   NFquerieskeysvaluesattention_maskoutput_attentionsc                 (   |                      |                     |                    }|                      |                     |                    }|                      |                     |                    }t	          j        ||                    dd                    }	|	t          j        | j	                  z  }	||	|z   }	t          j                            |	d          }
|                     |
          }
t	          j        |
|          }|                    dddd                                          }|                                d d         | j        fz   }|                    |          }|                     |          }|r||
fn|f}|S )NrF   r)  rD   r   r,   r   r	   )rT  rM  rN  r  r    matmul	transposemathsqrtrK  r   r   r   rQ  rK   rL   r   rL  r   rO  )r<   rU  rV  rW  rX  rY  query_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                 r%   r]   z"ZoeDepthMultiheadAttention.forward;  sy    //

70C0CDD--dhhtnn==	//

60B0BCC !<Y5H5HR5P5PQQ+di8P.Q.QQ%/.@ -//0@b/II ,,77_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S%**+BCCm446G]=/22mM]r$   )NF)r   r   r   r   r.   r    r_   rT  r   r!   boolr   r]   r`   ra   s   @r%   rG  rG    s        UU+ + + + +(%el %u| % % % % 7;,1% %% l% 	%
 !!23% $D>% 
u|	% % % % % % % %r$   rG  c                   F     e Zd Zd fd	Z	 ddeej                 fdZ xZS )	ZoeDepthTransformerEncoderLayer皙?reluc                 &   t                                                       |j        }|j        }|j        }t          |||          | _        t          j        ||          | _	        t          j
        |          | _        t          j        ||          | _        t          j        |          | _        t          j        |          | _        t          j
        |          | _        t          j
        |          | _        t$          |         | _        d S )N)rQ  )r-   r.   patch_transformer_hidden_size#patch_transformer_intermediate_size%patch_transformer_num_attention_headsrG  	self_attnr   r:   linear1rP  rQ  linear2	LayerNormnorm1norm2dropout1dropout2r
   
activation)r<   r=   rQ  rx  r?   intermediate_sizerJ  rA   s          r%   r.   z(ZoeDepthTransformerEncoderLayer.__init__d  s    :"F$J3KAT^efffy.?@@z'**y!2K@@\+..
\+..

7++
7++ ,r$   Nsrc_maskc           	         |x}}|                      ||||          d         }||                     |          z   }|                     |          }|                     |                     |                     |                     |                                        }||                     |          z   }|                     |          }|S )N)rU  rV  rW  rX  r   )	rp  rv  rt  rr  rQ  rx  rq  rw  ru  )r<   srcrz  rU  rV  src2s         r%   r]   z'ZoeDepthTransformerEncoderLayer.forwardx  s    
 $~~gDU]~^^_`aDMM$'''jjoo||DLLc9J9J)K)KLLMMDMM$'''jjoo
r$   )rj  rk  rq   )	r   r   r   r.   r   r    r_   r]   r`   ra   s   @r%   ri  ri  c  sl        - - - - - -. ,0  5<(       r$   ri  c                   :     e Zd Z fdZdej        fdZd Z xZS )ZoeDepthPatchTransformerEncoderc                    t                                                       j        }t          j        fdt          j                  D                       | _        t          j        |j	        ddd          | _
        dS )zViT-like transformer block

        Args:
            config (`ZoeDepthConfig`):
                Model configuration class defining the model architecture.
        c                 .    g | ]}t                    S r#   )ri  )r   r@   r=   s     r%   r   z<ZoeDepthPatchTransformerEncoder.__init__.<locals>.<listcomp>  s"    iii,V44iiir$   r   r   rg   N)r-   r.   r   r   r0   rw   num_patch_transformer_layerstransformer_encoderrj   rm  embedding_convPxP)r<   r=   rd   rA   s    ` r%   r.   z(ZoeDepthPatchTransformerEncoder.__init__  s     	0#%=iiiieFDg>h>hiii$
 $
  "$=1UV`a"
 "
 "
r$   cpuc           	         t          j        d|||                              d          }t          j        d|d||                              d          }t          j        |t          j        t          j        d|                     |z  z            }||z  }	t          j        t          j        |	          t          j        |	          gd          }	|	                    d          	                    |dd          }	|	S )zGenerate positional encodings

        Args:
            sequence_length (int): Sequence length
            embedding_dim (int): Embedding dimension

        Returns:
            torch.Tensor: Positional encodings.
        r   )dtyper.  r   r,   g     @r-  rD   )
r    r   rN   expr   tensorrI   sincosrepeat)
r<   rT   rW   embedding_dimr.  r  positionindexdiv_termpos_encodings
             r%   positional_encoding_1dz6ZoeDepthPatchTransformerEncoder.positional_encoding_1d  s     <?%OOOYYZ[\\QqfMMMWWXYZZ9Uuygf1U1U1U'V'V&VYf&fghh(*y%)L"9"959\;R;R!SYZ[[[#--!-44;;J1MMr$   c                 x   |                      |                              d          }t          j                            |d          }|                    ddd          }|j        \  }}}||                     ||||j        |j	                  z   }t          d          D ]} | j        |         |          }|S )zForward pass

        Args:
            x (torch.Tensor - NCHW): Input feature tensor

        Returns:
            torch.Tensor - Transformer output embeddings of shape (batch_size, sequence_length, embedding_dim)
        r,   )r   r   r   r   )r.  r  r   )r  rM   r   r   r  rK   rH   r  r.  r  rw   r  )r<   r  
embeddingsrT   rW   r  r   s          r%   r]   z'ZoeDepthPatchTransformerEncoder.forward  s     ++A..66q99
]&&z6::
''1a00
5?5E2
O]$"="=z?PXbXh #> #
 #
 

 q 	A 	AA41!4Z@@JJr$   )	r   r   r   r.   r    float32r  r]   r`   ra   s   @r%   r  r    sh        
 
 
 
 
& Y^ejer    $      r$   r  c                   &     e Zd Zd fdZd Z xZS )ZoeDepthMLPClassifierrB   Nc                     t                                                       |}t          j        ||          | _        t          j                    | _        t          j        ||          | _        d S rq   )r-   r.   r   r:   rq  r   rx  rr  )r<   r   rD  hidden_featuresrA   s       r%   r.   zZoeDepthMLPClassifier.__init__  sU    %yo>>'))y,??r$   c                     |                      |          }|                     |          }|                     |          }|S rq   )rq  rx  rr  )r<   r\   r   s      r%   r]   zZoeDepthMLPClassifier.forward  s;    ||L11|44\22r$   )rB   Nrs   ra   s   @r%   r  r    sR        @ @ @ @ @ @      r$   r  c                   (     e Zd ZdZ fdZd Z xZS )*ZoeDepthMultipleMetricDepthEstimationHeadszn
    Multiple metric depth estimation heads. A MLP classifier is used to route between 2 different heads.
    c                 4   t                                                       j        j        j        | _        j        | _        j        }t          j        ||ddd          | _	        t                    | _        t          dd          | _        | j        dk    rt          n| j        dk    rt          t          j        fd	j        D                       | _        t%          |dz  
          | _        t          j        fdt+          d          D                       | _        t          j        fdj        D                       | _        j        t          j        fdj        D                       | _        d S )Nr   r   rg   rB  r,   r   rD  r   softplusc                 p    i | ]2}|d          t          |d         dz  |d         |d                   3S )namer  r,   r  r  )r  r  r  r  )r   )r   confr&  r=   s     r%   
<dictcomp>zGZoeDepthMultipleMetricDepthEstimationHeads.__init__.<locals>.<dictcomp>  sc     	 	 	  V6>-2";/";/  	 	 	r$   r   rD  r  c                 D    g | ]}t          j        d z            S )r,   r  rA  r   r   r@   r&  r=   s     r%   r   zGZoeDepthMultipleMetricDepthEstimationHeads.__init__.<locals>.<listcomp>  sK         " & 9!2-2    r$   r   c                     i | ]Fd          t          j        fdt          t                              D                       GS )r  c           	      R    g | ]#} |         d          d                   $S )r  r  r  r  r  r#   )r   r   	Attractorr=   configurationr$  s     r%   r   zRZoeDepthMultipleMetricDepthEstimationHeads.__init__.<locals>.<dictcomp>.<listcomp>  sV         "	"#/?&3K&@&3K&@	    r$   )r   r0   rw   rx   )r   r  r  r=   r$  s    @r%   r  zGZoeDepthMultipleMetricDepthEstimationHeads.__init__.<locals>.<dictcomp>  s        " f%r}       "'s<'8'8!9!9  
( 
(  r$   c           
      R    i | ]#}|d          t          |d         d          $S )r  r  r   )r   )r   )r   r  r&  r=   last_ins     r%   r  zGZoeDepthMultipleMetricDepthEstimationHeads.__init__.<locals>.<dictcomp>$  sV     	 	 	 " f%'L%!(+&'( ( (	 	 	r$   )r-   r.   r&  num_attractorsbin_configurationsr   r   r   rj   r   r  patch_transformerr  mlp_classifierr  r=  
ModuleDictseed_bin_regressorsrA  seed_projectorr0   rw   
projectorsr4  r   conditional_log_binomial)r<   r=   r   r  r&  r  r$  rA   s    ` @@@@r%   r.   z3ZoeDepthMultipleMetricDepthEstimationHeads.__init__  s   "4,"("; & 7 %8Y24GUV_`jklll
 "A!H!H3RSTTT  H,,.II"j006I $&=	 	 	 	 	 #5	 	 	$
 $
  0+:KUfjkUk
 
 
 -     q  	
 	
 -      &,%>  
 
" .(*	 	 	 	 	 	 &,%>	 	 	)
 )
%%%r$   c                    |                      |          }|                     |          d d dd d f         }|                     |          }t          j        |                    dd          d          }d | j        D             }	|	t          j        |d                                          	                                         	 fd| j        D             d         }
n!# t          $ r t          d d	          w xY w|
d
         }|
d         }| j                 } ||          \  }}| j        dv r||z
  ||z
  z  }n|}|                     |          }| j                 }t!          | j        ||          D ]'\  }}} ||          } ||||d          \  }}|}|}(|}t$          j                            ||j        dd          dd          }t$          j                            ||j        dd          dd          }| j                 } |||          }t          j        ||z  dd          }||fS )Nr   Tr  rF   rD   c                     g | ]
}|d          S r  r#   )r   r  s     r%   r   zFZoeDepthMultipleMetricDepthEstimationHeads.forward.<locals>.<listcomp><  s    TTT=v&TTTr$   c                 ,    g | ]}|d          k    |S r  r#   )r   r=   bin_configurations_names     r%   r   zFZoeDepthMultipleMetricDepthEstimationHeads.forward.<locals>.<listcomp>@  s(    nnnvF6NVmDmDmFDmDmDmr$   zbin_configurations_name z! not found in bin_configurationssr  r  r   hybrid2r   r)  r   r*  r   )r   r  r  r    r   r  r  argmaxr   item
IndexErrorr   r  r   r  r4  r2   r  r   r   r   rH   r  )r<   outconv_activationr   feature_blocksrelative_depthr  	embeddingr   domain_votenamesr  r  r  seed_bin_regressorr@   seed_bin_centersr2  r3  r4  	projector	attractorr   bin_embeddingbinr  lastr  rZ   r  s                               @r%   r]   z2ZoeDepthMultipleMetricDepthEstimationHeads.forward0  s   JJz"" **1--aaaAAAg6	 ++I66mM$5$5!T$5$J$JPRSSS UTD<STTT"'[b(I(I(I(Q(Q(S(S(X(X(Z(Z"[	tnnnn)@nnnopqDD 	t 	t 	tr8Orrrsss	t %	%	!56MN0033 $999(94Y9NOHH'H!0033_%<=
-0*n-]-] 	/ 	/)Iy'%Ig..M(yBTbfgggCH!.!m//TZ_S]mq/rr11-BCCWaqu1vv#'#@AX#Y $$T=99 iKQ===M!!s   C% %Dr   ra   s   @r%   r  r    sW         R
 R
 R
 R
 R
h1" 1" 1" 1" 1" 1" 1"r$   r  c                   $     e Zd Z fdZd Z xZS )!ZoeDepthMetricDepthEstimationHeadc                   	
 t                                                       j        d         }|d         |d         	|d         j        j        
j        }	| _        | _        || _        j        }t          j
        ||ddd          | _        | j        dk    rt          n| j        dk    rt          t          		          | _        t!          |
          | _        t          j        fdt'          d          D                       | _        t          j        	
fdt'          d          D                       | _        j        dz   }t/          |          | _        d S )Nr   r  r  r  r   rg   r   r  r  r  c                 <    g | ]}t          j                   S )r  r  r  s     r%   r   z>ZoeDepthMetricDepthEstimationHead.__init__.<locals>.<listcomp>  s:        "f.GVghhh  r$   r   c           	      <    g | ]} |                    S ))r  r$  r  r  r#   )r   r   r  r=   r  r  r$  r  s     r%   r   z>ZoeDepthMetricDepthEstimationHead.__init__.<locals>.<listcomp>  sO     	 	 	  	!!-a''  	 	 	r$   )r   )r-   r.   r  r&  r  r   r  r  r   r   rj   r   r  r=  r   r  rA  r  r0   rw   r  r4  r   r   r  )r<   r=   bin_configurationr   r   r  r  r&  r  r  r$  r  rA   s    `    @@@@@@r%   r.   z*ZoeDepthMetricDepthEstimationHead.__init__e  s   "5a8"8,%k2	%k2	"4,!2"" 0 %8Y24GUV_`jklll
  H,,.II"j006I":6Y)#
 #
 #
 0<O^oppp-    q  
 
 -	 	 	 	 	 	 	 	 	 q	 	 	
 
 .2 )N	)
 )
 )
%%%r$   c                    |                      |          }|                     |          \  }}| j        dv r|| j        z
  | j        | j        z
  z  }n|}|                     |          }	t          | j        | j        |          D ]K\  }
}} |
|          } ||||	d          \  }}|	                                }|	                                }	L|}|
                    d          }t          j                            ||j        dd          dd          }t          j        ||gd          }t          j                            ||j        d	d          dd
          }|                     ||          }t          j                            ||j        d	d          dd
          }t          j        ||z  dd          }|d fS )Nr  Tr  r   r,   r   r   rD   r)  r*  r  )r   r  r   r  r  r  r2   r  r4  clonerN   r   r   r   rH   r    rI   r  r  )r<   r  r   r  r  r  r@   r  r2  r3  r  r  r   r  r  r  r  relative_conditioningrZ   s                      r%   r]   z)ZoeDepthMetricDepthEstimationHead.forward  s   JJz"""55a88 $999(4>9dnt~>]^HH'H!0033 .1$/Sa-b-b 	7 	7)Iy'%Ig..M(yBTbfgggCyy{{H!.!4!4!6!6! !/ 8 8 ; ; " 9 9!
122ZW[ !: !
 !
 y$ 56A>>>11-BCCWaqu1vv))$>> m//QWRSS\PZjn/ooiKQ===Dyr$   rs   ra   s   @r%   r  r  d  sH        9
 9
 9
 9
 9
v" " " " " " "r$   r  c                   (    e Zd ZdZeZdZdZdZd Z	dS )ZoeDepthPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    zoedepthpixel_valuesTc                    t          |t          j        t          j        t          j        f          rT|j        j                            d| j        j	                   |j
         |j
        j                                         dS dS t          |t          j                  r?|j
        j                                         |j        j                            d           dS dS )zInitialize the weightsg        )r,  stdNr   )r   r   r:   rj   rl   weightdatanormal_r=   initializer_ranger   zero_rs  fill_)r<   modules     r%   _init_weightsz%ZoeDepthPreTrainedModel._init_weights  s    fry")R5GHII 	* M&&CT[5R&SSS{& &&((((( '&-- 	*K""$$$M$$S)))))	* 	*r$   N)
r   r   r   r   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr  r#   r$   r%   r  r    sE         
 "L"$O&*#
* 
* 
* 
* 
*r$   r  aE  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`ViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
            for details.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
zU
    ZoeDepth model with one or multiple metric depth estimation head(s) on top.
    c                        e Zd Z fdZ ee           eee          	 	 	 	 dde	j
        dee	j                 dee         dee         dee         d	eee	j                 ef         fd
                        Z xZS )ZoeDepthForDepthEstimationc                 @   t                                          |           t          |          | _        t	          | j        j        d          rGt	          | j        j        d          r-| j        j        j        |_        | j        j        j        | _        nt          d          t          |          | _        t          |          | _        t          |j                  dk    rt!          |          nt#          |          | _        |                                  d S )Nr?   
patch_sizezXZoeDepth assumes the backbone's config to have `hidden_size` and `patch_size` attributesr   )r-   r.   r   backbonehasattrr=   r?   r8   r  r   r   neckr   relative_headrx   r  r  r  metric_head	post_init)r<   r=   rA   s     r%   r.   z#ZoeDepthForDepthEstimation.__init__  s      %f--4='77 	GDMDXZf<g<g 	*.-*>*JF'"m2=DOOj   !((	@HH 6,--11 7v>>>26:: 	 	r$   )output_typer  Nr  labelsrY  output_hidden_statesreturn_dictrB   c                    d}|t          d          ||n| j        j        }||n| j        j        }||n| j        j        }| j                            |||          }|j        }|j        \  }	}	}
}| j	        }|
|z  }||z  }| 
                    |||          \  }}|g|z   }|                     |          \  }}|g|z   }|                     |d         |d         |dd         |          \  }}|                    d          }|s*|||f|dd         z   }n|f|dd         z   }||f|z   n|S t          ||||j        |j        	          S )
a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth depth estimation maps for computing the loss.

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, ZoeDepthForDepthEstimation
        >>> import torch
        >>> import numpy as np
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti")
        >>> model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti")

        >>> # prepare image for the model
        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # interpolate to original size
        >>> post_processed_output = image_processor.post_process_depth_estimation(
        ...     outputs,
        ...     source_sizes=[(image.height, image.width)],
        ... )

        >>> # visualize the prediction
        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
        >>> depth = predicted_depth * 255 / predicted_depth.max()
        >>> depth = depth.detach().cpu().numpy()
        >>> depth = Image.fromarray(depth.astype("uint8"))
        ```NzTraining is not implemented yet)r  rY  r   r   r,   )r  r   r  r  rD   )r   r   r   r   r   )NotImplementedErrorr=   use_return_dictr  rY  r  forward_with_filtered_kwargsfeature_mapsrH   r  r  r  r  r   r   r   r   )r<   r  r  rY  r  r  r   rf  r   r@   r5  r6  r  rR   rS   r   rZ   r  metric_depthr   r   s                        r%   r]   z"ZoeDepthForDepthEstimation.forward  s   ` %&GHHH%0%<kk$+B]$8$D  $+Jj 	 2C1N--TXT_Tq-<</CWh = 
 
  ,*01fe_
+z)"&))M<"U"Uxj=(#'#5#5m#D#D j3&*&6&6"1v#a&QRRao '7 '
 '
#m $+++22 	F(&6D&7122;6)-)9TGf$$vE+('!/)
 
 
 	
r$   )NNNN)r   r   r   r.   r   ZOEDEPTH_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr    r!   r   
LongTensorrg  r   r   r_   r]   r`   ra   s   @r%   r  r    s           2 +*+DEE+?o^^^ .2,0/3&*]
 ]
']
 )*]
 $D>	]

 'tn]
 d^]
 
uU\"$88	9]
 ]
 ]
 _^ FE]
 ]
 ]
 ]
 ]
r$   r  )r   )r  r,   )@r   r]  dataclassesr   typingr   r   r   r   r    torch.utils.checkpointr   activationsr
   
file_utilsr   r   r   modeling_outputsr   modeling_utilsr   utilsr   r   utils.backbone_utilsr   configuration_zoedepthr   
get_loggerr   loggerr   r   Moduler'   r6   ru   r   ry   r   r   r   r   r   r   jitscriptfloatro   r  r  r=  rA  rG  ri  r  r  r  r  r  ZOEDEPTH_START_DOCSTRINGr  r  r#   r$   r%   <module>r     s      ! ! ! ! ! ! / / / / / / / / / / / /            ! ! ! ! ! !         
 5 4 4 4 4 4 - - - - - - ) ) ) ) ) ) ) ) 1 1 1 1 1 1 2 2 2 2 2 2 
	H	%	% # ? ? ? ? ?; ? ? ?BG G G G Gbi G G GT    bi   0# # # # # # # #0;' ;' ;' ;' ;'") ;' ;' ;'~" " " " " " " "J3$ 3$ 3$ 3$ 3$29 3$ 3$ 3$l)) )) )) )) ))") )) )) ))XR R R R)0 )0 )0 )0 )0 )0 )0 )0X@G @G @G @G @GBI @G @G @GF5, 5, 5, 5, 5,ry 5, 5, 5,p - -U - - - - -&\, \, \, \, \,RY \, \, \,~Q, Q, Q, Q, Q,RY Q, Q, Q,h    	   6B B B B B B B BJ! ! ! ! !bi ! ! !H< < < < <bi < < <~    BI   "J" J" J" J" J" J" J" J"Z^ ^ ^ ^ ^	 ^ ^ ^D* * * * *o * * *0	  "  	 y
 y
 y
 y
 y
!8 y
 y
 y
 y
 y
r$   