
    gw                     T   d Z ddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZ ddlZddlZddlmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZmZmZ ddlmZ ddlmZm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(  e#j)        e*          Z+dZ,dZ-g dZ.e G d de"                      Z/e G d de"                      Z0 G d dej1                  Z2 G d dej1                  Z3 G d dej1                  Z4 G d dej1                  Z5 G d  d!ej1                  Z6 G d" d#ej1                  Z7 G d$ d%ej1                  Z8 G d& d'ej1                  Z9 G d( d)ej1                  Z: G d* d+ej1                  Z; G d, d-ej1                  Z<d. Z= G d/ d0ej1                  Z> G d1 d2ej1                  Z? G d3 d4ej1                  Z@ G d5 d6ej1                  ZA G d7 d8e          ZBd9ZCd:ZD ed;eC           G d< d=eB                      ZE G d> d?ej1                  ZF G d@ dAej1                  ZG G dB dCej1                  ZH edDeC           G dE dFeB                      ZI G dG dHej1                  ZJ G dI dJej1                  ZK edKeC           G dL dMeB                      ZLdS )NzPyTorch DPT (Dense Prediction Transformers) model.

This implementation is heavily inspired by OpenMMLab's implementation, found here:
https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/dpt_head.py.

    N)	dataclass)ListOptionalSetTupleUnion)nn)CrossEntropyLoss   )ACT2FN)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardreplace_return_docstrings)BaseModelOutputDepthEstimatorOutputSemanticSegmenterOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputlogging	torch_int)load_backbone   )	DPTConfigr   zIntel/dpt-large)r   iA  i   c                   `    e Zd ZU dZdZej        ed<   dZe	e
ej        df                  ed<   dS )*BaseModelOutputWithIntermediateActivationsa#  
    Base class for model's outputs that also contains intermediate activations that can be used at later stages. Useful
    in the context of Vision models.:

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
            Intermediate activations that can be used to compute hidden states of the model at various layers.
    Nlast_hidden_states.intermediate_activations)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r    r   r        `/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/dpt/modeling_dpt.pyr   r   9   sU         	 	 -1)000HLhuU->-C'DELLLLLr)   r   c                       e Zd ZU dZdZej        ed<   dZej        ed<   dZ	e
eej        df                  ed<   dZe
eej        df                  ed<   dZe
eej        df                  ed<   dS )	4BaseModelOutputWithPoolingAndIntermediateActivationsa  
    Base class for model's outputs that also contains a pooling of the last hidden states as well as intermediate
    activations that can be used by the model at later stages.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            Last layer hidden-state of the first token of the sequence (classification token) after further processing
            through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
            the classification token after processing through a linear layer and a tanh activation function. The linear
            layer weights are trained from the next sentence prediction (classification) objective during pretraining.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
            Intermediate activations that can be used to compute hidden states of the model at various layers.
    Nlast_hidden_statepooler_output.hidden_states
attentionsr    )r!   r"   r#   r$   r-   r%   r&   r'   r.   r/   r   r   r0   r    r(   r)   r*   r,   r,   J   s          6 ,0u(///'+M5$+++=AM8E%"3S"89:AAA:>Ju0#567>>>HLhuU->-C'DELLLLLr)   r,   c            	       \     e Zd ZdZd fd	ZddZ	 ddej        ded	ed
ej        fdZ	 xZ
S )DPTViTHybridEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    Nc                 &   t                                                       |j        |j        }}|j        |j        }}t          |t          j        j	                  r|n||f}t          |t          j        j	                  r|n||f}|d         |d         z  |d         |d         z  z  }t          |          | _        | j        j        d         }t          | j        j                  dk    r)t          dt          | j        j                             ddg| _        ||j        }	|	dd          }|	d         }n7t          |t          j        j	                  r|n||f}| j        j        d         }|| _        |d         | _        || _        t#          j        ||d          | _        t#          j        t+          j        dd|j                            | _        t#          j        t+          j        d|dz   |j                            | _        d S )Nr   r   r   z1Expected backbone to have 3 output features, got kernel_size)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterabler   backbonechannelslen
ValueErrorresidual_feature_map_indexbackbone_featmap_shaper	   Conv2d
projection	Parameterr%   zeros	cls_tokenposition_embeddings)selfconfigfeature_sizer:   r;   r<   r=   num_patchesfeature_dimfeat_map_shape	__class__s             r*   r9   zDPTViTHybridEmbeddings.__init__u   s   !'!2F4EJ
$*$79Kk#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
!!}
15*Q-:VW=:XY%f--m,R0t}%&&!++nQTUYUbUkQlQlnnooo+,a&'#:N)"##.L(+KK !+<9Q R RtYegsXt  -04K$$Q-()K!LLLek!Q8J&K&KLL#%<A{QPVPb0c0c#d#d   r)   r   c                    |d d d |f         }|d|d f         }t          t          |          dz            }|                    d||d                              dddd          }t          j                            |||fd          }|                    dddd                              d||z  d          }t          j        ||gd	          }|S 
Nr         ?r   r4   r      bilinear)sizemodedim)	r   rD   reshapepermuter	   
functionalinterpolater%   catrN   posembgrid_size_heightgrid_size_widthstart_index
posemb_tokposemb_gridold_grid_sizes           r*   _resize_pos_embedz(DPTViTHybridEmbeddings._resize_pos_embed   s    AAA||O,
Q_-!#k"2"2c"9::!))!]M2NNVVWXZ[]^`abbm//CSUdBelv/ww!))!Q155==aAQTcAceghhJ4!<<<r)   Fpixel_valuesinterpolate_pos_encodingreturn_dictreturnc                    |j         \  }}}}|| j        k    rt          d          |sT|| j        d         k    s|| j        d         k    r2t          d| d| d| j        d          d| j        d          d	          |                     | j        || j        z  || j        z            }|                     |          j        d         }	fd	| j	        D             }
| 
                    |	                              d
                              dd
          }| j                            |dd          }t          j        ||fd          }||z   }|s||
fS t#          ||
          S )NeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model (z).r4   c                 *    g | ]}j         |         S r(   )feature_maps).0indexbackbone_outputs     r*   
<listcomp>z2DPTViTHybridEmbeddings.forward.<locals>.<listcomp>   s!    qqq <U Cqqqr)   rX   r\   )r   r    )shaper<   rE   r:   rk   rM   r;   rB   rt   rF   rI   flatten	transposerL   expandr%   rb   r   )rN   rl   rm   rn   
batch_sizer<   heightwidthrM   featuresoutput_hidden_states
embeddings
cls_tokensrw   s                @r*   forwardzDPTViTHybridEmbeddings.forward   s    3?2D/
L&%4,,,w   ( 	+++u8J/J/J E E E% E E+E E.2oa.@E E E  
 #44$f&?$/AY
 
 --55"/3  rqqqQUQpqqq__X..66q99CCAqII
^**:r2>>
Y
J7Q???
  "55
 	6 455 :)%9
 
 
 	
r)   Nr   )FF)r!   r"   r#   r$   r9   rk   r%   Tensorboolr   __classcell__rT   s   @r*   r2   r2   n   s          e  e  e  e  e  eD    gl)
 )
!L)
DH)
_c)
	)
 )
 )
 )
 )
 )
 )
 )
r)   r2   c                   2     e Zd ZdZ fdZddZddZ xZS )	DPTViTEmbeddingszB
    Construct the CLS token, position and patch embeddings.

    c                    t                                                       t          j        t	          j        dd|j                            | _        t          |          | _	        | j	        j
        }t          j        t	          j        d|dz   |j                            | _        t          j        |j                  | _        || _        d S )Nr   )r8   r9   r	   rJ   r%   rK   r=   rL   DPTViTPatchEmbeddingspatch_embeddingsrQ   rM   Dropouthidden_dropout_probdropoutrO   )rN   rO   rQ   rT   s      r*   r9   zDPTViTEmbeddings.__init__   s    ek!Q8J&K&KLL 5f = =+7#%<A{QPVPb0c0c#d#d z&"<==r)   r   c                    |d d d |f         }|d|d f         }t          |                    d          dz            }|                    d||d                              dddd          }t          j                            |||fd          }|                    dddd                              d||z  d          }t          j        ||gd	          }|S rV   )	r   rZ   r^   r_   r	   r`   ra   r%   rb   rc   s           r*   rk   z"DPTViTEmbeddings._resize_pos_embed   s    AAA||O,
Q_-!+"2"21"5"5"<==!))!]M2NNVVWXZ[]^`abbm//CSUdBelv/ww!))!Q155==aAQTcAceghhJ4!<<<r)   Fc                    |j         \  }}}}| j        j        }|                     | j        ||z  ||z            }|                     |          }	|	                                \  }}
}| j                            |dd          }t          j
        ||	fd          }	|	|z   }	|                     |	          }	|s|	fS t          |	          S )Nr4   r   r\   )r   )ry   rO   r;   rk   rM   r   rZ   rL   r|   r%   rb   r   r   )rN   rl   rn   r}   r<   r~   r   r;   rM   r   seq_len_r   s                r*   r   zDPTViTEmbeddings.forward   s    2>2D/
L&% [+
"44$f
&:EZ<O
 
 **<88
!+!2!2
GQ ^**:r2>>
Y
J7Q???
  "55
\\*--
 	!= 9ZXXXXr)   r   )F)r!   r"   r#   r$   r9   rk   r   r   r   s   @r*   r   r      sr         
       Y Y Y Y Y Y Y Yr)   r   c                   (     e Zd ZdZ fdZd Z xZS )r   z$
    Image to Patch Embedding.

    c                    t                                                       |j        |j        }}|j        |j        }}t          |t          j        j	                  r|n||f}t          |t          j        j	                  r|n||f}|d         |d         z  |d         |d         z  z  }|| _        || _        || _        || _
        t          j        ||||          | _        d S )Nr   r   )r7   stride)r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rQ   r	   rH   rI   )rN   rO   r:   r;   r<   r=   rQ   rT   s          r*   r9   zDPTViTPatchEmbeddings.__init__  s    !'!2F4EJ
$*$79Kk#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
!!}
15*Q-:VW=:XY$$(&)L+:^hiiir)   c                     |j         \  }}}}|| j        k    rt          d          |                     |                              d                              dd          }|S )Nrq   rX   r   )ry   r<   rE   rI   rz   r{   )rN   rl   r}   r<   r~   r   r   s          r*   r   zDPTViTPatchEmbeddings.forward   sm    2>2D/
L&%4,,,w   __\22::1==GG1MM
r)   r!   r"   r#   r$   r9   r   r   r   s   @r*   r   r     sV         
j j j j j      r)   r   c            
            e Zd Zdeddf fdZdej        dej        fdZ	 ddeej                 d	e	de
eej        ej        f         eej                 f         fd
Z xZS )DPTViTSelfAttentionrO   ro   Nc                    t                                                       |j        |j        z  dk    r1t	          |d          s!t          d|j        f d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j
        |j        | j        |j                  | _        t          j
        |j        | j        |j                  | _        t          j
        |j        | j        |j                  | _        t          j        |j                  | _        d S )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .)bias)r8   r9   r=   num_attention_headshasattrrE   intattention_head_sizeall_head_sizer	   Linearqkv_biasquerykeyvaluer   attention_probs_dropout_probr   rN   rO   rT   s     r*   r9   zDPTViTSelfAttention.__init__,  s1    ::a??PVXhHiHi?76#5"7 7 737 7 7  
 $*#= #&v'9F<V'V#W#W !58PPYv143EFO\\\
9V/1C&/ZZZYv143EFO\\\
z&"EFFr)   xc                     |                                 d d         | j        | j        fz   }|                    |          }|                    dddd          S )Nr4   r   rX   r   r   )rZ   r   r   viewr_   )rN   r   new_x_shapes      r*   transpose_for_scoresz(DPTViTSelfAttention.transpose_for_scores>  sP    ffhhssmt'?AY&ZZFF;yyAq!$$$r)   F	head_maskoutput_attentionsc                    |                      |          }|                     |                     |                    }|                     |                     |                    }|                     |          }t	          j        ||                    dd                    }|t          j        | j	                  z  }t          j                            |d          }	|                     |	          }	||	|z  }	t	          j        |	|          }
|
                    dddd                                          }
|
                                d d         | j        fz   }|
                    |          }
|r|
|	fn|
f}|S )Nr4   r5   r\   r   rX   r   r   )r   r   r   r   r%   matmulr{   mathsqrtr   r	   r`   softmaxr   r_   
contiguousrZ   r   r   )rN   r/   r   r   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                r*   r   zDPTViTSelfAttention.forwardC  sr    !JJ}55--dhh}.E.EFF	//

=0I0IJJ//0ABB !<Y5H5HR5P5PQQ+di8P.Q.QQ -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S%**+BCC6G]=/22mM]r)   NF)r!   r"   r#   r   r9   r%   r   r   r   r   r   r   r   r   r   s   @r*   r   r   +  s        Gy GT G G G G G G$%el %u| % % % % bg! !(0(>!Z^!	uU\5</0%2EE	F! ! ! ! ! ! ! !r)   r   c                   ^     e Zd ZdZdeddf fdZdej        dej        dej        fdZ xZ	S )	DPTViTSelfOutputz
    The residual connection is defined in DPTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rO   ro   Nc                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _        d S r   )	r8   r9   r	   r   r=   denser   r   r   r   s     r*   r9   zDPTViTSelfOutput.__init__n  sJ    Yv163EFF
z&"<==r)   r/   input_tensorc                 Z    |                      |          }|                     |          }|S r   r   r   rN   r/   r   s      r*   r   zDPTViTSelfOutput.forwards  s*    

=11]33r)   )
r!   r"   r#   r$   r   r9   r%   r   r   r   r   s   @r*   r   r   h  s         
>y >T > > > > > >
U\  RWR^        r)   r   c                        e Zd Zdeddf fdZdee         ddfdZ	 	 ddej	        d	e
ej	                 d
edeeej	        ej	        f         eej	                 f         fdZ xZS )DPTViTAttentionrO   ro   Nc                     t                                                       t          |          | _        t	          |          | _        t                      | _        d S r   )r8   r9   r   	attentionr   outputsetpruned_headsr   s     r*   r9   zDPTViTAttention.__init__{  sI    ,V44&v..EEr)   headsc                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S )Nr   r   r\   )rD   r   r   r   r   r   r   r   r   r   r   r   r   union)rN   r   rv   s      r*   prune_headszDPTViTAttention.prune_heads  s   u::??F74>5t~7Y[_[l
 
u
  2$.2FNN/0BEJJ1$.2FNN.t{/@%QOOO .2^-ORUV[R\R\-\*'+~'IDNLn'n$ -33E::r)   Fr/   r   r   c                     |                      |||          }|                     |d         |          }|f|dd          z   }|S )Nr   r   )r   r   )rN   r/   r   r   self_outputsattention_outputr   s          r*   r   zDPTViTAttention.forward  sM     ~~mY@QRR;;|AFF#%QRR(88r)   r   )r!   r"   r#   r   r9   r   r   r   r%   r   r   r   r   r   r   r   r   s   @r*   r   r   z  s        "y "T " " " " " ";S ;d ; ; ; ;, -1"'	 | EL)  	
 
uU\5</0%2EE	F       r)   r   c                   L     e Zd Zdeddf fdZdej        dej        fdZ xZS )DPTViTIntermediaterO   ro   Nc                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S r   )r8   r9   r	   r   r=   intermediate_sizer   r>   
hidden_actstrr   intermediate_act_fnr   s     r*   r9   zDPTViTIntermediate.__init__  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r)   r/   c                 Z    |                      |          }|                     |          }|S r   )r   r   )rN   r/   s     r*   r   zDPTViTIntermediate.forward  s,    

=1100??r)   	r!   r"   r#   r   r9   r%   r   r   r   r   s   @r*   r   r     sq        9y 9T 9 9 9 9 9 9U\ el        r)   r   c                   Z     e Zd Zdeddf fdZdej        dej        dej        fdZ xZS )DPTViTOutputrO   ro   Nc                     t                                                       t          j        |j        |j                  | _        t          j        |j                  | _	        d S r   )
r8   r9   r	   r   r   r=   r   r   r   r   r   s     r*   r9   zDPTViTOutput.__init__  sJ    Yv79KLL
z&"<==r)   r/   r   c                 d    |                      |          }|                     |          }||z   }|S r   r   r   s      r*   r   zDPTViTOutput.forward  s4    

=11]33%4r)   r   r   s   @r*   r   r     s|        >y >T > > > > > >
U\  RWR^        r)   r   c                        e Zd ZdZdeddf fdZ	 	 ddej        deej                 d	e	de
eej        ej        f         eej                 f         fd
Z xZS )DPTViTLayerz?This corresponds to the Block class in the timm implementation.rO   ro   Nc                 z   t                                                       |j        | _        d| _        t	          |          | _        t          |          | _        t          |          | _	        t          j        |j        |j                  | _        t          j        |j        |j                  | _        d S )Nr   eps)r8   r9   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r	   	LayerNormr=   layer_norm_epslayernorm_beforelayernorm_afterr   s     r*   r9   zDPTViTLayer.__init__  s    '-'E$(00.v66"6** "V-?VEZ [ [ [!|F,>FDYZZZr)   Fr/   r   r   c                    |                      |                     |          ||          }|d         }|dd          }||z   }|                     |          }|                     |          }|                     ||          }|f|z   }|S )N)r   r   r   )r   r   r   r   r   )rN   r/   r   r   self_attention_outputsr   r   layer_outputs           r*   r   zDPTViTLayer.forward  s     "&!!-00/ "0 "
 "

 2!4(, )=8 ++M::((66 {{<??/G+r)   r   )r!   r"   r#   r$   r   r9   r%   r   r   r   r   r   r   r   r   s   @r*   r   r     s        II[y [T [ [ [ [ [ [ -1"'	 | EL)  	
 
uU\5</0%2EE	F       r)   r   c                        e Zd Zdeddf fdZ	 	 	 	 ddej        deej                 d	ed
edede	e
ef         fdZ xZS )DPTViTEncoderrO   ro   Nc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r(   )r   )ru   r   rO   s     r*   rx   z*DPTViTEncoder.__init__.<locals>.<listcomp>  s!    #a#a#aAK$7$7#a#a#ar)   F)	r8   r9   rO   r	   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   s    `r*   r9   zDPTViTEncoder.__init__  s`    ]#a#a#a#avG_A`A`#a#a#abb
&+###r)   FTr/   r   r   r   rn   c                    |rdnd }|rdnd }t          | j                  D ]h\  }}	|r||fz   }|||         nd }
| j        r%| j        r|                     |	j        ||
|          }n |	||
|          }|d         }|r||d         fz   }i|r||fz   }|st          d |||fD                       S t          |||          S )Nr(   r   r   c              3      K   | ]}||V  	d S r   r(   )ru   vs     r*   	<genexpr>z(DPTViTEncoder.forward.<locals>.<genexpr>  s(      mmq_`_l_l_l_l_lmmr)   )r-   r/   r0   )	enumerater   r   training_gradient_checkpointing_func__call__tupler   )rN   r/   r   r   r   rn   all_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss               r*   r   zDPTViTEncoder.forward  sI    #7@BBD$5?bb4(44 	P 	POA|# I$58H$H!.7.CillO* `t} ` $ A A )!#%	! ! !-]OM^ _ _)!,M  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r)   )NFFT)r!   r"   r#   r   r9   r%   r   r   r   r   r  r   r   r   r   s   @r*   r   r     s        ,y ,T , , , , , , -1"'%* )
 )
|)
 EL))
  	)

 #)
 )
 
uo%	&)
 )
 )
 )
 )
 )
 )
 )
r)   r   c                   l     e Zd ZdZ fdZd Zd Zd	deej	                 deej	                 fdZ
 xZS )
DPTReassembleStagea@  
    This class reassembles the hidden states of the backbone into image-like feature representations at various
    resolutions.

    This happens in 3 stages:
    1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
       `config.readout_type`.
    2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
    3. Resizing the spatial dimensions (height, width).

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
    c                    t                                                       || _        t          j                    | _        |j        r|                     |           n|                     |           |j	        | _	        d S r   )
r8   r9   rO   r	   r   layers	is_hybrid_init_reassemble_dpt_hybrid_init_reassemble_dptneck_ignore_stagesr   s     r*   r9   zDPTReassembleStage.__init__3  st    moo 	.,,V4444%%f---"(";r)   c           	      j   t          t          t          |j                            |j                  D ]r\  }}|dk    r,| j                            t          j                               7|dk    r5| j                            t          ||j        |         |                     s|j
        dk    rt          d|j
         d          t          j                    | _        t          |          }t          t          |j                            D ]}|dk    r>| j                            t          j        t          j                                         F|dk    rS| j                            t          j        t          j        d|z  |          t"          |j                                      dS )a   "
        For DPT-Hybrid the first 2 reassemble layers are set to `nn.Identity()`, please check the official
        implementation: https://github.com/isl-org/DPT/blob/f43ef9e08d70a752195028a51be5e1aff227b913/dpt/vit.py#L438
        for more details.
        r   rC   factorprojectzReadout type z! is not supported for DPT-Hybrid.rX   N)zipr   rD   neck_hidden_sizesreassemble_factorsr  appendr	   IdentityDPTReassembleLayerreadout_typerE   r   readout_projects_get_backbone_hidden_size
Sequentialr   r   r   )rN   rO   r  r  r=   s        r*   r  z.DPTReassembleStage._init_reassemble_dpt_hybrid?  s    U3v'?#@#@AA6C\]] 	t 	tIAvAvv""2;==1111Q""#5fvG_`aGbkq#r#r#rsss)++cV-@cccddd !#/77s634455 	 	AAvv%,,R]2;==-I-IJJJJQ%,,M")AO["I"I6RXRcKdee  		 	r)   c           	      :   t          t          t          |j                            |j                  D ]:\  }}| j                            t          ||j        |         |                     ;|j        dk    rt          j
                    | _        t          |          }t          t          |j                            D ]W}| j                            t          j        t          j        d|z  |          t          |j                                      Vd S d S )Nr  r  rX   )r  r   rD   r  r  r  r  r   r!  r	   r   r"  r#  r$  r   r   r   )rN   rO   r  r  r=   r   s         r*   r  z'DPTReassembleStage._init_reassemble_dptY  s   U3v'?#@#@AA6C\]] 	p 	pIAvK1&6C[\]C^gmnnnoooo)++$&MOOD!3F;;K3v78899  %,,M")AO["I"I6RXRcKdee   	 ,+ r)   Nr/   ro   c                    g }t          |          D ]\  }}|| j        vr|dddf         |ddddf         }}|j        \  }}	}
|||                    ||||
          }n*t	          |	dz            }|                    ||||
          }|                    dddd                                          }|j        }| j        j        dk    r|	                    d                              d          }|
                    d                              |          } | j        |         t          j        ||fd	                    }|                    ddd                              |          }nP| j        j        d
k    r@|	                    d          |
                    d	          z   }|                    |          } | j        |         |          }|                    |           |S )z
        Args:
            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
                List of hidden states from the backbone.
        Nr   r   rW   r   rX   r  )r   rX   r   r4   add)r  r  ry   r^   r   r_   r   rO   r!  rz   	unsqueeze	expand_asr"  r%   rb   r  r  )rN   r/   patch_heightpatch_widthoutr  hidden_staterL   r}   sequence_lengthr<   rZ   feature_shapereadouts                 r*   r   zDPTReassembleStage.forwarde  s    (77 	% 	%OA|///*6qqq!t*<l111abb5>Q<	<H<N9
O\+0G#/#7#7
LR]_k#l#lLL$_c%9::D#/#7#7
D$P\#]#]L+33Aq!Q??JJLL , 2;+y88#/#7#7#:#:#B#B9#M#ML'11!44>>|LLG#;4#8#;EI|U\F]_a<b<b#c#cL#/#7#71a#@#@#H#H#W#WLL[-66#/#7#7#:#:Y=P=PQS=T=T#TL#/#7#7#F#FL-t{1~l;;JJ|$$$$
r)   NN)r!   r"   r#   r$   r9   r  r  r   r%   r   r   r   r   s   @r*   r  r  #  s         
< 
< 
< 
< 
<  4
 
 
# #T%,%7 #aefkfras # # # # # # # #r)   r  c                 H    | j         | j        du r| j         j        S | j        S r   )backbone_configr  r=   )rO   s    r*   r#  r#    s-    )f.>%.G.G%11!!r)   c                   $     e Zd Z fdZd Z xZS )r   c           	         t                                                       t          |          }t          j        ||d          | _        |dk    r t          j        ||||d          | _        d S |dk    rt          j                    | _        d S |dk     r0t          j        ||dt          d|z            d          | _        d S d S )Nr   )in_channelsout_channelsr7   r   r7   r   paddingr   )
r8   r9   r#  r	   rH   rI   ConvTranspose2dresizer  r   )rN   rO   rC   r  r=   rT   s        r*   r9   zDPTReassembleLayer.__init__  s    /77)(`abbb A::,XxV\blmnnnDKKKq[[+--DKKKaZZ)HhAcRSV\R\ooghiiiDKKK Zr)   c                 Z    |                      |          }|                     |          }|S r   )rI   r;  )rN   r-  s     r*   r   zDPTReassembleLayer.forward  s*    |44{{<00r)   r!   r"   r#   r9   r   r   r   s   @r*   r   r     sL        j j j j j      r)   r   c                   $     e Zd Z fdZd Z xZS )DPTFeatureFusionStagec                    t                                                       t          j                    | _        t          t          |j                            D ])}| j                            t          |                     *d S r   )
r8   r9   r	   r   r  r   rD   r  r  DPTFeatureFusionLayer)rN   rO   r   rT   s      r*   r9   zDPTFeatureFusionStage.__init__  st    moos634455 	> 	>AK4V<<====	> 	>r)   c                    |d d d         }g } | j         d         |d                   }|                    |           t          |dd          | j         dd                    D ]&\  }} |||          }|                    |           '|S )Nr4   r   r   )r  r  r  )rN   r/   fused_hidden_statesfused_hidden_stater-  r   s         r*   r   zDPTFeatureFusionStage.forward  s    %ddd+ +T[^M!,<==""#5666#&}QRR'8$+abb/#J#J 	; 	;L%!&'9<!H!H&&'9::::""r)   r=  r   s   @r*   r?  r?    sG        > > > > ># # # # # # #r)   r?  c                   F     e Zd ZdZ fdZdej        dej        fdZ xZS )DPTPreActResidualLayerz
    ResidualConvUnit, pre-activate residual unit.

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
    c                    t                                                       |j        | _        |j        |j        n| j         }t          j                    | _        t          j        |j	        |j	        ddd|          | _
        t          j                    | _        t          j        |j	        |j	        ddd|          | _        | j        r>t          j        |j	                  | _        t          j        |j	                  | _        d S d S )Nr   r   )r7   r   r9  r   )r8   r9   !use_batch_norm_in_fusion_residualuse_batch_normuse_bias_in_fusion_residualr	   ReLUactivation1rH   fusion_hidden_sizeconvolution1activation2convolution2BatchNorm2dbatch_norm1batch_norm2)rN   rO   rJ  rT   s      r*   r9   zDPTPreActResidualLayer.__init__  s   $F 1= ..(( 	$ 799I%%,
 
 
 799I%%,
 
 
  	I!~f.GHHD!~f.GHHD	I 	Ir)   r-  ro   c                 (   |}|                      |          }|                     |          }| j        r|                     |          }|                     |          }|                     |          }| j        r|                     |          }||z   S r   )rL  rN  rI  rR  rO  rP  rS  rN   r-  residuals      r*   r   zDPTPreActResidualLayer.forward  s    ''55((66 	:++L99L''55((66 	:++L99Lh&&r)   )	r!   r"   r#   r$   r9   r%   r   r   r   r   s   @r*   rF  rF    sn          I  I  I  I  ID'EL 'U\ ' ' ' ' ' ' ' 'r)   rF  c                   ,     e Zd ZdZd fd	ZddZ xZS )rA  a3  Feature fusion layer, merges feature maps from different stages.

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
        align_corners (`bool`, *optional*, defaults to `True`):
            The align_corner setting for bilinear upsample.
    Tc                     t                                                       || _        t          j        |j        |j        dd          | _        t          |          | _        t          |          | _	        d S )Nr   T)r7   r   )
r8   r9   align_cornersr	   rH   rM  rI   rF  residual_layer1residual_layer2)rN   rO   rY  rT   s      r*   r9   zDPTFeatureFusionLayer.__init__  si    *)F$=v?Xfgnrsss5f==5f==r)   Nc                 t   |c|j         |j         k    r;t          j                            ||j         d         |j         d         fdd          }||                     |          z   }|                     |          }t          j                            |dd| j                  }|                     |          }|S )NrX   r   rY   FrZ   r[   rY  scale_factorr[   rY  )ry   r	   r`   ra   rZ  r[  rY  rI   rU  s      r*   r   zDPTFeatureFusionLayer.forward  s    !X^33=44L$6q$9<;Ma;P#QXbrw 5   ($*>*>x*H*HHL++L99}00qzI[ 1 
 
 |44r)   Tr   r   r   s   @r*   rA  rA    s[         > > > > > >       r)   rA  c                   (    e Zd ZdZeZdZdZdZd Z	dS )DPTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    dptrl   Tc                    t          |t          j        t          j        t          j        f          rT|j        j                            d| j        j	                   |j
         |j
        j                                         dS dS t          |t          j                  r?|j
        j                                         |j        j                            d           dS dS )zInitialize the weightsg        )meanstdNg      ?)r>   r	   r   rH   r:  weightdatanormal_rO   initializer_ranger   zero_r   fill_)rN   modules     r*   _init_weightsz DPTPreTrainedModel._init_weights,  s    fry")R5GHII 	* M&&CT[5R&SSS{& &&((((( '&-- 	*K""$$$M$$S)))))	* 	*r)   N)
r!   r"   r#   r$   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingrn  r(   r)   r*   rb  rb  !  sE         
 L$O&*#
* 
* 
* 
* 
*r)   rb  aE  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`ViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aP  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
            for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
z]The bare DPT Model transformer outputting raw hidden-states without any specific head on top.c                        e Zd Zd fd	Zd Zd Z ee           ee	e
ede          	 	 	 	 ddej        d	eej                 d
ee         dee         dee         deee
f         fd                        Z xZS )DPTModelTc                    t                                          |           || _        |j        rt	          |          | _        nt          |          | _        t          |          | _        t          j
        |j        |j                  | _        |rt          |          nd | _        |                                  d S )Nr   )r8   r9   rO   r  r2   r   r   r   encoderr	   r   r=   r   	layernormDPTViTPoolerpooler	post_init)rN   rO   add_pooling_layerrT   s      r*   r9   zDPTModel.__init__`  s         	74V<<DOO.v66DO$V,,f&8f>STTT.?Il6***T 	r)   c                 @    | j         j        r| j        S | j        j        S r   )rO   r  r   r   )rN   s    r*   get_input_embeddingszDPTModel.get_input_embeddingsq  s"    ;  	4?"?33r)   c                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrv  r   r   r   )rN   heads_to_pruner   r   s       r*   _prune_headszDPTModel._prune_headsw  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr)   vision)
checkpointoutput_typero  modalityexpected_outputNrl   r   r   r   rn   ro   c                 (   ||n| j         j        }||n| j         j        }||n| j         j        }|                     || j         j                  }|                     ||          }|s|d         n|j        }|                     |||||          }|d         }	| 	                    |	          }	| j
        | 
                    |	          nd }
|s!|
|	|
fn|	f}||dd          z   |dd          z   S t          |	|
|j        |j        |j                  S )N)rn   r   r   r   r   rn   r   )r-   r.   r/   r0   r    )rO   r   r   use_return_dictget_head_maskr   r   r   rv  rw  ry  r,   r/   r0   r    )rN   rl   r   r   r   rn   embedding_outputembedding_last_hidden_statesencoder_outputssequence_outputpooled_outputhead_outputss               r*   r   zDPTModel.forward  sk     2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] &&y$+2OPP	??<[?QQBM'v'7':':ScSv$,,(/!5# ' 
 
 *!,..998<8OO444UY 	M?L?XO];;_n^pL/!"""558H8LLLC-')7&1%5%N
 
 
 	
r)   r`  )NNNN)r!   r"   r#   r9   r}  r  r   DPT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr,   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr%   r&   r   r   r   r   r   r   r   s   @r*   rt  rt  [  s)       
     "4 4 4C C C +*+?@@&H$.   26,0/3&*/
 /
'/
 E-./
 $D>	/

 'tn/
 d^/
 
uJJ	K/
 /
 /
  A@/
 /
 /
 /
 /
r)   rt  c                   *     e Zd Zdef fdZd Z xZS )rx  rO   c                     t                                                       t          j        |j        |j                  | _        t          j                    | _        d S r   )r8   r9   r	   r   r=   r   Tanh
activationr   s     r*   r9   zDPTViTPooler.__init__  sC    Yv163EFF
'))r)   c                 r    |d d df         }|                      |          }|                     |          }|S )Nr   )r   r  )rN   r/   first_token_tensorr  s       r*   r   zDPTViTPooler.forward  s@     +111a40

#56666r)   )r!   r"   r#   r   r9   r   r   r   s   @r*   rx  rx    sS        $y $ $ $ $ $ $
      r)   rx  c                   `     e Zd ZdZ fdZddeej                 deej                 fdZ xZ	S )DPTNecka;  
    DPTNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
    input and produces another list of tensors as output. For DPT, it includes 2 stages:

    * DPTReassembleStage
    * DPTFeatureFusionStage.

    Args:
        config (dict): config dict.
    c           
         t                                                       || _        |j        |j        j        dv rd | _        nt          |          | _        t          j                    | _	        |j
        D ]8}| j	                            t          j        ||j        ddd                     9t          |          | _        d S )N)swinv2r   r   Fr7   r9  r   )r8   r9   rO   r3  
model_typereassemble_stager  r	   r   convsr  r  rH   rM  r?  fusion_stage)rN   rO   channelrT   s      r*   r9   zDPTNeck.__init__  s     !-&2H2SWa2a2a$(D!!$6v$>$>D!]__
/ 	s 	sGJbi1JXYcdkpqqqrrrr 2&99r)   Nr/   ro   c                 l    t          |t          t          f          st          d          t	          |          t	           j        j                  k    rt          d           j                             |||          } fdt          |          D             } 
                    |          }|S )z
        Args:
            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
                List of hidden states from the backbone.
        z2hidden_states should be a tuple or list of tensorszOThe number of hidden states should be equal to the number of neck hidden sizes.Nc                 B    g | ]\  }} j         |         |          S r(   )r  )ru   r  featurerN   s      r*   rx   z#DPTNeck.forward.<locals>.<listcomp>  s-    VVVzq'MDJqM'**VVVr)   )r>   r  list	TypeErrorrD   rO   r  rE   r  r  r  )rN   r/   r*  r+  r   r   s   `     r*   r   zDPTNeck.forward  s     -%77 	RPQQQ}T[%B!C!CCCnooo  , 11-{[[MVVVVY}=U=UVVV ""8,,r)   r1  
r!   r"   r#   r$   r9   r   r%   r   r   r   r   s   @r*   r  r    sz        	 	: : : : :" T%,%7 aefkfras        r)   r  c                   R     e Zd ZdZ fdZdeej                 dej        fdZ xZ	S )DPTDepthEstimationHeada	  
    Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
    the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
    supplementary material).
    c                    t                                                       || _        d | _        |j        rt          j        ddddd          | _        |j        }t          j        t          j        ||dz  ddd          t          j	        ddd	
          t          j        |dz  dddd          t          j
                    t          j        ddddd          t          j
                              | _        d S )N   )r   r   )r   r   r8  rX   r   r   rY   Tr^      r   )r8   r9   rO   rI   add_projectionr	   rH   rM  r$  UpsamplerK  headrN   rO   r   rT   s      r*   r9   zDPTDepthEstimationHead.__init__  s      	e iSfV]cdddDO,MIhA1QPQRRRKQZtLLLIh!mRQq!LLLGIIIb!1a@@@GII
 
			r)   r/   ro   c                     || j         j                 }| j        1|                     |          } t          j                    |          }|                     |          }|                    d          }|S )Nr   r\   )rO   head_in_indexrI   r	   rK  r  squeeze)rN   r/   predicted_depths      r*   r   zDPTDepthEstimationHead.forward  sl    %dk&?@?& OOM::M%BGIIm44M))M22)11a188r)   r  r   s   @r*   r  r    sm         
 
 
 
 
&T%,%7 EL        r)   r  zu
    DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
    c                       e Zd Z fdZ ee           eee          	 	 	 	 	 dde	j
        dee	j
                 dee	j                 dee         dee         d	ee         d
eee	j                 ef         fd                        Z xZS )DPTForDepthEstimationc                 T   t                                          |           d | _        |j        du r#|j        |j        t          |          | _        nt          |d          | _        t          |          | _	        t          |          | _        |                                  d S NF)r{  )r8   r9   rB   r  r3  r   rt  rc  r  neckr  r  rz  r   s     r*   r9   zDPTForDepthEstimation.__init__.  s       u$$&*@*LPVP_Pk)&11DMM%@@@DH FOO	 +622	 	r)   r  ro  Nrl   r   labelsr   r   rn   ro   c                     d}|t          d          ||n j        j        }||n j        j        }||n j        j        } j        % j                            |||          }|j        }	n                     |||d|          }|r|j	        n|d         }	 j        j
        s$ fdt          |	dd                   D             }	nV|r|j        nt          |d                   }
|
                     fd	t          |	dd                   D                        |
}	d
\  }} j        j        5 j        j
        du r'|j        \  }}}} j        j        j        }||z  }||z  }                     |	||          }	                     |	          }|s)|r|f|dd         z   }n|f|dd         z   }||f|z   n|S t)          |||r|j	        nd|j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth depth estimation maps for computing the loss.

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DPTForDepthEstimation
        >>> import torch
        >>> import numpy as np
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large")
        >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")

        >>> # prepare image for the model
        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # interpolate to original size
        >>> post_processed_output = image_processor.post_process_depth_estimation(
        ...     outputs,
        ...     target_sizes=[(image.height, image.width)],
        ... )

        >>> # visualize the prediction
        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
        >>> depth = predicted_depth * 255 / predicted_depth.max()
        >>> depth = depth.detach().cpu().numpy()
        >>> depth = Image.fromarray(depth.astype("uint8"))
        ```NzTraining is not implemented yet)r   r   Tr  r   c                 6    g | ]\  }}|j         j        v |S r(   rO   backbone_out_indicesru   idxr  rN   s      r*   rx   z1DPTForDepthEstimation.forward.<locals>.<listcomp>  s6     ! ! ! ,WPSW[WbWwPwPwGPwPwPwr)   r4   c              3   N   K   | ]\  }}|j         j        d d         v |V   dS rX   Nr  r  s      r*   r  z0DPTForDepthEstimation.forward.<locals>.<genexpr>  sL       . .$Wdk>qrrBBB BBBB. .r)   r1  FrX   )lossr  r/   r0   )NotImplementedErrorrO   r  r   r   rB   forward_with_filtered_kwargsrt   rc  r/   r  r  r    r  extendr3  ry   r;   r  r  r   r0   )rN   rl   r   r  r   r   rn   r  r   r/   backbone_hidden_statesr*  r+  r   r~   r   r;   r  r   s   `                  r*   r   zDPTForDepthEstimation.forward@  s   b %&GHHH%0%<kk$+B]$8$D  $+Jj 	 2C1N--TXT_Tq=$m@@3G[l A  G $0MMhh#"3%)'   G 6APG11gajM ;( 7! ! ! !09-:K0L0L! ! ! NY)o)I)I^bcjkmcn^o^o&&-- . . . .(1-2C(D(D. . .    !7$.!k;&2t{7LPU7U7U"."4Aq&%4?J!Z/L:-K		-{KK))M22 	F# :)+gabbk9)+gabbk9)-)9TGf$$vE#+3GQ'//T)	
 
 
 	
r)   )NNNNN)r!   r"   r#   r9   r   r  r   r   r  r%   r&   r   
LongTensorr   r   r   r   r   r   r   s   @r*   r  r  '  s           $ +*+?@@+?o^^^ 26-1,0/3&*n
 n
'n
 E-.n
 )*	n

 $D>n
 'tnn
 d^n
 
uU\"$88	9n
 n
 n
 _^ A@n
 n
 n
 n
 n
r)   r  c                   N     e Zd Z fdZdeej                 dej        fdZ xZS )DPTSemanticSegmentationHeadc                    t                                                       || _        |j        }t	          j        t	          j        ||ddd          t	          j        |          t	          j                    t	          j	        |j
                  t	          j        ||j        d          t	          j        ddd	                    | _        d S )
Nr   r   Fr  r6   rX   rY   Tr^  )r8   r9   rO   rM  r	   r$  rH   rQ  rK  r   semantic_classifier_dropout
num_labelsr  r  r  s      r*   r9   z$DPTSemanticSegmentationHead.__init__  s    ,MIhaOOON8$$GIIJv9::Ih 1qAAAKQZtLLL
 
			r)   r/   ro   c                 T    || j         j                 }|                     |          }|S r   )rO   r  r  rN   r/   logitss      r*   r   z#DPTSemanticSegmentationHead.forward  s'    %dk&?@=))r)   )	r!   r"   r#   r9   r   r%   r   r   r   r   s   @r*   r  r    sc        
 
 
 
 
T%,%7 EL        r)   r  c                   $     e Zd Z fdZd Z xZS )DPTAuxiliaryHeadc                 ^   t                                                       |j        }t          j        t          j        ||ddd          t          j        |          t          j                    t          j        dd          t          j        ||j	        d                    | _
        d S )Nr   r   Fr  g?r6   )r8   r9   rM  r	   r$  rH   rQ  rK  r   r  r  r  s      r*   r9   zDPTAuxiliaryHead.__init__  s    ,MIhaOOON8$$GIIJsE""Ih 1qAAA
 
			r)   c                 0    |                      |          }|S r   )r  r  s      r*   r   zDPTAuxiliaryHead.forward  s    =))r)   r=  r   s   @r*   r  r    sG        

 

 

 

 

      r)   r  zY
    DPT Model with a semantic segmentation head on top e.g. for ADE20k, CityScapes.
    c                   $    e Zd Z fdZ ee           eee          	 	 	 	 	 	 dde	e
j                 de	e
j                 de	e
j                 de	e         de	e         d	e	e         d
eee
j                 ef         fd                        Z xZS )DPTForSemanticSegmentationc                 (   t                                          |           t          |d          | _        t	          |          | _        t          |          | _        |j        rt          |          nd | _
        |                                  d S r  )r8   r9   rt  rc  r  r  r  r  use_auxiliary_headr  auxiliary_headrz  r   s     r*   r9   z#DPTForSemanticSegmentation.__init__  s       Fe<<< FOO	 077	:@:S].v666Y] 	r)   r  Nrl   r   r  r   r   rn   ro   c                 r    ||n j         j        }||n j         j        }| j         j        dk    rt	          d                               |||d|          }|r|j        n|d         } j         j        s$ fdt          |dd                   D             }nV|r|j	        nt          |d                   }	|	                     fdt          |dd                   D                        |	}                     |	          }                     |          }
d} j                             |d                   }d}|t          j                            |
|j        d
d         dd          }|0t          j                            ||j        d
d         dd          }t'           j         j                  } |||          } |||          }| j         j        |z  z   }|s)|r|
f|dd         z   }n|
f|dd         z   }||f|z   n|S t-          ||
|r|j        nd|j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DPTForSemanticSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large-ade")
        >>> model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr  c                 6    g | ]\  }}|j         j        v |S r(   r  r  s      r*   rx   z6DPTForSemanticSegmentation.forward.<locals>.<listcomp>-  s5       (CCSWS^SsLsLsLsLsLsr)   r4   c              3   N   K   | ]\  }}|j         j        d d         v |V   dS r  r  r  s      r*   r  z5DPTForSemanticSegmentation.forward.<locals>.<genexpr>2  sM       * *(CCSWS^SstutvtvSwLwLwLwLwLwLw* *r)   )r/   r5   rY   Fr]  )ignore_indexrX   )r  r  r/   r0   )rO   r  r   r  rE   rc  r/   r  r  r    r  r  r  r  r  r	   r`   ra   ry   r
   semantic_loss_ignore_indexauxiliary_loss_weightr   r0   )rN   rl   r   r  r   r   rn   r   r/   r  r  auxiliary_logitsr  upsampled_logitsupsampled_auxiliary_logitsloss_fct	main_lossauxiliary_lossr   s   `                  r*   r   z"DPTForSemanticSegmentation.forward  s   F &1%<kk$+B]$8$D  $+Jj 	 $+"8A"="=NOOO((/!%#  
 
 2=L--'!* {$ 
	3   ,5mABB6G,H,H  MM JU%kW%E%EZ^_fgi_jZkZk"")) * * * *,5mABB6G,H,H* * *    3M			>>=))*#22=3DEE!}88V\"##.Zu  9      +-/]-F-F$6<+<:]b .G . .* (T[5[\\\H !16::I%X&@&IINt{@>QQD 	F# 1 WQRR[0 WQRR[0)-)9TGf$$vE&3GQ'//T)	
 
 
 	
r)   )NNNNNN)r!   r"   r#   r9   r   r  r   r   r  r   r%   r&   r  r   r   r   r   r   r   r   s   @r*   r  r    s'            +*+?@@+BQ`aaa 5915-1,0/3&*e
 e
u01e
 E-.e
 )*	e

 $D>e
 'tne
 d^e
 
uU\"$;;	<e
 e
 e
 ba A@e
 e
 e
 e
 e
r)   r  )Mr$   collections.abcr?   r   dataclassesr   typingr   r   r   r   r   r%   torch.utils.checkpointr	   torch.nnr
   activationsr   
file_utilsr   r   r   r   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   utils.backbone_utilsr   configuration_dptr   
get_loggerr!   loggerr  r  r  r   r,   Moduler2   r   r   r   r   r   r   r   r   r   r  r#  r   r?  rF  rA  rb  DPT_START_DOCSTRINGr  rt  rx  r  r  r  r  r  r  r(   r)   r*   <module>r     s         ! ! ! ! ! ! 4 4 4 4 4 4 4 4 4 4 4 4 4 4            % % % % % % ! ! ! ! ! !            _ ^ ^ ^ ^ ^ ^ ^ ^ ^ - - - - - - Q Q Q Q Q Q Q Q 4 4 4 4 4 4 4 4 4 4 1 1 1 1 1 1 ( ( ( ( ( ( 
	H	%	%  ( '  M M M M M M M M   M  M  M  M  M;  M  M  MF`
 `
 `
 `
 `
RY `
 `
 `
F7Y 7Y 7Y 7Y 7Yry 7Y 7Y 7Yt    BI   @9 9 9 9 9") 9 9 9z    ry   $& & & & &bi & & &T       "    29    ' ' ' ' '") ' ' 'V0
 0
 0
 0
 0
BI 0
 0
 0
fe e e e e e e eP" " "       ,# # # # #BI # # #.:' :' :' :' :'RY :' :' :'z" " " " "BI " " "J* * * * * * * *0	  . c W
 W
 W
 W
 W
! W
 W
	 W
v    29   2 2 2 2 2bi 2 2 2j& & & & &RY & & &R  	 C
 C
 C
 C
 C
. C
 C
 C
L    ")   2    ry   &  	 w
 w
 w
 w
 w
!3 w
 w
 w
 w
 w
r)   