
    gMf                       d Z ddlZddlmZ ddlmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZmZ ddlmZ  ej        e          ZdZ G d de	j                  Z G d de	j                  Z 	 	 	 d^dej!        de"de#de$de%f
dZ&	 	 d_dej!        dee#e%f         de#de%fdZ' G d de	j                  Z( G d de	j                  Z) G d  d!e	j                  Z* G d" d#e          Z+ G d$ d%e	j                  Z, G d& d'e	j                  Z- G d( d)e+          Z.d*Z/e G d+ d,e                      Z0e G d- d.e                      Z1e G d/ d0e                      Z2e G d1 d2e                      Z3e G d3 d4e                      Z4e G d5 d6e                      Z5d7ej6        j7        d8ej!        d9ej!        fd:Z8d`d;ej!        d<eej!                 d9ej!        fd=Z9 G d> d?e	j                  Z: G d@ dAe	j                  Z; G dB dCe	j                  Z< G dD dEe	j                  Z= edFe/           G dG dHe+                      Z> G dI dJe	j                  Z? edKe/           G dL dMe+                      Z@ G dN dOe	j                  ZA edPe/           G dQ dRe+                      ZB edSe/           G dT dUe	j                              ZC edVe/           G dW dXe+                      ZD G dY dZe	j                  ZE ed[e/           G d\ d]e+                      ZFdS )azPyTorch PatchTST model.    N)	dataclass)OptionalTupleUnion)nn   )ACT2CLS)BaseModelOutput)PreTrainedModel)NegativeBinomialOutputNormalOutputStudentTOutput)ModelOutputadd_start_docstringslogging   )PatchTSTConfigr   c                   h    e Zd ZdZ	 	 	 	 	 ddededed	ed
ededee         f fdZ	de
j        dedefdZ	 	 	 	 	 dde
j        dee
j                 deee
j                          dee
j                 dee
j                 dedee
j        ee
j                 eee
j                          f         fdZ xZS )PatchTSTAttentionz=Multi-headed attention from 'Attention Is All You Need' paper        FTN	embed_dim	num_headsdropout
is_decoderbias	is_causalconfigc                 
   t                                                       || _        || _        || _        ||z  | _        || _        | j        |z  | j        k    rt          d| j         d| d          | j        dz  | _        || _	        || _
        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        t          j        |||          | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: ).g      ࿩r   )super__init__r   r   r   head_dimr   
ValueErrorscalingr   r   r   Lineark_projv_projq_projout_proj)	selfr   r   r   r   r   r   r   	__class__s	           j/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/patchtst/modeling_patchtst.pyr"   zPatchTSTAttention.__init__)   s    	""!Y.MI%$.883dn 3 3%.3 3 3   }d*$"i	94@@@i	94@@@i	94@@@	)YTBBB    tensorseq_lenbszc                     |                     ||| j        | j                                      dd                                          S )Nr      )viewr   r#   	transpose
contiguous)r+   r/   r0   r1   s       r-   _shapezPatchTSTAttention._shapeH   s<    {{3GGQQRSUVWWbbdddr.   hidden_stateskey_value_statespast_key_valueattention_masklayer_head_maskoutput_attentionsreturnc                 Z
   |du}|                                 \  }}	}
|                     |          | j        z  }|r6|4|d         j        d         |j        d         k    r|d         }|d         }n>|rU|                     |                     |          d|          }|                     |                     |          d|          }n||                     |                     |          d|          }|                     |                     |          d|          }t          j        |d         |gd          }t          j        |d         |gd          }nT|                     |                     |          d|          }|                     |                     |          d|          }| j	        r||f}|| j
        z  d| j        f} |                     ||	|          j        | } |j        | } |j        | }|                     d          }t          j        ||                    dd                    }|                                 || j
        z  |	|fk    r2t!          d|| j
        z  |	|f d|                                            ||                                 |d|	|fk    r+t!          d	|d|	|f d|                                            |                    || j
        |	|          |z   }|                    || j
        z  |	|          }t"          j                            |d          }||                                 | j
        fk    r-t!          d
| j
        f d|                                            |                    dddd          |                    || j
        |	|          z  }|                    || j
        z  |	|          }|r=|                    || j
        |	|          }|                    || j
        z  |	|          }nd}t"          j                            || j        | j                  }t          j        ||          }|                                 || j
        z  |	| j        fk    r7t!          d|| j
        z  |	| j        f d|                                            |                    || j
        |	| j                  }|                    dd          }|                    ||	| j                  }|                     |          }|||fS )z#Input shape: Batch x Time x ChannelNr   r3   r   dimz$Attention weights should be of size z	, but is z!Attention mask should be of size z/Head mask for a single layer should be of size )ptrainingz `attn_output` should be of size )sizer)   r%   shaper7   r'   r(   torchcatr   r   r#   r4   reshapebmmr5   r$   r   
functionalsoftmaxr   rD   r   r*   )r+   r8   r9   r:   r;   r<   r=   is_cross_attentionr1   tgt_len_query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                       r-   forwardzPatchTSTAttention.forwardK   s    .T9',,..Wa {{=11DL@ 	L*q!'*.>.DQ.GGG (*J)!,LL 	LT[[1A%B%BBLLJ;;t{{3C'D'Db#NNLL'T[[%?%?SIIJ;;t{{='A'A2sKKLN1$5z#BJJJJ 9nQ&7%FANNNLL T[[%?%?SIIJ;;t{{='A'A2sKKL? 	8 ),7NDN*B>
Ct{{<#>>CZP'Z'4
+|+Z8//!$$yz/C/CAq/I/IJJ3#7'"JJJ*dn8LgW^7_ * * %%''* *  
 %""$$a'(BBB ta'8Rtt]k]p]p]r]rtt   (,,S$.'7SSVddL',,S4>-A7GTTL},,\r,BB&##%%$.)::: 1t~FW 1 1',,..1 1   +//2q!<<|?P?PQTVZVdfmov?w?wwL',,S4>-A7GTTL 	)
 %1$5$5c4>7T[$\$\!055cDN6JGU\]]LL$(!]**<4<RVR_*``
i
L99#"6!OOO)C$.4H'SWS`3a ) )$$&&) )  
 "&&sDNGT]SS!++Aq11 "))#wGGmmK001>AAr.   )r   FTFN)NNNNF)__name__
__module____qualname____doc__intfloatboolr   r   r"   rG   Tensorr7   r   rY   __classcell__r,   s   @r-   r   r   &   s       GG  +/C CC C 	C
 C C C (C C C C C C>eU\ eC ec e e e e 488<1526"'vB vB|vB #5<0vB !u|!45	vB
 !.vB "%,/vB  vB 
u|Xel3XeEL>Q5RR	SvB vB vB vB vB vB vB vBr.   r   c                   >     e Zd ZdZdef fdZdej        fdZ xZ	S )PatchTSTBatchNormzP
    Compute batch normalization over the sequence length (time) dimension.
    r   c                     t                                                       t          j        |j        |j                  | _        d S )Neps)r!   r"   r   BatchNorm1dd_modelnorm_eps	batchnormr+   r   r,   s     r-   r"   zPatchTSTBatchNorm.__init__   s7    FOLLLr.   inputsc                     |                     dd          }|                     |          }|                     dd          S )a  
        Parameters:
            inputs (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`):
                input for Batch norm calculation
        Returns:
            `torch.Tensor` of shape `(batch_size, sequence_length, d_model)`
        r   r3   )r5   rl   )r+   rn   outputs      r-   rY   zPatchTSTBatchNorm.forward   s@     !!!Q''''1%%%r.   
rZ   r[   r\   r]   r   r"   rG   ra   rY   rb   rc   s   @r-   re   re      sr         M~ M M M M M M
&el 
& 
& 
& 
& 
& 
& 
& 
&r.   re   Frn   
mask_ratiounmasked_channel_indiceschannel_consistent_masking
mask_valuec                    |dk     s|dk    rt          d| d          | j        \  }}}}| j        }	t          |d|z
  z            }
|r0t	          j        |d||	          }|                    d|d          }nt	          j        ||||	          }t	          j        ||||	          }d|ddddd|
f<   t	          j        |d          }t	          j        |d          }t	          j	        |d|	          }|
                    d                              ddd|          }|d|dd|ddddf<   |                     |                                |          }||d
         fS )a  random_masking: Mask the input considering the control variables.

    Args:
        inputs (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, num_features)`):
            The input tensor to mask.
        mask_ratio (`float`):
            Masking ratio applied to mask the input data during random pretraining. It is the number between 0 and 1.
        unmasked_channel_indices (list, *optional*):
            Indices of channels that will not be masked.
        channel_consistent_masking (bool, *optional*, defaults to `False`):
            When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
            across channels.
        mask_value (int, *optional*, defaults to 0):
            Define the value of masked patches for pretraining.

    Returns:
        `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as input Tensor and mask tensor of shape [bs x c x
        n]
    r   r   zMask ratio z has to be between 0 and 1.deviceNr@   rA   )rB   index.r   )r$   rF   rx   r^   rG   randrepeatonesargsortgather	unsqueezemasked_fillr`   )rn   rr   rs   rt   ru   
batch_sizenum_channelssequence_lengthnum_featuresrx   len_keepnoisemaskids_shuffleids_restoreinputs_masks                   r-   random_maskingr      s   4 A~~qNzNNNOOO>Dl;Jo|]F?a*n566H! U
:q/&IIIQa00 
:|_VTTT :j,OOODDAAAyy -2...K-444K<"K888D>>"$$Q1l;;D+23QQQ(!!!QQQ./$$TYY[[*==KV$$r.   num_forecast_mask_patchesc                    t          |t                    r|g}d |D             }| j        \  }}}}t          j        |||| j                  }	g }
d}t          |          }t          ||          D ]V\  }}|dk    s||k    rt          d| d          t          ||z  |z            }|
	                    |||g           ||z  }Wt          |
d           }
||k     r|
d         d         ||z
  z   |
d         d<   n#||k    r|
d	         d         ||z
  z   |
d	         d<   d}|
D ]\  }}}||z   }d
|	||dd| df<   |}t          j        |	j        d                   }|	|         }	|	                    d	                              d
d
d
|          }	|d|	dd|ddddf<   |                     |	                                |          }||	d         fS )a  Forecast masking that masks the last K patches where K is from the num_forecast_mask_patches.
    If num_forecast_mask_patches is a list, samples in the batch will be randomly masked by numbers defined in the list.

    Parameters:
        inputs (`torch.Tensor`):
            Input of shape `(bs, num_channels, num_patch, patch_length)`
        num_forecast_mask_patches (`list`):
            Number of patches to be masked at the end of each batch sample. e.g. 4 or [3, 5].
        unmasked_channel_indices (`list`, *optional*):
            Indices of channels that are not masked.
        mask_value (`int`, *optional*, defaults to 0):
            Values in the masked patches will be filled by `mask_value`.

    Returns:
        `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape `(bs,
        num_channels , num_patch)` or `(bs, tsg1, tsg2, num_channels, num_patch)`
    c                     g | ]}d S )r    ).0rO   s     r-   
<listcomp>z$forecast_masking.<locals>.<listcomp>.  s    AAA!AAAAr.   rw   r   znum_forecast_mask_patches z6 should be greater than 0 and less than total patches.c                     | d         S )Nr3   r   )xs    r-   <lambda>z"forecast_masking.<locals>.<lambda>@  s
    !A$ r.   )keyr3   r@   r   Nrz   )
isinstancer^   rF   rG   zerosrx   sumzipr$   appendsortedrandpermr   r|   r   r`   )rn   r   rs   ru   forecast_mask_ratiosr   r   r   r   r   t_listtotal_lengthtotal_ratiopatch_lengthratiotemp_lenbatch1	patch_lenrO   batch2permr   s                         r-   forecast_maskingr     sU   0 +S11 @%>$?!AA'@AAA>Dl;Jo|;z<WWWDFL*++K"#<>RSS ! !e1 ? ?q\qqq   zE)K788|UH5666 F///Fj  ay|zL'@Aq	!	
	"	"r
1
)BCr
1F"(  	1h("./VF]AAA	z{{*+>$*Q-((D:D>>"$$Q1l;;D+23QQQ(!!!QQQ./$$TYY[[*==KV$$r.   c                   >     e Zd ZdZdef fdZdej        fdZ xZ	S )PatchTSTPatchifyz
    A class to patchify the time series sequence into different patches

    Returns:
        `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
    r   c                    t                                                       |j        | _        |j        | _        |j        | _        | j        | j        k    r t          d| j         d| j         d          t          | j        | j                  | j        z
  | j        z  dz   | _        | j        | j        | j        dz
  z  z   }| j        |z
  | _	        d S )NzSequence length (z+) has to be greater than the patch length ()r   )
r!   r"   context_lengthr   r   patch_strider$   maxnum_patchessequence_start)r+   r   new_sequence_lengthr,   s      r-   r"   zPatchTSTPatchify.__init__`  s    %4"/"/4#444yD$8yyeievyyy  
   4d6GHH4K\\aearruvv"/$2CtGWZ[G[2\\"25HHr.   past_valuesc                 ,   |j         d         }|| j        k    rt          d| d| j         d          |dd| j        dddf         }|                    d| j        | j                  }|                    dd                                          }|S )a!  
        Parameters:
            past_values (`torch.Tensor` of shape `(batch_size, sequence_length, num_channels)`, *required*):
                Input for patchification

        Returns:
            `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
        zInput sequence length (z%) doesn't match model configuration (r   N)	dimensionrE   step)	rF   r   r$   r   unfoldr   r   r5   r6   )r+   r   r   rp   s       r-   rY   zPatchTSTPatchify.forwardq  s     &+B/d222x/xx`d`txxx   QQQ 3 5 5qqq89$2C$J[\\!!"b))4466r.   rq   rc   s   @r-   r   r   X  sr         I~ I I I I I I"5<        r.   r   c                   >     e Zd ZdZdef fdZdej        fdZ xZ	S )PatchTSTMaskinga  
    Class to perform random or forecast masking.

    Parameters:
        config (`PatchTSTConfig`): model config
    Returns:
        x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
            Masked patched input
        mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
            Bool tensor indicating True on masked points
    r   c                    t                                                       |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        | j        t          | j                  | _        d S d S N)	r!   r"   random_mask_ratiort   	mask_typer   rs   ru   r   rm   s     r-   r"   zPatchTSTMasking.__init__  s    !'!9*0*K'))/)I&(.(G% +(4,243P,Q,QD))) 54r.   patch_inputc                 2   | j         dk    r,t          || j        | j        | j        | j                  \  }}nI| j         dk    r&t          || j        | j        | j                  \  }}nt          d| j          d          |	                                }||fS )a  
        Parameters:
            patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
                Patch input

        Return:
            masked_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
                Masked patched input
            mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
                Bool tensor indicating True on masked points

        random)rn   rr   rs   rt   ru   forecast)rn   r   rs   ru   zInvalid mask type .)
r   r   r   rs   rt   ru   r   r   r$   r`   )r+   r   masked_inputr   s       r-   rY   zPatchTSTMasking.forward  s     >X%%!/"1)-)F+/+J?" " "L$$ ^z))!1"*.*H)-)F?	" " "L$$ C$.CCCDDD yy{{T!!r.   rq   rc   s   @r-   r   r     sr        
 
	R~ 	R 	R 	R 	R 	R 	R!"5< !" !" !" !" !" !" !" !"r.   r   c                   P     e Zd ZdZdef fdZddej        dee	         fdZ
 xZS )	PatchTSTEncoderLayerz 
    PatchTST encoder layer
    r   c           
      
   t                                                       |j        | _        t          |j        |j        |j                  | _        |j        dk    rt          j
        |j                  nt          j                    | _        |j        dk    rt          |          | _        nH|j        dk    r&t          j        |j        |j                  | _        nt%          |j         d          | j        r|j        dk    rt          j
        |j                  nt          j                    | _        |j        dk    rt          |          | _        nH|j        dk    r&t          j        |j        |j                  | _        nt%          |j         d          t          j        t          j        |j        |j        |j                  t3          |j                             |j        dk    rt          j
        |j                  nt          j                    t          j        |j        |j        |j                            | _        |j        dk    rt          j
        |j                  nt          j                    | _        |j        dk    rt          |          | _        nH|j        dk    r&t          j        |j        |j                  | _        nt%          |j         d          |j        | _        d S )N)r   r   r   r   rl   	layernormrg   z$ is not a supported norm layer type.r    ) r!   r"   channel_attentionr   rj   num_attention_headsattention_dropout	self_attnpath_dropoutr   DropoutIdentitydropout_path1	norm_typere   norm_sublayer1	LayerNormrk   r$   dropout_path2norm_sublayer2
Sequentialr&   ffn_dimr   r	   activation_function
ff_dropoutffdropout_path3norm_sublayer3pre_normrm   s     r-   r"   zPatchTSTEncoderLayer.__init__  s   !'!9*n0,
 
 
 AG@SVW@W@WRZ(;<<<]_]h]j]j{**"3F";";D,,"$,v~6?"S"S"SD 0VVVWWW ! 	\DJDWZ[D[D[F,?!@!@!@acalananD;..&7&?&?##![00&(l6>v&W&W&W## F$4!Z!Z!Z[[[ -Ifnfn6;GGGF./11-3->-B-BBJv()))Ifnfn6;GGG	
 
 AG@SVW@W@WRZ(;<<<]_]h]j]j{**"3F";";D,,"$,v~6?"S"S"SD 0VVVWWWr.   Nhidden_stater=   c                 ~   |j         \  }}}}|                    ||z  ||          }| j        rG|                     |                     |          |          \  }}}	||                     |          z   }nF|                     ||          \  }}}	|                     ||                     |          z             }|                    ||||          }| j        r|                    dd          	                                }|                    ||z  ||          }| j        rG|                     | 
                    |          |          \  }}
}	||                     |          z   }nF|                     ||          \  }}
}	| 
                    ||                     |          z             }|                    ||||          }|                    dd          	                                }|                    ||z  ||          }| j        r?||                     |                     |                     |                              z   }n>|                     ||                     |                     |                    z             }|                    ||||          }|f}|r|| j        r||
fn|fz  }|S )a  
        Parameters:
            hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*):
                Past values of the time series
            output_attentions (`bool`, *optional*):
                Whether or not to return the output attention of all layers
        Return:
            `torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`

        )r8   r=   r3   r   )rF   r4   r   r   r   r   rI   r   r5   r6   r   r   r   r   r   )r+   r   r=   r   num_input_channelsr   rj   rX   rU   rO   channel_attn_weightsoutputss               r-   rY   zPatchTSTEncoderLayer.forward  s%    DPCU@
& $((6H)H/[bcc= 	_+/>>"11,??Sd ,: , ,(Kq ($*<*<[*I*IILL ,0>>*>O ,: , ,(Kq  ..|d>P>PQ\>]>]/]^^L $++J8JO]dee ! 	E'11!Q77BBDDL',,Z/-IK]_fggL} c7;~~"&"5"5l"C"CWh 8F 8 8411  ,d.@.@.M.MM 8<~~".BS 8F 8 8411  $22<$BTBTU`BaBa3abb (//
OM_ahiiL'11!Q77BBDDL $((6H)H/[bcc= 	i ($*<*<TWWTEXEXYeEfEf=g=g*h*hhLL  ..|d>P>PQUQXQXYeQfQf>g>g/ghhL $++J8JO]dee/ 	kt?Uj&:;;\h[jjGr.   r   )rZ   r[   r\   r]   r   r"   rG   ra   r   r`   rY   rb   rc   s   @r-   r   r     s         /(~ /( /( /( /( /( /(bQ QEL QXd^ Q Q Q Q Q Q Q Qr.   r   c                   ,    e Zd ZeZdZdZdZd ZddZ	dS )PatchTSTPreTrainedModelmodelr   Fc                 l   t          |t                    rm| j        j        r&t          j                            |j        d           | j        j        dk    r)t          j                            |j	        dd           dS dS t          |t          j
                  r?|j        j                                         |j        j                            d           dS t          |t                     rI|j        j        j                                         |j        j        j                            d           dS t          |t          j        t          j        f          rR|j        j                            d| j        j                   |j        "|j        j                                         dS dS dS )	z$
        Initialize weights
        g{Gz?)stdr   r   g?)meanr         ?N)r   PatchTSTPositionalEncodingr   use_cls_tokenr   initnormal_	cls_tokenpositional_encoding_typeposition_encr   r   datazero_weightfill_re   rl   r&   Conv1dinit_std)r+   modules     r-   _init_weightsz%PatchTSTPreTrainedModel._init_weightsT  s    f899 	){( < 0d;;;{3x?? 3#3GGGGG @?-- 		)K""$$$M$$S))))) 122 	)!&,,...#(..s33333BI 677 	)M&&CT[5I&JJJ{& &&(((((	) 	)&&r.   c                 B    t          |t                    r	||_        d S d S r   )r   PatchTSTEncodergradient_checkpointing)r+   r   values      r-   _set_gradient_checkpointingz3PatchTSTPreTrainedModel._set_gradient_checkpointingj  s,    f00 	2,1F)))	2 	2r.   N)F)
rZ   r[   r\   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr   r   r   r.   r-   r   r   N  sM        !L#O&+#) ) ),2 2 2 2 2 2r.   r   c                   :     e Zd Zdef fdZdej        fdZ xZS )PatchTSTEmbeddingr   c                    t                                                       |j        | _        |j        | _        | j        r&t	          j        |j        |j                  | _        d S t	          j	                    | _        t          |j                  D ]9}| j                            t	          j        |j        |j                             :d S r   )r!   r"   r   share_embeddingr   r&   r   rj   input_embedding
ModuleListranger   )r+   r   rO   r,   s      r-   r"   zPatchTSTEmbedding.__init__p  s    "(";%5 	\#%9V-@&.#Q#QD   #%=??D 6455 \ \$++BIf6I6>,Z,Z[[[[\ \r.   r   c                     j         d         }| j        k    rt          d j         d| d           j        r                               }n2 fdt          |          D             }t          j        |d          }|S )a%  
        Parameters:
            patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
                Patch input for embedding
        return:
            `torch.Tensor` of shape `(batch_size, num_channels, num_patches, d_model)`
        r   z&The defined number of input channels (zQ) in the config has to be the same as the number of channels in the batch input (r   c           
      \    g | ](} j         |         d d |d d d d f                   )S r   )r  )r   ir   r+   s     r-   r   z-PatchTSTEmbedding.forward.<locals>.<listcomp>  sE    nnnq1$.q1+aaaAAAqqqj2IJJnnnr.   rA   )rF   r   r$   r  r  r
  rG   stack)r+   r   r   
embeddingss   ``  r-   rY   zPatchTSTEmbedding.forward|  s     ).q1!888j9P j jTfj j j    	8--k::JJnnnnnTYZlTmTmnnnJZQ777Jr.   	rZ   r[   r\   r   r"   rG   ra   rY   rb   rc   s   @r-   r  r  o  sh        
\~ 
\ 
\ 
\ 
\ 
\ 
\5<        r.   r  c                   p     e Zd ZdZdedef fdZedededej	        fd            Z
dej        fdZ xZS )	r   z'
    Class for positional encoding
    r   r   c                    t                                                       |j        | _        |j        | _        |j        r8t	          j        t          j        ddd|j                            | _	        |dz  }| 
                    ||          | _        |j        dk    rt	          j        |j                  nt	          j                    | _        d S )Nr   r   )r!   r"   r   r   r   	ParameterrG   r   rj   r   _init_per   positional_dropoutr   r   r+   r   r   r,   s      r-   r"   z#PatchTSTPositionalEncoding.__init__  s    #1"("; 	\%+aAv~*N*NOODN1K MM&+>> 6<5NQR5R5RBJv0111XZXcXeXe 	r.   r>   c                    | j         dk    r0t          j        t          j        || j                  d          }n:| j         dk    rt          j        || j                  }t          j        d|                              d          }t          j	        t          j        d| j        d          t          j        d          | j        z   z            }t          j        ||z            |d d dd df<   t          j        ||z            |d d dd df<   ||                                z
  }||                                d	z  z  }t          j        |d
          }nt!          | j          d          |S )Nr   Trequires_gradsincosr   r   r3   g     @
   FzN is not a valid positional encoder. Available types are 'random' and 'sincos'.)r   r   r  rG   randnrj   r   aranger   expmathlogsincosr   r   r$   )r   r   r   positiondiv_terms        r-   r  z#PatchTSTPositionalEncoding._init_pe  sy    *h66<K(P(P`deeeLL,88 ;{FNCCL|A{33==a@@Hya!C!CQXHYHY\b\jHjFk!kllH$)Ih.A$B$BLADqD!$)Ih.A$B$BLADqD!',*;*;*=*==L'<+;+;+=+=+BCL<EJJJLL2  C  C  C   r.   r   c                 X   | j         r|                     || j        dd d d f         z             }| j        | j        d dd d f         z   }|                    |j        d         | j        dd          }t          j        ||fd          }n|                     || j        z             }|S )Nr   r   r@   r3   rA   )	r   r  r   r   expandrF   r   rG   rH   )r+   r   r   
cls_tokensr   s        r-   rY   z"PatchTSTPositionalEncoding.forward  s     	T11+@QRSRTRTVWVWVWRW@X2XYYK):2A2qqq5)AAI"))+*;A*>@WY[]_``J 9j+%>AFFFLL  22;AR3RSSLr.   )rZ   r[   r\   r]   r   r^   r"   staticmethodr   r  r  rG   ra   rY   rb   rc   s   @r-   r   r     s         
~ 
C 
 
 
 
 
 
  c bl    \&5<        r.   r   c            	       l     e Zd ZdZdedef fdZ	 	 ddej        de	e
         de	e
         d	efd
Z xZS )r   z
    PatchTST Encoder
    r   r   c                 B   t                                                     d| _        t                    | _        t          |          | _        t          j        fdt          j
                  D                       | _        |                                  d S )NFc                 .    g | ]}t                    S r   )r   )r   r  r   s     r-   r   z,PatchTSTEncoder.__init__.<locals>.<listcomp>  s"    $k$k$ka%9&%A%A$k$k$kr.   )r!   r"   r   r  embedderr   positional_encoderr   r	  r
  num_hidden_layerslayers	post_initr  s    ` r-   r"   zPatchTSTEncoder.__init__  s       &+# *&11"<V["Q"Qm$k$k$k$k5QWQiKjKj$k$k$kll 	r.   Nr   output_hidden_statesr=   r>   c                 <   ||n| j         j        }||n| j         j        }|                     |          }|                     |          }|rdnd}|rdnd}| j        D ]-}|r||fz   } |||          }|d         }|r||d         fz   }.t          |||          S )a  
        Parameters:
            patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
                Past values of the time series
            output_hidden_states (bool, optional): Indicates if hidden states should be outputted.
            output_attentions (bool, optional): Indicates if attentions should be outputted.

        return:
            `BaseModelOutput`
        Nr   )r   r=   r   r   )last_hidden_stater8   
attentions)r   r=   r1  r,  r-  r/  r
   )	r+   r   r1  r=   r   encoder_statesall_attentionsencoder_layerlayer_outputss	            r-   rY   zPatchTSTEncoder.forward  s      2C1N--TXT_Tq$8$D  $+Jj 	
 mmK00..{;;3=0:d![ 
	F 
	FM# B!/</!A)M|WhiiiM )+L  F!/=3C2E!E^hvwwwwr.   NN)rZ   r[   r\   r]   r   r^   r"   rG   ra   r   r`   r
   rY   rb   rc   s   @r-   r   r     s         ~ C      " 04,0	(x (x\(x 'tn(x $D>	(x
 
(x (x (x (x (x (x (x (xr.   r   aM  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`PatchTSTConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
c                       e Zd ZU dZdZej        ed<   dZe	e
ej                          ed<   dZe	e
ej                          ed<   dZej        ed<   dZej        ed<   dZej        ed<   dZej        ed	<   dS )
PatchTSTModelOutputa  
    Base class for model's outputs, with potential hidden states.

    Parameters:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of
            the model at the output of each layer plus the optional initial embedding outputs.
        mask: (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`, *optional*)
            Bool masked tensor indicating which patches are masked
        loc: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*)
            Mean of the input data (batch_size, sequence_length, num_channels) over the sequence_length
        scale: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*)
            Std of the input data (batch_size, sequence_length, num_channels) over the sequence_length
        patch_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
            Patched input to the Transformer
    Nr3  r8   r4  r   locscaler   )rZ   r[   r\   r]   r3  rG   FloatTensor__annotations__r8   r   r   r4  r   r<  r=  r   r   r.   r-   r;  r;    s          ( ,0u(///8<M8E%"345<<<59Ju012999"D%
"""!C	!!!#E5###%)K")))))r.   r;  c                       e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
eeej                          ed<   dZeeej                          ed<   dS )PatchTSTForPretrainingOutputa  
    Output type of [`PatchTSTForPretraining`].

    Parameters:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            MSE loss.
        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction outputs of the time series modeling heads.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlossprediction_outputr8   r4  )rZ   r[   r\   r]   rB  r   rG   r>  r?  rC  r8   r   r4  r   r.   r-   rA  rA  9  s          * )-D(5$
%,,,+/u(///8<M8E%"345<<<59Ju01299999r.   rA  c                       e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
eeej                          ed<   dZeeej                          ed<   dS )PatchTSTForRegressionOutputa  
    Output type of [`PatchTSTForRegression`].

    Parameters:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            MSE loss.
        regression_outputs (`torch.FloatTensor` of shape `(batch_size, num_targets)`):
            Regression outputs of the time series modeling heads.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    NrB  regression_outputsr8   r4  )rZ   r[   r\   r]   rB  r   rG   r>  r?  rF  r8   r   r4  r   r.   r-   rE  rE  V  s          * )-D(5$
%,,,,0)0008<M8E%"345<<<59Ju01299999r.   rE  c                       e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
eeej                          ed<   dZeeej                          ed<   dZej        ed<   dZej        ed<   dS )	PatchTSTForPredictionOutputaR  
    Output type of [`PatchTSTForPrediction`].

    Parameters:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            MSE loss.
        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, prediction_length, -1)`):
            Prediction outputs of the time series modeling heads.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        loc: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*)
            Mean of the input data (batch_size, sequence_length, num_channels) over the sequence_length
        scale: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*)
            Std of the input data (batch_size, sequence_length, num_channels) over the sequence_length
    NrB  prediction_outputsr8   r4  r<  r=  )rZ   r[   r\   r]   rB  r   rG   r>  r?  rI  r8   r   r4  r<  r=  r   r.   r-   rH  rH  s  s          2 )-D(5$
%,,,,0)0008<M8E%"345<<<59Ju012999!C	!!!#E5#####r.   rH  c                       e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
eeej                          ed<   dZeeej                          ed<   dS )PatchTSTForClassificationOutputaR  
    Output type of [`PatchTSTForClassification`].

    Parameters:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            Total loss as the sum of the masked language modeling loss and the next sequence prediction
            (classification) loss.
        prediction_logits (`torch.FloatTensor` of shape `(batch_size, num_targets)`):
            Prediction scores of the PatchTST modeling head (scores before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    NrB  prediction_logitsr8   r4  )rZ   r[   r\   r]   rB  r   rG   r>  r?  rL  r8   r   r4  r   r.   r-   rK  rK    s          , )-D(5$
%,,,+/u(///8<M8E%"345<<<59Ju01299999r.   rK  c                   ,    e Zd ZU dZdZej        ed<   dS )SamplePatchTSTOutputa!  
    Base class for time series model's predictions outputs that contains the sampled values from the chosen
    distribution.

    Parameters:
        sequences `(batch_size, num_samples, prediction_length, num_targets)`):
                Sampled values from the chosen distribution.
    N	sequences)rZ   r[   r\   r]   rO  rG   r>  r?  r   r.   r-   rN  rN    s1           $(Iu '''''r.   rN  inputtargetr>   c                 .    |                      |           S )zc
    Computes the negative log likelihood loss from input distribution with respect to target.
    )log_prob)rP  rQ  s     r-   nllrT    s     NN6""""r.   input_tensorweightsc                 n   |t          j        |dk    | |z  t          j        |                     }t          j        |r|                    |          n|                                d          }|r|                    |          n|                                |z  S |                     |          S )aj  
    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.

    Args:
        input_tensor (`torch.FloatTensor`):
            Input tensor, of which the average must be computed.
        weights (`torch.FloatTensor`, *optional*):
            Weights tensor, of the same shape as `input_tensor`.
        dim (`int`, *optional*):
            The dim along which to average `input_tensor`.

    Returns:
        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
    Nr   rA   r   min)rG   where
zeros_likeclampr   r   )rU  rV  rB   weighted_tensorsum_weightss        r-   weighted_averager_    s      +glL74JEL\]iLjLjkkk#"P'++#+"6"6"67;;==VYZZZ03N###,,,9L9L9N9NR]]]  S )))r.   c            	            e Zd ZdZdef fdZdej        dej        deej        ej        ej        f         fdZ	 xZ
S )PatchTSTStdScalerz
    Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
    subtracting from the mean and dividing by the standard deviation.
    r   c                     t                                                       t          |d          r|j        nd| _        t          |d          r|j        nd| _        t          |d          r|j        nd| _        d S )Nscaling_dimr   keepdimTminimum_scalegh㈵>)r!   r"   hasattrrc  rB   rd  re  rm   s     r-   r"   zPatchTSTStdScaler.__init__  sy    )0)G)GN6%%Q)0)C)CMv~~5<V_5U5U_V11[_r.   r   observed_indicatorr>   c                 d   |                     | j        | j                  }|                    d          }||z                       | j        | j                  |z  }||z
  |z  dz                       | j        | j                  |z  }t	          j        || j        z             }||z
  |z  ||fS )C  
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                input for Batch norm calculation
            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Calculating the scale on the observed indicator.
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, num_input_channels)`)
        rd  r   r3   )r   rB   rd  	clamp_minrG   sqrtre  )r+   r   rg  denominatorr<  variancer=  s          r-   rY   zPatchTSTStdScaler.forward  s     ),,TXt|,LL!++C00((--dh-MMP[[Sj$661<AA$(TXT`Aaadoo
8d&8899s
e#S%//r.   rZ   r[   r\   r]   r   r"   rG   ra   r   rY   rb   rc   s   @r-   ra  ra    s         
`~ ` ` ` ` ` `0L06;l0	u|U\5<7	80 0 0 0 0 0 0 0r.   ra  c            	            e Zd ZdZdef fdZdej        dej        deej        ej        ej        f         fdZ	 xZ
S )PatchTSTMeanScalerz
    Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
    accordingly.
    r   c                 8   t                                                       t          |d          r|j        nd| _        t          |d          r|j        nd| _        t          |d          r|j        nd| _        t          |d          r|j        nd | _        d S )Nrc  r   rd  Tre  绽|=default_scale)r!   r"   rf  rc  rB   rd  re  rt  rm   s     r-   r"   zPatchTSTMeanScaler.__init__  s    )0)G)GN6%%Q)0)C)CMv~~5<V_5U5U`V11[`5<V_5U5U_V11[_r.   r   rg  r>   c                    ||z                                                       | j        d          }|                    | j        d          }|t          j        |d          z  }| j        W|                    d          }t          j        |                    d          d          }t          j        ||z            }n| j        t          j        |          z  }t          j        |dk    ||          }t          j        || j	                  }||z  }	| j
        s|                    | j                  }|	t          j        |          |fS )ri  Trj  r   rX  Nr   rA   )absr   rB   rG   r\  rt  squeeze	ones_likerZ  re  rd  r[  )
r+   r   rg  ts_sumnum_observedr=  	batch_sumbatch_observationsrt  scaled_datas
             r-   rY   zPatchTSTMeanScaler.forward  sE    ++002266tx6NN)--dh-EE\q9999 %

q
))I!&\-=-=a-@-@a!H!H!H!M)6H*HIIMM .1G1GGM L1,e]CC Et'9:::Ul| 	0MMdhM//EE,U33U::r.   ro  rc   s   @r-   rq  rq    s         
`~ ` ` ` ` ` `&;L&;6;l&;	u|U\5<7	8&; &; &; &; &; &; &; &;r.   rq  c            
            e Zd ZdZdef fdZ	 d	dej        dej        deej        ej        ej        f         fdZ	 xZ
S )
PatchTSTNOPScalerz|
    Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
    r   c                     t                                                       t          |d          r|j        nd| _        t          |d          r|j        nd| _        d S )Nrc  r   rd  T)r!   r"   rf  rc  rB   rd  rm   s     r-   r"   zPatchTSTNOPScaler.__init__D  sW    )0)G)GN6%%Q)0)C)CMv~~r.   Nr   rg  r>   c                     t          j        |d                              | j        | j                  }t          j        |d                              | j        | j                  }|||fS )a  
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                input for Batch norm calculation
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, num_input_channels)`)
        Fr  )rB   rd  )rG   rx  r   rB   rd  r[  )r+   r   rg  r=  r<  s        r-   rY   zPatchTSTNOPScaler.forwardI  sl     E:::??DHVZVb?cct5999>>48UYUa>bbS%r.   r   ro  rc   s   @r-   r  r  ?  s         N~ N N N N N N FJ   L 6;l 	u|U\5<7	8               r.   r  c            	       |     e Zd Zdef fdZdej        dej        deej        ej        ej        f         fdZ xZ	S )PatchTSTScalerr   c                    t                                                       |j        dk    s	|j        du rt          |          | _        d S |j        dk    rt          |          | _        d S t          |          | _        d S )Nr   Tr   )r!   r"   r%   rq  scalerra  r  rm   s     r-   r"   zPatchTSTScaler.__init__[  sx    >V##v~'='=,V44DKKK^u$$+F33DKKK+F33DKKKr.   r   rg  r>   c                 @    |                      ||          \  }}}|||fS )a>  
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Input for scaler calculation
            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Calculating the scale on the observed indicator.
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, um_input_channels)`)
        )r  )r+   r   rg  r<  r=  s        r-   rY   zPatchTSTScaler.forwardd  s,      ;;t-?@@c5S%r.   )
rZ   r[   r\   r   r"   rG   ra   r   rY   rb   rc   s   @r-   r  r  Z  s        4~ 4 4 4 4 4 4 L 6;l 	u|U\5<7	8               r.   r  zOThe bare PatchTST Model outputting raw hidden-states without any specific head.c                        e Zd Zdef fdZ	 	 	 	 	 ddej        deej                 deej                 dee         dee         d	ee         d
e	e
ef         fdZ xZS )PatchTSTModelr   c                    t                                          |           t          |          | _        t	          |          | _        |j        | _        | j        j        }| j        rt          |          | _	        nt          j                    | _	        t          ||          | _        |                                  d S )N)r   )r!   r"   r  r  r   
patchifierdo_mask_inputr   r   maskingr   r   r   encoderr0  r  s      r-   r"   zPatchTSTModel.__init__{  s       $V,,*622#1o1 	)*622DLL;==DL&v;GGG 	r.   Nr   past_observed_maskfuture_valuesr1  r=   return_dictr>   c           	      D   ||n| j         j        }||n| j         j        }||n| j         j        }|t	          j        |          }|                     ||          \  }}}	|                     |          }
| j        r| 	                    |
          \  }}n| 	                    |
          d}}| 
                    |||          }|s6|j        |j        |j        f}||||	|
fz   }t          d |D                       S t          |j        |j        |j        |||	|
          S )a  
        Parameters:
            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
                Input sequence to the model
            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
            future_values (`torch.BoolTensor` of shape `(batch_size, prediction_length, num_input_channels)`, *optional*):
                Future target values associated with the `past_values`
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers
            output_attentions (`bool`, *optional*):
                Whether or not to return the output attention of all layers
            return_dict (`bool`, *optional*):
                Whether or not to return a `ModelOutput` instead of a plain tuple.

        Returns:
            `PatchTSTModelOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False)

        Examples:

        ```python
        >>> from huggingface_hub import hf_hub_download
        >>> import torch
        >>> from transformers import PatchTSTModel

        >>> file = hf_hub_download(
        ...     repo_id="hf-internal-testing/etth1-hourly-batch", filename="train-batch.pt", repo_type="dataset"
        ... )
        >>> batch = torch.load(file)

        >>> model = PatchTSTModel.from_pretrained("namctin/patchtst_etth1_pretrain")

        >>> # during training, one provides both past and future values
        >>> outputs = model(
        ...     past_values=batch["past_values"],
        ...     future_values=batch["future_values"],
        ... )

        >>> last_hidden_state = outputs.last_hidden_state
        ```N)r   r1  r=   c              3      K   | ]}||V  	d S r   r   )r   vs     r-   	<genexpr>z(PatchTSTModel.forward.<locals>.<genexpr>  s"      ==qq}}}}}==r.   )r3  r8   r4  r   r<  r=  r   )r   use_return_dictr=   r1  rG   rx  r  r  r  r  r  r3  r8   r4  tupler;  )r+   r   r  r  r1  r=   r  scaled_past_valuesr<  r=  patched_valuesmasked_valuesr   encoder_outputr   s                  r-   rY   zPatchTSTModel.forward  sq   l &1%<kk$+B]1B1N--TXT_Tq$8$D  $+Jj 	 %!&!=!= *.[BT)U)U&C );<< 	E"&,,~">">M44"&,,~">">4M%<Pdu & 
 
  	>%79UWeWpqGsE> BBG==G======",>(6%0&
 
 
 	
r.   NNNNN)rZ   r[   r\   r   r"   rG   ra   r   r`   r   r   r;  rY   rb   rc   s   @r-   r  r  v  s        
~      * 6:04/3,0&*Z
 Z
\Z
 %U\2Z
  -	Z

 'tnZ
 $D>Z
 d^Z
 
u))	*Z
 Z
 Z
 Z
 Z
 Z
 Z
 Z
r.   r  c                   L     e Zd ZdZdef fdZdej        dej        fdZ xZ	S )PatchTSTMaskPretrainHeadz-
    Pretraining head for mask modelling
    r   c                     t                                                       |j        dk    rt          j        |j                  nt          j                    | _        t          j        |j        |j	                  | _
        |j        | _        d S Nr   )r!   r"   head_dropoutr   r   r   r   r&   rj   r   linearr   rm   s     r-   r"   z!PatchTSTMaskPretrainHead.__init__  ss    :@:MPQ:Q:Qrz&"5666WYWbWdWdi0CDD#1r.   	embeddingr>   c                     |                      |                     |                    }| j        r|ddddddddf         }|S )a  
        Parameters:
            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                    `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
                Embedding from the model
        Returns:
            `torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                            `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True

        Nr   )r  r   r   )r+   r  s     r-   rY   z PatchTSTMaskPretrainHead.forward  sT     KKY 7 788	 	/!!!!QQQAAA+.Ir.   rq   rc   s   @r-   r  r    st         2~ 2 2 2 2 2 2 %,        r.   r  z The PatchTST for pretrain model.c                        e Zd Zdef fdZ	 	 	 	 ddej        deej                 dee         dee         dee         d	e	e
ef         fd
Z xZS )PatchTSTForPretrainingr   c                     t                                          |           d|_        t          |          | _        t          |          | _        |                                  d S )NT)r   )r!   r"   r  r  r   r  headr0  rm   s     r-   r"   zPatchTSTForPretraining.__init__  s\       #"&111
,V44	 	r.   Nr   r  r1  r=   r  r>   c                    ||n| j         j        }|                     ||||d          }|                     |j                  }t          j        d          } |||j                  }	|	                    d          |j	        z  
                                |j	        
                                dz   z  }
|j        }|s|f|d	d
         z   }|
|
f|z   n|}|S t          |
|||j                  S )a	  
        Parameters:
            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
                Input sequence to the model
            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers
            output_attentions (`bool`, *optional*):
                Whether or not to return the output attention of all layers
            return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.

        Returns:
            `PatchTSTForPretrainingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
            `config.return_dict`=False)

        Examples:

        ```python
        >>> from huggingface_hub import hf_hub_download
        >>> import torch
        >>> from transformers import PatchTSTConfig, PatchTSTForPretraining

        >>> file = hf_hub_download(
        ...     repo_id="hf-internal-testing/etth1-hourly-batch", filename="train-batch.pt", repo_type="dataset"
        ... )
        >>> batch = torch.load(file)

        >>> # Config for random mask pretraining
        >>> config = PatchTSTConfig(
        ...     num_input_channels=7,
        ...     context_length=512,
        ...     patch_length=12,
        ...     stride=12,
        ...     mask_type='random',
        ...     random_mask_ratio=0.4,
        ...     use_cls_token=True,
        ... )
        >>> # Config for forecast mask pretraining
        >>> config = PatchTSTConfig(
        ...     num_input_channels=7,
        ...     context_length=512,
        ...     patch_length=12,
        ...     stride=12,
        ...     mask_type='forecast',
        ...     num_forecast_mask_patches=5,
        ...     use_cls_token=True,
        ... )
        >>> model = PatchTSTForPretraining(config)

        >>> # during training, one provides both past and future values
        >>> outputs = model(past_values=batch["past_values"])

        >>> loss = outputs.loss
        >>> loss.backward()
        ```NTr   r  r1  r=   r  none	reductionr@   rA   rs  r   )rB  rC  r8   r4  )r   r  r   r  r3  r   MSELossr   r   r   r   r8   rA  r4  )r+   r   r  r1  r=   r  model_outputx_hatrB  loss_valmasked_lossr5  r   s                r-   rY   zPatchTSTForPretraining.forward  s$   J &1%<kk$+B] zz#1!5/ " 
 
 		,899 zF+++4|788}}},,|/@@EEGG<K\K`K`KbKbejKjk%3 	had!33G2=2I{nw..wGN+^`l`w
 
 
 	
r.   )NNNN)rZ   r[   r\   r   r"   rG   ra   r   r`   r   r   rA  rY   rb   rc   s   @r-   r  r    s        
~       6:/3,0&*a
 a
\a
 %U\2a
 'tn	a

 $D>a
 d^a
 
u22	3a
 a
 a
 a
 a
 a
 a
 a
r.   r  c                   :     e Zd Zdef fdZdej        fdZ xZS )PatchTSTClassificationHeadr   c                 |   t                                                       |j        | _        |j        | _        t	          j        d          | _        |j        dk    rt	          j        |j                  nt	          j	                    | _
        t	          j        |j        |j        z  |j                  | _        d S Nr   	start_dimr   )r!   r"   r   pooling_typer   Flattenflattenr  r   r   r   r&   r   rj   num_targetsr  rm   s     r-   r"   z#PatchTSTClassificationHead.__init__z  s    #1"/zA...:@:MPQ:Q:Qrz&"5666WYWbWdWdi 9FN JFL^__r.   r  c                 v   | j         r|dddddddf         }na| j        dk    r|                    d          }n?| j        dk    r|                    d          j        }nt          d| j         d          |                     |          }|                     |                     |                    }|S )	a[  
        Parameters:
            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                     `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
                Embedding from the model
        Returns:
            `torch.Tensor` of shape `(bs, num_targets)`

        Nr   r   r3   rA   r   pooling operator  is not implemented yet)	r   r  r   r   valuesr$   r  r  r   r+   r  pooled_embeddingrp   s       r-   rY   z"PatchTSTClassificationHead.forward  s      
	](AAAq!!!4&(((~~!~44%''(}}}33:[1B[[[\\\<<(899T\\*:;;<<r.   r  rc   s   @r-   r  r  y  sh        `~ ` ` ` ` ` `        r.   r  z&The PatchTST for classification model.c                        e Zd Zdef fdZ	 	 	 	 	 ddej        dej        dee         dee         dee         d	ee         d
e	e
ef         fdZ xZS )PatchTSTForClassificationr   c                    t                                          |           |j        r!t                              d           d|_        t          |          | _        t          |          | _        | 	                                 d S )N+Setting `do_mask_input` parameter to False.F)
r!   r"   r  loggerwarningr  r   r  r  r0  rm   s     r-   r"   z"PatchTSTForClassification.__init__  sy         	)NNHIII#(F "6**
.v66	 	r.   Nr   target_valuesr  r1  r=   r  r>   c                 B   ||n| j         j        }|                     ||||d          }|                     |j                  }d}	|t          j                    }
 |
||          }	|s|f|dd         z   }|	|	f|z   n|}|S t          |	||j        |j	                  S )a  
        Parameters:
            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
                Input sequence to the model
            target_values (`torch.Tensor`, *optional*):
                Labels associates with the `past_values`
            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers
            output_attentions (`bool`, *optional*):
                Whether or not to return the output attention of all layers
            return_dict (`bool`, *optional*):
                Whether or not to return a `ModelOutput` instead of a plain tuple.

        Returns:
            `PatchTSTForClassificationOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
            `config.return_dict`=False)

        Examples:

        ```python
        >>> from transformers import PatchTSTConfig, PatchTSTForClassification

        >>> # classification task with two input channel2 and 3 classes
        >>> config = PatchTSTConfig(
        ...     num_input_channels=2,
        ...     num_targets=3,
        ...     context_length=512,
        ...     patch_length=12,
        ...     stride=12,
        ...     use_cls_token=True,
        ... )
        >>> model = PatchTSTForClassification(config=config)

        >>> # during inference, one only provides past values
        >>> past_values = torch.randn(20, 512, 2)
        >>> outputs = model(past_values=past_values)
        >>> labels = outputs.prediction_logits
        ```NTr  r   r   )rB  rL  r8   r4  )
r   r  r   r  r3  r   CrossEntropyLossrK  r8   r4  )r+   r   r  r  r1  r=   r  r  y_hatr  rB  r   s               r-   rY   z!PatchTSTForClassification.forward  s    l &1%<kk$+B]zz#1!5/ " 
 
 		,899$&((DtE=11H 	had!33G/7/CxkG++GN.#&4#.	
 
 
 	
r.   r  )rZ   r[   r\   r   r"   rG   ra   r   r`   r   r  rK  rY   rb   rc   s   @r-   r  r    s        
~      " '+-1/3,0&*O
 O
\O
 |O
 %TN	O

 'tnO
 $D>O
 d^O
 
u55	6O
 O
 O
 O
 O
 O
 O
 O
r.   r  z"The PatchTST for regression Model.c                   <     e Zd Zddef fdZdej        fdZ xZS )PatchTSTPredictionHeadNr   c                    t                                                       |j        | _        |j        | _        |j        | _        |j        | _        | j        s| j        r|j        }n
|j        |z  }| j        s?t          j                    | _	        t          j                    | _
        t          j                    | _        t          | j                  D ]}| j                            t          j        d                     |3| j	                            t          j        ||j                             n-| j	                            |                    |                     | j
                            |j        dk    rt          j        |j                  nt          j                               d S t          j        d          | _        | t          j        ||j                  | _        n|                    |          | _        |j        dk    rt          j        |j                  nt          j                    | _        d S )Nr3   r  r   )r!   r"   share_projectionr   r   r  rj   r   r	  projectionsdropoutsflattensr
  r   r  r&   prediction_lengthget_parameter_projectionr  r   r   r  
projectionr   )r+   r   r   distribution_outputr#   r  r,   s         r-   r"   zPatchTSTPredictionHead.__init__  s	    & 7"(";#1"/ 	4 2 	4~HH~3H$ 	i!}DMOODMMOODM4233 t t$$RZ!%<%<%<===&.$++BIh@X,Y,YZZZZ $++,?,X,XYa,b,bccc$$H[^_H_H_RZ0C%D%D%Degeperersssst t :222DL"*"$)Hf6N"O"O #6"N"Nx"X"X>D>QTU>U>U2:f&9:::[][f[h[hDLLLr.   r  c                    | j         r|dddddddf         }nK| j        dk    r|                    d          }n)| j        dk    r|                    d          j        }n|}| j        sg }t          | j                  D ]f} | j        |         |dd|ddf                   } | j	        |         |          } | j
        |         |          }|                    |           gt          j        |d          }n?|                     |          }|                     |          }|                     |          }t#          |t$                    rt%          d |D                       }n|                    dd          }|S )	aj  
        Parameters:
            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                     `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
                Embedding from the model
        Returns:
            `torch.Tensor` of shape `(bs, forecast_len, num_channels)`

        Nr   r   r3   rA   r   r   c              3   B   K   | ]}|                     d d          V  dS )r3   r   N)r5   )r   zs     r-   r  z1PatchTSTPredictionHead.forward.<locals>.<genexpr>[  s0      ==1;;q!,,======r.   )r   r  r   r   r  r  r
  r   r  r  r  r   rG   r  r  r   r  r   r  r5   )r+   r  r  rp   r  s        r-   rY   zPatchTSTPredictionHead.forward-  s     	-(AAAq!!!4 F**#,>>a>#8#8  "e++#,==Q=#7#7#>   $- $ 	7F4233 0 0#34=#34DQQQ111W4M#N#N #34=#34D#E#E  $74#3A#67G#H#H .////[Q///FF  $||,<==#||,<== __%566Ffe$$ 	,==f=====FF%%a++Fr.   r   r  rc   s   @r-   r  r    so        
#i #i~ #i #i #i #i #i #iJ1 1 1 1 1 1 1 1 1r.   r  z"The PatchTST for prediction model.c                        e Zd Zdef fdZ	 	 	 	 	 ddej        deej                 deej                 dee         dee         d	ee         d
e	e
ef         fdZ	 ddej        deej                 d
efdZ xZS )PatchTSTForPredictionr   c                 x   t                                          |           |j        r!t                              d           d|_        t          |          | _        |j        dk    rd | _        n|j        dk    rt          |j
                  | _        nc|j        dk    rt          |j
                  | _        n=|j        dk    rt          |j
                  | _        nt          d|j                   t          || j        j        j        | j        	          | _        |                                  d S )
Nr  Fmse	student_trA   normalnegative_binomialUnknown distribution output )r  )r!   r"   r  r  r  r  r   rB  r  r   r  r   r   r$   r  r  r   r  r0  rm   s     r-   r"   zPatchTSTForPrediction.__init__f  s:        	)NNHIII#(F "6**
;%'+D$$)[88+9f>V+W+W+W((+x77+7F<T+U+U+U((+/BBB+AfF^+_+_+_(( !\@Z!\!\]]]*DJ)54Kc
 
 
	
 	r.   Nr   r  r  r1  r=   r  r>   c                 :   ||n| j         j        }|                     ||||d          }|                     |j                  }d}	| j        r|}
n||j        z  |j        z   }
|o| j        rG| j                            ||j        |j                  }t          ||          }	t          |	          }	n!t          j        d          } ||
|          }	|j        }|j        }|s|
f|dd         z   }|	|	f|z   n|}|S t          |	|
|j        |j        ||	          S )
aV	  
        Parameters:
            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
                Input sequence to the model
            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
            future_values (`torch.Tensor` of shape `(bs, forecast_len, num_input_channels)`, *optional*):
                Future target values associated with the `past_values`
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers
            output_attentions (`bool`, *optional*):
                Whether or not to return the output attention of all layers
            return_dict (`bool`, *optional*):
                Whether or not to return a `ModelOutput` instead of a plain tuple.

        Returns:
            `PatchTSTForPredictionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
            `config.return_dict`=False)

        Examples:

        ```python
        >>> from huggingface_hub import hf_hub_download
        >>> import torch
        >>> from transformers import PatchTSTConfig, PatchTSTForPrediction

        >>> file = hf_hub_download(
        ...     repo_id="hf-internal-testing/etth1-hourly-batch", filename="train-batch.pt", repo_type="dataset"
        ... )
        >>> batch = torch.load(file)

        >>> # Prediction task with 7 input channels and prediction length is 96
        >>> model = PatchTSTForPrediction.from_pretrained("namctin/patchtst_etth1_forecast")

        >>> # during training, one provides both past and future values
        >>> outputs = model(
        ...     past_values=batch["past_values"],
        ...     future_values=batch["future_values"],
        ... )

        >>> loss = outputs.loss
        >>> loss.backward()

        >>> # during inference, one only provides past values, the model outputs future values
        >>> outputs = model(past_values=batch["past_values"])
        >>> prediction_outputs = outputs.prediction_outputs
        ```NTr  r<  r=  r   r  r   r@   )rB  rI  r8   r4  r<  r=  )r   r  r   r  r3  r  r=  r<  distributionrT  r_  r   r  rH  r8   r4  )r+   r   r  r  r1  r=   r  r  r  r  	y_hat_outr  rB  r<  r=  r   s                   r-   rY   zPatchTSTForPrediction.forward  sz   z &1%<kk$+B] zz#1!5/ " 
 
 		,899# 	FII 22\5EEI$' 	:#7DD|/|7I  E     |];;+H55zF3334	=99" 	 l\!B$%77G/7/CxkG++GN*(&4#.
 
 
 	
r.   c                 X   | j         j        } | |d|d          }| j        r^| j                            |j        |j        |j                  fdt          |          D             }t          j	        |d          }n|j        
                    d          }t          |          S )	a   
        Generate sequences of sample predictions from a model with a probability distribution head.

        Parameters:
            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Past values of the time series that serves as context in order to predict the future.
            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).

        Return:
            [`SamplePatchTSTOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
            samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, num_input_channels)`
            for multivariate predictions.
        NF)r   r  r  r1  r  c                 8    g | ]}                                 S r   sampler   rO   r  s     r-   r   z2PatchTSTForPrediction.generate.<locals>.<listcomp>  s%    RRR|**,,RRRr.   r   rA   rO  )r   num_parallel_samplesr  r  rI  r<  r=  r
  rG   r  r   rN  r+   r   r  r  r   samplesr  s         @r-   generatezPatchTSTForPrediction.generate  s    0  ${? $#1!&	
 
 
 # 
	>3@@*7= A  L SRRRe<P6Q6QRRRGk'q111GG0::1==G#g6666r.   r  r   )rZ   r[   r\   r   r"   rG   ra   r   r`   r   r   rH  rY   rN  r  rb   rc   s   @r-   r  r  a  s*       
~      @ 6:04/3,0&*k
 k
\k
 %U\2k
  -	k

 'tnk
 $D>k
 d^k
 
u11	2k
 k
 k
 k
` 6:-7 -7\-7 %U\2-7 
	-7 -7 -7 -7 -7 -7 -7 -7r.   r  c                   @     e Zd ZdZddef fdZdej        fdZ xZ	S )PatchTSTRegressionHeadz
    Regression head
    Nr   c                    t                                                       |j        | _        |j        | _        |j        | _        || _        |j        |j        z  }t          j
        d          | _        |j        dk    rt          j        |j                  nt          j                    | _        |!t          j        ||j                  | _        d S |                    |          | _        d S r  )r!   r"   output_rangey_ranger   r  r  r   rj   r   r  r  r  r   r   r   r&   r  r  r  )r+   r   r  r#   r,   s       r-   r"   zPatchTSTRegressionHead.__init__%  s    *#1"/#6 ,v~=zA...:@:MPQ:Q:Qrz&"5666WYWbWdWd& i&2DEEDOOO1JJ8TTDOOOr.   r  c                    | j         r|dddddddf         }na| j        dk    r|                    d          }n?| j        dk    r|                    d          j        }nt          d| j         d          |                     |                     |                    }|                     |          }| j	        du | j
        duz  r>t          j        |          | j
        d	         | j
        d         z
  z  | j
        d         z   }|S )
aY  
        Parameters:
            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                    `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
                Embedding from the model
        Returns:
            `torch.Tensor` of shape `(bs, output_dim)`

        Nr   r   r3   rA   r   r  r  r   )r   r  r   r   r  r$   r   r  r  r  r  rG   sigmoidr  s       r-   rY   zPatchTSTRegressionHead.forward6  s'     
	](AAAq!!!4&(((~~!~44%''(}}}33:[1B[[[\\\  <<5E(F(FGG !122$,T1IJ 	c]6**dl1oQ.OPSWS_`aSbbFr.   r   rq   rc   s   @r-   r  r     sx         U U~ U U U U U U"        r.   r  z"The PatchTST for regression model.c                        e Zd Zdef fdZ	 	 	 	 	 ddej        dej        deej                 dee         dee         d	ee         d
e	e
ef         fdZ	 ddej        deej                 d
efdZ xZS )PatchTSTForRegressionr   c                 V   t                                          |           |j        r!t                              d           d|_        t          |          | _        |j        dk    rd | _        n|j        dk    rt          |j
                  | _        nc|j        dk    rt          |j
                  | _        n=|j        dk    rt          |j
                  | _        nt          d|j                   t          || j                  | _        |                                  d S )	Nr  Fr  r  rA   r  r  r  )r!   r"   r  r  r  r  r   rB  r  r   r  r   r   r$   r  r  r0  rm   s     r-   r"   zPatchTSTForRegression.__init__\  s&        	)NNHIII#(F "6**
;%'+D$$)[88+9f>P+Q+Q+Q((+x77+7F<N+O+O+O((+/BBB+AfFX+Y+Y+Y(( !\@Z!\!\]]]*643KLL	 	r.   Nr   r  r  r1  r=   r  r>   c                      ||n j         j        }                     ||||d          }                     |j                  }d}	|} j        rU j                            |          }
t           fd|D                       }t          |
|          }	t          |	          }	n!t          j        d          }	 |	||          }	|s|f|dd         z   }|	|	f|z   n|}|S t          |	||j        |j        	          S )
a'  
        Parameters:
            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
                Input sequence to the model
            target_values (`torch.Tensor` of shape `(bs, num_input_channels)`):
                Target values associates with the `past_values`
            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers
            output_attentions (`bool`, *optional*):
                Whether or not to return the output attention of all layers
            return_dict (`bool`, *optional*):
                Whether or not to return a `ModelOutput` instead of a plain tuple.

        Returns:
            `PatchTSTForRegressionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
            `config.return_dict`=False)

        Examples:

        ```python
        >>> from transformers import PatchTSTConfig, PatchTSTForRegression

        >>> # Regression task with 6 input channels and regress 2 targets
        >>> model = PatchTSTForRegression.from_pretrained("namctin/patchtst_etth1_regression")

        >>> # during inference, one only provides past values, the model outputs future values
        >>> past_values = torch.randn(20, 512, 6)
        >>> outputs = model(past_values=past_values)
        >>> regression_outputs = outputs.regression_outputs
        ```NTr  c                 P    g | ]"}|                     d j        j                  #S )r@   )r4   r   r  )r   itemr+   s     r-   r   z1PatchTSTForRegression.forward.<locals>.<listcomp>  s,    XXX$tyyT[-DEEXXXr.   r   r  r   r   )rB  rF  r8   r4  )r   r  r   r  r3  r  r  r  rT  r_  r   r  rE  r8   r4  )r+   r   r  r  r1  r=   r  r  r  rB  r  r   s   `           r-   rY   zPatchTSTForRegression.forwardv  sG   \ &1%<kk$+B]zz#1!5/ " 
 
 		,899$' 	2#7DDUKKXXXXRWXXXYY<77'--zF333tE=11 	had!33G+/+;tg''GN*$&4#.	
 
 
 	
r.   c                 8   | j         j        } | |d|d          }| j                            |j                  fdt          |          D             }t          j        |d                              d|| j         j	                  }t          |          S )	a  
        Generate sequences of sample predictions from a model with a probability distribution head.

        Parameters:
            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Past values of the time series that serves as context in order to predict the future.
            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).

        Return:
            [`SamplePatchTSTOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
            samples, num_targets)`.
        NF)r   r  r  r1  c                 8    g | ]}                                 S r   r  r  s     r-   r   z2PatchTSTForRegression.generate.<locals>.<listcomp>  s%    NNNQ<&&((NNNr.   r   rA   r@   r  )r   r  r  r  rF  r
  rG   r  r4   r  rN  r  s         @r-   r  zPatchTSTForRegression.generate  s    .  ${? $#1!&	
 
 
 /<<W=WXXNNNN%8L2M2MNNN+g1---2227KT[Mdee#g6666r.   r  r   )rZ   r[   r\   r   r"   rG   ra   r   r`   r   r  rE  rY   rN  r  rb   rc   s   @r-   r  r  W  s$       
~      : '+59/3,0&*Q
 Q
\Q
 |Q
 %U\2	Q

 'tnQ
 $D>Q
 d^Q
 
u11	2Q
 Q
 Q
 Q
l 6:'7 '7\'7 %U\2'7 
	'7 '7 '7 '7 '7 '7 '7 '7r.   r  )NFr   r  r9  )Gr]   r  dataclassesr   typingr   r   r   rG   r   activationsr	   modeling_outputsr
   modeling_utilsr   time_series_utilsr   r   r   utilsr   r   r   configuration_patchtstr   
get_loggerrZ   r  _CONFIG_FOR_DOCModuler   re   ra   r_   listr`   r^   r   r   r   r   r   r   r  r   r   PATCHTST_START_DOCSTRINGr;  rA  rE  rH  rK  rN  distributionsDistributionrT  r_  ra  rq  r  r  r  r  r  r  r  r  r  r  r  r   r.   r-   <module>r     sa      ! ! ! ! ! ! ) ) ) ) ) ) ) ) ) )        " " " " " " / / / / / / - - - - - - U U U U U U U U U U ? ? ? ? ? ? ? ? ? ? 2 2 2 2 2 2 
	H	%	%"[B [B [B [B [B	 [B [B [B|& & & & &	 & & &2 &*',7% 7%L7%7% #7% !%	7%
 7% 7% 7% 7%z &*	A% A%LA%$T3Y/A% #A% 	A% A% A% A%H- - - - -ry - - -`9" 9" 9" 9" 9"bi 9" 9" 9"xG G G G G29 G G GT2 2 2 2 2o 2 2 2B! ! ! ! !	 ! ! !H5 5 5 5 5 5 5 5p;x ;x ;x ;x ;x- ;x ;x ;x| " * * * * *+ * * *< : : : : :; : : :8 : : : : :+ : : :8 $ $ $ $ $+ $ $ $D : : : : :k : : :: 
( 
( 
( 
( 
(; 
( 
( 
(#u"/ # #%, # # # #* *5< *(5<:P *fkfr * * * *2 0  0  0  0  0	  0  0  0H3; 3; 3; 3; 3; 3; 3; 3;n         	      6         RY      8 U m
 m
 m
 m
 m
+ m
 m
	 m
`    ry   8 & l
 l
 l
 l
 l
4 l
 l
	 l
^" " " " " " " "J , ^
 ^
 ^
 ^
 ^
 7 ^
 ^
	 ^
B ( W W W W WRY W W	 Wt ( x7 x7 x7 x7 x73 x7 x7	 x7v4 4 4 4 4RY 4 4 4n ( U7 U7 U7 U7 U73 U7 U7	 U7 U7 U7r.   