
    Χg7}              *       2   d dl mZmZmZmZmZ d dlZd dlmZ ddlm	Z	m
Z
mZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ ddgZ G d de          Zd	d
e de de	 de de dz   e_        dee         dee         dee         dee         dee         dee         dee         dee         dededededeeef         dededededef$dZdee         dee         dee         dee         dee         dee         dee         dee         dededededeeef         dededededef$d Z dee         dee         dee         dee         dee         dee         dee         dee         dededededeeef         dededededed!df&d"Z! ee#          	 	 	 	 	 	 	 d(dee         dee         dee         dee         dee         dee         d%ee         deded&ee         dee         dee         dededededeeef         dededef(d'            Z"dS ))    )castListOptionalTupleUnionN)Tensor   )_capturable_doc_default_to_fused_or_foreach_device_dtype_check_for_fused_differentiable_doc_disable_dynamo_if_unsupported_foreach_doc
_fused_doc!_get_capturable_supported_devices_get_scalar_dtype
_get_value_maximize_doc_stack_if_compiling_use_grad_for_differentiable_view_as_real
DeviceDict	OptimizerParamsTAdamadamc                        e Zd Z	 	 	 	 	 dddddddded	eeef         d
eeef         dededede	e         dededede	e         f fdZ
 fdZd Zedd            Z xZS )r   MbP?g?g+?:0yE>r   FN)foreachmaximize
capturabledifferentiablefusedparamslrbetasepsweight_decayamsgradr!   r"   r#   r$   r%   c                   t          |t                    r:|r|	st          d          |                                dk    rt          d          d|k    st          d|           d|k    st          d|           d|d         cxk    rdk     sn t          d	|d                    d|d         cxk    rdk     sn t          d
|d                    d|k    st          d|           t	          ||||||||	|
|
  
        }t                                          ||           |r)|
rt          d          d| _        |rt          d          d S d S )NElr as a Tensor is not supported for capturable=False and foreach=Truer	   zTensor lr must be 1-element        zInvalid learning rate: zInvalid epsilon value: r         ?z#Invalid beta parameter at index 0: z#Invalid beta parameter at index 1: zInvalid weight_decay value: )
r'   r(   r)   r*   r+   r"   r!   r#   r$   r%   z)`fused` does not support `differentiable`Tz0`fused` and `foreach` cannot be `True` together.)	
isinstancer   
ValueErrornumeldictsuper__init__RuntimeError_step_supports_amp_scaling)selfr&   r'   r(   r)   r*   r+   r!   r"   r#   r$   r%   defaults	__class__s                L/var/www/html/ai-engine/env/lib/python3.11/site-packages/torch/optim/adam.pyr5   zAdam.__init__!   s    b&!! 	@ z  [   xxzzQ !>???byy;r;;<<<czz<s<<===eAh$$$$$$$$M58MMNNNeAh$$$$$$$$M58MMNNNl""JLJJKKK%!)
 
 
 	*** 		W P"#NOOO.2D+
  W"#UVVV		W 		WW W    c                    t                                          |           | j        D ]N}|                    dd           |                    dd           |                    dd            |                    dd           |                    dd           |                    dd           }|d         D ]}| j                            |g           }t          |          d	k    rt          j        |d
                   stt          |d
                   }|d         s|d         r*t          j
        |t          |          |j                  n!t          j
        |t                                |d
<   Pd S )Nr+   Fr"   r!   r#   r$   r%   r&   r   stepis_fuseddtypedevicerB   )r4   __setstate__param_groups
setdefaultstategetlentorch	is_tensorfloattensorr   rC   )r8   rH   groupr%   pp_statestep_valr:   s          r;   rE   zAdam.__setstate__[   s   U###& 	 	EY...Z///Y---\5111-u555$$Wd33E8_  *..B//w<<1$$U_WV_-M-M$$WV_55H !.O
 38.O$"3U"C"C"C#$8    #\(:K:M:MNNN FO		 	r<   c                    d}|d         D ]J}	|	j         ?|t          j        |	          z  }|                    |	           |	j         j        rt          d          |                    |	j                    | j        |	         }
t          |
          dk    r|d         rt          |	           |d         s|d         r0t          j	        dt          |d                   |	j        	          n!t          j        d
t                                |
d<   t          j        |	t          j                  |
d<   t          j        |	t          j                  |
d<   |d         r#t          j        |	t          j                  |
d<   |                    |
d                    |                    |
d                    |d         r|                    |
d                    |d         r|
d         j        rt          d          |d         r1t          j        |d                   r|d         st          d          |                    |
d                    L|S )NFr&   zJAdam does not support sparse gradients, please consider SparseAdam insteadr   r%   r#    r?   rA   r.   rD   r>   )memory_formatexp_avg
exp_avg_sqr+   max_exp_avg_sqr$   zB`requires_grad` is not supported for `step` in differentiable moder!   r'   r-   )gradrK   
is_complexappend	is_sparser6   rH   rJ   r   zerosr   rC   rN   
zeros_likepreserve_formatrequires_gradrL   )r8   rO   params_with_gradgradsexp_avgsexp_avg_sqsmax_exp_avg_sqsstate_stepshas_complexrP   rH   s              r;   _init_groupzAdam._init_groupr   s    x =	2 =	2Av!u/222 ''***6# &d   QV$$$
1u::??W~ 95a888 !.J
 38.J"3U7^"L"L"L#$8    #\#5F5H5HIII &M (-'7)>( ( (E)$ +0*:)>+ + +E,' Y' 272BU-B3 3 3./ i 0111""5#6777# D#**51A+BCCC)* uV}/J &\   )$d44 ",/
 '_   ""5=111r<   c                    |                                   d}|5t          j                    5   |            }ddd           n# 1 swxY w Y   | j        D ]}g }g }g }g }g }g }	|d         \  }
}|                     |||||||	          }t          ||||||	f|d         ||
||d         |d         |d         |d         |d         |d	         |d
         |d         t          | dd          t          | dd          d |S )zPerform a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr(   r+   r'   r*   r)   r"   r!   r#   r$   r%   
grad_scale	found_inf)r+   rg   beta1beta2r'   r*   r)   r"   r!   r#   r$   r%   rj   rk   ) _cuda_graph_capture_health_checkrK   enable_gradrF   rh   r   getattr)r8   closurelossrO   ra   rb   rc   rd   re   rf   rl   rm   rg   s                r;   r>   z	Adam.step   s    	--///"$$ ! !wyy! ! ! ! ! ! ! ! ! ! ! ! ! ! ! & (	 (	E-/"$E%'H(*K,.O(*K >LE5**  K   i(';">2%Lz*i( .$%56Gn"4t<<!$T::)    . s   AA
A)r   r   r    r   FN)__name__
__module____qualname__r   r   rM   r   r   boolr   r5   rE   rh   r   r>   __classcell__)r:   s   @r;   r   r       sV        $(%18W #' $ $8W 8W 8W8W %- 8W UE\"	8W
 8W 8W 8W $8W 8W 8W 8W ~8W 8W 8W 8W 8W 8Wt    .I I IV "8 8 8 "!8 8 8 8 8r<   a  Implements Adam algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma \text{ (lr)}, \beta_1, \beta_2
                \text{ (betas)},\theta_0 \text{ (params)},f(\theta) \text{ (objective)}          \\
            &\hspace{13mm}      \lambda \text{ (weight decay)},  \: \textit{amsgrad},
                \:\textit{maximize}                                                              \\
            &\textbf{initialize} :  m_0 \leftarrow 0 \text{ ( first moment)},
                v_0\leftarrow 0 \text{ (second moment)},\: \widehat{v_0}^{max}\leftarrow 0\\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\

            &\hspace{5mm}\textbf{if} \: \textit{maximize}:                                       \\
            &\hspace{10mm}g_t           \leftarrow   -\nabla_{\theta} f_t (\theta_{t-1})         \\
            &\hspace{5mm}\textbf{else}                                                           \\
            &\hspace{10mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})          \\
            &\hspace{5mm}\textbf{if} \: \lambda \neq 0                                           \\
            &\hspace{10mm} g_t \leftarrow g_t + \lambda  \theta_{t-1}                            \\
            &\hspace{5mm}m_t           \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t          \\
            &\hspace{5mm}v_t           \leftarrow   \beta_2 v_{t-1} + (1-\beta_2) g^2_t          \\
            &\hspace{5mm}\widehat{m_t} \leftarrow   m_t/\big(1-\beta_1^t \big)                   \\
            &\hspace{5mm}\widehat{v_t} \leftarrow   v_t/\big(1-\beta_2^t \big)                   \\
            &\hspace{5mm}\textbf{if} \: amsgrad                                                  \\
            &\hspace{10mm}\widehat{v_t}^{max} \leftarrow \mathrm{max}(\widehat{v_t}^{max},
                \widehat{v_t})                                                                   \\
            &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} - \gamma \widehat{m_t}/
                \big(\sqrt{\widehat{v_t}^{max}} + \epsilon \big)                                 \\
            &\hspace{5mm}\textbf{else}                                                           \\
            &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} - \gamma \widehat{m_t}/
                \big(\sqrt{\widehat{v_t}} + \epsilon \big)                                       \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `Adam: A Method for Stochastic Optimization`_.
    a  
    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, Tensor, optional): learning rate (default: 1e-3). A tensor LR
            is not yet supported for all our implementations. Please use a float
            LR if you are not also specifying fused=True or capturable=True.
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (bool, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False)
        z	
        a=  
    .. Note::
        A prototype implementation of Adam and AdamW for MPS supports `torch.float32` and `torch.float16`.
    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ

    r&   rb   rc   rd   re   rf   rj   rk   r+   rg   rl   rm   r'   r*   r)   r"   r#   r$   c       
            ||J t           j                                        rt          |t                    sJ t          |           D ]\  }}|s||         n||          }||         }||         }||         }t           j                                        sF|rDt                      }|j	        j
        |j	        j
        k    r|j	        j
        |v sJ d| d            |dz  }|dk    r|                    ||          }t          j        |          rot          j        |          }t          j        |          }t          j        |          }|rt          j        ||                   ||<   t          j        |          }|                    |d|
z
             |                    |                              ||                                d|z
             |s|r|}d|
|z  z
  }d||z  z
  }||z  }|                                }|                                }|r|r||                                         }n||         }||                             t          j        ||                     ||                                         ||z  z                      ||z            } n0|                                ||z  z                      ||z            } |                    ||            nt3          |          }d|
|z  z
  }d||z  z
  }||z  }|dz  }|rTt          j        ||         |||                    ||                                         |z                      |          } n*|                                |z                      |          } |                    || |            |r7t          j        | |                   rt          j        ||                   ||<   d S )	NIIf capturable=True, params and state_steps must be on supported devices: .r	   r   alpha)value      ?)out)rK   jitis_scriptingr0   rM   	enumerate_utilsis_compilingr   rC   typeaddrZ   view_as_reallerp_mul_addcmul_conjnegsqrtclonecopy_maximumadd_addcdiv_r   view_as_complex)!r&   rb   rc   rd   re   rf   rj   rk   r+   rg   rl   rm   r'   r*   r)   r"   r#   r$   iparamrY   rV   rW   step_tcapturable_supported_devicesr>   bias_correction1bias_correction2	step_sizestep_size_negbias_correction2_sqrtrX   denoms!                                    r;   _single_tensor_adamr   @  sK   * )"3"3"3y % "e$$$$$f%% WK WK5'6uQxxeAhY1+ ^
Q |((** 	{z 	{+L+N+N(!V]%777L%)EEEEz[wzzz FEE 	!188E866DE"" 	.%d++D(11G+J77J L%*%78J%K%K"&u--E 	dAI&&&''diikkU'KKK 3	= 3	=D 5$; 5$;--I%MMOOM$4$9$9$;$;! ,! 8%4Q%7%=%=%?%?NN%4Q%7N"((~z)R)RSSS $A&++--1F1VW$s]*++ 
 OO%%)>)NO$s]*++  NN7E****f%%D 5$; 5$;--I$4c$9! Noa0*/RSBTUUUU )+00225JJPPQTUU#**-BBHHMMNN7E)N<<<  	Ku'q	22 	K!&!6q7I!J!JOAoWK WKr<   c       
         V
  
% t          |           dk    rd S t          t                    r|st          d          t          j                                        sI|rGt          d          %t          %fdt          | |          D                       sJ d% d            ||J |r
J d            t          j        | |||||g          }|                                D ]@\  \  }}}}}}}t          t          t                   |          }t          t          t                   |          }t          t          t                   |          }t          t          t                   |          }t          t          t                   |          }|	rH|r4t          t          t                   |          }t          |||||           nt          ||||           |rt	          j        |          }t          j                                        s9|d         j        r,t	          j        |t	          j        d	d
          d	           nt	          j        |d           |dk    r1|rt	          j        |||           nt	          j        |||          }t	          j        ||d
z
             t	          j        |           t	          j        |||dz
             ~|r`t	          j        
|          } t	          j        |          }!t	          j        | d           t	          j        |!d           t	          j        |!           t	          j        |            t	          j        |            t	          j        |!           | }"|!}#|rJt          t          t                   |          }t	          j        ||           t	          j        |          }$nt	          j        |          }$t	          j        |$|#           t	          j        |$|           t	          j        |$|"           t	          j         |||$           \
fd|D             } fd|D             }!tC          fd| D                       }"d |!D             }#|rJt          t          t                   |          }t	          j        ||           t	          j        |          }$nt	          j        |          }$t	          j        |$|#           t	          j        |$|           t	          j         |||$|"           Bd S )Nr   r-   F)supports_xlac              3   n   K   | ]/\  }}|j         j        |j         j        k    o|j         j        v V  0d S rs   )rC   r   ).0rP   r>   r   s      r;   	<genexpr>z%_multi_tensor_adam.<locals>.<genexpr>  s]       
 
 4 HMT[-- >!==
 
 
 
 
 
r<   rz   r{   z#_foreach ops don't support autogradr/   cpu)rC   r|   r	   c                 :    g | ]}d t          |          z  z
  S r	   r   )r   r>   rl   s     r;   
<listcomp>z&_multi_tensor_adam.<locals>.<listcomp>P  8          26EZ----     r<   c                 :    g | ]}d t          |          z  z
  S r   r   )r   r>   rm   s     r;   r   z&_multi_tensor_adam.<locals>.<listcomp>S  r   r<   c                      g | ]
}|z  d z  S )rT   )r   bcr'   s     r;   r   z&_multi_tensor_adam.<locals>.<listcomp>W  s!    ,W,W,Wb2g^,W,W,Wr<   c                     g | ]}|d z  S )r   rT   )r   r   s     r;   r   z&_multi_tensor_adam.<locals>.<listcomp>Y  s    $H$H$HRW$H$H$Hr<   )"rJ   r0   r   r6   rK   r   r   r   allzipr   "_group_tensors_by_device_and_dtypevaluesr   r   r   _foreach_negis_cpu_foreach_add_rN   _foreach_add_foreach_lerp__foreach_mul__foreach_addcmul__foreach_pow_foreach_sub__foreach_neg__foreach_div__foreach_reciprocal__foreach_sqrt__foreach_maximum__foreach_sqrt_foreach_addcdiv_r   )&r&   rb   rc   rd   re   rf   rj   rk   r+   rg   rl   rm   r'   r*   r)   r"   r#   r$   grouped_tensorsdevice_params_device_grads_device_exp_avgs_device_exp_avg_sqs_device_max_exp_avg_sqs_device_state_steps__device_paramsdevice_gradsdevice_exp_avgsdevice_exp_avg_sqsdevice_state_stepsdevice_max_exp_avg_sqsr   r   r   r   exp_avg_sq_sqrtr   s&             ```                        @r;   _multi_tensor_adamr     s   * 6{{a"f 
j 
S
 
 	

 <$$&& w: w'H(
 (
 (
$  
 
 
 
 v{33
 
 
 
 
 	w 	w wWsvvv		w 	w 	w )"3"3"3DDDDDDB	+L O ""$$C C 		 	T&\>::DL-88tF|-=>>!$v,0CDD!$v,0CDD  	 )-d6l<S)T)T&! #&*    !<BT    	< -l;;L |((** 	7/A!/D/K 	7"ELU$C$C$C3      2A6661 #L-|TTTTT$1 -|     
 	_lAIFFF.666lAI	
 	
 	

   ?	$1%9KLL$1%9KLL 0!444 0!444 0111  0"555&'7888 !1222
 )I$4! J)-d6l<S)T)T&'(>@RSSS #("56L"M"M"'"56H"I"I1FGGG555;;; #M?OTTTT       :L            :L      ,,W,W,W,WFV,W,W,WXXI$H$H7G$H$H$H! J)-d6l<S)T)T&'(>@RSSS #("56L"M"M"'"56H"I"I1FGGG555#   CC Cr<   returnc       
            | sd S |rt          d          |	|j        |ini }|	|j        |ini }t          |t                    r!t	          |j                  dk    r	|j        |ind }t          j        | |||||g          }|                                D ]\  \  }}\  \  }}}}}}}t          t          t                   |          }t          t          t                   |          }t          t          t                   |          } t          t          t                   |          }!t          t          t                   |          }"|j
        dk    r||J d\  }#}$|+|                    ||                    |d                    }#|+|                    ||                    |d                    }$|&||vr"|                    |d          ||<   ||         }t          j        |"d           t          j        ||| |!||"|||
|||||#|$	           |$&t          j        |"|$gt#          |"          z             d S )
Nz9Adam with fused=True does not support differentiable=Truer   mps)NNT)non_blocking)rC   r   r	   )	r+   r'   rl   rm   r*   r)   r"   rj   rk   )r6   rC   r0   r   strr   r   itemsr   r   r   rG   torK   r   _fused_adam_r   rJ   )%r&   rb   rc   rd   re   rf   rj   rk   r+   rg   rl   rm   r'   r*   r)   r"   r#   r$   grad_scale_dictfound_inf_dictlr_dictr   rC   r   r   r   r   r   r   r   r   r   r   r   r   device_grad_scaledevice_found_infs%                                        r;   _fused_adamr   l  s   *   XVWWW ,6+A	J''r  *3)>	9%%B  &b&11Wc")nn6M6MBSW   B	+L O 
			 	 5 5 
	 
	
"	T&\>::DL-88tF|-=>>!$v,0CDD!$v,0CDD;%$);););.8++! / : :
f4@@! !  -88	V$??    6#8#8 ee6eEEGFOB.222"%(&	
 	
 	
 	
" '"%5$6=O9P9P$P  g5 5r<   )single_tensor_fnFr!   r%   c                   |	2|0t          | |d          \  }}|rt          |t                    r|sd}|	d}	|d}t          j                                        s(t          d |D                       st          d          |r-t          j        	                                rt          d          |	r-t          j        	                                rt          d          |	r&t          j        	                                st          }n/|r&t          j        	                                st          }nt          } || ||||||||||||||||
|           dS )	znFunctional API that performs Adam algorithm computation.

    See :class:`~torch.optim.Adam` for details.
    NF)	use_fusedc              3   J   K   | ]}t          |t          j                  V  d S rs   )r0   rK   r   )r   ts     r;   r   zadam.<locals>.<genexpr>  s?       3 3()
1el##3 3 3 3 3 3r<   zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsz6torch.jit.script not supported with foreach optimizersz4torch.jit.script not supported with fused optimizers)r+   rg   rl   rm   r'   r*   r)   r"   r#   r$   rj   rk   )r   r0   r   rK   r   r   r   r6   r   r   r   r   r   )r&   rb   rc   rd   re   rf   r!   r#   r$   r%   rj   rk   rg   r+   rl   rm   r'   r*   r)   r"   r   funcs                         r;   r   r     s   D }1Ne
 
 

7  	z"f-- 	j 	G} <$$&& 
s 3 3-83 3 3 0 0 
 ^
 
 	
  U59))++ USTTT S'')) SQRRR #UY++-- #	 #//11 #!"D!%%     r<   )NFFNNNF)#typingr   r   r   r   r   rK   r   	optimizerr
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   __all__r   __doc__rw   rM   r   r   r   r   rT   r<   r;   <module>r      sO   6 5 5 5 5 5 5 5 5 5 5 5 5 5                                            * 6
V V V V V9 V V Vt&N 
   
! " 
# $ 
% & 
'  OB NtKLtK<tK 6ltK f	tK
 &\tK ftK  tK tK tK tK tK tK 	eVmtK tK  
!tK" #tK$ %tK& 'tK tK tK tKnrLr<r 6lr f	r
 &\r fr  r r r r r r 	eVmr r  
!r" #r$ %r& 'r r r rj^L^<^ 6l^ f	^
 &\^ f^  ^ ^ ^ ^ ^ ^ 	eVm^ ^  
!^" #^$ %^& '^( 
)^ ^ ^ ^B  1DEEE #  #'"&U ULU<U 6lU f	U
 &\U fU d^U U U D>U  U U U" #U$ %U& 'U( 	eVm)U* +U, 
-U. /U U U FEU U Ur<   