
    Χgy              *       2   d dl mZmZmZmZmZ d dlZd dlmZ ddlm	Z	m
Z
mZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ ddgZ G d de          Zd	d
e de de	 de de dz   e_        dee         dee         dee         dee         dee         dee         dee         dee         dedededeeef         dedededededef$dZdee         dee         dee         dee         dee         dee         dee         dee         dedededeeef         dedededededef$d Z dee         dee         dee         dee         dee         dee         dee         dee         dedededeeef         dedededededed!df&d"Z! ee#          	 	 	 	 	 	 	 d(dee         dee         dee         dee         dee         dee         d%ee         deded&ee         dee         dee         dededededeeef         dededef(d'            Z"dS ))    )castListOptionalTupleUnionN)Tensor   )_capturable_doc_default_to_fused_or_foreach_device_dtype_check_for_fused_differentiable_doc_disable_dynamo_if_unsupported_foreach_doc
_fused_doc!_get_capturable_supported_devices_get_scalar_dtype
_get_value_maximize_doc_stack_if_compiling_use_grad_for_differentiable_view_as_real
DeviceDict	OptimizerParamsTAdamWadamwc                        e Zd Z	 	 	 	 	 dddddddded	eeef         d
eeef         dedededede	e         dedede	e         f fdZ
 fdZd Zedd            Z xZS )r   MbP?g?g+?:0yE>{Gz?FN)maximizeforeach
capturabledifferentiablefusedparamslrbetasepsweight_decayamsgradr"   r#   r$   r%   r&   c                   t          |t                    r:|r|	st          d          |                                dk    rt          d          d|k    st          d|           d|k    st          d|           d|d         cxk    rdk     sn t          d	|d                    d|d         cxk    rdk     sn t          d
|d                    d|k    st          d|           t	          ||||||||	|
|
  
        }t                                          ||           |r)|
rt          d          d| _        |rt          d          d S d S )NElr as a Tensor is not supported for capturable=False and foreach=Truer	   zTensor lr must be 1-element        zInvalid learning rate: zInvalid epsilon value: r         ?z#Invalid beta parameter at index 0: z#Invalid beta parameter at index 1: zInvalid weight_decay value: )
r(   r)   r*   r+   r,   r#   r"   r$   r%   r&   z)`fused` does not support `differentiable`Tz0`fused` and `foreach` cannot be `True` together.)	
isinstancer   
ValueErrornumeldictsuper__init__RuntimeError_step_supports_amp_scaling)selfr'   r(   r)   r*   r+   r,   r"   r#   r$   r%   r&   defaults	__class__s                M/var/www/html/ai-engine/env/lib/python3.11/site-packages/torch/optim/adamw.pyr6   zAdamW.__init__!   s    b&!! 	@ z  [   xxzzQ !>???byy;r;;<<<czz<s<<===eAh$$$$$$$$M58MMNNNeAh$$$$$$$$M58MMNNNl""JLJJKKK%!)
 
 
 	*** 	W P"#NOOO.2D+ W"#UVVV	W 	WW W    c                    t                                          |           | j        D ]N}|                    dd           |                    dd           |                    dd            |                    dd           |                    dd           |                    dd           }|d         D ]}| j                            |g           }t          |          d	k    rt          j        |d
                   stt          |d
                   }|d         s|d         r*t          j
        |t          |          |j                  n!t          j
        |t                                |d
<   Pd S )Nr,   Fr"   r#   r$   r%   r&   r'   r   stepis_fuseddtypedevicerC   )r5   __setstate__param_groups
setdefaultstategetlentorch	is_tensorfloattensorr   rD   )r9   rI   groupr&   pp_statestep_valr;   s          r<   rF   zAdamW.__setstate__V   s   U###& 	 	EY...Z///Y---\5111-u555$$Wd33E8_  *..B//w<<1$$U_WV_-M-M$$WV_55H !.O
 38.O$"3U"C"C"C#$8    #\(:K:M:MNNN FO		 	r=   c	                    d}	|d         D ]E}
|
j         |	t          j        |
          z  }	|                    |
           |
j         j        rt          d          |                    |
j                    | j        |
         }t          |          dk    r|d         rt          |
           |d         s|d         r0t          j	        dt          |d                   |
j        	          n!t          j        d
t                                |d<   t          j        |
t          j                  |d<   t          j        |
t          j                  |d<   |r#t          j        |
t          j                  |d<   |                    |d                    |                    |d                    |d         r|                    |d                    |d         r|d         j        rt          d          |d         r2t!          |d         t"                    r|d         st          d          |                    |d                    G|	S )NFr'   z'AdamW does not support sparse gradientsr   r&   r$    r@   rB   r/   rE   r?   )memory_formatexp_avg
exp_avg_sqmax_exp_avg_sqr,   r%   zB`requires_grad` is not supported for `step` in differentiable moder#   r(   r.   )gradrL   
is_complexappend	is_sparser7   rI   rK   r   zerosr   rD   rO   
zeros_likepreserve_formatrequires_gradr1   r   )r9   rP   params_with_gradgradsr,   exp_avgsexp_avg_sqsmax_exp_avg_sqsstate_stepshas_complexrQ   rI   s               r<   _init_groupzAdamW._init_groupm   s    x <	. <	.Av~5+A...K##A&&&v N"#LMMMLL   JqME 5zzQ> 51!444 \*F
 /4GnFEK/wHHH x    c1B1D1DEEE f $)#3U%:$ $ $i  ',&6U%:' ' 'l#  .3.>)>/ / /E*+ OOE),---u\2333Y @&&u-='>???%& 5=+F "X   i uT{F33 l+
 #[   uV}----r=   c                 N   |                                   d}|5t          j                    5   |            }ddd           n# 1 swxY w Y   | j        D ]}g }g }g }g }g }g }	|d         }
t	          t
          t          t          f         |d                   \  }}|                     ||||
||||	          }t          ||||||	f|
|||d         |d         |d         |d         |d         |d	         |d
         |d         t          | dd          t          | dd          |d |S )zPerform a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr,   r)   r(   r+   r*   r"   r#   r$   r%   r&   
grad_scale	found_inf)r,   beta1beta2r(   r+   r*   r"   r#   r$   r%   r&   rk   rl   rh   )
 _cuda_graph_capture_health_checkrL   enable_gradrG   r   r   rN   ri   r   getattr)r9   closurelossrP   rb   rc   rd   re   rf   rg   r,   rm   rn   rh   s                 r<   r?   z
AdamW.step   s    	--///"$$ ! !wyy! ! ! ! ! ! ! ! ! ! ! ! ! ! ! & *	 *	E-/"$E%'H(*K,.O(*K!),GeUl 3U7^DDLE5** 	 	K    ;">2%Lz*i( .$%56Gn"4t<<!$T::')    . s   AA
A)r   r   r    r!   FN)__name__
__module____qualname__r   r   rN   r   r   boolr   r6   rF   ri   r   r?   __classcell__)r;   s   @r<   r   r       sV        $(%1"3W "& $ $3W 3W 3W3W %- 3W UE\"	3W
 3W 3W 3W 3W $3W 3W 3W ~3W 3W 3W 3W 3W 3Wj    .I I IV ": : : "!: : : : :r=   a  Implements AdamW algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma \text{(lr)}, \: \beta_1, \beta_2
                \text{(betas)}, \: \theta_0 \text{(params)}, \: f(\theta) \text{(objective)},
                \: \epsilon \text{ (epsilon)}                                                    \\
            &\hspace{13mm}      \lambda \text{(weight decay)},  \: \textit{amsgrad},
                \: \textit{maximize}                                                             \\
            &\textbf{initialize} : m_0 \leftarrow 0 \text{ (first moment)}, v_0 \leftarrow 0
                \text{ ( second moment)}, \: \widehat{v_0}^{max}\leftarrow 0              \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\

            &\hspace{5mm}\textbf{if} \: \textit{maximize}:                                       \\
            &\hspace{10mm}g_t           \leftarrow   -\nabla_{\theta} f_t (\theta_{t-1})          \\
            &\hspace{5mm}\textbf{else}                                                           \\
            &\hspace{10mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
            &\hspace{5mm} \theta_t \leftarrow \theta_{t-1} - \gamma \lambda \theta_{t-1}         \\
            &\hspace{5mm}m_t           \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t          \\
            &\hspace{5mm}v_t           \leftarrow   \beta_2 v_{t-1} + (1-\beta_2) g^2_t          \\
            &\hspace{5mm}\widehat{m_t} \leftarrow   m_t/\big(1-\beta_1^t \big)                   \\
            &\hspace{5mm}\widehat{v_t} \leftarrow   v_t/\big(1-\beta_2^t \big)                   \\
            &\hspace{5mm}\textbf{if} \: amsgrad                                                  \\
            &\hspace{10mm}\widehat{v_t}^{max} \leftarrow \mathrm{max}(\widehat{v_t}^{max},
                \widehat{v_t})                                                                   \\
            &\hspace{10mm}\theta_t \leftarrow \theta_t - \gamma \widehat{m_t}/
                \big(\sqrt{\widehat{v_t}^{max}} + \epsilon \big)                                 \\
            &\hspace{5mm}\textbf{else}                                                           \\
            &\hspace{10mm}\theta_t \leftarrow \theta_t - \gamma \widehat{m_t}/
                \big(\sqrt{\widehat{v_t}} + \epsilon \big)                                       \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `Decoupled Weight Decay Regularization`_.
    a  
    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, Tensor, optional): learning rate (default: 1e-3). A tensor LR
            is not yet supported for all our implementations. Please use a float
            LR if you are not also specifying fused=True or capturable=True.
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay coefficient (default: 1e-2)
        amsgrad (bool, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False)
        z	
        a8  
    .. Note::
        A prototype implementation of Adam and AdamW for MPS supports `torch.float32` and `torch.float16`.
    .. _Decoupled Weight Decay Regularization:
        https://arxiv.org/abs/1711.05101
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ

    r'   rc   rd   re   rf   rg   rk   rl   r,   rm   rn   r(   r+   r*   r"   r$   r%   rh   c       
            ||J t           j                                        rt          |t                    sJ t          |           D ]\  }}|s||         n||          }||         }||         }||         }t           j                                        sF|rDt                      }|j	        j
        |j	        j
        k    r|j	        j
        |v sJ d| d            t          j        |          rot          j        |          }t          j        |          }t          j        |          }|rt          j        ||                   ||<   t          j        |          }|dz  }|                    d||z  z
             |                    |d|	z
             |                    |
                              ||d|
z
             |s|r|}d|	|z  z
  }d|
|z  z
  }||z  }|                                }|                                }|r|r||                                         }n||         }||                             t          j        ||                     ||                                         ||z  z                      ||z            } n0|                                ||z  z                      ||z            } |                    ||            nt/          |          }d|	|z  z
  }d|
|z  z
  }||z  }|dz  }|rTt          j        ||         |||                    ||                                         |z                      |          } n*|                                |z                      |          } |                    || |            |r7t          j        | |                   rt          j        ||                   ||<   d S )NIIf capturable=True, params and state_steps must be on supported devices: .r	   )value      ?)out)rL   jitis_scriptingr1   rN   	enumerate_utilsis_compilingr   rD   typer[   view_as_realmul_lerp_addcmul_negsqrtclonecopy_maximumadd_addcdiv_r   view_as_complex)!r'   rc   rd   re   rf   rg   rk   rl   r,   rm   rn   r(   r+   r*   r"   r$   r%   rh   iparamrZ   rW   rX   step_tcapturable_supported_devicesr?   bias_correction1bias_correction2	step_sizestep_size_negbias_correction2_sqrtrY   denoms!                                    r<   _single_tensor_adamwr   =  sC   * )"3"3"3y % "e$$$$$f%% WK WK5'6uQxxeAhY1+ ^
Q |((** 	{z 	{+L+N+N(!V]%777L%)EEEEz[wzzz FEE E"" 	.%d++D(11G+J77J L%*%78J%K%K"&u--E 	! 	

1rL(())) 	dAI&&&''d!e)'DDD 3	= 3	=D 5$; 5$;--I%MMOOM$4$9$9$;$;! ,! 8%4Q%7%=%=%?%?NN%4Q%7N"((~z)R)RSSS $A&++--1F1VW$s]*++ 
 OO%%)>)NO$s]*++  NN7E****f%%D 5$; 5$;--I$4c$9! Noa0*/RSBTUUUU )+00225JJPPQTUU#**-BBHHMMNN7E)N<<<  	Ku'q	22 	K!&!6q7I!J!JOAoWK WKr=   c       
         *
  	
% t          |           dk    rd S t          t                    r|st          d          t          j                                        sI|rGt          d          %t          %fdt          | |          D                       sJ d% d            |r
J d            ||J t          j        | |||||g          }|                                D ]*\  \  }}}}}}}t          t          t                   |          }t          t          t                   |          }t          t          t                   |          }t          t          t                   |          }t          t          t                   |          }|rH|r4t          t          t                   |          }t          |||||           nt          ||||           |rt	          j        |          }t          j                                        s9|d         j        r,t	          j        |t	          j        d	d
          d	           nt	          j        |d           |dk    rt	          j        |d|z  z
             t	          j        ||d	z
             t	          j        |
           t	          j        |||d
z
             ~|r`t	          j        	|          } t	          j        
|          }!t	          j        | d           t	          j        |!d           t	          j        |!           t	          j        |            t	          j        |            t	          j        |!           | }"|!}#|rJt          t          t                   |          }t	          j        ||           t	          j        |          }$nt	          j        |          }$t	          j        |$|#           t	          j        |$|           t	          j        |$|"           t	          j        |||$           F	fd|D             } 
fd|D             }!tA          fd| D                       }"d |!D             }#|rJt          t          t                   |          }t	          j        ||           t	          j        |          }$nt	          j        |          }$t	          j        |$|#           t	          j        |$|           t	          j        |||$|"           ,d S )Nr   r.   F)supports_xlac              3   n   K   | ]/\  }}|j         j        |j         j        k    o|j         j        v V  0d S rt   )rD   r   ).0rQ   r?   r   s      r<   	<genexpr>z&_multi_tensor_adamw.<locals>.<genexpr>  s]       
 
 4 HMT[-- >!==
 
 
 
 
 
r=   r{   r|   z#_foreach ops don't support autogradr0   cpu)rD   )alphar	   c                 :    g | ]}d t          |          z  z
  S r	   r   )r   r?   rm   s     r<   
<listcomp>z'_multi_tensor_adamw.<locals>.<listcomp>H  8          26EZ----     r=   c                 :    g | ]}d t          |          z  z
  S r   r   )r   r?   rn   s     r<   r   z'_multi_tensor_adamw.<locals>.<listcomp>K  r   r=   c                      g | ]
}|z  d z  S )rU   )r   bcr(   s     r<   r   z'_multi_tensor_adamw.<locals>.<listcomp>O  s!    ,W,W,Wb2g^,W,W,Wr=   c                     g | ]}|d z  S )r~   rU   )r   r   s     r<   r   z'_multi_tensor_adamw.<locals>.<listcomp>Q  s)     % % %C% % %r=   )!rK   r1   r   r7   rL   r   r   r   allzipr   "_group_tensors_by_device_and_dtypevaluesr   r   r   _foreach_negis_cpu_foreach_add_rO   _foreach_mul__foreach_lerp__foreach_addcmul__foreach_pow_foreach_sub__foreach_neg__foreach_div__foreach_reciprocal__foreach_sqrt__foreach_maximum__foreach_sqrt_foreach_addcdiv_r   )&r'   rc   rd   re   rf   rg   rk   rl   r,   rm   rn   r(   r+   r*   r"   r$   r%   rh   grouped_tensorsdevice_params_device_grads_device_exp_avgs_device_exp_avg_sqs_device_max_exp_avg_sqs_device_state_steps__device_paramsdevice_gradsdevice_exp_avgsdevice_exp_avg_sqsdevice_state_stepsdevice_max_exp_avg_sqsr   r   r   r   exp_avg_sq_sqrtr   s&            ```                         @r<   _multi_tensor_adamwr     s   * 6{{a"f 
j 
S
 
 	

 <$$&& w: w'H(
 (
 (
$  
 
 
 
 v{33
 
 
 
 
 	w 	w wWsvvv		w 	w 	w DDDDDD)"3"3"3B	+L O ""$$D D 		 	T&\>::DL-88tF|-=>>!$v,0CDD!$v,0CDD 	 )-d6l<S)T)T&! #&*    !<BT    	< -l;;L |((** 	7/A!/D/K 	7"ELU$C$C$C3      2A666 1q23D/DEEE 	_lAIFFF.666lAI	
 	
 	

   F	$1%9KLL$1%9KLL 0!444 0!444 0111  0"555&'7888 !1222
 )I$4! 	J)-d6l<S)T)T& '(>@RSSS #("56L"M"M"'"56H"I"I1FGGG555;;; #M?OTTTT       :L            :L      ,,W,W,W,WFV,W,W,WXXI% %"2% % %!  	J)-d6l<S)T)T& '(>@RSSS #("56L"M"M"'"56H"I"I1FGGG555#	   D Dr=   returnc       
            | sd S |rt          d          |	|j        |ini }|	|j        |ini }t          |t                    r!t	          |j                  dk    r	|j        |ind }t          j        | |||||g          }|                                D ]\  \  }}\  \  }}}}}}}t          t          t                   |          }t          t          t                   |          }t          t          t                   |          } t          t          t                   |          }!t          t          t                   |          }"|j
        dk    r||J d\  }#}$|+|                    ||                    |d                    }#|+|                    ||                    |d                    }$|/||vr+|                    ||                    |d                    }t          j        |"d           t          j        ||| |!||"|||	|
||||#|$	           |$&t          j        |"|$gt#          |"          z             d S )
Nz9Adam with fused=True does not support differentiable=Truer   mps)NNT)non_blocking)rD   r   r	   )	r,   r(   rm   rn   r+   r*   r"   rk   rl   )r7   rD   r1   r   strr   r   itemsr   r   r   rH   torL   r   _fused_adamw_r   rK   )%r'   rc   rd   re   rf   rg   rk   rl   r,   rm   rn   r(   r+   r*   r"   r$   r%   rh   grad_scale_dictfound_inf_dictlr_dictr   rD   r   r   r   r   r   r   r   r   r   r   r   r   device_grad_scaledevice_found_infs%                                        r<   _fused_adamwr   j  s   *   XVWWW ,6+A	J''r  *3)>	9%%B  &b&11Wc")nn6M6MBSW   B	+L O 
			 	 6 6 
	 
	
"	T&\>::DL-88tF|-=>>!$v,0CDD!$v,0CDD;%$);););.8++! / : :
f4@@! !  -88	V$??    6#8#8##V$?? B 	.222"%(&	
 	
 	
 	
" '"%5$6=O9P9P$P  i6 6r=   )single_tensor_fnFr#   r&   c                   t           j                                        s(t          d |D                       st	          d          |	2|0t          | |d          \  }}|rt          |t                    r|sd}|	d}	|d}|r-t           j        	                                rt	          d          |	r-t           j        	                                rt	          d          |	r&t           j        	                                st          }n/|r&t           j        	                                st          }nt          } || |||||||||||||||
||           dS )	zpFunctional API that performs AdamW algorithm computation.

    See :class:`~torch.optim.AdamW` for details.
    c              3   J   K   | ]}t          |t          j                  V  d S rt   )r1   rL   r   )r   ts     r<   r   zadamw.<locals>.<genexpr>  s?       3 3()
1el##3 3 3 3 3 3r=   zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsNF)	use_fusedz6torch.jit.script not supported with foreach optimizersz4torch.jit.script not supported with fused optimizers)r,   rm   rn   r(   r+   r*   r"   r$   r%   rk   rl   rh   )rL   r   r   r   r7   r   r1   r   r   r   r   r   r   )r'   rc   rd   re   rf   rg   r#   r$   r%   r&   rk   rl   rh   r,   rm   rn   r(   r+   r*   r"   r   funcs                         r<   r   r     s   < <$$&& 
s 3 3-83 3 3 0 0 
 ^
 
 	
 }1Ne
 
 

7  	z"f-- 	j 	G} U59))++ USTTT S'')) SQRRR $UY++-- $	 $//11 $"#D!%%     r=   )NFFNNNF)#typingr   r   r   r   r   rL   r   	optimizerr
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   __all__r   __doc__rx   rN   r   r   r   r   rU   r=   r<   <module>r      sO   6 5 5 5 5 5 5 5 5 5 5 5 5 5                                            * G
S S S S SI S S Sn&N 
   
! " 
# $ 
% & 
'  OB NtKLtK<tK 6ltK f	tK
 &\tK ftK  tK tK tK tK tK 	femtK tK 
tK  !tK" #tK$ %tK& 'tK tK tK tKnsLs<s 6ls f	s
 &\s fs  s s s s s 	fems s 
s  !s" #s$ %s& 's s s sl`L`<` 6l` f	`
 &\` f`  ` ` ` ` ` 	fem` ` 
`  !`" #`$ %`& '`( 
)` ` ` `F  1EFFF #  #'"&S SLS<S 6lS f	S
 &\S fS d^S S S D>S  S S S" #S$ %S& 'S( 	eVm)S* +S, 
-S. /S S S GFS S Sr=   