
    Ng#                     J    d Z ddlZddlZ G d dej        j                  ZdS )z Adafactor Optimizer

Lifted from https://github.com/pytorch/fairseq/blob/master/fairseq/optim/adafactor.py

Original header/copyright below.

    Nc                        e Zd ZdZ	 	 d fd
	Zed             Zed             Zed             Zd Z	 e
j                    dd            Z xZS )	Adafactora  Implements Adafactor algorithm.
    This implementation is based on: `Adafactor: Adaptive Learning Rates with Sublinear Memory Cost`
    (see https://arxiv.org/abs/1804.04235)

    Note that this optimizer internally adjusts the learning rate depending on the
    *scale_parameter*, *relative_step* and *warmup_init* options.

    To use a manual (external) learning rate schedule you should set `scale_parameter=False` and
    `relative_step=False`.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining parameter groups
        lr (float, optional): external learning rate (default: None)
        eps (tuple[float, float]): regularization constants for square gradient
            and parameter scale respectively (default: (1e-30, 1e-3))
        clip_threshold (float): threshold of root mean square of final gradient update (default: 1.0)
        decay_rate (float): coefficient used to compute running averages of square gradient (default: -0.8)
        beta1 (float): coefficient used for computing running averages of gradient (default: None)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        scale_parameter (bool): if True, learning rate is scaled by root mean square of parameter (default: True)
        warmup_init (bool): time-dependent learning rate computation depends on
            whether warm-up initialization is being used (default: False)
    NKH9MbP?      ?皙        TFc                     | }|
r|st          d          |d n|d         }t          ||||||||	||

  
        }t          t          |                               ||           d S )Nz'warmup_init requires relative_step=Truer   )
lreps	eps_scaleclip_threshold
decay_ratebeta1weight_decayscale_parameterrelative_stepwarmup_init)
ValueErrordictsuperr   __init__)selfparamsr   r   r   r   r   betasr   r   r   r   r   defaults	__class__s                 P/var/www/html/ai-engine/env/lib/python3.11/site-packages/timm/optim/adafactor.pyr   zAdafactor.__init__)   s     	H} 	HFGGG5823)Ngq#,P_&3N N N 	i''99999    c                     | d         rn| d         rd|d         z  nd}t          |dt          j        |d                   z            }d}| d         rt          | d         |d	                   }||z  | d
<   | d
         S )Nr   r   gư>stepg{Gz?r   r   r   RMSr   )minmathsqrtmax)param_groupparam_statemin_steplr_tparam_scales        r   _get_lrzAdafactor._get_lr5   s    ' 	35@5OYtk&111UYHxtyV1D'E'E!EFFDK,- P!+k":K<NOO ${ 2K4  r   c                 D    t          |          dk    }| d         d u}||fS )N   r   )len)r'   param_shapefactoreduse_first_moments       r   _get_optionszAdafactor._get_options@   s0    {##q(&w/t;)))r   c                 \    |                      d          |                                 dz  z  S )Nr.   g      ?)normnumel)tensors    r   _rmszAdafactor._rmsF   s$    {{1~~3!677r   c                     ||                     dd          z                                                      d          }|                    d                                          }t	          j        ||          S )NT)dimkeepdim)meanrsqrt_	unsqueezersqrttorchmul)r   exp_avg_sq_rowexp_avg_sq_colr_factorc_factors        r   _approx_sq_gradzAdafactor._approx_sq_gradJ   sk    "^%8%8R%8%N%NNVVXXbbceff!++B//5577y8,,,r   c                 	   d}|5t          j                    5   |            }ddd           n# 1 swxY w Y   | j        D ]}|d         D ]}|j        |j        }|j        t           j        t           j        hv r|                                }|j        rt          d          | j
        |         }|                     ||j                  \  }}t          |          dk    rd|d<   |rt          j        |          |d<   |rt          j        |j        dd                                       |          |d<   t          j        |j        dd	         |j        dd         z                                 |          |d
<   nt          j        |          |d<   d|d<   n}|r|d                             |          |d<   |r=|d                             |          |d<   |d
                             |          |d
<   n|d                             |          |d<   |}	|j        t           j        t           j        hv r|	                                }	|dxx         dz  cc<   |                     |	          |d<   |                     ||          }
dt'          j        |d         |d                   z
  }|dz  |d         z   }|r|d         }|d
         }|                    |                              |                    d          d|z
             |                    |                              |                    d	          d|z
             |                     ||          }|                    |           n\|d         }|                    |                              |d|z
             |                                                    |          }|                    |                     |          |d         z                      d                     |                    |
           |rC|d         }|                    |d                                       |d|d         z
             |}|d         dk    r!|	                    |	|d          |
z             |	                    |            |j        t           j        t           j        hv r|                    |	           |S )zPerforms a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model and returns the loss.
        Nr   z,Adafactor does not support sparse gradients.r   r!   exp_avgr:   rD   r=   rE   
exp_avg_sqr"      r   r   r.   r   )r;   )alphar   )r#   r   r   )rB   enable_gradparam_groupsgraddtypefloat16bfloat16float	is_sparseRuntimeErrorstater3   shaper/   
zeros_likezerostor8   r,   r$   powmul_add_r>   rH   rA   div_clamp_copy_)r   closurelossgroupprP   rW   r1   r2   p_fp32r*   beta2tupdaterD   rE   rK   rJ   s                    r   r!   zAdafactor.stepO   s<    "$$ ! !wyy! ! ! ! ! ! ! ! ! ! ! ! ! ! ! & K	$ K	$E8_ J$ J$6>v:%-!@@@::<<D> W&'UVVV
1-1->->udj-Q-Q**u::??$%E&M' B+0+;D+A+Ai( E27+dj"o2N2N2Q2QRV2W2W./27+dj"oPTPZ[][^[^P_>_2`2`2c2cdh2i2i.//.3.>t.D.Dl+#$E%LL' E+0+;+>+>t+D+Di( K278H2I2L2LT2R2R./278H2I2L2LT2R2R.//.3L.A.D.DT.J.Jl+7u}en===#\\^^Ff"#yy00e||E511txfu\7JKKKU5\1 ;%*+;%<N%*+;%<N"''//44V[[R[5H5HPSV\P\4]]]"''//44V[[R[5H5HPSV\P\4]]] "11..QQFKK%%%%!&|!4JOOF++00sV|0LLL'--//44T::FTYYv..7G1HHPPUXPYYZZZD!!!# %#I.GLLw0055fAgDV5WWW$F(A--KKu^/D.Dt.KKLLLVG$$$7u}en===GGFOOOUJ$X s   /33)	Nr   r   r   r   Nr	   TF)N)__name__
__module____qualname____doc__r   staticmethodr,   r3   r8   rH   rB   no_gradr!   __classcell__)r   s   @r   r   r      s         0 SVbg
: 
: 
: 
: 
: 
: ! ! \! * * \*
 8 8 \8- - -
 U]__W W W _W W W W Wr   r   )rl   rB   r$   optim	Optimizerr    r   r   <module>rs      sf      W W W W W% W W W W Wr   