
    Ngc&                     >    d dl Z d dlZd dlmZ  G d de          ZdS )    N)	Optimizerc                        e Zd ZdZ	 	 d fd	Z fd	Z ej                    d
             Z ej                    dd            Z	 xZ
S )	AdaBeliefa  Implements AdaBelief algorithm. Modified from Adam in PyTorch

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-16)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False)
        decoupled_decay (boolean, optional): (default: True) If set as True, then
            the optimizer uses decoupled weight decay as in AdamW
        fixed_decay (boolean, optional): (default: False) This is used when weight_decouple
            is set as True.
            When fixed_decay == True, the weight decay is performed as
            $W_{new} = W_{old} - W_{old} \times decay$.
            When fixed_decay == False, the weight decay is performed as
            $W_{new} = W_{old} - W_{old} \times decay \times lr$. Note that in this case, the
            weight decay ratio decreases with learning rate (lr).
        rectify (boolean, optional): (default: True) If set as True, then perform the rectified
            update similar to RAdam
        degenerated_to_sgd (boolean, optional) (default:True) If set as True, then perform SGD update
            when variance of gradient is high
    reference: AdaBelief Optimizer, adapting stepsizes by the belief in observed gradients, NeurIPS 2020

    For a complete table of recommended hyperparameters, see https://github.com/juntang-zhuang/Adabelief-Optimizer'
    For example train/args for EfficientNet see these gists
      - link to train_scipt: https://gist.github.com/juntang-zhuang/0a501dd51c02278d952cf159bc233037
      - link to args.yaml: https://gist.github.com/juntang-zhuang/517ce3c27022b908bb93f78e4f786dc3
    MbP?g?g+?缉ؗҜ<r   FTc                    d|k    s"t          d                    |                    d|k    s"t          d                    |                    d|d         cxk    rdk     s*n t          d                    |d                             d|d         cxk    rdk     s*n t          d                    |d                             t          |t          t          f          rt          |          dk    rpt          |d         t                    rU|D ]R}d	|v rL|d	         d         |d         k    s|d	         d         |d         k    rd
 t          d          D             |d<   St          ||||||
||	|d t          d          D             
  
        }t          t          |           
                    ||           d S )Ng        zInvalid learning rate: {}zInvalid epsilon value: {}r         ?z%Invalid beta parameter at index 0: {}   z%Invalid beta parameter at index 1: {}betasc                     g | ]}g d S )NNN .0_s     P/var/www/html/ai-engine/env/lib/python3.11/site-packages/timm/optim/adabelief.py
<listcomp>z&AdaBelief.__init__.<locals>.<listcomp>:   s    &M&M&Ma'9'9'9&M&M&M    
   bufferc                     g | ]}g d S r   r   r   s     r   r   z&AdaBelief.__init__.<locals>.<listcomp>?   s    ,S,S,SA-?-?-?,S,S,Sr   )
lrr   epsweight_decayamsgraddegenerated_to_sgddecoupled_decayrectifyfixed_decayr   )
ValueErrorformat
isinstancelisttuplelendictrangesuperr   __init__)selfparamsr   r   r   r   r   r   r    r   r   paramdefaults	__class__s                r   r*   zAdaBelief.__init__*   s    byy8??CCDDDczz8??DDEEEeAh$$$$$$$$DKKERSHUUVVVeAh$$$$$$$$DKKERSHUUVVVftUm,, 	NVqZPVWXPY[_E`E` N Ne##w):eAh)F)F%PW.YZJ[_def_gJgJg&M&M599&M&M&ME(OClG1?\c#,S,Sr,S,S,SU U U 	i''99999r   c                     t          t          |                               |           | j        D ]}|                    dd           d S )Nr   F)r)   r   __setstate__param_groups
setdefault)r+   stategroupr/   s      r   r1   zAdaBelief.__setstate__B   sV    i++E222& 	/ 	/EY....	/ 	/r   c                     | j         D ]n}|d         D ]c}| j        |         }|d         }d|d<   t          j        |          |d<   t          j        |          |d<   |rt          j        |          |d<   dod S )Nr,   r   r   stepexp_avgexp_avg_varmax_exp_avg_var)r2   r4   torch
zeros_like)r+   r5   pr4   r   s        r   resetzAdaBelief.resetG   s    & 	C 	CE8_ C C
1	* !"f#(#3A#6#6i  (-'7':':m$ C/4/?/B/BE+,C	C 	Cr   Nc           	      H
   d}|5t          j                    5   |            }ddd           n# 1 swxY w Y   | j        D ]}|d         D ]}|j        |j        }|j        t           j        t           j        hv r|                                }|j        rt          d          |}|j        t           j        t           j        hv r|                                }|d         }|d         \  }}	| j
        |         }
t          |
          dk    rLd|
d<   t          j        |          |
d<   t          j        |          |
d	<   |rt          j        |          |
d
<   |d         rO|d         s(|                    d|d         |d         z  z
             nH|                    d|d         z
             n)|d         dk    r|                    ||d                    |
d         |
d	         }}|
dxx         dz  cc<   d||
d         z  z
  }d|	|
d         z  z
  }|                    |                              |d|z
             ||z
  }|                    |	                              ||d|	z
             |r{|
d
         }t          j        ||                    |d                   |           |                                t%          j        |          z                      |d                   }n[|                    |d                                                   t%          j        |          z                      |d                   }|d         s&|d         |z  }|                    |||            nm|d         t)          |
d         dz                     }|
d         |d         k    r|d         |d         }}n|
d         |d<   |	|
d         z  }dd|	z
  z  dz
  }|d|
d         z  |z  d|z
  z  z
  }||d<   |dk    rEt%          j        d|z
  |dz
  z  |dz
  z  |dz
  z  |z  |z  |dz
  z            d||
d         z  z
  z  }n|d         rdd||
d         z  z
  z  }nd}||d<   |dk    rP|                                                    |d                   }|                    ||| |d         z             n'|dk    r!|                    || |d         z             |j        t           j        t           j        hv r|                    |           Ԑ|S )zPerforms a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr,   zOAdaBelief does not support sparse gradients, please consider SparseAdam insteadr   r   r   r7   r8   r9   r:   r   r    r
   r   r   )alphar   )valuer   )outr   r   r            r   )r;   enable_gradr2   graddtypefloat16bfloat16float	is_sparseRuntimeErrorr4   r&   r<   mul_add_addcmul_maxsqrtmathaddcdiv_intcopy_)r+   closurelossr5   r=   rH   p_fp32r   beta1beta2r4   r8   r9   bias_correction1bias_correction2grad_residualr:   denom	step_sizebufferednum_smabeta2_tnum_sma_maxs                          r   r7   zAdaBelief.stepY   s    "$$ ! !wyy! ! ! ! ! ! ! ! ! ! ! ! ! ! ! & b	$ b	$E8_ a$ a$6>v:%-!@@@::<<D> k&ik k k 7u}en===#\\^^F	*$W~u
1u::??$%E&M','7'?'?E)$+0+;F+C+CE-( L383CF3K3K/0 *+ G / AC%+n8M*M$MNNNNC%*?$?@@@@^,11		&n0E	FFF (-Y'7}9Mf"#$uf'=#= #$uf'=#=  U##((QY(??? $w  ''00UVY^U^0___ u&+,=&>OIo{/?/?e/M/MSbcccc -1133di@P6Q6QQWWX]^cXdeeEE(--eEl;;@@BBTYO_E`E``ffglmrgsttE Y' !M %d.> >IOOGU9*OEEEE  %Xs5=23E/F/FGHV}33-5a[(1+&+Fm"'5="8&'1u9o&9"-E&M0AG0KqSZ{0["[&- #a<<(,	!"W!(1!.1<q!B!(1!.07!8 !,!, 0;Q!@)A )A EFQVW]Q^H^D^	)`II
 ##78 +(+q5E&M3I/I(JII(*I&/!|| + 0 0 2 2 7 7e E Eyj5QU;>VWWWW"QGI:d3KLLL7u}en===GGFOOOCa$F s   /33)	r   r   r   r   FTFTT)N)__name__
__module____qualname____doc__r*   r1   r;   no_gradr>   r7   __classcell__)r/   s   @r   r   r      s        ! !H [`VZ: : : : : :0/ / / / /
 U]__C C _C" U]__o o o _o o o o or   r   )rT   r;   torch.optim.optimizerr   r   r   r   r   <module>rm      sg      + + + + + +C C C C C	 C C C C Cr   