
    Ng#                     B    d Z ddlZddlZddlmZ  G d de          ZdS )aP   PyTorch Lamb optimizer w/ behaviour similar to NVIDIA FusedLamb

This optimizer code was adapted from the following (starting with latest)
* https://github.com/HabanaAI/Model-References/blob/2b435114fe8e31f159b1d3063b8280ae37af7423/PyTorch/nlp/bert/pretraining/lamb.py
* https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/Transformer-XL/pytorch/lamb.py
* https://github.com/cybertronai/pytorch-lamb

Use FusedLamb if you can (GPU). The reason for including this variant of Lamb is to have a version that is
similar in behaviour to APEX FusedLamb if you aren't using NVIDIA GPUs or cannot install/use APEX.

In addition to some cleanup, this Lamb impl has been modified to support PyTorch XLA and has been tested on TPU.

Original copyrights for above sources are below.

Modifications Copyright 2021 Ross Wightman
    N)	Optimizerc                   Z     e Zd ZdZ	 	 d fd		Z ej                    dd            Z xZS )Lamba  Implements a pure pytorch variant of FuseLAMB (NvLamb variant) optimizer from apex.optimizers.FusedLAMB
    reference: https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/Transformer-XL/pytorch/lamb.py

    LAMB was proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining parameter groups.
        lr (float, optional): learning rate. (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its norm. (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability. (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        grad_averaging (bool, optional): whether apply (1-beta2) to grad when
            calculating running averages of gradient. (default: True)
        max_grad_norm (float, optional): value used to clip global grad norm (default: 1.0)
        trust_clip (bool): enable LAMBC trust ratio clipping (default: False)
        always_adapt (boolean, optional): Apply adaptive learning rate to 0.0
            weight decay parameter (default: False)

    .. _Large Batch Optimization for Deep Learning - Training BERT in 76 minutes:
        https://arxiv.org/abs/1904.00962
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    MbP?Tg?g+?ư>{Gz?      ?Fc                 |    t          ||||||||	|
	  	        }t                                          ||           d S )N)	lrbias_correctionbetasepsweight_decaygrad_averagingmax_grad_norm
trust_clipalways_adapt)dictsuper__init__)selfparamsr   r   r   r   r   r   r   r   r   defaults	__class__s               K/var/www/html/ai-engine/env/lib/python3.11/site-packages/timm/optim/lamb.pyr   zLamb.__init__W   sQ     ?%SWc)!> > > 	*****    Nc           
         d}|5t          j                    5   |            }ddd           n# 1 swxY w Y   | j        d         d         d         j        }t          j        d|          }t          j        d|          }| j        D ]l}|d         D ]a}|j        
|j        }|j        rt          d          |	                    |
                    d                                                     bmt          j        |          }t          j        | j        d	         |          }	t          j        ||	k    ||	z  |          }
| j        D ]}|d
         rdnd}|d         \  }}|d         rdnd}|rd|z
  nd}d|v r|dxx         dz  cc<   nd|d<   |rd||d         z  z
  }d||d         z  z
  }nd\  }}|d         D ]}|j        |j                            |
          }| j        |         }t#          |          dk    r.t          j        |          |d<   t          j        |          |d<   |d         |d         }}|                    |          	                    ||           |                    |                              ||d|z
             |                                t+          j        |          z  	                    |d                   }||z                      |          }|d         }|dk    r|	                    ||           |dk    s|d         r|                    d          }|                    d          }t          j        |dk    t          j        |dk    ||z  |          |          }|d         rt          j        ||          }|                    |           |	                    ||d                     |S )zPerforms a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   r   r
   )device   zCLamb does not support sparse gradients, consider SparseAdam instad.   r   r   r   r   step)r
   r
   exp_avg
exp_avg_sq)alpha)valuer   r   r   g       @r   r   )torchenable_gradparam_groupsr   tensorzerosgrad	is_sparseRuntimeErroradd_powsumsqrtr   wherediv_statelen
zeros_likemul_addcmul_mathnormminimum)r   closurelossr   
one_tensorglobal_grad_normgrouppr,   r   clip_global_grad_normr   beta1beta2r   beta3bias_correction1bias_correction2r5   r#   r$   denomupdater   w_normg_normtrust_ratios                              r   r"   z	Lamb.step`   s{    "$$ ! !wyy! ! ! ! ! ! ! ! ! ! ! ! ! ! ! "1%h/29\#f555
 ;q888& 	9 	9E8_ 9 96>v> n&'lmmm %%dhhqkkoo&7&788889 !:&677 T]?%CFSSS %},},! !
 & =	3 =	3E#():#;BaaO >LE5"'(8"9@QQqN!/8AIISE f" !f >#$uf'=#= #$uf'=#=  5=2 "28_ *3 *36>v{{#899
1 u::??','7':':E)$*/*:1*=*=E,'&+I&6l8K U##((U(;;;&&//d!e)/LLL#**TY7G-H-HHNNuUZ|\\!$44::5AA$^41$$KKK6661$$n(=$ VVC[[F#[[--F"'+
FQJLL"# #K
 \* M&+mK&L&LKK,,,veDk\2222U*3X s   /33)	r   Tr   r   r	   Tr
   FF)N)	__name__
__module____qualname____doc__r   r'   no_gradr"   __classcell__)r   s   @r   r   r   <   s         6 RVfk+ + + + + + U]___ _ _ __ _ _ _ _r   r   )rQ   r:   r'   torch.optimr   r    r   r   <module>rV      st    j   ! ! ! ! ! !D D D D D9 D D D D Dr   