
    gA                        d Z ddlZddlmZmZmZmZ ddlZ	 ddl	m
Z
 n# eef$ r	 ddlm
Z
 Y nw xY wddlmZ  eej        j        d          rej        j        j        Znej        j        Z G d d	ej                  Z	 	 	 	 	 	 	 	 	 d dedededededededee         dee         dededeee                  fdZ G d de
          Z G d d          ZdS )!z?Functions and classes related to optimization (weight updates).    N)CallableListOptionalUnion)Adam   )keraslearning_rate_schedulec                   J     e Zd ZdZ	 	 ddededededef
 fd	Zd
 Z	d Z
 xZS )WarmUpa  
    Applies a warmup schedule on a given learning rate decay schedule.

    Args:
        initial_learning_rate (`float`):
            The initial learning rate for the schedule after the warmup (so this will be the learning rate at the end
            of the warmup).
        decay_schedule_fn (`Callable`):
            The schedule function to apply after the warmup for the rest of training.
        warmup_steps (`int`):
            The number of steps for the warmup part of training.
        power (`float`, *optional*, defaults to 1.0):
            The power to use for the polynomial warmup (defaults is a linear warmup).
        name (`str`, *optional*):
            Optional name prefix for the returned tensors during the schedule.
          ?Ninitial_learning_ratedecay_schedule_fnwarmup_stepspowernamec                     t                                                       || _        || _        || _        || _        || _        d S N)super__init__r   r   r   r   r   )selfr   r   r   r   r   	__class__s         X/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/optimization_tf.pyr   zWarmUp.__init__8   sE     	%:"(
!2			    c                     t          j         j        pd          5 }t          j        t           j                  }t          j         j        t           j                  }||z  } j        t           j                            | j	                  z  t          j
        ||k     fd fd|          cd d d            S # 1 swxY w Y   d S )Nr   c                       S r    )warmup_learning_rates   r   <lambda>z!WarmUp.__call__.<locals>.<lambda>Q   s    , r   c                  >                           j        z
            S r   )r   r   )r   steps   r   r   z!WarmUp.__call__.<locals>.<lambda>R   s    ..td6G/GHH r   r   )tf
name_scoper   castfloat32r   r   mathpowr   cond)r   r!   r   global_step_floatwarmup_steps_floatwarmup_percent_doner   s   ``    @r   __call__zWarmUp.__call__G   s   ]49011 	T !#bj 9 9!#):BJ!G!G"36H"H#'#=L_aeak@l@l#l 7!$66,,,,HHHHH	  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   BCCCc                 D    | j         | j        | j        | j        | j        dS )Nr   r   r   r   r   r/   r   s    r   
get_configzWarmUp.get_configV   s-    %)%?!%!7 -ZI
 
 	
r   )r   N)__name__
__module____qualname____doc__floatr   intstrr   r-   r1   __classcell__r   s   @r   r   r   &   s         ,  $ $ 	
         
 
 
 
 
 
 
r   r           ?+?:0yE>r   init_lrnum_train_stepsnum_warmup_stepsmin_lr_ratio
adam_beta1
adam_beta2adam_epsilonadam_clipnormadam_global_clipnormweight_decay_rater   include_in_weight_decayc                    t                               | ||z
  | |z  |
          }|rt          | ||          }|	dk    rt          ||	|||||g d|	  	        }n%t          j                            ||||||          }||fS )a  
    Creates an optimizer with a learning rate schedule using a warmup phase followed by a linear decay.

    Args:
        init_lr (`float`):
            The desired learning rate at the end of the warmup phase.
        num_train_steps (`int`):
            The total number of training steps.
        num_warmup_steps (`int`):
            The number of warmup steps.
        min_lr_ratio (`float`, *optional*, defaults to 0):
            The final learning rate at the end of the linear decay will be `init_lr * min_lr_ratio`.
        adam_beta1 (`float`, *optional*, defaults to 0.9):
            The beta1 to use in Adam.
        adam_beta2 (`float`, *optional*, defaults to 0.999):
            The beta2 to use in Adam.
        adam_epsilon (`float`, *optional*, defaults to 1e-8):
            The epsilon to use in Adam.
        adam_clipnorm (`float`, *optional*, defaults to `None`):
            If not `None`, clip the gradient norm for each weight tensor to this value.
        adam_global_clipnorm (`float`, *optional*, defaults to `None`)
            If not `None`, clip gradient norm to this value. When using this argument, the norm is computed over all
            weight tensors, as if they were concatenated into a single vector.
        weight_decay_rate (`float`, *optional*, defaults to 0):
            The weight decay to use.
        power (`float`, *optional*, defaults to 1.0):
            The power to use for PolynomialDecay.
        include_in_weight_decay (`List[str]`, *optional*):
            List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is
            applied to all parameters except bias and layer norm parameters.
    )r   decay_stepsend_learning_rater   )r   r   r   r;   )	LayerNorm
layer_normbias)	learning_raterH   beta_1beta_2epsilonclipnormglobal_clipnormexclude_from_weight_decayrI   )rP   rQ   rR   rS   rT   rU   )	schedulesPolynomialDecayr   AdamWeightDecayr	   
optimizersr   )r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   r   rI   lr_schedule	optimizers                 r   create_optimizerr]   `   s    \ ++%#&66!L0	 ,  K  
")))
 
 

 3#%/ "0&I&I&I$;

 

 

		 $))% "0 * 
 
	 k!!r   c                   
    e Zd ZdZ	 	 	 	 	 	 	 	 	 dd	eeej        f         d
ededededede	e
e                  de	e
e                  def fdZe fd            Z fdZd Zd fd	Zd Zd fd	Zd fd	Z fdZd Z xZS )rY   a]
  
    Adam enables L2 weight decay and clip_by_global_norm on gradients. Just adding the square of the weights to the
    loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will interact
    with the m and v parameters in strange ways as shown in [Decoupled Weight Decay
    Regularization](https://arxiv.org/abs/1711.05101).

    Instead we want to decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent
    to adding the square of the weights to the loss with plain (non-momentum) SGD.

    Args:
        learning_rate (`Union[float, LearningRateSchedule]`, *optional*, defaults to 0.001):
            The learning rate to use or a schedule.
        beta_1 (`float`, *optional*, defaults to 0.9):
            The beta1 parameter in Adam, which is the exponential decay rate for the 1st momentum estimates.
        beta_2 (`float`, *optional*, defaults to 0.999):
            The beta2 parameter in Adam, which is the exponential decay rate for the 2nd momentum estimates.
        epsilon (`float`, *optional*, defaults to 1e-07):
            The epsilon parameter in Adam, which is a small constant for numerical stability.
        amsgrad (`bool`, *optional*, defaults to `False`):
            Whether to apply AMSGrad variant of this algorithm or not, see [On the Convergence of Adam and
            Beyond](https://arxiv.org/abs/1904.09237).
        weight_decay_rate (`float`, *optional*, defaults to 0.0):
            The weight decay to apply.
        include_in_weight_decay (`List[str]`, *optional*):
            List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is
            applied to all parameters by default (unless they are in `exclude_from_weight_decay`).
        exclude_from_weight_decay (`List[str]`, *optional*):
            List of the parameter names (or re patterns) to exclude from applying weight decay to. If a
            `include_in_weight_decay` is passed, the names in it will supersede this list.
        name (`str`, *optional*, defaults to `"AdamWeightDecay"`):
            Optional name for the operations created when applying gradients.
        kwargs (`Dict[str, Any]`, *optional*):
            Keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by
            norm; `clipvalue` is clip gradients by value, `decay` is included for backward compatibility to allow time
            inverse decay of learning rate. `lr` is included for backward compatibility, recommended to use
            `learning_rate` instead.
    MbP?r<   r=   Hz>Fr;   NrP   rQ   rR   rS   amsgradrH   rI   rV   r   c
                 p     t                      j        ||||||	fi |
 || _        || _        || _        d S r   )r   r   rH   _include_in_weight_decay_exclude_from_weight_decay)r   rP   rQ   rR   rS   ra   rH   rI   rV   r   kwargsr   s              r   r   zAdamWeightDecay.__init__   sM     	$YYRXYYY!2(?%*C'''r   c                 j    dt           i}t          t          |                               ||          S )z?Creates an optimizer from its config with WarmUp custom object.r   )custom_objects)r   r   rY   from_config)clsconfigrg   r   s      r   rh   zAdamWeightDecay.from_config   s2     #F+_c**66vn6]]]r   c                     t          t          |                               |||           t          j        | j        d          |||f         d<   d S )Nadam_weight_decay_rater"   rH   )r   rY   _prepare_localr#   constantrH   )r   
var_device	var_dtypeapply_stater   s       r   rm   zAdamWeightDecay._prepare_local   s]    ot$$33J	;WWWDFK")AE
 E
 E
Z+,-@AAAr   c                     |                      |j                  }|r?|                    ||z  ||j        |j        j        f         d         z  | j                  S t          j                    S )NrH   )use_locking)	_do_use_weight_decayr   
assign_subdevicedtype
base_dtype_use_lockingr#   no_op)r   varrP   rq   do_decays        r   _decay_weights_opz!AdamWeightDecay._decay_weights_op   sr    ,,SX66 	>>#k3:sy?S2T&UVi&jj - "    xzzr   c                     t          t          |           \  }} t          t          |           j        t          ||          fd|i|S )Nr   )listzipr   rY   apply_gradients)r   grads_and_varsr   re   gradstvarsr   s         r   r   zAdamWeightDecay.apply_gradients  sO    C011u;u_d++;Cu<M<MccTXc\bcccr   c                     || j         |         i fS |pi }|                    ||f          }||                     ||          }||||f<   |d         d|ifS )z1Retrieves the learning rate with the given state.Nlr_trq   )_decayed_lr_tget_fallback_apply_state)r   ro   rp   rq   coefficientss        r   _get_lrzAdamWeightDecay._get_lr  sz    %i0"44!'R"
I'>??55j)LLL3?KY/0F#m[%AAAr   c                 &   |                      |j        |j        j        |          \  }}|                     |||          }t          j        |g          5   t          t          |           j	        ||fi |cd d d            S # 1 swxY w Y   d S r   )
r   rv   rw   rx   r}   r#   control_dependenciesr   rY   _resource_apply_dense)r   gradr{   rq   r   re   decayr   s          r   r   z%AdamWeightDecay._resource_apply_dense  s    ||CJ	0DkRRf&&sD+>>$eW-- 	[ 	[E5$//EdCZZSYZZ	[ 	[ 	[ 	[ 	[ 	[ 	[ 	[ 	[ 	[ 	[ 	[ 	[ 	[ 	[ 	[ 	[ 	[s   "BB
B
c                 (   |                      |j        |j        j        |          \  }}|                     |||          }t          j        |g          5   t          t          |           j	        |||fi |cd d d            S # 1 swxY w Y   d S r   )
r   rv   rw   rx   r}   r#   r   r   rY   _resource_apply_sparse)	r   r   r{   indicesrq   r   re   r   r   s	           r   r   z&AdamWeightDecay._resource_apply_sparse  s    ||CJ	0DkRRf&&sD+>>$eW-- 	e 	eF5$//FtSRYdd]cdd	e 	e 	e 	e 	e 	e 	e 	e 	e 	e 	e 	e 	e 	e 	e 	e 	e 	es   #BBBc                     t                                                      }|                    d| j        i           |S )NrH   )r   r1   updaterH   )r   rj   r   s     r   r1   zAdamWeightDecay.get_config  s8    ##%%*D,BCDDDr   c                     | j         dk    rdS | j        r"| j        D ]}t          j        ||           dS | j        r"| j        D ]}t          j        ||           dS dS )z0Whether to use L2 weight decay for `param_name`.r   FNT)rH   rc   researchrd   )r   
param_namers      r   rt   z$AdamWeightDecay._do_use_weight_decay$  s    !Q&&5( 	 2    9Q
++744 8 * 	!4 ! !9Q
++7 55 8tr   )	r_   r<   r=   r`   Fr;   NNrY   r   )r2   r3   r4   r5   r   r6   rW   LearningRateScheduleboolr   r   r8   r   classmethodrh   rm   r}   r   r   r   r   r1   rt   r9   r:   s   @r   rY   rY      s       $ $P GL#&7;9=%D DUI$BBCD D 	D
 D D !D "*$s)!4D $,DI#6D D D D D D D$ ^ ^ ^ ^ [^

 
 
 
 
  d d d d d dB B B[ [ [ [ [ [e e e e e e    
      r   rY   c                   P    e Zd ZdZd Zed             Zed             Zd Zd Z	dS )GradientAccumulatoraR  
    Gradient accumulation utility. When used with a distribution strategy, the accumulator should be called in a
    replica context. Gradients will be accumulated locally on each replica and without synchronization. Users should
    then call `.gradients`, scale the gradients if required, and pass the result to `apply_gradients`.
    c                 "    g | _         d| _        dS )zInitializes the accumulator.N)
_gradients_accum_stepsr0   s    r   r   zGradientAccumulator.__init__A  s     r   c                     | j         Yt          j        t          j        dt          j                  dt          j        j        t          j        j                  | _         | j         	                                S )zNumber of accumulated steps.Nr   )rw   F	trainablesynchronizationaggregation)
r   r#   Variablern   int64VariableSynchronizationON_READVariableAggregationONLY_FIRST_REPLICAvaluer0   s    r   r!   zGradientAccumulator.stepF  sf     $ "ARX... " : B2E	! ! !D  &&(((r   c                 P    | j         st          d          d | j         D             S )z1The accumulated gradients on the current replica.zBThe accumulator should be called first to initialize the gradientsc                 >    g | ]}||                                 n|S r   )r   .0gradients     r   
<listcomp>z1GradientAccumulator.gradients.<locals>.<listcomp>X  s,    ggg8H$8   hgggr   )r   
ValueErrorr0   s    r   	gradientszGradientAccumulator.gradientsS  s6      	cabbbggW[Wfggggr   c                    | j         s+| j        }| j                             d |D                        t          |          t          | j                   k    r4t	          dt          | j                    dt          |                     t          | j         |          D ]\  }}|||                    |           | j                            d           dS )z/Accumulates `gradients` on the current replica.c                     g | ]N}|Ht          j        t          j        |          dt           j        j        t           j        j                  n|OS )NFr   )r#   r   
zeros_liker   r   r   r   r   s     r   r   z0GradientAccumulator.__call__.<locals>.<listcomp>_  so     
 
 
 !  + Kh//"'(*(B(J$&$:$M	    "
 
 
r   z	Expected z gradients, but got Nr   )r   r!   extendlenr   r   
assign_addr   )r   r   _accum_gradientr   s        r   r-   zGradientAccumulator.__call__Z  s     		AO""
 
 %.
 
 
   y>>S1111cT_)=)=ccSVW`SaSaccddd(+DOY(G(G 	4 	4$NH)h.B))(333$$Q'''''r   c                     | j         sdS | j                            d           | j         D ]+}|'|                    t          j        |                     ,dS )z8Resets the accumulated gradients on the current replica.Nr   )r   r   assignr#   r   )r   r   s     r   resetzGradientAccumulator.resett  sf     	F  ### 	9 	9H#h 7 7888	9 	9r   N)
r2   r3   r4   r5   r   propertyr!   r   r-   r   r   r   r   r   r   6  s         ! ! !
 
) 
) X
) h h Xh( ( (49 9 9 9 9r   r   )	r;   r<   r=   r>   NNr;   r   N)r5   r   typingr   r   r   r   
tensorflowr#   tf_keras.optimizers.legacyr   ImportErrorModuleNotFoundError"tensorflow.keras.optimizers.legacymodeling_tf_utilsr	   hasattrrZ   rW   r
   r   r   r6   r7   r8   r]   rY   r   r   r   r   <module>r      sP   F E 				 2 2 2 2 2 2 2 2 2 2 2 2    8///////() 8 8 8777777778 % $ $ $ $ $ 75%'?@@ + *AII *I7
 7
 7
 7
 7
Y+ 7
 7
 7
| %),0"37Q" Q"Q"Q" Q" 	Q"
 Q" Q" Q" E?Q" #5/Q" Q" Q" &d3i0Q" Q" Q" Q"h~ ~ ~ ~ ~d ~ ~ ~DE9 E9 E9 E9 E9 E9 E9 E9 E9 E9s    //