
    g                     8    d dl Z d dlmZmZ  G d d          ZdS )    N)AcceleratorDistributedTypec            	       X    e Zd ZdZd Zd Zddedej        j	        de
defd	Zd
 Zd ZdS )LocalSGDu  
    A helper class to support local SGD on top of Accelerator. It simply runs a given number of updates independently
    on each device, and averages model weights every K synchronization step.

    It should be used only in the multi-GPU (or multi-CPU) setup without extensions such as DeepSpeed. In particular,
    this is a simple implementation that cannot support scenarios such as model parallelism.


    Although we are not aware of the true origins of this simple approach, the idea of local SGD is quite old and goes
    back to at least:

    Zhang, J., De Sa, C., Mitliagkas, I., & Ré, C. (2016). [Parallel SGD: When does averaging help?. arXiv preprint
    arXiv:1606.07365.](https://arxiv.org/abs/1606.07365)

    We credit the term Local SGD to the following paper (but there might be earlier references we are not aware of).

    Stich, Sebastian Urban. ["Local SGD Converges Fast and Communicates Little." ICLR 2019-International Conference on
    Learning Representations. No. CONF. 2019.](https://arxiv.org/abs/1805.09767)

    c                     | j         r7| j                                        | _        | j                                         | S N)enabledmodelno_syncmodel_sync_obj	__enter__selfs    P/var/www/html/ai-engine/env/lib/python3.11/site-packages/accelerate/local_sgd.pyr   zLocalSGD.__enter__)   s<    < 	,"&*"4"4"6"6D))+++    c                 x    | j         r2|                                  | j                            |||           d S d S r   )r	   _sync_and_avg_model_paramsr   __exit__)r   typevaluetbs       r   r   zLocalSGD.__exit__0   sI    < 	:++---((ub99999	: 	:r   Tacceleratorr
   local_sgd_stepsr	   c                 <   |j         t          j        t          j        t          j        t          j        t          j        t          j        fvrt          d          |o|j         t          j        k    | _	        d| _
        | j	        r|| _        || _        || _        dS dS )a  
        Constructor.

        Args:
            model (`torch.nn.Module):
                The model whose parameters we need to average.
            accelerator (`Accelerator`):
                Accelerator object.
            local_sgd_steps (`int`):
                A number of local SGD steps (before model parameters are synchronized).
            enabled (`bool):
                Local SGD is disabled if this parameter set to `False`.
        zILocalSGD is supported only for CPUs and GPUs (no DeepSpeed or MegatronLM)r   N)distributed_typer   NO	MULTI_CPU	MULTI_GPU	MULTI_MLU
MULTI_MUSA	MULTI_NPUNotImplementedErrorr	   	num_stepsr   r
   r   )r   r   r
   r   r	   s        r   __init__zLocalSGD.__init__6   s     '%%%&%0
 
 
 &&qrrrU;#??CU#U< 	3*DDJ#2D   	3 	3r   c                     | xj         dz  c_         | j        sdS | j         | j        z  dk    r|                                  dS dS )z^
        This function makes a "step" and synchronizes model parameters if necessary.
           Nr   )r#   r	   r   r   r   s    r   stepzLocalSGD.stepT   sX     	!| 	F>D00A55++----- 65r   c                 "   | j                                          | j                                         5  | j                                        D ](}| j                             |j        d          |_        )	 ddd           dS # 1 swxY w Y   dS )zH
        Synchronize + Average model parameters across all GPUs
        mean)	reductionN)r   wait_for_everyoneautocastr
   
parametersreducedata)r   params     r   r   z#LocalSGD._sync_and_avg_model_params_   s    
 	**,,,&&(( 	S 	S..00 S S!-44UZ64RR

S	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	Ss   ABBBN)T)__name__
__module____qualname____doc__r   r   r   torchnnModuleintboolr$   r'   r    r   r   r   r      s         *  : : :3 3K 3 3Z] 3hl 3 3 3 3<	. 	. 	.S S S S Sr   r   )r5   
accelerater   r   r   r:   r   r   <module>r<      sn     3 3 3 3 3 3 3 3TS TS TS TS TS TS TS TS TS TSr   