
    קg6                        U d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ g Ze	e         ed<    e j        e          Z G d d	ej                  Zd
e	e         defdZdS )    N)deepcopy)	AnyCallable
CollectionDictListMappingOptionaloverloadUnion)optim)ShardedTensor)FullyShardedDataParallel__all__c                      e Zd ZdZ	 	 ddeeeej        e	f         f         de
j        deeeeef                           deej                 ddf
dZd	 Zdeeef         fd
Zeddd            Zedeg ef         defd            Zddeeg ef                  dee         fdZedeej        ef         fd            Zdeeef         ddfdZdeeef         ddfdZddZdeeef         fdZdeeef         fdZdS )_NamedOptimizera  
    ``_NamedOptimizer`` takes a dict of parameters and exposes ``state_dict`` by parameter key.

    We replace the original key (number) in an optim to the
    fully qualified name (FQN) string. User can initialize the optim as they
    initialize a PyTorch optim, the only difference is that they also need to
    pass in the FQN of each parameters.

    Args:
        named_parameters (Mapping[str, Union[torch.Tensor, ShardedTensor]]):
            Mapping from FQN to parameter.
        optimizer_class (optim.Optimizer):
            The class of optimizer to instantiate.
        param_groups (Collection[Mapping[str, Any]]):
            `param_groups` to pass to optimizer if specified.
            The key of the inner map needs to be FQNs.
            Default: None
        module (nn.Module): the module whose parameters to updated
            by the optimizer.
        args: arguments to pass to the optimizer constructor.
        kwargs: arguments to pass to the optimizer constructor.

    Example::
        >>> # xdoctest: +SKIP("distributed")
        >>> from torch import optim
        >>> from torch.distributed.optim import _NamedOptimizer
        >>>
        >>> # Define the named optimizer.
        >>> m = Model(...)
        >>> named_optim = _NamedOptimizer(m.named_parameters(), optim.SGD)
        >>> # Forward pass + backward pass.
        >>> named_optim.step()
        >>> ...
        >>> # Call state_dict for the named optimizer returns a FQN state_dict.
        >>> named_optim.state_dict()

    Warning: This API is still in development and subject to change.

    TODO: Add tutorial for _NamedOptimizer.
    TODO: Add documentation in the docstring for the public attributes
          like self.param_groups and self.named_parameters.
    Nnamed_parametersoptimizer_classparam_groupsmodulereturnc                    t           j                            d           || _        |                                  t          |          | _        || j                                        n|} ||g|R i || _        || _	        |,t          | j                                                  | _        nt          j        d           d | j                                        D             }g }	|D ]?}
|
d         D ]4}||vrt!          d| d          |	                    ||                    5@|	| _        | j        j        | _        d S )Nz'torch.distributed.optim._NamedOptimizerzvSince we pass in param_groups, we will use param_groups to initialize the optimizer, not all parameters of the module.c                     i | ]\  }}||	S  r   .0keyparams      c/var/www/html/ai-engine/env/lib/python3.11/site-packages/torch/distributed/optim/named_optimizer.py
<dictcomp>z,_NamedOptimizer.__init__.<locals>.<dictcomp>f   s    WWW:3E3WWW    paramszExpect param name z% found in param group but is missing.)torch_C_log_api_usage_oncer   _param_groups_checkdictr   values
_optimizerr   listkeysordered_param_keyswarningswarnitems
ValueErrorappend)selfr   r   r   r   argskwargsparams_for_optimizerparam_to_keyr,   groupr   s               r   __init__z_NamedOptimizer.__init__I   s    	$$%NOOO;G  """ $%5 6 6.:.BD!((*** 	 */ 

 
 
 
 

 &*4+@+E+E+G+G&H&HD##MN   XW9N9T9T9V9VWWWL!#% C C"8_ C CEL00(]]]]   '--l5.ABBBBC '9D# O8r!   c                 ~   | j         | j         D ]}t          |t                    s
J d            d|v s
J d            |d         }t          |t          j                  r|g}t          |          }|D ]@}t          |t          j                  s$t          dt          j        |          z             A||d<   d S d S )Nparam group must be a dictr"   z#param group must contain key paramsz>optimizer can only optimize Tensors, but one of the params is )r   
isinstancer'   r#   Tensorr*   	TypeErrortypename)r2   param_groupr"   r   s       r   r&   z#_NamedOptimizer._param_groups_checks   s    (#0 / /!+t44RR6RRRR;...0U...$X.fel33 &$XFf#  E%eU\:: '8:?.:O:OP  
 )/H%% )(/ /r!   c                      j                                         }|d         } fd|d                                         D             }g }|D ]}g }|d         D ]"}|                     j        |                    #dt          |          i}|                                D ]\  }	}
|	dk    rt          |
          ||	<   |                    |                                ||d          S )z
        Return the ``state_dict`` of the optimizer.

        Instead of using number to index
        parameters, we will use module fully qualified name (FQN) as the key.
        r   c                 2    i | ]\  }}j         |         |S r   )r,   )r   st_key	state_valr2   s      r   r    z._NamedOptimizer.state_dict.<locals>.<dictcomp>   s7     
 
 
!	 #F+Y
 
 
r!   stater"   )rD   r   )r)   
state_dictr/   r1   r,   sortedr   _post_state_dict)r2   rE   r   	ret_state
ret_groupsr7   
param_keysr   	ret_groupkvs   `          r   rE   z_NamedOptimizer.state_dict   s#    _//11
!.1
 
 
 
%/%8%>%>%@%@
 
 
	
 
! 	) 	)EJx B B!!$"9%"@AAAA!6*#5#56I / /1==#+A;;IaLi(((($$y*%U%UVVVr!   .closurec                     d S Nr   r2   rN   s     r   stepz_NamedOptimizer.step       r!   c                     d S rP   r   rQ   s     r   rR   z_NamedOptimizer.step   rS   r!   c                 8    | j                             |          S )z
        Perform a single optimization step.

        This will call :meth:`torch.optim.Optimizer.step` on the wrapped
        optimizer.
        rN   )r)   rR   rQ   s     r   rR   z_NamedOptimizer.step   s     ##G#444r!   c                     | j         j        S rP   )r)   rD   )r2   s    r   rD   z_NamedOptimizer.state   s    $$r!   rE   c                 0   | j                                         }|                     |          }|d         }|d         }t          |          dk    rt	          d          t          | j                  D ]J\  }}||                                vrt          ||                   t          ||                   k    r>t	          dt          ||                    d| dt          ||                              ||                                         D ]\  }}|||         vrt	          d| d| d          ||         |         }	t          |t                    rt          |	t                    sJ t          |                                          }
t          |	                                          }|
|k    rt	          d	| d
|
 d| d|           t          |                                |	                                          D ]6\  }}|j                                                            |j                   70t          |t           j                  rEt          |	t           j                  sJ |                                                    |	           t%          |	          ||         |<   L|d         }|d         }i }|D ])}t'          |d                   }||t)          |          <   *i }|D ]A}g }|d         D ]"}|                    | j        |                    #||t)          |          <   B|                                D ]\  }}||vr
||         }t          |          t          |          k    r3t	          dt          |           d| d
t          |           d          |D ]:}||vrt	          d| d| d          |dk    rt%          ||                   ||<   ;| j                             |           dS )a  
        Define the default behavior to load a state_dict for ``_NamedOptimizer``.

        Sample Code
        ```
            my_model = MyModule()
            optimizer = _NamedOptimizer(my_model.named_parameters(), Adagrad)
            ...

            optim_state_dict = optimizer.state_dict()
            ...
            ...

            optimizer.load_state_dict(optim_state_dict)
            ...
        ```
        Args:
            state_dict (Dict[str, Any]) : A ``state_dict`` to load into the optimizer.
                Note that this state dict update is performed in place.

        .. note:: PyTorch is using lazy init to initialize the optim states.
            So it is possible that there is no optim state when user call
            ``load_state_dict`` and for ``_NamedOptimizer`` we make it stricter
            that users can only call ``load_state_dict`` after the state is initialized.
            By doing this, we can validate the optim ``state_dict`` to be loaded.
        rD   r   zJExpects the optim to be initialized before load but found not initialized.zExpects equal length as z for parameter z but found: zExpects state z but not found.z"Expects equal number of shards as z but found z for /r   r"   z"Expects equal param_group size as z for group .zExpects group key z to be in group z  in `state_dict` but is missing.N)r)   rE   _pre_load_state_dictlenr0   	enumerater,   r+   r/   r;   r   local_shardsziptensordetachcopy_r#   r<   r   r*   _gen_param_group_keyr1   load_state_dict)r2   rE   new_state_dictrD   	new_stateidx	param_key	state_keyrC   src_state_val
num_shardsnum_new_shardsshard	src_shardsrc_param_groupsnew_param_groupssrc_group_mapr7   rJ   new_group_map	new_group	group_key	src_grouprL   s                           r   rd   z_NamedOptimizer.load_state_dict   s   6 3355..z::
7#"7+	y>>Q\   ((?@@  	H  	HNC

,,5#$$IcN(;(;;;  Bs9S>/B/B  B  BS\  B  Bjmnst}n~jj  B  B   )2#(<(<(>(> H H$	9E)$444$]]]9]]]   !&i 0 ;i77 H%m]CCCCC!$Y%;%;%=%=!>!>J%()C)C)E)E%F%FN!^33( E  E  E\f  E  Emv  E  E  zC  E  E   -0!..00-2L2L2N2N- - F F(y ++--33I4DEEEEF  	5<88 H%mU\BBBBB$$&&,,];;;;080G0GIcN9--/H4 &n5).9% 	D 	DEeHo..J>CM.z::;;) 	H 	HIJ&x0 F F	!!$"9)"DEEEE>GM.z::;;$1$7$7$9$9 	: 	: Iy --%i0I9~~Y// {Y{{T]{{jmnwjxjx{{{    : :I%%$kQkk	kkk   ==#+IaL#9#9IaL: 	''77777r!   r?   c                    t          |t                    s
J d            |d         }t          |t          j                  r|g|d<   nt	          |          |d<   d | j                                        D             }|d         D ]5}||vrt          d          | j        	                    ||                    6| j
                            |           | j
        j        | _        dS )z
        Add a param group to the :class:`_NamedOptimizer` s `param_groups`.

        Warning: This API is still in development and subject to change.
        r:   r"   c                     i | ]\  }}||	S r   r   r   s      r   r    z3_NamedOptimizer.add_param_group.<locals>.<dictcomp>+  s    SSSzsEsSSSr!   z%some parameters are not in the moduleN)r;   r'   r#   r<   r*   r   r/   r0   r,   r1   r)   add_param_groupr   )r2   r?   r"   r6   r   s        r   rx   z_NamedOptimizer.add_param_group  s     +t,,JJ.JJJJX&fel++ 	1%+HK!!$(LLK!SST5J5P5P5R5RSSS * 	@ 	@EL(( !HIII#**<+>????''444 O8r!   c                     | j                                         D ]A}|j        r8t          j        |          }t          j                            |          |_        B|                     d           dS )z
        Run a dummy optimizer step, which allows to initialize optimizer state because we do lazy init for most optimizers.

        This allows doing in-place loading of optimizer state from a checkpoint.
        NrV   )	r   r(   requires_gradr#   
zeros_likeautogradVariablegradrR   )r2   r   ts      r   
init_statez_NamedOptimizer.init_state5  so     *1133 	8 	8E" 8$U++"^44Q77
		$	r!   c                 ~    t          | j        t                    r"t          j        | j        | j        |d          S |S )NT)is_named_optimizer)r;   r   FSDPoptim_state_dict_to_loadr)   r2   rE   s     r   r[   z$_NamedOptimizer._pre_load_state_dictB  sG     dk4(( 	0T_jT    r!   c                 z    t          | j        t                    r t          j        | j        | j        |           |S rP   )r;   r   r   optim_state_dictr)   r   s     r   rG   z _NamedOptimizer._post_state_dictK  s8     dk4(( 	L!$+t
KKKr!   )NN).)rN   Nr   NrP   )r   N) __name__
__module____qualname____doc__r	   strr   r#   r<   r   r   	Optimizerr
   r   r   nnModuler8   r&   r   rE   r   rR   r   floatpropertyrD   rd   rx   r   r[   rG   r   r!   r   r   r      s[       ) )^ AE&*(9 (9!#uU\=-H'I"IJ(9 (9 z'#s(*;<=	(9
 #(9 
(9 (9 (9 (9T/ / /"WDcN W W W W8     X HRY/ E    X5 5HXb%i%89 5Xe_ 5 5 5 5 %wu|S01 % % % X%f8'#s(*; f8 f8 f8 f8 f8P9738+< 9 9 9 9 90       $sCx.    d38n      r!   r   rJ   r   c                 F    d                     t          |                     S )zGConcatenate all param keys as a unique indentifier for one param group.rY   )joinrF   )rJ   s    r   rc   rc   S  s    88F:&&'''r!   ) loggingr-   copyr   typingr   r   r   r   r   r	   r
   r   r   r#   torch.nnr   r   'torch.distributed._shard.sharded_tensorr   torch.distributed.fsdpr   r   r   r   __annotations__	getLoggerr   loggerr   r   rc   r   r!   r   <module>r      sy          
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
              A A A A A A C C C C C C c   		8	$	$s s s s seo s s sl	(T#Y (3 ( ( ( ( ( (r!   