
    קgX3                        d dl Z d dlmZmZmZmZmZmZmZ d dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZm Z m!Z!mZm"Z" d dl#m$Z$m%Z% d dl&m'Z'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8 ee9eeee:                  ee:         f         f         Z;dgZ<d*de:de9de9fdZ=	 d+deej>                 defdZ?de	j@        deAfdZB	 d*dedee:         de9de	j@        fd ZCd!e!dee;eej>                 f         fd"ZD G d# d$e          ZE	 d+d%e!d&e9d'e,d(ee%         de!f
d)ZFdS ),    N)castDictListOptionalSequenceTupleUnion)_get_device_module)ShardedTensor)TensorProperties)Shard)ChunkShardingSpec)unflatten_state_dict)DefaultLoadPlanner)BytesStorageMetadataChunkStorageMetadataMetadataMetadataIndexSTATE_DICT_TYPEr   TensorStorageMetadata)LoadPlanLoadPlanner)_create_read_items create_read_items_for_chunk_list)load_state_dict)StorageReader)_element_wise_add_element_wise_sub_normalize_device_info)_get_default_group)_create_chunk_sharded_tensor)_remote_device)DTensor!load_sharded_optimizer_state_dictcudaglobal_rankdevice_typereturnc                     |dk    rdS t          |          }|                                r%t          || |                                z            S dS )Ncpu)r
   is_availabler   device_count)r&   r'   device_modules      b/var/www/html/ai-engine/env/lib/python3.11/site-packages/torch/distributed/checkpoint/optimizer.py_gen_rank_devicer/   5   sb    eu&{33M!!## 
%}'A'A'C'CC
 
 	
 5    pgc           	          t           j                                       j         -fdt	          t          j                              D             }n. fdt	                                                     D             }t          dt          t          t          t          t          f                  |                    S )Nc           	      <    g | ]}d | dt          |           S rank:/)r/   ).0idxpg_device_types     r.   
<listcomp>z(_create_colwise_spec.<locals>.<listcomp>E   sE     
 
 
 BCAA*3??AA
 
 
r0   c                 b    g | ]+}d | dt          t          j        |                     ,S r4   )r/   distget_global_rank)r7   r8   r1   r9   s     r.   r:   z(_create_colwise_spec.<locals>.<listcomp>J   sR     
 
 
 \C[[*4+?C+H+H.YY[[
 
 
r0   r   dim
placements)r<   distributed_c10d_get_pg_default_devicetyperangeget_world_sizesizer   r   r   r	   r"   str)r1   r@   r9   s   ` @r.   _create_colwise_specrH   @   s     *AA"EEJN	z
 
 
 
T02233
 
 




 
 
 
 
RWWYY''
 
 

 U>3#678*EE   r0   valc                 &   t          |           t          u rt          |                                           dk    rdS t          |                                 d         j                  t          u rdS t          |                                 d         j                  t
          u rt          d          n[t          |           t
          u rEt          | j                  t
          u st          | j                  t          u rt          d          dS )Nr   FTz2Cannot handle DTensor nested insided ShardedTensorzCannot handle nested DTensor)rC   r   lenlocal_shardstensorr#   
ValueError_local_tensor)rI   s    r.   _is_nested_tensorrP   T   s    CyyM!!s!!""a''5  ""1%,-->>4  ""1%,--88QRRR 9	cg		S7**d33D.E.E.V.V78885r0   propsrF   c                 F   |dk    r:t          t          j        t          |                                                    }n4t          j        |t          |                                                    }t          j        || j        | j        | j        | j	        |          S )Nr*   )rF   dtypelayoutrequires_grad
pin_memorydevice)
r   torchrW   r
   current_deviceemptyrS   rT   rU   rV   )rQ   rF   r'   rW   s       r.   _alloc_tensorr[   c   s     eel$6{$C$C$R$R$T$TUU+K88GGII
 
 ;k|)#   r0   
state_dictc                    i }d}|                                  D ]\  }}d|                                f||<   t          |          rt          |                                          dk    s
J d            t          |t                    s
J d            |                                d         }|j        j        |j        j	        f||<   |j
        j        }||fS )a+  
    Load the right TP slice of the optimizer state.

    This is not easy since the per-tensor slicing can't be inferred from checkpoint metadata.
    We take advantage of the model state_dict producing a sliced ST to figure out what we need to load.
    This is pretty fragile and it might be easier for FSDP to compute this info for us.
    Returns a dictionary where keys are the same of the state_dict and the value is a tuple of
    (offset, size) for the current rank TP slice.
    N.B. The state_dict *MUST* come from FSDP.sharded_state_dict.
    N   z%Cannot handle ST with multiple shardsz$Can only handle nested ShardedTensorr   )itemsrF   rP   rK   rL   
isinstancer   metadatashard_offsetsshard_sizesrM   _process_group)r\   specsdp_pgkeyvalueshards         r.   _get_state_dict_2d_layoutrj   w   s    #%E)-E &&(( 0 0
UEJJLL)c
U## 	0E&&(())Q...6 /..}  6 656 6 6 &&((+E,*E#J L/E 	 r0   c                        e Zd ZU eeef         ed<   eed<   eed<   deee	e
         f         ddf fdZdefdZd	edej        f fd
Z xZS )_ReaderWithOffsettranslationr\   ra   fqn_to_offsetr(   Nc                     t                                                       || _        t          i           | _        i | _        i | _        d S N)super__init__rn   r   ra   r\   rm   )selfrn   	__class__s     r.   rr   z_ReaderWithOffset.__init__   sC    * r0   c           	         g }i | _         | j                                        D ]\  }}| j        j        |         }t          |t                    s|t          |||          z  }B|| j        vr|t          |||          z  }`| j        |         }t          |
                                          dk    sJ |
                                d         }t          t          j        t          |j        j        |                    t          j        |j        j                            g}t#          |t%          t&          |          |          }|D ]f}	|	j        j        J t-          |	j        j        |          }
t/          j        |	j        t          j        |
                    }|| j         |	j        <   g||z  }t3          |          S )Nr^   r   )offsetssizes)offset)rm   r\   r_   ra   state_dict_metadatar`   r   r   rn   rK   rL   r   rX   Sizer   rb   rc   r   r   r   
dest_indexrx   r   dataclassesreplacer   )rs   requestsfqnobjmdrx   original_shardlocal_chunksreqsrioriginal_offsetoriginal_indexs               r.   create_local_planz#_ReaderWithOffset.create_local_plan   s   --// $	 $	HC237Bc=11 .sB<<<$,,,.sB<<<',Fs''))**a//// --//2N$!J).*A*OQWXX   *^%<%HII	  L 4T/44l D
  A A}+777"3BM4H&"Q"Q!,!4M%*_*E*E" " " 3A //HH!!!r0   indexc                 x    t                                          | j                            ||                    S rp   )rq   lookup_tensorrm   get)rs   r   rt   s     r.   r   z_ReaderWithOffset.lookup_tensor   s.    ww$$T%5%9%9%%G%GHHHr0   )__name__
__module____qualname__r   r   __annotations__r   r   rG   r   intrr   r   r   rX   Tensorr   __classcell__)rt   s   @r.   rl   rl      s         m]23333d3+=&> 4      ("8 (" (" (" ("TI= IU\ I I I I I I I I I Ir0   rl   model_state_dictoptimizer_keystorage_readerplannerc                 R   |                                 }t          |           \  }}t          j                            |          j        }t          |          }|wg }	t          t          j                              D ]B}
t          ||
|
                                z            }|	                    d|
 d|            Ct          d|	          }nt          |          }i }i }|j                                        D ]n\  }}|j        |         }|d         |k    r t#          |t$                    rd||<   ;|j                                        dk    rt+          |j        |j        |          ||<   w|qt/          t+          |j        |j        |          t          j                    t          j                    |
                                t3                                ||<   |d	         }|                    |d|j        f          d         }t7          |j        j        |j        j        |j        j        |j        j        |j        j         
          }|!                    tE          j#        |          |          }g }t          j        |          }|j$        D ]p}tK          tL          |j'                  (                                |k    r3|                    tS          t+          |j        |j*        |          |                     qtW          j,        |||          }||v r=||         d         /tK          tZ          t\                   ||         d                   ||<   |||<   pt_          |||ta          |          n|           tc          ||j                  }|S )a  
    Load a state_dict in conjunction with FSDP sharded optimizer state.

    This is the current recommended way to checkpoint FSDP.
    >>> # xdoctest: +SKIP
    >>> import torch.distributed.checkpoint as dist_cp
    >>> # Save
    >>> model: torch.nn.Model
    >>> optim_params = model.parameters()
    >>> optim = torch.optim.SGD(optim_params, lr=0.01)
    >>> # Save
    >>> with FSDP.state_dict_type(model, StateDictType.SHARDED_STATE_DICT):
    >>>     state_dict = {
    >>>         "optimizer": FSDP.optim_state_dict(model, optim),
    >>>         "model": model.state_dict()
    >>>     }
    >>>     dist_cp.save_state_dict(
    >>>         state_dict=optim_state,
    >>>         storage_writer=dist_cp.FileSystemWriter("checkpoint"),
    >>>         planner=dist_cp.DefaultSavePlanner(),
    >>>     )
    >>>
    >>> # Load
    >>> with FSDP.state_dict_type(model_tp, StateDictType.SHARDED_STATE_DICT):
    >>>     model_state_dict = model_tp.state_dict()
    >>>     checkpoint = {
    >>>         "model": model_state_dict
    >>>     }
    >>>     dist_cp.load_state_dict(
    >>>         state_dict=checkpoint,
    >>>         storage_reader=dist_cp.FileSystemReader(checkpoint_file),
    >>>         planner=dist_cp.DefaultLoadPlanner(),
    >>>     )
    >>>     model.load_state_dict(checkpoint["model_state"])
    >>>
    >>>     optim_state = dist_cp.load_sharded_optimizer_state_dict(
    >>>         model_state_dict,
    >>>         optimizer_key="optimizer",
    >>>         storage_reader=dist_cp.FileSystemReader("checkpoint"),
    >>>     )
    >>>
    >>>     flattened_osd = FSDP.optim_state_dict_to_load(
    >>>        model, optim, optim_state["optimizer"]
    >>>     )
    >>>
    >>>     optim.load_state_dict(flattened_osd)
    Nr5   r6   r   r>   z
<bytes_io>r^   )rank
world_sizenum_devices_per_noder1      )rS   rT   rU   memory_formatrV   )rM   ra   )process_group)r\   r   r   )2read_metadatarj   r<   rA   rB   rC   r
   rD   rE   r   r,   appendr   rH   ry   r_   planner_datar`   r   rF   numelr[   
propertiesr!   get_rankr    r   ShardTensorPropertiesrS   rT   rU   r   rV   build_metadatarX   rz   shards_metadatar   r"   	placementr   r   rc   r   +_init_from_local_shards_and_global_metadatar   r   r   rl   r   )r   r   r   r   ra   layout_specsrf   dp_pg_device_typer-   r@   idevice_infosharding_specr\   rn   rg   rh   key_pathspec_key
alloc_sizer   st_mdrL   current_rankshard_mdsts                             r.   r$   r$      s   j ++--H34DEEL%-DDUKKP&'899M}
t*,,-- 	9 	9A0!1}'A'A'C'C#C K 7a77+778888)aJGGG,U33 #%J.0M288:: 8! 8!
U(-A;-''e122 	*JsO :""+ %*.? JsOO ]:e.
<MNN]__.00%2%?%?%A%A%''  JsOO  {H%))(T5:4FGGJJ.&,'.#.<#.< +6  J "00J1G1GTTEL=//L!1 
 
(:;;@@BBlRR##,!,h.BDU    "*	      Je5  B <''L,B1,E,Q%)(3-h9OPQ9R%S%Sc" JsOO %494E!-0007	    &j(2GHHJr0   )r%   rp   )Gr|   typingr   r   r   r   r   r   r	   rX   torch.distributeddistributedr<   torch._utilsr
   +torch.distributed._shard.sharded_tensor.apir   0torch.distributed._shard.sharded_tensor.metadatar   r   -torch.distributed._shard.sharded_tensor.shardr   :torch.distributed._shard.sharding_spec.chunk_sharding_specr   )torch.distributed.checkpoint._nested_dictr   ,torch.distributed.checkpoint.default_plannerr   %torch.distributed.checkpoint.metadatar   r   r   r   r   r   $torch.distributed.checkpoint.plannerr   r   ,torch.distributed.checkpoint.planner_helpersr   r   .torch.distributed.checkpoint.state_dict_loaderr   $torch.distributed.checkpoint.storager   "torch.distributed.checkpoint.utilsr   r   r   "torch.distributed.distributed_c10dr    #torch.distributed.fsdp._shard_utilsr!   torch.distributed.remote_devicer"   torch.distributed.tensorr#   rG   r   STATE_DICT_2D_LAYOUT__all__r/   ProcessGrouprH   r   boolrP   r[   rj   rl   r$    r0   r.   <module>r      s       E E E E E E E E E E E E E E E E E E              + + + + + + E E E E E E      @ ? ? ? ? ? X X X X X X J J J J J J K K K K K K                  G F F F F F F F        K J J J J J > > > > > >         
 B A A A A A L L L L L L : : : : : : , , , , , , Cx'>'M!NNO 
 (
 # C S     '+ "#   (5< D      FL #+C=?B
\   (""
$*;!<<=" " " "J7I 7I 7I 7I 7I* 7I 7I 7I| &*	N N%NN "N k"	N
 N N N N N Nr0   