
    קg1                        d dl Z d dlmZmZ d dlmZ d dlmZmZm	Z	m
Z
mZ d dlmZ d dlmZ g dZ G d d	e          Z G d
 de          Z G d de          Z G d de          Z G d de          Z G d de          Ze G d d                      Z G d d          Z G d de          Z G d d          Zeegef         Z G d d          Z e            ZdS )    N)ABCabstractmethod)	dataclass)AnyCallableClassVarDictOptional)Store)get_free_port)RendezvousClosedErrorRendezvousConnectionErrorRendezvousErrorRendezvousGracefulExitErrorRendezvousHandlerRendezvousHandlerCreatorRendezvousHandlerRegistryRendezvousInfoRendezvousParametersRendezvousStateErrorRendezvousStoreInfoRendezvousTimeoutErrorrendezvous_handler_registryc                       e Zd ZdZdS )r   z/Represents the base type for rendezvous errors.N__name__
__module____qualname____doc__     d/var/www/html/ai-engine/env/lib/python3.11/site-packages/torch/distributed/elastic/rendezvous/api.pyr   r   "   s        9999r!   r   c                       e Zd ZdZdS )r   z#Raised when a rendezvous is closed.Nr   r    r!   r"   r   r   &   s        ----r!   r   c                       e Zd ZdZdS )r   z2Raised when a rendezvous did not complete on time.Nr   r    r!   r"   r   r   *   s        <<<<r!   r   c                       e Zd ZdZdS )r   z>Raised when the connection to a rendezvous backend has failed.Nr   r    r!   r"   r   r   .   s        HHHHr!   r   c                       e Zd ZdZdS )r   z1Raised when the state of a rendezvous is corrupt.Nr   r    r!   r"   r   r   2   s        ;;;;r!   r   c                       e Zd ZdZdS )r   zRaised when node wasn't not included in rendezvous and gracefully exits.

    Exception is a mechanism to exit the stack, however does not mean a failure.
    Nr   r    r!   r"   r   r   6   s           r!   r   c            	           e Zd ZU dZdZee         ed<   dZee         ed<   eed<   e	ed<   e
de	d	ed
ee         dd fd            ZdS )r   zNStore address and port that can be used to bootstrap trainer distributed commsMASTER_ADDRMASTER_ADDR_KEYMASTER_PORTMASTER_PORT_KEYmaster_addrmaster_portrankstore
local_addrreturnc                 F   | dk    r|pt          j                    }t                      }|                    t          j        |                    d                     |                    t          j        t          |                              d                     |	                    t          j                  
                    d          }t          |	                    t          j                  
                    d                    }t	          ||          S )a  Factory method, finds unused new port on rank0 host and addr/port info with all ranks.

        If master_addr/master_port is knowns (useful when sharing existing tcp store server) use the constructor.

        Args:
            rank: rank of the current node
            store: store to use for rendezvous
            local_addr: address of the current node, if not provided will be resolved from hostname
        r   zUTF-8)encoding)r-   r.   )socketgetfqdn_get_free_portsetr   r*   encoder,   strgetdecodeint)r/   r0   r1   addrports        r"   buildzRendezvousStoreInfo.buildF   s     1991!1!1D!##DII)94;;PW;;X;XYYYII)93t99;K;KU\;K;];]^^^yy,<==DDgDVVII)9::AA7ASS
 
 #tFFFFr!   N)r   r   r   r   r*   r   r:   __annotations__r,   r=   staticmethodr   r
   r@   r    r!   r"   r   r   =   s         XX%2OXc]222%2OXc]222GGG-5c]G	G G G \G G Gr!   r   c                       e Zd ZdZdedededefdZedefd            Z	edefd	            Z
edefd
            Zedee         fd            ZdS )r   z+Holds the information about the rendezvous.r0   r/   
world_sizebootstrap_store_infoc                 >    || _         || _        || _        || _        d S N)_store_rank_world_size_bootstrap_store_info)selfr0   r/   rD   rE   s        r"   __init__zRendezvousInfo.__init__d   s(     
%%9"""r!   r2   c                     | j         S )z(Store used by torchelastic control plane)rH   rL   s    r"   r0   zRendezvousInfo.storep   s     {r!   c                     | j         S )zRank within a group)rI   rO   s    r"   r/   zRendezvousInfo.ranku   s     zr!   c                     | j         S )zGlobal group size)rJ   rO   s    r"   rD   zRendezvousInfo.world_sizez   s     r!   c                     | j         S )zOStore information that can used by trainer code to bootstrap distributed comms.)rK   rO   s    r"   rE   z#RendezvousInfo.bootstrap_store_info   s     ))r!   N)r   r   r   r   r   r=   r   rM   propertyr0   r/   rD   r
   rE   r    r!   r"   r   r   a   s        55
:
: 
: 	
:
 2
: 
: 
: 
: u    X c    X  C       X  *h/B&C * * * X* * *r!   r   c                       e Zd ZdZedefd            Zedefd            Z	ede
fd            Zedefd            Zed             Zedefd            Zedefd	            Zedefd
            ZdS )r   zMain rendezvous interface.

    Note:
        Distributed Torch users normally **do not** need to implement their own
        ``RendezvousHandler``. An implementation based on C10d Store is already
        provided, and is recommended for most users.
    r2   c                     dS )z*Return the name of the rendezvous backend.Nr    rO   s    r"   get_backendzRendezvousHandler.get_backend         r!   c                     dS )a  Indicates that store reference returned by :py:meth:`next_rendezvous` can be shared with user
        applications and will be available during application lifecyle.

        Rendezous handler impl will share store details as instance of :py:class:`RendezvousStoreInfo`.
        Applications as a convention use `MASTER_ADDR`/`MASTER_PORT` env variables to lookup the store.
        Fr    rO   s    r"   use_agent_storez!RendezvousHandler.use_agent_store   s	     ur!   c                     dS )a  Main entry-point into the rendezvous barrier.

        Blocks until the rendezvous is complete and the current process is
        included in the formed worker group, or a timeout occurs, or the
        rendezvous was marked closed.

        Returns:
            Instance of :py:class:`RendezvousInfo`.

        Raises:
            RendezvousClosedError:
                The rendezvous is closed.
            RendezvousConnectionError:
                The connection to the rendezvous backend has failed.
            RendezvousStateError:
                The rendezvous state is corrupt.
            RendezvousTimeoutError:
                The rendezvous did not complete on time.
        Nr    rO   s    r"   next_rendezvousz!RendezvousHandler.next_rendezvous   rW   r!   c                     dS )a  Check whether the rendezvous has been closed.

        A closed rendezvous means all future attempts to re-rendezvous within
        same job will fail.

        ``is_closed()`` and :py:meth:`set_closed` have semantics of eventual
        propagation and should not be used for synchronization. The intention is
        that if at least one node decides the job is finished, it will close the
        rendezvous, and other nodes will soon observe this and stop running as
        well.
        Nr    rO   s    r"   	is_closedzRendezvousHandler.is_closed   rW   r!   c                     dS )zMark the rendezvous as closed.Nr    rO   s    r"   
set_closedzRendezvousHandler.set_closed   rW   r!   c                     dS )aW  Return the number of nodes who arrived late at the rendezvous
        barrier, hence were not included in the current worker group.

        Callers should periodically call this method to check whether new
        nodes are waiting to join the job and if so admit them by calling
        :py:meth:`next_rendezvous()` (re-rendezvous).
        Nr    rO   s    r"   num_nodes_waitingz#RendezvousHandler.num_nodes_waiting   rW   r!   c                     dS )a  Return the run id of the rendezvous.

        The run id is a user-defined id that uniquely identifies an instance of
        a distributed application. It typically maps to a job id and is used to
        allow nodes to join the correct distributed application.
        Nr    rO   s    r"   
get_run_idzRendezvousHandler.get_run_id   rW   r!   c                     dS )a	  Close all resources that were open for the rendezvous.

        Example::

            rdzv_handler = ...
            try:
                store, rank, world_size = rdzv_handler.next_rendezvous()
            finally:
                rdzv_handler.shutdown()
        Nr    rO   s    r"   shutdownzRendezvousHandler.shutdown   rW   r!   N)r   r   r   r   r   r:   rV   rS   boolrY   r   r[   r]   r_   r=   ra   rc   re   r    r!   r"   r   r      sJ         9S 9 9 9 ^9     X     ^* 4    ^ - - ^- 3    ^ C    ^ 
$ 
 
 
 ^
 
 
r!   r   c                       e Zd ZdZ	 ddedededededee         fd	Zdd
ededefdZ	dd
edee
         dee
         fdZdd
edee         dee         fdZdS )r   a|  Hold the parameters to construct a :py:class:`RendezvousHandler`.

    Args:
        backend:
            The name of the backend to use to handle the rendezvous.
        endpoint:
            The endpoint of the rendezvous, usually in form <hostname>[:<port>].
        run_id:
            The id of the rendezvous.
        min_nodes:
            The minimum number of nodes to admit to the rendezvous.
        max_nodes:
            The maximum number of nodes to admit to the rendezvous.
        local_addr:
            The address of the local node.
        **kwargs:
            Additional parameters for the specified backend.
    Nbackendendpointrun_id	min_nodes	max_nodesr1   c                     |st          d          |dk     rt          d| d          ||k     rt          d| d| d          || _        || _        || _        || _        || _        || _        || _        d S )N7The rendezvous backend name must be a non-empty string.   z(The minimum number of rendezvous nodes (z) must be greater than zero.z(The maximum number of rendezvous nodes (zK) must be greater than or equal to the minimum number of rendezvous nodes (z).)
ValueErrorrh   ri   rj   rk   rl   configr1   )rL   rh   ri   rj   rk   rl   r1   kwargss           r"   rM   zRendezvousParameters.__init__   s      	XVWWWq==b9bbb   y  R9 R RDMR R R  
  ""$r!   keydefaultr2   c                 8    | j                             ||          S )zAReturn the value for ``key`` if ``key`` exists, else ``default``.)rq   r;   )rL   rs   rt   s      r"   r;   zRendezvousParameters.get  s    {sG,,,r!   c                 \   |                      ||          }|t          |t                    r|S t          |t                    r|dk    rdS |dk    rdS nEt          |t                    r0|                                dv rdS |                                dv rdS t          d| d	          )
z+Return the value for ``key`` as a ``bool``.Nro   Tr   F)1truetyesy)0falsefnon%The rendezvous configuration option 'z+' does not represent a valid boolean value.)r;   
isinstancerf   r=   r:   lowerrp   )rL   rs   rt   values       r"   get_as_boolz RendezvousParameters.get_as_bool  s    g&&=Jud33=LeS!! 		zztzzu s## 	{{}} >>>t{{}} >>>udCddd
 
 	
r!   c                     |                      ||          }||S 	 t          |          S # t          $ r}t          d| d          |d}~ww xY w)z+Return the value for ``key`` as an ``int``.Nr   z+' does not represent a valid integer value.)r;   r=   rp   )rL   rs   rt   r   es        r"   
get_as_intzRendezvousParameters.get_as_int/  s|    g&&=L	u:: 	 	 	     	s   + 
AA		ArG   )r   r   r   r   r:   r=   r
   rM   r   r;   rf   r   r   r    r!   r"   r   r      s        4 %)% %% % 	%
 % % SM% % % %>- -s -S -C - - - -
 
s 
Xd^ 
xPT~ 
 
 
 
& c HSM Xc]      r!   r   c                   Z    e Zd ZU dZeeef         ed<   ddZdededdfdZ	d	e
defd
ZdS )r   z?Represent a registry of :py:class:`RendezvousHandler` backends.	_registryr2   Nc                     i | _         d S rG   )r   rO   s    r"   rM   z"RendezvousHandlerRegistry.__init__E  s    r!   rh   creatorc           	          |st          d          	 | j        |         }n# t          $ r d}Y nw xY w|||k    rt          d| d| d| d          || j        |<   dS )zRegister a new rendezvous backend.

        Args:
            backend:
                The name of the backend.
            creator:
                The callback to invoke to construct the
                :py:class:`RendezvousHandler`.
        rn   NThe rendezvous backend 'z' cannot be registered with 'z$' as it is already registered with ''.)rp   r   KeyError)rL   rh   r   current_creators       r"   registerz"RendezvousHandlerRegistry.registerH  s      	XVWWW	#"nW5OO 	# 	# 	#"OOO	# &?g+E+EC7 C CQX C C/>C C C  
 #*ws   ! 00paramsc                 H   	 | j         |j                 }n8# t          $ r+}t          d|j         d| j        j         d          |d}~ww xY w ||          }|                                |j        k    r-t          d|                                 d|j         d          |S )z+Create a new :py:class:`RendezvousHandler`.r   z-' is not registered. Did you forget to call `z`?Nz(' does not match the requested backend 'r   )r   rh   r   rp   r   r   rV   RuntimeError)rL   r   r   r   handlers        r"   create_handlerz(RendezvousHandlerRegistry.create_handlerc  s    	nV^4GG 	 	 	76> 7 7 M27 7 7  	 '&//   FN22/7+>+>+@+@ / /"N/ / /  
 s    
A
&AA
)r2   N)r   r   r   r   r	   r:   r   rA   rM   r   r   r   r   r    r!   r"   r   r   @  s         IIC112222   * *.F *4 * * * *6%9 >O      r!   r   ) r5   abcr   r   dataclassesr   typingr   r   r   r	   r
   torch.distributedr   +torch.distributed.elastic.utils.distributedr   r7   __all__	Exceptionr   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   <module>r      s    # # # # # # # # ! ! ! ! ! ! : : : : : : : : : : : : : : # # # # # # W W W W W W  ": : : : :i : : :. . . . .O . . .= = = = =_ = = =I I I I I I I I< < < < <? < < <    /     G  G  G  G  G  G  G  GF!* !* !* !* !* !* !* !*H] ] ] ] ] ] ] ]@U U U U U U U Up $%9$:<M$MN 6 6 6 6 6 6 6 6v 8799   r!   