
    קgك                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dl mZmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZ d dl mZ! d d	l"m#Z#m$Z$ d d
l%m&Z&m'Z' d dl(m)Z)m*Z* d dl+m,Z, ej-        dk    Z.ej-        dk    Z/ ej0        e1          Z2g dZ3 G d de4          Z5de6dee         ddfdZ7dej8        fdZ9dej8        fdZ:dee6ef         de6de;fdZ<dZ=dZ> G d de          Z?d ee?ee6e?f         f         d!e6dee6e?f         fd"Z@e G d# d$                      ZA G d% d&e          ZB G d' d(eB          ZCe G d) d*                      ZD G d+ d,e j                  ZEd-e;fd.ZFd/e6d0ed1ee6ef         d2ee6ee;e;f         f         d3ee6e;f         d4ee6e;f         d5ee6e!jG        f         d6ejH        ddfd7ZI G d8 d9eE          ZJ G d: d;eE          ZKdS )<    N)ABCabstractmethod)nullcontext)	dataclassfield)IntFlag)synchronize)	FrameType)AnyCallableDictOptionalSetTupleUnion)ProcessFailurerecord)redirect_stderrredirect_stdout)get_subprocess_handlerSubprocessHandler)TailLogwin32darwin)DefaultLogsSpecsSignalExceptionStdto_mapRunProcsResultPContext
get_std_cmMultiprocessContextSubprocessContextLogsDest	LogsSpecsc                   :     e Zd ZdZdedej        ddf fdZ xZS )r   z
    Exception is raised inside the torchelastic agent process by the termination handler
    if the death signal got received by the process.
    msgsigvalreturnNc                 X    t                                          |           || _        d S N)super__init__r(   )selfr'   r(   	__class__s      i/var/www/html/ai-engine/env/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/api.pyr-   zSignalException.__init__E   s&        )	__name__
__module____qualname____doc__strsignalSignalsr-   __classcell__r/   s   @r0   r   r   ?   s^         
C  D          r1   r   signumframer)   c                 z    t          j        |           }t          dt          j                     d| |          )a  Termination handler that raises exceptions on the main process.

    When the process receives death signal(SIGTERM, SIGINT), this termination handler will
    be invoked. It raises the ``SignalException`` exception that should be processed by the
    user code. Python does not terminate process after the termination handler is finished,
    so the exception should not be silently ignored, otherwise the process will never
    be terminated.
    zProcess z got signal: )r(   )r7   r8   r   osgetpid)r;   r<   r(   s      r0   _terminate_process_handlerr@   J   s>     ^F##F
GRY[[GGvGGPV
W
W
WWr1   c                  @    t           rt          j        S t          j        S )z@Get the kill signal. SIGKILL for unix, CTRL_C_EVENT for windows.)
IS_WINDOWSr7   CTRL_C_EVENTSIGKILL r1   r0   _get_kill_signalrF   W        ""~r1   c                  @    t           rt          j        S t          j        S )zOGet the default termination signal. SIGTERM for unix, CTRL_C_EVENT for windows.)rB   r7   rC   SIGTERMrE   r1   r0   _get_default_signalrJ   _   rG   r1   dnprocswhatc                     t          |                                           }t          t          |                    }||k    rt          | d| d|           d S )Nz), local rank mapping mismatch, expected: z
, actual: )setkeysrangeRuntimeError)rK   rL   rM   actual_keysexpected_keyss        r0   _validate_full_rankrU   g   sz    affhh--Kf&&Mm## A A'A A3>A A
 
 	
 $#r1   z^(\d:[0123],)*(\d:[0123])$z^[0123]$c            	       d    e Zd ZdZdZdZeez  Zedede	d e
ed f         f         fd            ZdS )r   r         vmr)   c                    dt           dt          fd}t          j        t          |          r ||          S t          j        t
          |          rOi }|                    d          D ]5}|                    d          \  }} ||          |t          |          <   6|S t          | dt           dt
           d          )	z
        Example:
        ::

         from_str("0") -> Std.NONE
         from_str("1") -> Std.OUT
         from_str("0:3,1:0,2:1,3:2") -> {0: Std.ALL, 1: Std.NONE, 2: Std.OUT, 3: Std.ERR}

        Any other input raises an exception
        vr)   c                 T    t          t          |                     }|t           v r|S d S r+   )r   int)r[   ss     r0   to_stdzStd.from_str.<locals>.to_std   s(    CFFACxx xr1   ,:z does not match: <z> or <>)	r6   r   rematch_VALUE_REGEX_MAPPING_REGEXsplitr]   
ValueError)clsrY   r_   rK   mir[   s          r0   from_strzStd.from_str|   s    	c 	c 	 	 	 	 8L"%% 	6"::Xnb)) 		 "AXXc]] & &wws||1"F1II#a&&		HNNNN^NNN  r1   N)r2   r3   r4   NONEOUTERRALLclassmethodr6   r   r   r]   rl   rE   r1   r0   r   r   v   sk        D
C
C
)C# %tCJ/?(?"@    [  r1   r   
val_or_maplocal_world_sizec                     t          | t                    r(t                              t	          |          |           S i }t	          |          D ]%}|                     |t          j                  ||<   &|S )a   
    Certain APIs take redirect settings either as a single value (e.g. apply to all
    local ranks) or as an explicit user-provided mapping. This method is a convenience
    method that converts a value or mapping into a mapping.

    Example:
    ::

     to_map(Std.OUT, local_world_size=2) # returns: {0: Std.OUT, 1: Std.OUT}
     to_map({1: Std.OUT}, local_world_size=2) # returns: {0: Std.NONE, 1: Std.OUT}
     to_map({0: Std.OUT, 1: Std.OUT}, local_world_size=2) # returns: {0: Std.OUT, 1: Std.OUT}
    )
isinstancer   dictfromkeysrQ   getrm   )rr   rs   maprk   s       r0   r   r      sp     *c"" }}U#344jAAA'(( 	1 	1A^^Asx00CFF
r1   c                      e Zd ZU dZ ee          Zeee	f         e
d<    ee          Zeee	f         e
d<    ee          Zeee	f         e
d<    ee          Zeee	f         e
d<    ee          Zeee	f         e
d<   dS )	r$   zK
    For each log type, holds mapping of local rank ids to file paths.
    default_factorystdoutsstderrstee_stdoutstee_stderrserror_filesN)r2   r3   r4   r5   r   rv   r}   r   r]   r6   __annotations__r~   r   r   r   rE   r1   r0   r$   r$      s           $eD999GT#s(^999#eD999GT#s(^999"'%"="="=Kc3h==="'%"="="=Kc3h==="'%"="="=Kc3h=====r1   r$   c                      e Zd ZdZdej        ej        dfdee         deee	e
ef         f         deee	e
ef         f         deee
                  ddf
dZed	e	e
e	eef         f         defd
            Zeedefd                        ZdS )r%   a8  
    Defines logs processing and redirection for each worker process.

    Args:
        log_dir:
            Base directory where logs will be written.
        redirects:
            Streams to redirect to files. Pass a single ``Std``
            enum to redirect for all workers, or a mapping keyed
            by local_rank to selectively redirect.
        tee:
            Streams to duplicate to stdout/stderr.
            Pass a single ``Std`` enum to duplicate streams for all workers,
            or a mapping keyed by local_rank to selectively duplicate.
    Nlog_dir	redirectsteelocal_ranks_filterr)   c                 >    || _         || _        || _        || _        d S r+   _root_log_dir
_redirects_tee_local_ranks_filter)r.   r   r   r   r   s        r0   r-   zLogsSpecs.__init__   s(     %#	#5   r1   envsc                     dS )aS  
        Given the environment variables, builds destination of log files for each of the local ranks.

        Envs parameter contains env variables dict for each of the local ranks, where entries are defined in:
        :func:`~torchelastic.distributed.elastic.agent.server.local_elastic_agent.LocalElasticAgent._start_workers`.
        NrE   )r.   r   s     r0   reifyzLogsSpecs.reify   s      r1   c                     d S r+   rE   r.   s    r0   root_log_dirzLogsSpecs.root_log_dir   s	     	r1   )r2   r3   r4   r5   r   rm   r   r6   r   r   r]   r   r-   r   r$   r   propertyr   rE   r1   r0   r%   r%      s        $ "&03*-(15
6 
6#
6 d38n,-
6 3S#X&'	
6
 %SX.
6 

6 
6 
6 
6 	3S#X&'	 
	 	 	 ^	 c    ^ X  r1   r%   c                   >    e Zd ZdZdej        ej        dfdee         deee	e
ef         f         deee	e
ef         f         deee
                  ddf
 fdZedefd	            Zdee         d
efdZde	e
e	eef         f         defdZdefdZdedefdZ xZS )r   z
    Default LogsSpecs implementation:

    - `log_dir` will be created if it doesn't exist
    - Generates nested folders for each attempt and rank.
    Nr   r   r   r   r)   c                 ~   |t           j        k    r|st          j        d          }nht           j                            |          st          j        |d           n2t           j                            |          rt          d| d          t                      
                    ||||           d | _        d S )Ntorchelastic_prefixTexist_okz	log_dir: z
 is a file)r>   devnulltempfilemkdtemppathexistsmakedirsisfileNotADirectoryErrorr,   r-   _run_log_dir)r.   r   r   r   r   r/   s        r0   r-   zDefaultLogsSpecs.__init__   s     bj   N"*/BBBW^^G,, NGd333337>>'** N,-L-L-L-LMMM)S2DEEE r1   c                 *    t          | j                  S r+   )r6   r   r   s    r0   r   zDefaultLogsSpecs.root_log_dir  s    4%&&&r1   rdzv_run_idc                     |pt          j        d          }t          j        |d           t          j        | d|          }t                              d|           |S )Nr   r   Tr   _)r   dirzlog directory set to: %s)r   r   r>   r   loggerinfo)r.   r   r   base_log_dirr   s        r0   _make_log_dirzDefaultLogsSpecs._make_log_dir  sg    J("2/"J"J"J
L40000&7&7&7\JJJ.444
r1   r   c                 6   t          |          }i }|dk    r	|d         }nt                              d           |                    dd          }|                    dd          }d}| j        t
          j        k    ry| j        s |                     | j        |          | _        t
          j	        
                    | j        d|           }t          j        |d	
           t          j        |           | j        t
          j        k    rt
          j        }t          | j        |          }t          | j        |          }|                                D ]\  }	}
||	         }||
z  ||	<   d}t$                              t)          |          |          }t$                              t)          |          |          }i }i }i }t)          |          D ]3}	|t
          j        k    r9t
          j        ||	<   t
          j        ||	<   t
          j        ||	<   d||	         d<   Lt
          j	        
                    |t+          |	                    }t          j        |           ||	         }|t.          j        z  t.          j        k    r#t
          j	        
                    |d          ||	<   |t.          j        z  t.          j        k    r#t
          j	        
                    |d          ||	<   ||	         }|t.          j        z  t.          j        k    r||	         ||	<   |t.          j        z  t.          j        k    r||	         ||	<   | j        rs|	| j        vrj|	|v r|                    |	d           |	|v r|                    |	d           ||	         |k    rt
          j        ||	<   ||	         |k    rt
          j        ||	<   t
          j	        
                    |d          }|||	<   t                              d|	|           |||	         d<   5t;          |||||          S )a   
        Uses following scheme to build log destination paths:

        - `<log_dir>/<rdzv_run_id>/attempt_<attempt>/<rank>/stdout.log`
        - `<log_dir>/<rdzv_run_id>/attempt_<attempt>/<rank>/stderr.log`
        - `<log_dir>/<rdzv_run_id>/attempt_<attempt>/<rank>/error.json`
        r   z;Empty envs map provided when defining logging destinations.TORCHELASTIC_RUN_IDtest_run_idTORCHELASTIC_RESTART_COUNT0 attempt_T)ignore_errorsTORCHELASTIC_ERROR_FILEz
stdout.logz
stderr.logNz
error.jsonz"Setting worker%s reply file to: %s)lenr   warningrx   r   r>   r   r   r   r   joinshutilrmtreer   r   r   r   itemsrv   rw   rQ   r6   mkdirr   rn   ro   r   popr   r$   )r.   r   rL   
global_envrun_idrestart_countattempt_log_dirredirsts
local_ranktee_stdredirect_std
SYS_STREAMr}   r~   r   r   r   clogdirrdt
error_files                         r0   r   zDefaultLogsSpecs.reify  s    T
A::aJJNNM    5}EE"'CSII!++$ S$($6$6t7I6$R$R! gll4+<>X>X>XYYOM/>>>>K(((++ jO 00DIv&& $&88:: 	8 	8J!*-L!-!7F:
--fz::--fz::&(&(-- +	I +	IJ"*,,*,*J'*,*J'*,*J'>@Z !:;;',,JHH!!!J'LSW,,*,',,w*M*MGJ'LSW,,*,',,w*M*MGJ'zNsw;#')).5j.AK
+sw;#')).5j.AK
+ ,9"$*BBB "[00#
D999![00#
D999 z*j88.0j
+z*j88.0j
+W\\'<@@
*4J'8*j   ?IZ !:;;+{KPPPr1   c           	      H    d| j          d| j         d| j         d| j         d	S )NzDefaultLogsSpecs(root_log_dir=z, redirects=z, tee=z, local_ranks_filter=)r   r   s    r0   __repr__zDefaultLogsSpecs.__repr__  sW    OT-? O OT_ O O9O O373KO O O	
r1   otherc                     t          |t                    sdS | j        |j        k    o/| j        |j        k    o| j        |j        k    o| j        |j        k    S )NF)ru   r   r   r   r   r   )r.   r   s     r0   __eq__zDefaultLogsSpecs.__eq__  sj    %!122 	5 %"55 F5#33F	UZ'F (E,EE		
r1   )r2   r3   r4   r5   r   rm   r   r6   r   r   r]   r   r-   r   r   r   r$   r   r   objectboolr   r9   r:   s   @r0   r   r      s         "&03*-(15! !#! d38n,-! 3S#X&'	!
 %SX.! 
! ! ! ! ! !& 'c ' ' ' X'Xc]     hQ3S#X&'hQ 
hQ hQ hQ hQT
# 
 
 
 
	
F 	
t 	
 	
 	
 	
 	
 	
 	
 	
r1   r   c                       e Zd ZU dZ ee          Zeee	f         e
d<    ee          Zeeef         e
d<    ee          Zeeef         e
d<    ee          Zeeef         e
d<   defdZd	S )
r   a  
    Results of a completed run of processes started with ``start_processes()``. Returned by ``PContext``.

    Note the following:

    1. All fields are mapped by local rank
    2. ``return_values`` - only populated for functions (not the binaries).
    3. ``stdouts`` - path to stdout.log (empty string if no redirect)
    4. ``stderrs`` - path to stderr.log (empty string if no redirect)

    r{   return_valuesfailuresr}   r~   r)   c                 2    t          | j                  dk    S )Nr   )r   r   r   s    r0   	is_failedzRunProcsResult.is_failed  s    4=!!A%%r1   N)r2   r3   r4   r5   r   rv   r   r   r]   r   r   r   r   r}   r6   r~   r   r   rE   r1   r0   r   r     s         
 
 %*E$$?$?$?M4S>???*/%*E*E*EHd3&'EEE#eD999GT#s(^999#eD999GT#s(^999&4 & & & & & &r1   r   c                      e Zd ZdZ	 ddedeeef         deee	f         deeeeef         f         de
deeeef                  fd	ZddZej        dd            Zej        d
ee         fd            Zddeded
ee         fdZej        d
eeef         fd            Zej        ddej        ded
dfd            Z	 ddeej                 ded
dfdZdS )r    a  
    The base class that standardizes operations over a set of processes that are launched via different mechanisms.

    The name ``PContext`` is intentional to disambiguate with ``torch.multiprocessing.ProcessContext``.

    .. warning:: stdouts and stderrs should ALWAYS be a superset of
                 tee_stdouts and tee_stderrs (respectively) this is b/c
                 tee is implemented as a redirect + tail -f <stdout/stderr.log>
    Nname
entrypointargsr   
logs_specslog_line_prefixesc                    || _         t          |          }|                    |          }t          |j        |d           t          |j        |d           || _        || _        || _        |j        | _        |j        | _        |j	        | _	        || _
        t          ||j        t          j        |          | _        t          ||j        t          j        |          | _        d S )Nr}   r~   )r   r   r   rU   r}   r~   r   r   r   r   rL   r   r   sysstdout_stdout_tailr   stderr_stderr_tail)	r.   r   r   r   r   r   r   rL   	logs_dests	            r0   r-   zPContext.__init__  s     	 T $$T**	I-vyAAAI-vyAAA$		 ( ($0#)'5F
 
 $)'5F
 
r1   r)   c                 B   t          j                    t          j                    u rt          j        t          j        t
                     t          j        t          j        t
                     t          sHt          j        t          j        t
                     t          j        t          j	        t
                     nt                              d           |                                  | j                                         | j                                         dS )z<Start processes using parameters defined in the constructor.zFailed to register signal handlers since torchelastic is running on a child thread. This could lead to orphaned worker processes if the torchrun is terminated.N)	threadingcurrent_threadmain_threadr7   rI   r@   SIGINTrB   SIGHUPSIGQUITr   r   _startr   startr   r   s    r0   r   zPContext.start  s    #%%)>)@)@@@M&.*DEEEM&-)CDDD Jfm-GHHHfn.HIIINN^   	!!!!!!!!r1   c                     t           )z?Start processes using strategy defined in a particular context.NotImplementedErrorr   s    r0   r   zPContext._start  
     "!r1   c                     t           )aE  
        Poll the run status of the processes running under this context.
        This method follows an "all-or-nothing" policy and returns
        a ``RunProcessResults`` object if either all processes complete
        successfully or any process fails. Returns ``None`` if
        all processes are still running.
        r   r   s    r0   _pollzPContext._poll  s
     "!r1   rW   timeoutperiodc                 >   |dk    r|                                  S |dk     rt          j        }t          j                    |z   }t          j                    |k     rC|                                  }|r|S t          j        |           t          j                    |k     CdS )a  
        Wait for the specified ``timeout`` seconds, polling every ``period`` seconds
        for the processes to be done. Returns ``None`` if the processes are still running
        on timeout expiry. Negative timeout values are interpreted as "wait-forever".
        A timeout value of zero simply queries the status of the processes (e.g. equivalent
        to a poll).

        ..note: Multiprocessing library registers SIGTERM and SIGINT signal handlers that raise
                ``SignalException`` when the signals received. It is up to the consumer of the code
                to properly handle the exception. It is important not to swallow the exception otherwise
                the process would not terminate. Example of the typical workflow can be:

        .. code-block:: python
            pc = start_processes(...)
            try:
                pc.wait(1)
                .. do some other work
            except SignalException as e:
                pc.shutdown(e.sigval, timeout=30)

        If SIGTERM or SIGINT occurs, the code above will try to shutdown child processes by propagating
        received signal. If child processes will not terminate in the timeout time, the process will send
        the SIGKILL.
        r   N)r   r   maxsizetimesleep)r.   r   r   expiryprs        r0   waitzPContext.wait  s    2 a<<::<<Q;;kGw&ikkF""B 	Jv	 ikkF"" tr1   c                     t           )z@Return pids of processes mapped by their respective local_ranks.r   r   s    r0   pidszPContext.pids!  r   r1      	death_sigc                     t           )z
        Terminates all processes managed by this context and cleans up any
        meta resources (e.g. redirect, error_file files).
        r   r.   r  r   s      r0   _closezPContext._close&  s
     "!r1   c                     |st                      }|                     ||           | j        r| j                                         | j        r| j                                         dS dS )ar  
        Terminates all processes managed by this context and cleans up any
        meta resources (e.g. redirect, error_file files).

        Args:
            death_sig: Death signal to terminate processes.
            timeout: Time to wait for processes to finish, if process is
                still alive after this time, it will be terminated via SIGKILL.
        )r  r   N)rJ   r  r   stopr   r  s      r0   closezPContext.close.  s}      	.+--Ii999 	%""$$$ 	%""$$$$$	% 	%r1   r+   )r)   N)r   rW   r  )Nr  )r2   r3   r4   r5   r6   r   r   r   r]   r   r%   r   r-   r   abcr   r   r   r   floatr   r  r7   r8   r  r	  rE   r1   r0   r    r      s        " 7;!
 !
!
 (C-(!
 3:	!

 3S#X&'!
 !
 $DcN3!
 !
 !
 !
F" " " "" 	" " " " 	"x/ " " " "& &E & &h~>V & & & &P 	"d38n " " " " 	" " " "d " " " " JL% %!&.1%CF%	% % % % % %r1   r    std_rdc                 T    t           s	t          s| st                      S  ||           S r+   )rB   IS_MACOSr   )r  redirect_fns     r0   r!   r!   C  s2     #X #V #}}{6"""r1   r   fnr   r   stdout_redirectsstderr_redirectsret_valsqueue_finished_reading_eventc                    ||          }||          }	||          }
||          }||          }t          |t                    }t          |t                    }|	                                D ]\  }}|t          j        |<   |5  |5   t          |          | }d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   |
                    |           |                                 d S r+   )	r!   r   r   r   r>   environr   putr   )r   r  r   r   r  r  r  r  args_env_ret_val_	stdout_rd	stderr_rd	stdout_cm	stderr_cmkr[   rets                     r0   _wrapr"  J  sm    E
D
#H ,I ,I9o66I9o66I

  1
1	 ! !I ! !fRjj% ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !LL %%'''''s6   >B7B B7 B$	$B7'B$	(B77B;>B;c                        e Zd ZdZ	 ddededeeef         deeeeef         f         dede	d	e
eeef                  f fd
Zd ZdefdZde
e         fdZdeeef         fdZddej        deddfdZ xZS )r"   z<``PContext`` holding worker processes invoked as a function.Nr   r   r   r   start_methodr   r   c                 *    t                                          ||||||           | _         fdt           j                  D              _        i  _        d  _        t          j	         j                  
                                 _        d S )Nc                 h    i | ].}|t          j        j                                                  /S rE   )mpget_contextr$  SimpleQueue.0r   r.   s     r0   
<dictcomp>z0MultiprocessContext.__init__.<locals>.<dictcomp>  sD     
 
 
 t'899EEGG
 
 
r1   )r,   r-   r$  rQ   rL   	_ret_vals_return_values_pcr'  r(  Event_worker_finished_event)	r.   r   r   r   r   r$  r   r   r/   s	   `       r0   r-   zMultiprocessContext.__init__k  s     		
 	
 	
 )
 
 
 
#DK00
 
 
 /104 ')nT5F&G&G&M&M&O&O###r1   c           
          | j         rt          d          t          j        t          | j        | j        | j        | j        | j	        | j
        | j        f| j        dd| j                  | _         d S )NzWThe process context already initialized. Most likely the start method got called twice.F)r  r   rL   r   daemonr$  )r/  rh   r'  start_processesr"  r   r   r   r}   r~   r-  r1  rL   r$  r   s    r0   r   zMultiprocessContext._start  s    8 	B   %		+ ;*
 
 
r1   r)   c                 <    t          | j                  | j        k    S r+   )r   r.  rL   r   s    r0   _is_donezMultiprocessContext._is_done  s    4&''4;66r1   c           
      4   | j         J 	 | j                             d           t          d| j                  D ]?}| j        |         }|                                s|                                | j        |<   @|                                 r| j	        
                                 | j                                         s3t                              d           | j                                         3t          | j        | j        d           |                                  t          | j        | j        | j                  S d S # t$          j        t$          j        f$ r}|j        }| j        j        }| j         j        |         }| j        |         }t                              d|j        ||j        || j                   |                                  t          |t=          ||j        |j        |          i| j        | j                  cY d }~S d }~ww xY w)	Nr   r   z>entrypoint fn finished, waiting for all child procs to exit...zreturn_value queue)r   r}   r~   zKfailed (exitcode: %s) local_rank: %s (pid: %s) of fn: %s (start_method: %s)r   pidexitcoder   r   r}   r~   )r/  r   rQ   rL   r-  emptyrx   r.  r6  r1  rO   r   debugrU   r	  r   r}   r~   r'  ProcessRaisedExceptionProcessExitedExceptionerror_indexr   r4   	processesr   	exceptionr:  r9  r$  r   )r.   r   return_queueefailed_local_rankfn_namefailed_procerror_filepaths           r0   r   zMultiprocessContext._poll  sQ   x###J	 HMM" $At{33 I I
#~j9#))++ I6B6F6F6H6HD'
3}} +//111
 (--// LLX   (--// 
 $'6J   

%"&"5 L L    t)2+DE 	 	 	 ! o2G(,->?K!-.?@N0 $!!	 	 	 JJLLL!%~#4E!,!5#1	( ( (         )	s   D7E H B,HHHc                 z    | j         J t          t          | j                                                             S r+   )r/  rv   	enumerater  r   s    r0   r  zMultiprocessContext.pids  s0    x###Idhmmoo..///r1   r  r  r   c                    | j         sd S | j         j        D ]h}|                                rRt                              d|j        |j                   	 t          j        |j        |           X# t          $ r Y dw xY wit          j                    |z   }| j         j        D ]5}|t          j                    z
  }|dk    r n|                    |           6| j         j        D ]}|                                rft                              d|j        |t                                 	 t          j        |j        t                                 n# t          $ r Y nw xY w|                                 d S )Nz Closing process %s via signal %sr   ?Unable to shutdown process %s via %s, forcefully exiting via %s)r/  rA  is_aliver   r   r9  r   r>   killProcessLookupErrorr   	monotonicr   rF   )r.   r  r   procendtime_to_waits         r0   r  zMultiprocessContext._close  s   x 	FH& 
	 
	D}} 	6).  GDHi0000)    D	 n(H& 	$ 	$D!1!11Lq  IIl####H& 	 	D}} UH$&&	  GDH&6&8&89999)    D IIKKKK	 	s$   A..
A;:A;(&E
EEr+   r
  )r2   r3   r4   r5   r6   r   r   r]   r   r%   r   r-   r   r   r6  r   r   r  r7   r8   r  r9   r:   s   @r0   r"   r"   h  sk       FF 7;P PP P 3:	P
 3S#X&'P P P $DcN3P P P P P PB
 
 
.7$ 7 7 7 7Mx/ M M M M^0d38n 0 0 0 0" " " "d " " " " " " " "r1   r"   c                        e Zd ZdZ	 ddededeeef         deeeeef         f         dede	eeef                  f fd	Z
d
 Zde	e         fdZdeeef         fdZddej        deddfdZ xZS )r#   z:``PContext`` holding worker processes invoked as a binary.Nr   r   r   r   r   r   c                     t                                          ||||||           t          t          | j                            | _        i | _        i | _        d S r+   )r,   r-   rO   rQ   rL   _running_local_ranks	_failuressubprocess_handlers)r.   r   r   r   r   r   r   r/   s          r0   r-   zSubprocessContext.__init__!  sd     		
 	
 	
 /2%2D2D.E.E!46AC   r1   c                 ~      j         rt          d           fdt           j                  D              _         d S )Nz[The subprocess handlers already initialized. Most likely the start method got called twice.c                     i | ]I}|t          j        j        |         j        |         j        |         j        |         |           JS ))r   r   envr   r   local_rank_id)r   r   r   r   r}   r~   r*  s     r0   r,  z,SubprocessContext._start.<locals>.<dictcomp>=  sm     
$
 
$
 
$
  .?Yz*Ij)|J/|J/(  
$
 
$
 
$
r1   )rX  rh   rQ   rL   r   s   `r0   r   zSubprocessContext._start8  s`    # 	m  
$
 
$
 
$
 
$
 $DK00
$
 
$
 
$
   r1   r)   c                    t                      }| j        D ]u}| j        |         }|j                                        }|K|                    |           |dk    r0t          ||j        j        || j        |                   | j	        |<   v| j        
                    |           | j        r| j	        r|                                  t          | j	        | j        | j                  }|                                r\t!          |j                                        d           }t&                              d|j        |j        |j        | j                   n1t0                              t5          | j                            |_        |S d S )Nr   r8  r;  c                     | j         S r+   )	timestamp)fs    r0   <lambda>z)SubprocessContext._poll.<locals>.<lambda>d  s    AK r1   )keyz<failed (exitcode: %s) local_rank: %s (pid: %s) of binary: %s)rO   rV  rX  rQ  polladdr   r9  r   rW  difference_updater	  r   r}   r~   r   minr   valuesr   errorr:  r   r   rv   rw   rQ   rL   r   )r.   done_local_ranksr   handlerr:  resultfirst_failures          r0   r   zSubprocessContext._pollI  s   553 	 	J.z:G|((**H# $$Z000q==1?#-#L,!)#'#3J#?	2 2 2DN:. 	!334DEEE ( 	DN 	JJLLL#  F
 !! I #FO$:$:$<$<BWBW X X X% "*!,!%O    (,}}U4;5G5G'H'H$M4r1   c                 H    d | j                                         D             S )Nc                 .    i | ]\  }}||j         j        S rE   )rQ  r9  )r+  r   shs      r0   r,  z*SubprocessContext.pids.<locals>.<dictcomp>w  s2     
 
 

B 
 
 
r1   )rX  r   r   s    r0   r  zSubprocessContext.pidsv  s2    
 
"&":"@"@"B"B
 
 
 	
r1   r  r  r   c                 J   | j         sd S | j                                         D ]\}|j                                        At                              d|j        j        |j                   |                    |           ]t          j
                    |z   }| j                                         D ]P}|t          j
                    z
  }|dk    r n1	 |j                            |           <# t          j        $ r Y Mw xY w| j                                         D ]}|j                                        nt                              d|j        j        |t                                 |                    t                                 |j                                         d S )Nz$Sending process %s closing signal %s)r  r   rL  )rX  rg  rQ  rc  r   r   r9  r   r	  r   rP  r   
subprocessTimeoutExpiredrF   )r.   r  r   rj  rR  rS  s         r0   r  zSubprocessContext._close|  s   ' 	F/6688 	3 	3G|  ""*:L$N  
 	222n(/6688 		 		G!1!11Lq  !!,////,     /6688 		$ 		$G|  ""*UL$$&&	   (8(:(:;;;!!###		$ 		$s   C++C=<C=r+   r
  )r2   r3   r4   r5   r6   r   r]   r   r%   r   r-   r   r   r   r  r7   r8   r  r9   r:   s   @r0   r#   r#     s?       DD 7;D DD D 3:	D
 3S#X&'D D $DcN3D D D D D D.
 
 
"+x/ + + + +Z
d38n 
 
 
 
$ $ $ $d $ $ $ $ $ $ $ $r1   r#   )Lr  loggingr>   rc   r   r7   rq  r   r   r   r   r   r   
contextlibr   dataclassesr   r   enumr   multiprocessingr	   typesr
   typingr   r   r   r   r   r   r   torch.multiprocessingr'  0torch.distributed.elastic.multiprocessing.errorsr   r   3torch.distributed.elastic.multiprocessing.redirectsr   r   <torch.distributed.elastic.multiprocessing.subprocess_handlerr   r   2torch.distributed.elastic.multiprocessing.tail_logr   platformrB   r  	getLoggerr2   r   __all__	Exceptionr   r]   r@   r8   rF   rJ   r6   rU   rf   re   r   r   r$   r%   r   r   r    r!   r)  r0  r"  r"   r#   rE   r1   r0   <module>r     s^   


  				 				       



       # # # # # # # # " " " " " " ( ( ( ( ( ( ( (       ' ' ' ' ' '       C C C C C C C C C C C C C C C C C C " " " " " " S S S S S S S S               G F F F F F \W$
<8# 
	8	$	$      i   
Xs 
X8I3F 
X4 
X 
X 
X 
X&.    V^    
4S> 
3 
c 
 
 
 
 /$ $ $ $ $' $ $ $Nc4S>)*>A	#s(^   0 	> 	> 	> 	> 	> 	> 	> 	>, , , , , , , ,^_
 _
 _
 _
 _
y _
 _
 _
D & & & & & & & &,V% V% V% V% V%sw V% V% V%r#s # # # #((( sEz
( sDcN"
#	(
 38n( 38n( 3&'( #."3( 
( ( ( (<s s s s s( s s sl}$ }$ }$ }$ }$ }$ }$ }$ }$ }$r1   