
    קg*              
          d dl Z d dlZd dlmZmZ d dlmZmZmZm	Z	m
Z
mZmZ d dlmc mc mc mZ d dlmZmZ d dlmZ d dlmZ d dlmZmZmZ d dlmZ d d	l m!Z! d d
l"m#Z# d dl$m%Z% g dZ& e%e'          Z(e G d d                      Z) G d d          Z*deee+df         de	e         de+fdZ,de!dee
e+         e
e-         f         fdZ.de)deee+df         de	e         dee-ef         fdZ/dS )    N)	dataclassfield)AnyCallableDictListOptionalTupleUnion)eventsmetrics)
WorkerSpec)LocalElasticAgent)DefaultLogsSpecs	LogsSpecsSignalException)ChildFailedError)RendezvousParameters)parse_rendezvous_endpoint)
get_logger)LaunchConfigelastic_launchlaunch_agentc                   Z   e Zd ZU dZeed<   eed<   eed<   dZee         ed<   dZ	e
ed<   d	Ze
ed
<   dZe
ed<   dZe
ed<    ee          Zee
ef         ed<   dZeed<   dZeed<   dZeed<   dZe
ed<   dZee
         ed<    ee          Zee
e
f         ed<   dZee
         ed<   d ZdS )r   aM  
    Creates a rendezvous config.

    Args:
        min_nodes: Minimum amount of nodes that the user function will
                        be launched on. Elastic agent ensures that the user
                        function start only when the min_nodes amount enters
                        the rendezvous.
        max_nodes: Maximum amount of nodes that the user function
                        will be launched on.
        nproc_per_node: On each node the elastic agent will launch
                            this amount of workers that will execute user
                            defined function.
        rdzv_backend: rdzv_backend to use in the rendezvous (zeus-adapter, etcd).
        rdzv_endpoint: The endpoint of the rdzv sync. storage.
        rdzv_configs: Key, value pair that specifies rendezvous specific configuration.
        rdzv_timeout: Legacy argument that specifies timeout for the rendezvous. It is going
            to be removed in future versions, see the note below. The default timeout is 900 seconds.
        run_id: The unique run id of the job (if not passed a unique one will be
                deduced from run environment - flow workflow id in flow - or auto generated).
        role: User defined role of the worker (defaults to "trainer").
        max_restarts: The maximum amount of restarts that elastic agent will conduct
                    on workers before failure.
        monitor_interval: The interval in seconds that is used by the elastic_agent
                        as a period of monitoring workers.
        start_method: The method is used by the elastic agent to start the
                    workers (spawn, fork, forkserver).
        metrics_cfg: configuration to initialize metrics.
        local_addr: address of the local node if any. If not set, a lookup on the local
                machine's FQDN will be performed.
        local_ranks_filter: ranks for which to show logs in console. If not set, show from all.
    ..note:
        `rdzv_timeout` is a legacy argument that will be removed in future.
        Set the timeout via `rdzv_configs['timeout']`

    	min_nodes	max_nodesnproc_per_nodeN
logs_specs run_iddefault_rolerolerdzv_endpointetcdrdzv_backend)default_factoryrdzv_configsrdzv_timeout   max_restartsg?monitor_intervalspawnstart_methodlog_line_prefix_templatemetrics_cfg
local_addrc                     d}| j         dk    r| j         | j        d<   nd| j        vr
|| j        d<   | j        t                      | _        d S d S )Ni  r(   timeout)r)   r'   r   r   )selfdefault_timeouts     Z/var/www/html/ai-engine/env/lib/python3.11/site-packages/torch/distributed/launcher/api.py__post_init__zLaunchConfig.__post_init__Z   sf    ""+/+<Di((d///+:Di( ?".00DOOO #"    )__name__
__module____qualname____doc__int__annotations__r   r	   r   r    strr"   r#   r%   r   dictr'   r   r   r)   r+   r,   floatr.   r/   r0   r1   r7    r8   r6   r   r   "   sY        # #J NNNNNN&*J#***FCD#M3L##(5#>#>#>L$sCx.>>>L#L#!e!!!L#.2hsm222"'%"="="=Kc3h=== $J$$$	1 	1 	1 	1 	1r8   r   c                   :    e Zd ZdZdedeeedf         fdZd Z	dS )r   a  
    Launches an torchelastic agent on the container that invoked the entrypoint.

        1. Pass the ``entrypoint`` arguments as non ``kwargs`` (e.g. no named parameters)/
           ``entrypoint`` can be a function or a command.
        2. The return value is a map of each worker's output mapped
           by their respective global rank.

    Usage

    ::

    def worker_fn(foo):
        # ...

    def main():
        # entrypoint is a function.
        outputs = elastic_launch(LaunchConfig, worker_fn)(foo)
        # return rank 0's output
        return outputs[0]

        # entrypoint is a command and ``script.py`` is the python module.
        outputs = elastic_launch(LaunchConfig, "script.py")(args)
        outputs = elastic_launch(LaunchConfig, "python")("script.py")
    config
entrypointNc                 "    || _         || _        d S N)_config_entrypoint)r4   rD   rE   s      r6   __init__zelastic_launch.__init__   s    
 %r8   c                 R    t          | j        | j        t          |                    S rG   )r   rH   rI   list)r4   argss     r6   __call__zelastic_launch.__call__   s    DL$*:DJJGGGr8   )
r9   r:   r;   r<   r   r   r   r?   rJ   rN   rB   r8   r6   r   r   f   sd         4&& (C-.& & & &H H H H Hr8   r   rE   rM   returnc                     t          | t                    r| j        S t          | t                    r,| t          j        k    rt          d |D             d          S | S dS )a  Retrieve entrypoint name with the rule:
    1. If entrypoint is a function, use ``entrypoint.__qualname__``.
    2. If entrypoint is a string, check its value:
        2.1 if entrypoint equals to ``sys.executable`` (like "python"), use the first element from ``args``
            which does not start with hifen letter (for example, "-u" will be skipped).
        2.2 otherwise, use ``entrypoint`` value.
    3. Otherwise, return empty string.
    c              3   2   K   | ]}|d          dk    |V  dS )r   -NrB   ).0args     r6   	<genexpr>z'_get_entrypoint_name.<locals>.<genexpr>   s*      >>A#>>r8   r   )
isinstancer   r9   r?   sys
executablenext)rE   rM   s     r6   _get_entrypoint_namerZ      sh     *h'' ""	J	$	$ ''>>>>>CCCrr8   rdzv_parametersc                     | j         dk    rdS | j        }|                                }|st          d          t	          |d          \  }}|dk    rt          d| d          ||fS )Nstatic)NNzKEndpoint is missing in endpoint. Try to add --master-addr and --master-portr(   )default_portzport is missing in endpoint: z. Try to specify --master-port)backendendpointstrip
ValueErrorr   )r[   r`   master_addrmaster_ports       r6   _get_addr_and_portre      s     (**|'H~~H 
Y
 
 	
  9PRSSSKbTHTTT
 
 	
 %%r8   rD   c                    | j         sGt          t          j                    j                  }t
                              d|           || _         t          ||          }t
                              d|| j	        | j
        | j        | j         | j        | j        | j        | j        | j        | j        j        | j        d           t)          d
| j        | j        | j         | j	        | j
        | j        d| j        }t-          |          \  }}t/          | j        | j        |t3          |          t5          j        |          | j        | j        ||| j        
  
        }t9          || j        | j        | j                  }	d}
	 t?          j         t?          j!        | j                             |	"                                }tG          j$        |	%                                           |&                                rtO          ||j(                  |j)        |
r|j*        +                                 S S # tN          $ r  tX          $ r* d	}
tG          j$        |	-                                            t\          $ r( tG          j$        |	-                                            w xY w# |
r|j*        +                                 w w xY w)Nz3config has no run_id, generated a random run_id: %sa  Starting elastic_operator with launch configs:
  entrypoint       : %(entrypoint)s
  min_nodes        : %(min_nodes)s
  max_nodes        : %(max_nodes)s
  nproc_per_node   : %(nproc_per_node)s
  run_id           : %(run_id)s
  rdzv_backend     : %(rdzv_backend)s
  rdzv_endpoint    : %(rdzv_endpoint)s
  rdzv_configs     : %(rdzv_configs)s
  max_restarts     : %(max_restarts)s
  monitor_interval : %(monitor_interval)s
  log_dir          : %(log_dir)s
  metrics_cfg      : %(metrics_cfg)s
)rE   r   r   r   r    r%   r#   r'   r+   r,   log_dirr0   )r_   r`   r    r   r   r1   )
r"   local_world_sizerE   rM   rdzv_handlerr+   r,   rc   rd   r1   )specr   r.   r/   T)namefailuresFrB   )/r    r?   uuiduuid4r=   loggerwarningrZ   infor   r   r   r%   r#   r'   r+   r,   r   root_log_dirr0   r   r1   re   r   r"   tuplerdzv_registryget_rendezvous_handlerr   r.   r/   r   initialize_metricsMetricsConfigrunr   recordget_event_succeeded	is_failedr   rl   return_valuesri   shutdownr   get_event_failed	Exception)rD   rE   rM   r    entrypoint_namer[   rc   rd   rj   agentshutdown_rdzvresults               r6   r   r      s   
 = TZ\\%&&LfUUU*:t<<O
KK	1 *))$3m"/#1"/"/ & 7(5!-	
 	
  < + #%}""$  
 O  2/BBK[.4[["9/JJ(0$  D $(!'!@	  E M )"7#89K#L#LMMMe//11222 	
 #$   
 #  	)&&((((	)        e,,..///   e,,..///  	)&&((((	)s   BH3 3A.J!!J$ $K)0rW   rm   dataclassesr   r   typingr   r   r   r   r	   r
   r   -torch.distributed.elastic.rendezvous.registrydistributedelastic
rendezvousregistryrt   torch.distributed.elasticr   r   *torch.distributed.elastic.agent.server.apir   :torch.distributed.elastic.agent.server.local_elastic_agentr   )torch.distributed.elastic.multiprocessingr   r   r   0torch.distributed.elastic.multiprocessing.errorsr   $torch.distributed.elastic.rendezvousr   *torch.distributed.elastic.rendezvous.utilsr   'torch.distributed.elastic.utils.loggingr   __all__r9   ro   r   r   r?   rZ   r=   re   r   rB   r8   r6   <module>r      s   


  ( ( ( ( ( ( ( ( D D D D D D D D D D D D D D D D D D E E E E E E E E E E E E E E E 5 5 5 5 5 5 5 5 A A A A A A X X X X X X         
 N M M M M M E E E E E E P P P P P P > > > > > > =
<
<	H		 @1 @1 @1 @1 @1 @1 @1 @1F$H $H $H $H $H $H $H $HNhT)*26s)   ,&)&
8C=(3-'(& & & &&k)k)hT)*k) s)k) 
#s(^	k) k) k) k) k) k)r8   