
    קgT\                    6   d dl mZ d dlZd dlZd dlZd dlmZ d dlmZm	Z	m
Z
mZ d dlZddlmZmZ ddlmZ ddlmZmZmZmZmZmZmZ ej                            ed	          Zerdd
lmZ d*dZ d*dZ!d*dZ"d+dZ#d,dZ$d-dZ%d Z&d Z'd*dZ(d.d"Z)d# Z*d/d)Z+dS )0    )annotationsN)defaultdict)DictListSetTYPE_CHECKING   )configir)WeakDep)contains_collectivecontains_waitfind_recursive_deps_of_nodefind_recursive_users_of_nodeis_collectiveis_fallback_opis_waitoverlap)BaseSchedulerNodesnodesList[BaseSchedulerNode]returnc                (    t          | ddd          S )z7
    Greedily schedules waits as late as possible.
    FTraise_comms
sink_waitsreorder_for_overlap_schedule_for_commr   s    Q/var/www/html/ai-engine/env/lib/python3.11/site-packages/torch/_inductor/comms.pyr   r       s$     Ed       c                (    t          | ddd          S )z8
    Greedily schedules comms as early as possible.
    TFr   r   r    s    r!   r   r   )   s$     DU   r"   c                (    t          | ddd          S )a  
    This achieves the following overall scheduling procedure:
        Step 1: Given that we've currently scheduled comm N, we now schedule all compute nodes
            that are required for comm N + 1 but do not depend on comm N, to run at the same time with comm N.
        Step 2: If all those compute nodes are sufficient to overlap comm N, we're done.
            Otherwise, we now need to look elsewhere to find compute that overlaps with comm N.
            We prioritize compute nodes that are needed sooner.
        Step 3: We schedule the compute nodes dependent on comm N and required for comm N + 1.
        Step 4: We schedule comm N + 1.
        Repeat this for subsequent comm nodes.
    Tr   r   r    s    r!   reorder_compute_for_overlapr%   2   s$     DTt   r"   r   boolr   r   c                B   i }i i i i ct          |           D ]\  }}|                                D ]}|||<   |                                D ]}||<   ||                                <   |                                }	t          j        |	<   d|	<   ||	<   d}
| D ]}|rit          |          rZ|
|                                <   |j        D ]5}|                                         }t          |         |
          |<   6|
dz  }
m|r&t          |          rd|                                <    G fdd          d | D             g t          t                    d | D                                             D ]V\  }}t          |          dk    rt          j         |                     |D ]}|                             |           Wg fdfdfd	}t                    rPt          j                  j        }|rt          |          r ||           n |           t                    P                                D ]%\  }}t          |          dk    sJ d
             &S )a  
    Schedule `snodes` for various comm optimization objectives.

    Args:
        snodes: the nodes to be scheduled.
        raise_comms: whether to greedily schedule collectives as early as possible
        sink_wait: whether to greedily schedule waits as late as possible
        reorder_compute_for_overlap: whether to reorder compute nodes to
            optimize for compute/communication overlapping.

    Returns:
        The new schedule order.

    Some notes on the synergy between different options:
        - `raise_comms` provides more overlapping oppurtunies for `reorder_compute_for_overlap`.
        - When both `raise_comms` and `sink_waits` is `True`, `raise_comms` is prioritized.
    r   r	   c                  (    e Zd Zd fdZd ZdS )$_schedule_for_comm.<locals>.Runnabler   Nonec                    || _         t          t          |                                                    }|                                         }|         |         |         f| _        d S N)snodenextiterget_operation_namesget_namescore)selfr-   name
fused_namename_to_fused_nodescores_0scores_1scores_2s       r!   __init__z-_schedule_for_comm.<locals>.Runnable.__init__   sf    DJU668899::D+D1::<<J$$$DJJJr"   c                "    | j         |j         k     S r,   r2   )r3   others     r!   __lt__z+_schedule_for_comm.<locals>.Runnable.__lt__   s    :++r"   N)r   r*   )__name__
__module____qualname__r:   r>   )r6   r7   r8   r9   s   r!   Runnabler)      sQ        	 	 	 	 	 	 	 	 		, 	, 	, 	, 	,r"   rB   c                2    i | ]}|d  |j         D             S )c                    h | ]	}|j         
S  )r4   ).0deps     r!   	<setcomp>z0_schedule_for_comm.<locals>.<dictcomp>.<setcomp>   s    ===S===r"   )unmet_dependenciesrF   r-   s     r!   
<dictcomp>z&_schedule_for_comm.<locals>.<dictcomp>   s9     5 5 5BG==E$<===5 5 5r"   c                .    i | ]}|t          |          S rE   )estimate_op_runtimerJ   s     r!   rK   z&_schedule_for_comm.<locals>.<dictcomp>   s#    KKK5U/66KKKr"   c                                        |            |                                 D ]_}|         D ]T} |                              |           t          |                    dk    rt	          j         |                      U`dS )zU
        Schedules `snode` and put all unblocked nodes onto the ready queue.
        r   N)appendget_buffer_namesremovelenheapqheappush)r-   buf_namerB   buffer_usersready	scheduled
unmet_depss     r!   schedulez$_schedule_for_comm.<locals>.schedule   s     	..00 	; 	;H%h/ ; ;5!((222z%())Q..N5((5//:::;	; 	;r"   c                 j    d D             } t          |           dk    rdS t          | d           S )zh
        Return the next node in the ready queue that's neither a collective or
        a wait.
        c                b    g | ],}t          |j                  t          |j                  *|-S rE   )r   r-   r   rF   xs     r!   
<listcomp>zI_schedule_for_comm.<locals>.get_overlapping_candidate.<locals>.<listcomp>   sM     
 
 
&qw//
 9Fag8N8N

 
 
r"   r   Nc                    | j         S r,   r<   r^   s    r!   <lambda>zG_schedule_for_comm.<locals>.get_overlapping_candidate.<locals>.<lambda>   s    QW r"   key)rR   min)
candidatesrW   s    r!   get_overlapping_candidatez5_schedule_for_comm.<locals>.get_overlapping_candidate   sP    

 

 
 


 z??a4:#4#45555r"   c                *   t          |           sJ  |            |          }|dk    rS             x}G                    |            |j                   ||j                 z  }|dk    r             x}Gt          j                   dS )z
        Schedules collective node `snode`, along with one or more compute nodes
        to overlap with it. The strategy is described in the comment of
        `reorder_compute_for_overlap`.
        r   N)r   rQ   r-   rS   heapify)r-   collective_cost	candidaterg   rW   rZ   snode_to_costs      r!   schedule_collective_for_overlapz;_schedule_for_comm.<locals>.schedule_collective_for_overlap   s     #5)))))'.a77999FLL###HY_%%%}Y_==O a77999F
 	er"   z;Detected unscheduled nodes. Nodes with unmet dependencies: )	enumeraterP   r0   r1   sysmaxsizer   	ancestorsre   r   r   setitemsrR   rS   rT   addheappopr-   )r   r   r   r   buf_name_to_snodeidxr-   rU   op_name	node_namecomm_idxancanc_fused_namedepsrG   rm   rB   rV   rg   r6   rW   rZ   rX   r7   r8   r9   rl   rY   s                   @@@@@@@@@@@@r!   r   r   E   s   L #%r2 Hh'' " "
U..00 	0 	0H*/h''0022 	0 	0G*/w''/45>>++,NN$$	!k!H + + 	+.u55 	+)1HU^^%%& S S!3C!8!A!A!C!C+.x/G+R+R((MHH 	+M%00 	+)*HU^^%%&, , , , , , , , , , , , ,5 5KQ5 5 5J E6A#6F6FLKKFKKKM!'')) ) )tt99>>N5((5//222 	) 	)C!!%((((	) I	; 	; 	; 	; 	; 	; 	; 	; 	;6 6 6 6 6       & e** e$$* 	#6u#=#= 	++E2222HUOOO e**  "'')) 
 
t4yyA~~~;.8; ; ~~~ r"   nodesc                   t          d | D                       rt          | ||          } d | D             }t          dt          |                    D ]}t	          t          ||                                                             }||dz
                                           D ],}||                             t          ||                     -| S )z
    Decide global ordering of comms, by just enforcing the ordering that's in the input graph
    (might not be the same ordering as the eager mode program).
    TODO: Come up with a better approach
    c              3     K   | ]M}t          |j        t          j        j        j        j        t          j        j        j        j        h          V  Nd S r,   )r   nodetorchopsfsdpall_gather_copy_indefault	chunk_catr]   s     r!   	<genexpr>z2decide_global_ordering_of_comms.<locals>.<genexpr>   sf       	 	  	F	19	(0	
 	
	 	 	 	 	 	r"   c                0    g | ]}t          |          |S rE   )r   )rF   ns     r!   r_   z3decide_global_ordering_of_comms.<locals>.<listcomp>   s&    ===&9!&<&<=!===r"   r	   mutating_buf)	anyenforce_comm_ordering_for_fsdprangerR   r.   r/   rP   add_fake_depr   )r~   name_to_bufr6   
comm_nodesir   bufs          r!   decide_global_ordering_of_commsr      s      	 	 	 	 	 	 	 
W /ukCUVV==U===J1c*oo&& P PDA!?!?!A!ABBCCa!e$5577 	P 	PCqM&&ws'N'N'NOOOO	P Lr"   r-   r   floatc                    t           j        dk    r|                                 }n/t          t           j                  sJ t          j        |           }|S )z:
    Returns estimated op runtime in nanoseconds (ns)
    r   )r
   rM   get_estimated_runtimecallable)r-   runtimes     r!   rM   rM      sQ     !Y..--//233333,U33Nr"   c                   d}t          | j        t          j                  rd| j        j         d}d}t          | j        d          r[t          | j        j        d          rAt          | j        j        d          r'd| j        j        j         d| j        j        j         d}d}t          | j        d	          r| j        j	        }| j        j
        j         | | d| dS )
N z ()layoutsizestridez (size=z	, stride=r4   )
isinstancer   r   ExternKernelOutpython_kernel_namehasattrr   r   r   r4   	__class__r?   )r-   detailout_tensor_infory   s       r!   node_summaryr     s    F%*b011 76ej3666O
H%%
EJ%v..
 EJ%x00
 Sej',RRuz7H7ORRR 	 Iuz6"" $JO	j"+TVT_TT	TTTTr"   c                   d}d }| D ]}|t          |          r|t          |          z  }|j        }n5t          |j                  rt	          d          |t          |          z  }t
                              t          |                      t          |          rt	          d          t          |j                  r+t
                              t          |                      d }t
                              dt          |                      t
                              d|dz  dz              d S )Ng        z8Wait is not expected when there is no collective runningzkFound two collectives running at the same time. `visualize_overlap` needs to be updated to handle this casez| zEst. runtime (ms): i  )r   rM   r   r   AssertionErroroverlap_logdebugr   )ordertotal_est_runtimecur_comm_noder-   s       r!   visualize_overlapr     sr   "M > > "5)) @!%8%?%??! %
$$ @$N   "%8%?%??!e!4!467777"5)) 	>$R   $$ >!!\%%8%8":;;; $!!"<|E':':"<"<====?/$6=??    r"   c                   | }t           j        D ]g}t          |t                    r$|t	                      v rt	                      |         }t
          j                                        dk    rmt          	                    d| d           	 t          |           n># t          $ r1}t          	                    t          |                     Y d }~nd }~ww xY w ||          }t
          j                                        dk    rot          	                    d| d           	 t          |           (# t          $ r2}t          	                    t          |                     Y d }~_d }~ww xY wi|S )Nr   z.==== Visualize overlap before reordering pass z ====z-==== Visualize overlap after reordering pass )r
   'reorder_for_compute_comm_overlap_passesr   strglobalsr   distributedget_rankr   r   r   	Exception)r   r   pes       r!   $reorder_compute_and_comm_for_overlapr   =  s    E; * *a 	!wyy..		!A%%''1,,IIII  *!%(((( * * *!!#a&&))))))))*%%%''1,,HHHH  *!%(((( * * *!!#a&&))))))))* - Ls0   B
C&'CC&D77
E3'E..E3graphtorch.fx.Graphr*   c                  	 	 dd l 		j                                        sJ 	j        j        j        r	j        j        j        sJ n# t          t          t          f$ r Y d S w xY wddl
m}m}m}m}m} 	 	fd} |            } | |	j        j        j        j         |t"          j         |	j        j        j        j         |d           |d           |d           |d           |d	           |d
           |d                     |d                     |d           |d                    |d           d	fd            } ||            |                    |            d S )Nr   r	   )CallFunction
KeywordArgMatchPatternMatcherPassregister_graph_patternc                    t          | j                  }|D ]e}|j        t          j        k    rN|j        d         j        j        j        j        j	        u r&|j        d         dk    r| 
                    |           fd S )Nr   r	   )listr~   targetoperatorgetitemargsr   r   r   r   
erase_node)g	node_listr   r   s      r!   remove_unused_getitemz8reinplace_fsdp_all_gather.<locals>.remove_unused_getitem~  sv    MM	 	  	 AH,,,F1I$	(I(QQQF1INNQ	  	 r"   all_gather_inputsinp_split_sizesall_gather_input_numel
world_sizerankdtypedeviceitem_idx
group_size
group_namec                $    | j         d         dk    S )Nr   r   )kwargs)matchs    r!   rb   z+reinplace_fsdp_all_gather.<locals>.<lambda>  s    %,z":a"? r"   )	pass_dictextra_checkr   r   c                    fd}|                      ||d         |d         |d         |d         |d         |d         |d         |d	         |d
         g	           d S )Nc                     | d d         }| d         }| d         } j         j        j        j        | }|d         }|d         }j         j        j                            ||||          }|S )Nr   r	   )out)r   r   r   r   _c10d_functionalall_gather_into_tensor_out)	r   copy_in_argsr   r   r   r   	getitem_1all_gather_into_tensorr   s	           r!   replzEreinplace_fsdp_all_gather.<locals>.reinplace_all_gather.<locals>.repl  s      9LbJbJ!J!B!J" )+G*1-I	*EMMZ N   #
 *)r"   r   r   r   r   r   r   r   r   r   )replace_by_example)r   r   r   r   r   s       r!   reinplace_all_gatherz7reinplace_fsdp_all_gather.<locals>.reinplace_all_gather  s    0	* 	* 	* 	* 	*$ 	  *+()/0|$vwx |$|$
	
 	
 	
 	
 	
r"   )r   r   )4torch.distributed._composable.fsdp._fsdp_collectivesr   is_availabler   r   r   r   ImportErrorAttributeErrorr   pattern_matcherr   r   r   r   r   r   r   r   r   r   apply)
r   r   r   r   r   r   r   
graph_passr   r   s
            @r!   reinplace_fsdp_all_gatherr   Y  sL   
CCCC --///// I&=	
	*E	
 	
 	
 	
 8                 	  	  	  	  	  $#%%JI&=EL IN5=J233J011J788J|,,Jv&&Jw''Jx((	 	 
:&&  J|$$J|$$#	
 	
& ??+  . 
  
  
  
  
/ . 
D %   Us   AA A"!A"c                    t          | t          j        j        j        t          j        j        j        f          rJ t          |                                 dd                    S )N   )r   r   	_inductor	schedulerFusedSchedulerNodeGroupedSchedulerNodeintr1   )r-   s    r!   
get_op_idxr     s_    O%8O%:	
     u~~#$$$r"   1List[torch._inductor.scheduler.BaseSchedulerNode]r   4Dict[str, torch._inductor.scheduler.SchedulerBuffer]r6   Dict[str, BaseSchedulerNode]c           	     
   ! ddl m  g }t                      }d}d}i }i }i ! !fd}	| D ]}
t          |
j        t
          j        j        j        j	                  rt          fd|
j        D                       rd}|
}t                      }t          |||           t
          j        j        j        j	        t
          j        j        j        j	        t
          j        j        j        j	        t
          j        j        j        j        ht'          ||| fd	           t)          |d
           }t+          |          }d}t-          t+          |                    D ]G}||         }t/          |j        t
          j        j        j        j	                  r|dz  }|dk    r|} nH|d |         }d }t-          t+          |          dz
            D ]1}t1          ||dz            j        t2          j                  r|dz   } n2|J  |	|d |                   } |	||d                    }|||<   t/          |
j        t
          j        j        j        j	                  rd}|
}t                      }t'          |||           t)          |d           }d }t-          t+          |          dz
            D ]1}t1          ||dz            j        t2          j                  r|dz   } n2|J  |	|d |                   } |	||d                    }|||<   t+          !          dk    sJ |rt+          |          dk    sJ |rt+          |          dk    sJ | D ]a}
|
                                !v r!|
                                         }
|
|v r7|                    |
           |                    |
           bd }|                                D ]\  }}|{tA          tC          |"                                                    }|#                                D ]8}|$                    tK          |                                |                     9|}d }|                                D ]\  }}|{tA          tC          |"                                                    }|#                                D ]8}|$                    tK          |                                |                     9|}|S )Nr	   )r   Fc                    j                             |           }| D ]}||                                <   ||                                <   |S r,   )r   creater1   )snodes_to_group
group_noder-   r   snode_name_to_final_snodes      r!   _create_group_nodez:enforce_comm_ordering_for_fsdp.<locals>._create_group_node  s_    3::?KK
$ 	E 	EE:D%enn&6&677;E!*"5"5"7"78r"   )opc              3     K   | ]8}t          |         j        t          j        j        j        j                  V  9d S r,   )r   r   r   r   r   r   r   )rF   r^   r6   s     r!   r   z1enforce_comm_ordering_for_fsdp.<locals>.<genexpr>  sZ       
 
  "1%*EIN,M,U 
 
 
 
 
 
r"   Tc                v    t          | j                  p"t          | j                  o| j        j        v  S r,   )r   NopKernelSchedulerNodeExternKernelSchedulerNoder   op_overload)r^   allowed_opsr   s    r!   rb   z0enforce_comm_ordering_for_fsdp.<locals>.<lambda>  sA    q)"BCC "1i&IJJ >F.+=	' r"   )criteria_cbc                     t          |           S r,   r   ra   s    r!   rb   z0enforce_comm_ordering_for_fsdp.<locals>.<lambda>      JqMM r"   rc   r   c                     t          |           S r,   r  ra   s    r!   rb   z0enforce_comm_ordering_for_fsdp.<locals>.<lambda>H  r  r"   r   )&r   r   rr   r   r   r   r   r   r   r   r   rq   r   wait_tensorr   split_with_sizes_copyatenset_source_Tensorr   sortedrR   r   r   r   r   _WaitKernelr   r1   rO   rt   rs   r.   r/   rP   get_outputsr   r   )"r   r   r6   	new_orderrX   	ag_exists	rs_exists$ag_grouped_node_to_wait_grouped_node$rs_grouped_node_to_wait_grouped_noder  r-   ag_snodeag_related_snode_setag_related_snodesend_idx_of_current_ag_blockcopy_out_countr   	cur_snodewait_node_idxag_group_nodeag_wait_group_noders_snoders_related_snode_setrs_related_snodesrs_group_noders_wait_group_nodeprev_ag_waitwait_group_noder   oprev_rs_waitr	  r   r  s"     `                            @@@r!   r   r     sP   
 )+IIII+-(+-( "       mU mUJ595PX
 
 
 k	U 
 
 
 
 _	
 
 
 
 
k	U IHEHUU  ($"	   	*EM	*6>	4<	#1	K )$"        !'$*A*A! ! ! +..?*@*@'N301122  -a0	!NEIN$H$P  ( #a'N!A%%23/E & !22N3N2N O !M3011A566  /A6;R^LL $%EME !,,,../@-/PQQM "4!34Emnn4U!V!VBT0?? EJ	(@(HII 	UIH FIUU ($"	   !'$*A*A! ! !
 !M3011A566  /A6;R^LL $%EME !,,,../@-/PQQM "4!34Emnn4U!V!VBT0?())A---- =7881<<<< =7881<<<<   >>888-enn.>.>?EIe L*N*T*T*V*V ' '&#]%C%C%E%E F FGGL!--//  **AJJLL|DDD    ' L*N*T*T*V*V ' '&#]%C%C%E%E F FGGL!--//  **AJJLL|DDD    'r"   )r   r   r   r   )
r   r   r   r&   r   r&   r   r&   r   r   )r~   r   r   r   )r-   r   r   r   )r   r   r   r*   )r   r   r   r   r6   r   r   r   ),
__future__r   rS   r   ro   collectionsr   typingr   r   r   r   r   r   r
   r   dependenciesr   utilsr   r   r   r   r   r   r   _logginggetArtifactLoggerr?   r   r   r   r   r   r%   r   r   rM   r   r   r   r   r   r   rE   r"   r!   <module>r5     s)   # " " " " "   



 # # # # # # 1 1 1 1 1 1 1 1 1 1 1 1          ! ! ! ! ! !                  n..xCC -,,,,,,         &W W W Wt   @	 	 	 	U U U&  >   8l l l l^% % %m m m m m mr"   