
    קg9                      d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlmZmZmZmZmZmZmZmZmZmZmZmZmZ d dlZd dlZd dlZd dlmZm Z  d dl!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d d	l+m,Z, d
dl-m.Z.m/Z/m0Z0m1Z1m2Z2 d
dl3m4Z4 d
dl5m6Z6m7Z7m8Z8 d
dl9m:Z: d
dl0m;Z;m<Z<m=Z=m>Z> d
dl1m?Z?m@Z@mAZA d
dlBmCZC d
dlDmEZEmFZF d
dlGmHZH d
dlImJZJmKZKmLZLmMZMmNZNmOZOmPZPmQZQmRZRmSZSmTZT d
dlUmVZV  ejW        eX          ZYejZ        [                    eXd          Z\ejZ        [                    eXd          Z]ej^         G d d                      Z_ G d d          Z` G d d          ZadQd"Zb G d# d$          ZcdRd+Zdeje        jf        jg        eje        jf        jh        eje        jf        ji        eje        jf        jj        d,Zk G d- d.e`          Zl G d/ d0e`          Zm G d1 d2e`          ZndSd4ZodTd9Zp G d: d;e`          Zq G d< d=eq          Zr G d> d?e`          Zs	 dUdVdHZtej^         G dI dJ                      Zu ejv                    Zw G dK d6          Zx G dL dM          ZydWdPZzdS )X    )annotationsN)AnyCallableCounterDefaultDictDictGenericListOptionalSequenceSetTupleTypeVarUnion)countersdynamo_timed)get_metric_tableis_metric_table_enabled)free_unbacked_symbols)
OrderedSet)free_symbol_is_typeSymT)
has_triton   )commsconfigdependenciesirmetrics)
write_text)BackendFeatureget_scheduling_for_deviceKernel) estimate_nccl_collective_runtime)Dep	MemoryDepStarDepWeakDep)ComputedBufferMultiOutputMultiOutputLayout)LoopBody)
green_textred_text)SimplifyIndexing)cache_on_selfcmpdevice_need_guardget_device_tflopsget_dtype_sizeget_gpu_dram_gbpsIndentedBufferis_collectiveis_gpuis_waitsympy_product)Vfusionloop_orderingc                      e Zd ZU ded<   ded<   ded<    ej        e          Zded	<   ddZddZ	ddZ
ddZddZddZddZd dZdS )!SchedulerBuffer	Scheduler	schedulerz	ir.BuffernodeBaseSchedulerNodedefining_op)default_factoryList[NodeUser]usersreturnintc                4    t          | j        j                  S N)hashrB   nameselfs    U/var/www/html/ai-engine/env/lib/python3.11/site-packages/torch/_inductor/scheduler.py__hash__zSchedulerBuffer.__hash__P   s    DIN###    strc                   t                      }|                                 }|                    | dt          | j                  j                    |                    | d| j        j                    |                                 r9|                    | dt          |                                                       | 	                                r9|                    | dt          | 	                                                      t          | j                  dk    r |                    | d| j                    n}|                    | d           |                    d          5  | j        D ]}|                    | d           	 d d d            n# 1 swxY w Y   |                    d	           |                                S )
N: z
.layout = z.aliases = z.mutations = r   z	.users = z
.users = [,])r6   get_name	writelinetyperB   __name__layoutget_aliasespformatget_mutationslenrG   indentgetrawvalue)rO   resultrM   users       rP   	debug_strzSchedulerBuffer.debug_strS   s   !!}}D>>DOO$<>>???D>>DI,<>>??? 	PNN9I9I9K9K1L1LNNOOO 	TRR74;M;M;O;O3P3PRRSSStz??a;;tz;;<<<<000111q!! 1 1 J 1 1D$$ZZZ000011 1 1 1 1 1 1 1 1 1 1 1 1 1 1 S!!!!!###s   7#F''F+.F+c                4    | j                                         S rK   rB   rX   rN   s    rP   rX   zSchedulerBuffer.get_nameg       y!!###rR   Nonec                   | j         J | j                                         sd S | j                                         s| j                                         r+t          j        j                            | j                    d S t          t          j	        d          r| 
                                t          j	        j        v rht          j        j                            | j        j        t          j	        j        | 
                                                  j         | j                    d S t          j        j                            | j                    d S )Nargs)rB   should_allocateget_inputs_that_alias_outputget_mutation_namesr;   graphwrapper_codecodegen_allocationhasattrkernelrX   inplace_update_bufferscodegen_inplace_reuserA   name_to_bufrN   s    rP   allocatezSchedulerBuffer.allocatej   s   y$$$y((** 	F91133 	ty7S7S7U7U 	G 33DI>>>F AHf%%	?18#BBBG 66*H3DMMOOD		     G 33DI>>>>>rR   boolc                    | j         J t          | j         j        t          j                  rdS | j        D ]}t          |j         t                    r dS  dS )NFT)rB   
isinstancer\   r   
NoneLayoutrG   
OutputNode)rO   uses     rP   can_freezSchedulerBuffer.can_free   sd    y$$$di&66 	5: 	 	C#(J// uutrR   c                @   i }|D ]r}t          |j                  |v rC|                    |t          |j                                     |t          |j                  <   [||t          |j                  <   st          |                                          | _        d S rK   )idrB   mergelistvaluesrG   )rO   rG   rc   r}   s       rP   	set_userszSchedulerBuffer.set_users   s    &( 	+ 	+C#(||v%%'*yy381E'F'Fr#(||$$'*r#(||$$&--//**


rR   Sequence[str]c                F    | j         J | j                                         S rK   )rB   rm   rN   s    rP   r]   zSchedulerBuffer.get_aliases   s$    y$$$y55777rR   	List[str]c                F    | j         J | j                                         S rK   )rB   rn   rN   s    rP   r_   zSchedulerBuffer.get_mutations   $    y$$$y++---rR   NrH   rI   rH   rS   rH   ri   rH   rx   )rG   rF   rH   ri   rH   r   )rH   r   )r[   
__module____qualname____annotations__dataclassesfieldr   rG   rQ   re   rX   rw   r~   r   r]   r_    rR   rP   r?   r?   I   s         OOO""""-K-dCCCECCCC$ $ $ $$ $ $ $($ $ $ $? ? ? ?.   + + + +8 8 8 8. . . . . .rR   r?   c                  ~   e Zd ZU ded<   ded<   ded<   ded<   ded	<   dTdZdUdZdVdZdVdZdVdZdVdZ	dWdZ
dXdZdYdZdZd!Zd[d#Zd\d%Zd]d)ZdWd*Zd^d+Zd^d,ZdWd-ZdWd.Zd_d1ZdVd2ZdVd3Zd^d4Zd^d5Zd`d7Zdad9Zdbd<Zdcd>Zd[d?Zd[d@Z d[dAZ!d[dBZ"d[dCZ#dddFZ$d[dGZ%dWdHZ&	 dedfdMZ'dgdNZ(dhdPZ)didRZ*dSS )jrC   z7Tuple[torch.device, Tuple[Tuple[sympy.Expr, ...], ...]]groupdependencies.ReadWritesread_writeszOrderedSet[Dep]unmet_dependenciesrI   	min_order	max_orderrA   r@   rH   ri   c                    || _         d S rK   rA   )rO   rA   s     rP   __init__zBaseSchedulerNode.__init__   s    $-rR   rB   ir.Operationc                     | _         t                       _        t                       _        d _         fd|                                D              _        d  j        D              _        d S )NFc                >    g | ]}t          j        |           S ))rA   rB   rD   )r?   rA   ).0outputrO   s     rP   
<listcomp>z5BaseSchedulerNode._init_from_node.<locals>.<listcomp>   sE     /
 /
 /
  .   /
 /
 /
rR   c                8    i | ]}|                                 |S r   rX   r   bufs     rP   
<dictcomp>z5BaseSchedulerNode._init_from_node.<locals>.<dictcomp>   s/     <
 <
 <
$'CLLNNC<
 <
 <
rR   )rB   r   	ancestors
last_usagewrittenget_outputsoutputsoutputs_by_namerO   rB   s   ` rP   _init_from_nodez!BaseSchedulerNode._init_from_node   s    ,0	*4,, LL 	 /
 /
 /
 /
 **,,/
 /
 /
<
 <
+/<<
 <
 <
rR   rS   c                Z    t          |           j         d|                                 dS )Nz(name=)rZ   r[   rX   rN   s    rP   __repr__zBaseSchedulerNode.__repr__   s*    t**%AAT]]__AAAArR   c                   |                                  }t                      }|                    | dt          |           j         dt          t          | dd                    j         d| dt          | j        j                   d| dt          | j	                   d| d	t          | j        j
        | j	        z
             d| d
           |                                5  |                                 D ])}|                    |                                           *	 ddd           n# 1 swxY w Y   |                    d           	 |                    |                                            n,# t           $ r t"                              dd           Y nw xY w|                                                                S )#Longer form printout for trace logsrU   (rB   N)

.writes = 
.unmet_dependencies = .met_dependencies = z.outputs = [
        rW   Ignoring error in debug_str()Texc_info)rX   r6   splicerZ   r[   getattrr^   r   writesr   readsra   r   re   rY   debug_str_extra	Exceptionlogwarningrb   rstrip)rO   rM   r   outs       rP   re   zBaseSchedulerNode.debug_str   sG   }}

 	d	 #GD&$$?$?@@I  )011    %T%<==  	  #4#3#9D<S#STT	 
   	
 	
 	
 ZZ\\ 	, 	,'')) , ,

3==??++++,	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	c	HJJt++--.... 	H 	H 	HKK7$KGGGGG	H   '')))s$   %?D11D58D5'E: :&F#"F#c                    dS )N r   rN   s    rP   r   z!BaseSchedulerNode.debug_str_extra       rrR   c                   t          | j        dd           }d}t          |t          j        j        j                  r/d|                    |                                gdd          z   }net          |t          j        j        j	                  rAd|                    |
                                |                                gdd          z   }|  | S )Ndatar   z, F)shorten	multiline)r   rB   rz   torch	_inductorr   	Pointwise
str_helperget_size	Reductionget_reduction_sizeget_reduction_type)rO   
maybe_datadata_strs      rP   debug_str_shortz!BaseSchedulerNode.debug_str_short   s    TY55
j%/"4">?? 		j33$$&&'% 4   HH 
EO$6$@AA 	j33..00*2O2O2Q2QR 4   H
 """"rR   c                ^    t                               d| | j        | j        j                   d S )Nz(%s: unmet_dependencies = %s, writes = %s)r   infor   r   r   rN   s    rP   log_detailszBaseSchedulerNode.log_details   s7    6##		
 	
 	
 	
 	
rR   self_depr&   	other_depc                    d S rK   r   )rO   r   r   s      rP   reorder_loops_by_dep_pairz+BaseSchedulerNode.reorder_loops_by_dep_pair   s	     	rR   renamesDict[str, str]c                `    |                      | j                            |                     d S rK   )set_read_writesr   renamerO   r   s     rP   update_mutated_namesz&BaseSchedulerNode.update_mutated_names   s-    T-44W==>>>>>rR   depr%   c                `    |                      | j                            |                     d S rK   )r   r   	with_readrO   r   s     rP   add_fake_depzBaseSchedulerNode.add_fake_dep   s-    T-77<<=====rR   rx   c                X    t          d |                                 D                       S )Nc              3  f   K   | ],}|                                 p|                                V  -d S rK   )r]   r_   r   s     rP   	<genexpr>z=BaseSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  sN       
 
9<COO4!2!2!4!4
 
 
 
 
 
rR   )anyr   rN   s    rP   has_aliasing_or_mutationz*BaseSchedulerNode.has_aliasing_or_mutation  s<     
 
@D@P@P@R@R
 
 
 
 
 	
rR   rwc                ^    || _         | j         j        | _        |                                  d S rK   )r   r   r   
prune_deps)rO   r   s     rP   r   z!BaseSchedulerNode.set_read_writes  s.    "&"2"8rR   future_used_buffersOrderedSet[str]mutation_real_namec                z    |                                  }t          fd|D                       }||z
  | _        d S )Nc                <    g | ]}                     ||          S r   )get)r   kr   s     rP   r   z4BaseSchedulerNode.set_last_usage.<locals>.<listcomp>  s*    "V"V"VA#5#9#9!Q#?#?"V"V"VrR   )used_or_aliased_buffer_namesr   r   )rO   r   r   used_bufferss     ` rP   set_last_usagez BaseSchedulerNode.set_last_usage  sI     88::!"V"V"V"V"V"V"VWW&)<<rR   c                B    | j         D ]}|                                 d S rK   )r   rw   )rO   r   s     rP   mark_runzBaseSchedulerNode.mark_run  s,    < 	 	CLLNNNN	 	rR   c                    t          d t          j        | j        j        | j        j                  D                       S )Nc              3  $   K   | ]}|j         V  d S rK   rM   r   r   s     rP   r   z6BaseSchedulerNode.used_buffer_names.<locals>.<genexpr>  s8       
 
 H
 
 
 
 
 
rR   )r   	itertoolschainr   r   r   rN   s    rP   used_buffer_namesz#BaseSchedulerNode.used_buffer_names  sH     
 
 t'7'=t?O?VWW
 
 
 
 
 	
rR   c                   t                      }d t          j        | j        j        | j        j                  D             }t          |          dk    r|                                }|                    |           t          j
        j                            |          rEt          j
        j        |                                         D ]}||vr|                    |           t          |          dk    |S )Nc                    g | ]	}|j         
S r   r  r  s     rP   r   zBBaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<listcomp>  s*     
 
 
 H
 
 
rR   r   )r   r  r	  r   r   r   r`   popaddr;   ro   name_to_bufferr   rm   append)rO   
used_namesdepsr   aliass        rP   r   z.BaseSchedulerNode.used_or_aliased_buffer_names  s    &0ll

 
 t'7'=t?O?VWW
 
 
 $ii!mm((**CNN3w%))#.. +W3C8UUWW + +EJ..E*** $ii!mm rR   c                R     t           fd j        D                        _        d S )Nc              3  B   K   | ]}|j         j        j        v|V  d S rK   )rM   rA   available_buffer_namesr   r   rO   s     rP   r   z/BaseSchedulerNode.prune_deps.<locals>.<genexpr>-  sA       -
 -
xt~DDD DDDD-
 -
rR   r   r   rN   s   `rP   r   zBaseSchedulerNode.prune_deps,  sD    ", -
 -
 -
 -
.-
 -
 -
 #
 #
rR   c                     d fdt          fd j        j        D                       }                      j                            |                     d S )Nr   r%   rH   rx   c                    t          | t                    sdS j        j        | j                 j        }|                                t          j        j	        v S NF)
rz   r(   rA   rv   rM   rD   rX   r;   ro   removed_operations)r   oprO   s     rP   should_prunez7BaseSchedulerNode.prune_weak_deps.<locals>.should_prune5  sF    c7++ u+CH5AB;;==AG$>>>rR   c              3  2   K   | ]} |          |V  d S rK   r   r   r   r  s     rP   r   z4BaseSchedulerNode.prune_weak_deps.<locals>.<genexpr>;  sF       
 
\\#5F5F

 
 
 
 
 
rR   r   r%   rH   rx   )r   r   r   r   remove_reads)rO   	to_remover  s   ` @rP   prune_weak_depsz!BaseSchedulerNode.prune_weak_deps3  s    	? 	? 	? 	? 	? 	?  
 
 
 
+1
 
 
 
 
	 	T-::9EEFFFFFrR   name_to_fused_nodeDict[str, BaseSchedulerNode]c                <    t          | || j        j                   d S rK   )_prune_redundant_depsrA   rv   )rO   r%  s     rP   prune_redundant_depsz&BaseSchedulerNode.prune_redundant_deps@  s"     	d$68RSSSSSrR   c                F    | j         J | j                                         S rK   )rB   get_operation_namerN   s    rP   rX   zBaseSchedulerNode.get_nameE  r   rR   c                *    |                                  S rK   r   rN   s    rP   get_first_namez BaseSchedulerNode.get_first_nameI  s    }}rR   c                X    t          d |                                 D                       S )Nc              3  >   K   | ]}|                                 V  d S rK   r   r   rB   s     rP   r   z8BaseSchedulerNode.get_operation_names.<locals>.<genexpr>M  s*      GGd$--//GGGGGGrR   )r   	get_nodesrN   s    rP   get_operation_namesz%BaseSchedulerNode.get_operation_namesL  s)    GGdnn6F6FGGGGGGrR   c                >    t          d | j        D                       S )Nc              3  >   K   | ]}|                                 V  d S rK   r   r   r   s     rP   r   z5BaseSchedulerNode.get_buffer_names.<locals>.<genexpr>P  s*      AAS#,,..AAAAAArR   )r   r   rN   s    rP   get_buffer_namesz"BaseSchedulerNode.get_buffer_namesO  s!    AADLAAAAAArR   Sequence[BaseSchedulerNode]c                    | gS rK   r   rN   s    rP   r1  zBaseSchedulerNode.get_nodesR  s	    vrR   Sequence[SchedulerBuffer]c                    | j         S rK   )r   rN   s    rP   r   zBaseSchedulerNode.get_outputsU  s
    |rR   buf_namer?   c                    | j         |         S rK   )r   )rO   r;  s     rP   
get_outputzBaseSchedulerNode.get_outputX  s    #H--rR   torch.devicec                F    | j         J | j                                         S rK   )rB   
get_devicerN   s    rP   r@  zBaseSchedulerNode.get_device[  s$    y$$$y##%%%rR   c                    dS r  r   rN   s    rP   is_reductionzBaseSchedulerNode.is_reduction_      urR   c                    dS r  r   rN   s    rP   is_split_scanzBaseSchedulerNode.is_split_scanb  rC  rR   c                    dS r  r   rN   s    rP   is_templatezBaseSchedulerNode.is_templatee  rC  rR   c                    dS r  r   rN   s    rP   	is_externzBaseSchedulerNode.is_externh  rC  rR   c                    dS r  r   rN   s    rP   
is_foreachzBaseSchedulerNode.is_foreachk  rC  rR   read_depdependencies.Depc                    dS r  r   rO   rL  s     rP   can_inplacezBaseSchedulerNode.can_inplacen  rC  rR   c                    dS r  r   rN   s    rP   has_side_effectsz"BaseSchedulerNode.has_side_effectsq  rC  rR   c                    ddl m} t           t          f          rt          j        rt          j                             	                                t          j                  rht          t          j        t          j        j        j        j                  rt%          t          j        dd          t'          t          j        d          sdS t)           j        j        d           }                                 D ]?}|j        }|J |                                rM|                                s9|                                s%|                                t          j        j        v rp|D ]} j        j                             |j!                  }|rt          j        j"        #                    |           r{t          |j$        tJ                    s`|j&        J  fd|j&        D             }tO          |          dk    r0|d	         j(        r"|d	         j         u r|j        
t          |j        )                                tT          j+        tT          j,        f          st          |j$        j        tT          j-        tT          j.        f          r+tO          |j                                                  d	k    sq ||j                   ||j                  k    rNt          j        j/        0                    |                                |                                           t          t          j        t          j        j        j        j                  rlt          j        j1        2                    |                                           t          j        j1        2                    |                                            j3        4                    |                                           |                                t          j        j5        |                                <    n͐AdS )
z~
        Decide if there should be inplace updates for the node
        and record the decision in the active kernel.
        r   )buffer_reuse_key	mutationsNrk   c                    | j         S rK   r  xs    rP   <lambda>z9BaseSchedulerNode.decide_inplace_update.<locals>.<lambda>  s    QV rR   keyc                ^    g | ])}|j                                         j        j        v'|*S r   )rB   rX   rA   completed_operationsr   rX  rO   s     rP   r   z;BaseSchedulerNode.decide_inplace_update.<locals>.<listcomp>  s?     & & &6??,,DN4WWW WWWrR   r   )6codegen.wrapperrT  rz   SchedulerNoder   inplace_buffersr;   ro   has_featurer@  r!   INPLACE_BUFFERSrs   r   r   codegensimd
SIMDKernelr   rr   sortedr   r   r   rB   rl   rm   rn   rX   removed_buffersrA   rv   r   rM   rp   	can_reuserD   NopKernelSchedulerNoderG   r`   rP  
get_layoutr   r+   MutationLayoutSHOULDREMOVEFallbackKernelr*   rk   make_inplacerU  r  r   discardrt   )rO   rT  ordered_readsr   buf_noderead	input_bufremaining_usess   `       rP   decide_inplace_updatez'BaseSchedulerNode.decide_inplace_updatet  s   
 	655555 tm-..	&	 ##DOO$5$5~7UVV	
 qx)@)E)PQQ	 18[$77C &)) D
 Ft/5;K;KLLL##%% A	 A	CxH''',,..88:: ..00 <<>>QW%<<<% 6 67;~7Q7U7UI8 8	 2,66y$GG2 'y'<>TUU2
 %?666& & & &!*& & &N N++q00*1-9 1*1-2d::%N6 *%N5577 " 4 " =! ! 7 ' ) 5 :!#!2BN C  7 !$IN$O$O$Q$Q R RUV V V,,Y^<<++CH556 6 2293E3E3G3GXXX%Heo&=&B&M  C H.2293E3E3G3GHHHH.223<<>>BBB //	0B0B0D0DEEE &..00 7LLNN CA	 A	rR   Tbufferr6   	only_oncec                B   t           j        sd S |r	| j        rd S | j        J | j                                        }g }|D ],}|j        dk    r|                    d           |                    d           d|j         d|j         }d|j        v r|d|j        d          z   }|                    |           d|j        v r|j        d          }|	                    d	          d
         }|                    d|
                    dd          
                    dd          
                    dd          z              |                    d           |                    d           .t          |          dk    rd S |                    |           d| _        d S )Nr   r   z#pragma CMT ORIGIN:z#pragma CMT  seq_nrz seq_nr:stack_trace|{z{{}z}}r   \z#pragma CMT END ORIGINr   T)r   comment_originr   rB   get_originsr  r  targetmetasplitreplacer`   
writelines)	rO   rv  rw  origins	out_linesoop_info_strr{  stack_trace_last_lines	            rP   codegen_originating_infoz*BaseSchedulerNode.codegen_originating_info  s    $ 	F 	 	Fy$$$)''))	 	% 	%AtxR   2333:::::K16!!),Iqvh7G,I,II[)))&&!"!68(3(9(9#(>(>r(B%  "+33C>>WS$''WT4(()     !9:::  $$$y>>QF 	)$$$rR   c                   	
 t           t                    rdS t           t                    rt           j        t                    rdS ddt           t
                    rY t                                           d                   t                                           d                   z            nt          d          t          j
        t                    } j        j         j        j        z  D ]"}||j                                     |           #t#          d	  j        j        D                       }t#          d
  j        j        D                       }d fdt           t$                    r&t#           fd|D                       }||z
  }||z
  }d}||z  D ]}t'          fd||         D                       	|t(          j        j        v rt(          j        j        |         }n,|t(          j        j        v rt(          j        j        |         }nzd	
 fd
| 
|          z  }|S )aM  
        Counting the number of bytes accessed for a kernel is
        surprisingly tricky. In particular, there is a differentiation
        between 'theoretical' memory accesses and practical memory
        accesses. For example, a layernorm kernel may actually access an
        input 3 times, but in theory, it only needs to access its input
        once (and may be optimized to do so through say, persistent
        reductions)

        Another example is that even though a buffer is passed in, we may
        not access the entire buffer. This may occur if we are accessing
        a slice of the buffer. Another tricky case is for indirect
        indexing, where the amount of bytes accessed depends on the
        values of the input.

        What this function aims to compute is the memory accesses for
        worst-case inputs, best-case optimization. What this means is
        that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

        1. Numel in ranges multiplied by number of deps the buffer has
        2. The buffer size
        r   s
sympy.ExprrH   rI   c                N    t           j        j                            | d          S )Nr   fallback)r;   ro   sizevars	size_hint)r  s    rP   try_size_hintzEBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.try_size_hint  s     7#--a!-<<<rR   r       eAc              3  $   K   | ]}|j         V  d S rK   r  r  s     rP   r   zABaseSchedulerNode.get_read_write_buffers_sizes.<locals>.<genexpr>&  s$      FF38FFFFFFrR   c              3  $   K   | ]}|j         V  d S rK   r  r  s     rP   r   zABaseSchedulerNode.get_read_write_buffers_sizes.<locals>.<genexpr>'  s$      HHCHHHHHHHrR   r   rS   snodesr7  rx   c                    j         j        |          j        }t          d |D                       }t	          |t          |          z
            dk    S )Nc              3  $   K   | ]}|j         V  d S rK   )rB   r   rd   s     rP   r   zZBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.is_materialized.<locals>.<genexpr>+  s$      !>!>$)!>!>!>!>!>!>rR   r   )rA   rv   rG   r   r`   )r   r  rG   buf_usesrO   s       rP   is_materializedzGBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.is_materialized)  sQ    N.s39E!!>!>!>!>!>>>Hx*V"4"445599rR   c              3  >   K   | ]} |j                   |V  d S rK   r  )r   r   r  rO   s     rP   r   zABaseSchedulerNode.get_read_write_buffers_sizes.<locals>.<genexpr>/  sJ       ) )__S$+-N-N)) ) ) ) ) )rR   c              3     K   | ]}V  d S rK   r   )r   r   
node_numels     rP   r   zABaseSchedulerNode.get_read_write_buffers_sizes.<locals>.<genexpr>7  s#      $R$RCZ$R$R$R$R$R$RrR   (Optional[Union[ir.Buffer, ir.TensorBox]]c                   | sdS t          | j        t                    rj        j        |                                          j        }d}|D ]o}t          |j        t                    sJ t          |j        j        t                    r0|j        
                                D ]}| |j                  z  }m dS |S t          | j        t          j                  r-t          fd|                                 D                       S  	t          |                                                     }t#          |                                           t'          |          z  S )Nr   c              3  h   K   | ],} t           j                            |                    V  -d S rK   )r;   ro   
get_buffer)r   mut_nameget_buf_bytess     rP   r   zXBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.get_buf_bytes.<locals>.<genexpr>T  sQ        $ &ag&8&8&B&BCC     rR   )rz   r\   r+   rA   rv   rX   rG   rB   rC   r*   r   r   r{   sumrn   r:   r   r4   	get_dtypemin)
r   rG   totrd   	sched_buf	buf_elemsbuf_accessed_elemsr  rO   r  s
         rP   r  zEBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.get_buf_bytes@  sx    1 cj*;<<  N6s||~~FLEC % 	% 	%)$)5FGGGGG%dinkBB %-1Y-B-B-D-D E E	 #}}Y^'D'D DE $%11J
BM:: 	    (+(>(>(@(@     
 !.mCLLNN.K.K L LI)#--//::S*I> >  rR   )r  r  rH   rI   )r   rS   r  r7  rH   rx   )r   r  rH   rI   )rz   rj  ExternKernelSchedulerNoderB   r*   r`  r:   
get_rangesrI   collectionsdefaultdictr   r   r   r   rM   r  r   FusedSchedulerNoder  r;   ro   r  graph_inputs)rO   buf_accessesr   r   r   rh  
node_bytesr;  r   r  r  r  r  r  s   `        @@@@@rP   get_read_write_buffers_sizesz.BaseSchedulerNode.get_read_write_buffers_sizes  s   . d233 	1d566 	:I{<
 <
 	 1	= 	= 	= 	= dM** 	"&doo//233 1 1! 4556 JJ
 SJ".t44#)D,<,CC 	/ 	/C"))#....FFt/?/EFFFFFHH0@0GHHHHH	: 	: 	: 	: 	: 	:
 d.// 	,( ) ) ) ) )%) ) )  O o-FO+E
 (	- (	-H!$$R$R$R$R<;Q$R$R$R!R!R17111g,X6QW111g*84        < --,,,JJrR   floatc           	     P   |                                  d                                         d         }|j                                        }|j                                        }|j        t          |j        j                  sdS t          | j                  rht          | j        t          j                  sJ 	 t          | j                  S # t          $ r%}t                              |           Y d}~dS d}~ww xY wt!          | j                  rdS 	 t#                      }t%          |          dz  }n# t&          $ r Y dS w xY wt          | t(                    rt          | j        t          j                  sJ dt          | j                              t,                              t1          | j        dd          d          }|ddlm} ddlm}	 t;          d	 | j        j        D                       rdS  |            5 }
 |	d
          5 }t?          j         | j        j!                  5  t?          j"        |
          5  ddl
m# fd| j        j        D             }| j        j$        } |j%        |g|R i | j        j&         d}|'                                }| (                                }||z  |z  dz  }||z  }tS          ||          cddd           cddd           cddd           cddd           S # 1 swxY w Y   ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   nFt          | tT                    st          | j        tV                    r| (                                |z  S dS )zB
        Returns estimated op runtime in nanoseconds (ns)
        r   Nl    J)type(self.node)=python_kernel_namer   )FakeTensorMode)FlopCounterModec              3  z   K   | ]6}t          t          |                                                    d k    V  7dS r   N)r`   r   	get_numelr   ns     rP   r   z:BaseSchedulerNode.get_estimated_runtime.<locals>.<genexpr>  sT         -akkmm<<==A     rR   F)displayr   )ir_node_to_tensorc                *    g | ]} |d           S )F)guard_shaper   )r   inputr  s     rP   r   z;BaseSchedulerNode.get_estimated_runtime.<locals>.<listcomp>  s9     # # #! *)%UCCC# # #rR   g      ?r  ),r1  r   rB   rk  r  devicer8   rZ   r7   rz   r   IRNoder$   
ValueErrorr   r   r9   r5   r3   r   r  ExternKernelkernel_name_to_opr   r   torch._subclasses.fake_tensorr  torch.utils.flop_counterr  r   inputsr;   set_current_nodefx_nodeset_fake_moder  	__class__process_kernelkwargsget_total_flopsr  maxr  r)   )rO   r   r\   dtypeegpu_memory_bandwidth	gpu_flopsr  r  r  	fake_modeflop_counter_modefake_inputsclsfactorcounted_flopscounted_bytescompute_timetransfer_timer  s                      @rP   get_estimated_runtimez'BaseSchedulerNode.get_estimated_runtimeb  sX    nnq!--//2$$&&""$$=$VFM4F-G-G$1 ## 	di333337	BBB    qqqqq	 TY 	
 1	#4#6#6 )%0069II 	 	 	11	 d566 1	Ndi99PP;Pd49oo;P;PPPP"&&	#7<<d B
 ~HHHHHHDDDDDD  !Y-      1#^%% <OO!5 5 5 <&(:I%) )< < ? 	< < 655555# # # #%)Y%5# # #K )-C&C&rLKLLL49;KLLL !F$5$E$E$G$GM$($E$E$G$GM$*]$:Y$F##ML$14H$HM |];;1< < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < <4 011 	NZI~6
 6
 	N 44669MMMqs   >C 
DC<<D D< <
E
	E
ML=:L&BL	L&*L=6MLL&LL&L=&L**L=-L*.L=1M=M	MM	MMMOptional[ir.TemplateBuffer]c                    d S rK   r   rN   s    rP   get_template_nodez#BaseSchedulerNode.get_template_node      trR   N)rA   r@   rH   ri   )rB   r   rH   ri   r   r   r   r&   r   r&   rH   ri   r   r   rH   ri   )r   r%   rH   ri   r   )r   r   rH   ri   r   r   r   r   rH   ri   rH   r   r%  r&  rH   ri   rH   r7  )rH   r9  )r;  rS   rH   r?   rH   r>  rL  rM  rH   rx   )T)rv  r6   rw  rx   rH   ri   r   )rH   r  rH   r  )+r[   r   r   r   r   r   r   re   r   r   r   r   r   r   r   r   r  r  r
  r   r   r$  r)  rX   r-  r2  r6  r1  r   r=  r@  rB  rE  rG  rI  rK  rP  rR  ru  r  r  r  r  r   rR   rP   rC   rC      s        BBBB(((('''' NNNNNN. . . .
 
 
 
&B B B B* * * *2   # # # #
 
 
 
   
? ? ? ?> > > >
 
 
 

   
= = = =   
 
 
 
    
 
 
 
G G G GT T T T
. . . .   H H H HB B B B      . . . .& & & &                     W W W Wt 9=* * * * *Xg g g gRW W W Wr     rR   rC   c                  D    e Zd ZU g dZded<   ded<   ddZddZddZdS )	WhyNoFuse)node1node2reasonrk   rS   r  zTuple[Any, ...]rk   r  rC   r  rH   ri   c                "    || _         || _        d S rK   )r  r  rO   r  r  s      rP   r   zWhyNoFuse.__init__  s    



rR   r   c                V    || _         || _        t                              |            d S rK   )r  rk   
fusion_logdebug)rO   r  rk   s      rP   __call__zWhyNoFuse.__call__  s*    	rR   c                    d| j                                          d| j                                         d| j        | j        z  z   S )Nzcannot fuse z with rU   )r  rX   r  r  rk   rN   s    rP   __str__zWhyNoFuse.__str__  sJ    Tdj1133TT4:;N;N;P;PTTTK$)#
 	
rR   N)r  rC   r  rC   rH   ri   )r  rS   rk   r   rH   ri   r   )r[   r   r   	__slots__r   r   r  r  r   rR   rP   r  r    st          544IKKK      

 
 
 
 
 
rR   r  objr   rH   rS   c                    t          | t                    rt          | t                    } t	          j        | d          }d|v rdt          j        |d           S |S )NrZ     )ra   r       )rz   r   rg  rS   pprintr^   textwrapra   )r  rc   s     rP   r^   r^     sb    #z"" #Sc"""^C***Fv~~6HOFG44666MrR   c                  2    e Zd ZddZddZdd	ZddZeZdS )r|   r   r'   rH   ri   c                0    t          |g          | _        d S rK   r  r   s     rP   r   zOutputNode.__init__  s    ",cU"3"3rR   rx   c                    dS r  r   rN   s    rP   rB  zOutputNode.is_reduction  rC  rR   r   c                    dS )Nr   r   rN   s    rP   rm   z'OutputNode.get_inputs_that_alias_output  r   rR   rS   c                    dS )NOUTPUTr   rN   s    rP   rX   zOutputNode.get_name  s    xrR   N)r   r'   rH   ri   r   r   r   )r[   r   r   r   rB  rm   rX   r   r   rR   rP   r|   r|     se        4 4 4 4          HHHrR   r|   rB   r%  r&  rv   Dict[str, SchedulerBuffer]ri   c                    t          j                     j        D ]c}t          |t                    sL|j                 j        }|                                                                         xx         dz  cc<   dd	 fdt          fd j        D                       }|r> j        |z
   _         	                     j
                            |                     dS dS )
am  
    Prunes weakdeps intended for mutation ordering
    on an upstream fused node if after fusion there is another dependency
    on the fused upstream node, making the weakdep redundant

    In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
    be incrementally removed, enabling other fusions, ensuring they are fused in order.
    r   r   r%   rH   rx   c                    t          | t                    rX| j                 j                                        }|                                                  dk    }|         k    }|p|S dS )Nr   F)rz   r(   rM   rD   rX   )r   op_nameis_redundantis_self_deprv   name_to_dep_countr%  rB   s       rP   r  z+_prune_redundant_deps.<locals>.should_prune  ss    c7## 		!#(+7@@BBG,-?-H-Q-Q-S-STWXXL -W5=K.;.5rR   c              3  2   K   | ]} |          |V  d S rK   r   r   s     rP   r   z(_prune_redundant_deps.<locals>.<genexpr>  sF        ,,s2C2C     rR   Nr!  )r  r   r   rz   r(   rM   rD   rX   r   r   r   r"  )rB   r%  rv   r   r  deps_to_pruner  r  s   ```   @@rP   r(  r(    s>    '2&9&;&;& Q Q#w'' 	QSX&2B0?HHJJKKKqPKKK
 
 
 
 
 
 
 
 
     .    M  K"&"9M"IT-::=IIJJJJJK KrR   )zextern_kernels.convolutionzextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmc                  8     e Zd Zd fdZdd	ZddZddZ xZS )r  rA   r@   rB   r   rH   ri   c                    t                                          |           |                     |           |                     |                                           d S rK   superr   r   r   get_read_writesrO   rA   rB   r  s      rP   r   z"ExternKernelSchedulerNode.__init__"  U    ###T"""T113344444rR   rS   c                \    |                                   dt          | j        dd            S )Nz.node.kernel = r  )rX   r   rB   rN   s    rP   r   z)ExternKernelSchedulerNode.debug_str_extra'  s.    --//bb'$)EY[_2`2`bbbrR   rx   c                    dS NTr   rN   s    rP   rI  z#ExternKernelSchedulerNode.is_extern*  r  rR   c                p    | j         J t          | j         d          o| j                                         S )NrR  )rB   rr   rR  rN   s    rP   rR  z*ExternKernelSchedulerNode.has_side_effects-  s6    y$$$ty"455V$):T:T:V:VVrR   rA   r@   rB   r   rH   ri   r   r   )r[   r   r   r   r   rI  rR  __classcell__r  s   @rP   r  r  !  s        5 5 5 5 5 5
c c c c   W W W W W W W WrR   r  c                        e Zd Zd fdZ xZS )	rj  rA   r@   rB   r   rH   ri   c                    t                                          |           |                     |           |                     |                                           d S rK   r  r  s      rP   r   zNopKernelSchedulerNode.__init__3  r  rR   r#  )r[   r   r   r   r$  r%  s   @rP   rj  rj  2  s=        5 5 5 5 5 5 5 5 5 5rR   rj  c                       e Zd Zd/ fdZ	 	 d0d1dZ	 	 d0d1dZd2dZd3dZd4dZd5dZ	d6dZ
d7dZd7dZd7dZd8d!Zd9d$Zd:d&Zd;d'Zed<d)            Zd=d,Zed>d.            Z xZS )?r`  rA   r@   rB   +Union[ir.ComputedBuffer, ir.TemplateBuffer]rH   ri   c                    t                                          |           |                     |           |                                  d S rK   )r  r   r   _compute_attrsr  s      rP   r   zSchedulerNode.__init__:  sI    
 	###T"""rR   Nextra_indexing_constraints*Optional[Tuple[Dict[Any, Any], List[Any]]]recompute_sizes_body_funcOptional[Callable[..., Any]]c                   t          | j        t          j        t          j        f          sJ | j                            ||          \  | _        | _        | j        	                    | j        
                                          j        }| j        
                                 || j                  f| _        t          j         p!| j        
                                j        dk    }t          | j        t          j                  r0|                     | j                            |                     d S |                     t%          j        | j        g| j        R d|i           d S )Nr,  r.  cuda	normalizer4  )rz   rB   r   r)   TemplateBuffersimplify_and_reorder_sizes_bodyrA   get_backendr@  group_fnr   r   loop_ordering_after_fusionrZ   r   extract_read_writesr   )rO   r,  r.  r:  should_normalizes        rP   r+  zSchedulerNode._compute_attrsC  sv   
 $)b&79J%KLLLLL"&)"@"@'A&? #A #
 #
TZ
 >--di.B.B.D.DEENi**,,hht{.C.CD

 11 5y##%%*f4 	
 di!233 		  	--8H-II       0J!%  8H     rR   c                4    |                      ||           d S )Nr1  )r+  )rO   r,  r.  s      rP   recompute_size_and_bodyz%SchedulerNode.recompute_size_and_bodyc  s1    
 	'A&? 	 	
 	
 	
 	
 	
rR   r4  rx   c                    d | j         j        D             }|                     t          j        | j        g| j        R d|i                    |                     d S )Nc                J    h | ] }t          |t          t          f          |!S r   )rz   r(   r'   r  s     rP   	<setcomp>z5SchedulerNode.refresh_dependencies.<locals>.<setcomp>p  s<     
 
 
ZgwEW5X5X

 
 
rR   r4  )r   r   r   r   r<  r8  r7  r   )rO   r4  	fake_depss      rP   refresh_dependenciesz"SchedulerNode.refresh_dependenciesm  s    
 
+1
 
 
	 	,
![  4= i	""	
 	
 	
 	
 	
rR   	new_orderSequence[int]c                    | j                             |          | _         | j         j        | _        |                     d           d S )NFr3  )r8  reorder_iter_loopssizesr7  rD  )rO   rE  s     rP   apply_new_loop_orderz"SchedulerNode.apply_new_loop_order|  sI    Z22
 

 j&!!E!22222rR   r   r&   r   c                   d }| j         d         }t          |          |j        cxk    r|j        k    rn n|                    |          }|rZt          xj        dz  c_        t                              d|                                 |           | 	                    |           d S t                              d|                                            d S )Nr   r   z"Reorder loops for %s with order %szEDon't reordering %s because we can not decide the suitable loop order)
r7  r`   num_varsdecide_loop_order_to_matchr   num_loop_reorderingloop_ordering_logr  rX   rJ  )rO   r   r   rE  
self_sizess        rP   r   z'SchedulerNode.reorder_loops_by_dep_pair  s     	[^
z??h/EEEE93EEEEEE ;;IFFI 
	''1,''##4dmmooy   %%i00000##W    rR   rS   c                H   |                                  }| d| j        d          | d| j        d          | d| j         g}| j                                        D ]i}t          |t                    sR|j        }t          j	        
                    |          }|                    | dt          |j                              jt          | j        t                    rX|                    d| d           |                    t!          j        | j                                        d	                     | j        J t)          j        | j                                                  r"|                    t1          |                      d
                    |          S )Nz.group.device = r   z.group.iteration = r   z	.sizes = z
_layout = zclass z_loop_body:r  r   )rX   r   r7  r   reads_and_writesrz   r(   rM   r;   ro   r  r  r^   r\   r8  r,   r  ra   re   rB   r   	is_tritonr@  extenddebug_triton_codejoin)rO   rM   linesr   r;  r   s         rP   r   zSchedulerNode.debug_str_extra  s   }}44TZ]4477
177++dk++

 #4466 	K 	KCc7++ K8g((22IIGCJ4G4GIIJJJdj(++ 	JLL3$333444LL)=)=)?)?HHIIIy$$$<	,,..// 	2LL*400111yyrR   Sequence[Sequence[sympy.Expr]]c                    | j         S rK   )r7  rN   s    rP   r  zSchedulerNode.get_ranges  
    {rR   c                    t          | j        t          j        t          j        f          sJ dt          | j                              t          | j                                                  S Nr  )rz   rB   r   r)   r5  rZ   rx   r   rN   s    rP   rB  zSchedulerNode.is_reduction  sk    I)2+<=
 
 	! 	! d49oo  	! 	! 	! DI0022333rR   c                   t          | j        t          j        t          j        f          sJ dt          | j                              t          | j        t          j                  o#t          | j        j        t          j                  S r\  )rz   rB   r   r)   r5  rZ   r   	SplitScanrN   s    rP   rE  zSchedulerNode.is_split_scan  s    I)2+<=
 
 	! 	! d49oo  	! 	! 	! $)R%677 
JINBL=
 =
 	
rR   c                @    t          | j        t          j                  S rK   rz   rB   r   r5  rN   s    rP   rG  zSchedulerNode.is_template  s    $)R%6777rR   r  c                R    t          | j        t          j                  r| j        nd S rK   r`  rN   s    rP   r  zSchedulerNode.get_template_node  s"    &ty"2CDDNtyy$NrR   
index_varsSequence[sympy.Expr]c                    |                                   |                                  |                     |           d S rK   )ru  r  rd  )rO   rb  s     rP   runzSchedulerNode.run  s9    ""$$$Z     rR   Dict[sympy.Expr, sympy.Expr]c                R   | j         }t          t          t          |                    t          t          t          |                    k    sJ t	          t          t          j                            |          t          j                            |                              }|S rK   )	r7  r  mapr`   dictzipr  r	  from_iterable)rO   rb  rI  
var_rangess       rP   ranges_from_index_varsz$SchedulerNode.ranges_from_index_vars  s     3sE??##s3sJ+?+?'@'@@@@@--j99--e44 
 

 rR   c                   |                      |          }	 t          j        t          t          j                    |                    5  t          j                            |           5   | j        |  d d d            n# 1 swxY w Y   d d d            d S # 1 swxY w Y   d S # t          $ r" t          
                    d| j                    w xY w)NzError in codegen for %s)rm  r;   set_ops_handlerr/   get_ops_handlerrs   r  r8  r   r   fatalrB   )rO   rb  rl  s      rP   rd  zSchedulerNode.codegen  sZ   00<<
	" !2!4!4jAA  ( (x((..( ( 
J''( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( (  	 	 	II/;;;	sS   3B& 
 B*B5BB	BB		BB& BB&  B!B& &,Cr   c                    | j         \  }}t          j        | j        |t	          j        d          gt          |          z  g          S )zH
        Get the memory dependencies in the non-reduction axis.
        r   )hidden_args)r7  r   r<  r8  sympyIntegerr`   )rO   rI  reduction_sizess      rP   pointwise_read_writesz#SchedulerNode.pointwise_read_writes  sP    
 "&/JU]1-=-=,>_AUAU,U+V
 
 
 	
rR   rL  rM  c                   |                                  rdS t          d |                                 D                       rdS t          | j        j                  dk    rt          |t          j                  rzt          t          | j        j                            }t          |t          j                  sJ dt          |                      |j        |j        k    o|j        |j        k    S dS )NFc              3  >   K   | ]}|                                 V  d S rK   )r]   r5  s     rP   r   z,SchedulerNode.can_inplace.<locals>.<genexpr>  s,      ??Ss  ??????rR   r   ztype(write_dep)=)rG  r   r   r`   r   r   rz   r   r&   nextiterrZ   indexsize)rO   rL  	write_deps      rP   rP  zSchedulerNode.can_inplace  s     	5??D,<,<,>,>????? 	5t&''1,,l,2
 2
, T$"2"9::;;Ii)?@@WWBWT)__BWBWWWW>Y_4X).9XXurR   r   c                   t                      }t          | j        t                    r| j                                        D ]}|j        dk    r|j        dk    rd|j        v r|j        d         dk    s)t          |j	                  dk    ra|j	        d         dk    rP|
                    d|j        v r|j        d         n&t          |j	                  dk    r|j	        d	         nd
           |S )Ncall_methodstoremode
atomic_add   r  rM      r   r   )r   rz   r8  r,   r1  r  r  r  r`   rk   r  )rO   buffers_store_as_atomic_addrB   s      rP   _get_atomic_add_buffersz%SchedulerNode._get_atomic_add_buffers  s    7A||#dj(++ 	
,,..  G},,w..4;..4;v3F,3V3V	NNa//DIaLL4P4P 033!T[00 F++.1$)nn.A.Adillr  
 +*rR   )rA   r@   rB   r)  rH   ri   NN)r,  r-  r.  r/  rH   ri   )r4  rx   rH   ri   )rE  rF  rH   ri   r  r   )rH   rX  r   r  )rb  rc  rH   ri   )rb  rX  rH   rf  )rb  rX  rH   ri   )rH   r   r  r  )r[   r   r   r   r+  r?  rD  rJ  r   r   r  rB  rE  rG  r  re  rm  rd  r0   rw  rP  r  r$  r%  s   @rP   r`  r`  9  s             RVBF    D RVBF
 
 
 
 

 
 
 
3 3 3 3   (       ,   4 4 4 4
 
 
 
8 8 8 8O O O O! ! ! !
   	 	 	 	 
 
 
 ]
    + + + ]+ + + + +rR   r`  group_snodec                     j         }                     t          j                            d |D                                  t           fdt          j        d |D              D                        j        j        z
   _	        d S )Nc                    g | ]	}|j         
S r   )r   r   rX  s     rP   r   z3refresh_group_node_dependencies.<locals>.<listcomp>  s    +J+J+JaAM+J+J+JrR   c              3  R   K   | ]!}|j                                         v|V  "d S rK   rM   r6  )r   r   r  s     rP   r   z2refresh_group_node_dependencies.<locals>.<genexpr>  sH       
 
x{;;==== ====
 
rR   c                    g | ]	}|j         
S r   )r   r  s     rP   r   z3refresh_group_node_dependencies.<locals>.<listcomp>  s    )O)O)O1!*>)O)O)OrR   )
r  r   r   
ReadWrites
merge_listr   unionr   r   r   )r  r  s   ` rP   refresh_group_node_dependenciesr    s    F**+J+J6+J+J+JKK  
 	 
 
 
 
!')O)O)O)O)OP
 
 
 	
 	

 
!
(	) """rR   rA   r@   r  List[BaseSchedulerNode]c                   t          | t          t          f          sJ || _        || _        d | _        t          j        d |D              | _        t          |            t          d | j        D                       | _        t          d | j        D                       | _        d |                                 D             | _        d S )Nc                *    g | ]}|j         	|j         S rK   )r   r  s     rP   r   z#init_group_node.<locals>.<listcomp>(  s!    	A	A	A!)@!+)@)@)@rR   c              3  $   K   | ]}|j         V  d S rK   r   r  s     rP   r   z"init_group_node.<locals>.<genexpr>-  $      HHHHHHHHrR   c              3  $   K   | ]}|j         V  d S rK   )r   r  s     rP   r   z"init_group_node.<locals>.<genexpr>.  r  rR   c                8    i | ]}|                                 |S r   r   r   s     rP   r   z#init_group_node.<locals>.<dictcomp>/  s/     # # # ## # #rR   )rz   r  GroupedSchedulerNoder  rA   rB   r   r  r   r  r  r   r  r   r   r   )r  rA   r  s      rP   init_group_noder    s    
 k$68L#MNNNNNK%KK&,	A	Av	A	A	AK $K000HH[5GHHHHHKHH[5GHHHHHK# #'2'>'>'@'@# # #KrR   c                      e Zd ZU dZded<   ed5d            Zd6dZd7 fdZe	d8d            Z
d8dZe	d9d            Zd:dZd8dZd8dZd; fdZe	d9d            Ze	d9d            Zd<d!Zd8d"Ze	d=d$            Ze	d=d%            Ze	d=d&            Ze	d>d(            Zd?d*Ze	d=d+            Zd@d-ZdAd0ZdBd3Zd8d4Z xZS )Cr  z
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be fused together. The way it does this is by maintaining
    its unmet dependencies as the union of its constituent nodes.
    r  r  r  rC   r  rH   c                H   |j         |j         u sJ t          |t          t          f          sJ t          |t          t          f          sJ t	          t          j        |                                |                                                    } | |j         |          S rK   )rA   rz   r`  r  r   r  r	  r1  )r  r  r  nodess       rP   fusezFusedSchedulerNode.fuse=  s     %/1111%-1C!DEEEEE%-1C!DEEEEEY_U__%6%68I8IJJKKs5?E***rR   r   r&   r   ri   c                   |                                  rd S d }| j        D ]V}t          |t                    sJ |.||j        d         k    rt
                              d            d S |j        d         }Wd }|J t          |          |j        cxk    r|j        k    rn n|	                    |          }|s/t
                              d| 
                                           d S t          xj        dz  c_        t
                              d| 
                                |           | j        D ].}t          |t                    sJ |                    |           /t          |            d S )Nr   z1Can not reorder fused node due to different sizeszODont reordering fused node %s because we can not decide the suitable loop orderr   z-Reorder loops for fused node %s with order %s)rG  r  rz   r`  r7  rO  r  r`   rL  rM  rX   r   rN  rJ  r  )rO   r   r   rP  snoderE  s         rP   r   z,FusedSchedulerNode.reorder_loops_by_dep_pairG  s     	F
[ 	) 	)Ee]33333%*Q*G*G!''G   aJJ	%%%z??h/EEEE93EEEEEE ;;IFFI 	##a   F##q(##;T]]__i	
 	
 	
 [ 	2 	2Ee]33333&&y1111'-----rR   rA   r@   c                    t                                          |           t          | ||           g | _        t	          |d           j        | _        d S )Nc                D    t          |                                           S rK   )rI   rB  rW  s    rP   rY  z-FusedSchedulerNode.__init__.<locals>.<lambda>p  s    s1>>3C3C/D/D rR   rZ  )r  r   r  rG   r  r   rO   rA   r  r  s      rP   r   zFusedSchedulerNode.__init__l  sS    ###i000%'
%D%DEEEK


rR   rS   c                J    d                     d | j        D                       S )N_c                6    g | ]}|                                 S r   r   r  s     rP   r   z/FusedSchedulerNode.get_name.<locals>.<listcomp>t       ;;;!;;;rR   rV  r  rN   s    rP   rX   zFusedSchedulerNode.get_namer  %    xx;;t{;;;<<<rR   c                @    | j         d                                         S Nr   r  rX   rN   s    rP   r-  z!FusedSchedulerNode.get_first_namev      {1~&&(((rR   r   c                <    t          j        d | j        D              S )Nc                6    g | ]}|                                 S r   r6  r  s     rP   r   z7FusedSchedulerNode.get_buffer_names.<locals>.<listcomp>{  $    !L!L!L1!"4"4"6"6!L!L!LrR   r   r  r  rN   s    rP   r6  z#FusedSchedulerNode.get_buffer_namesy  !    !L!L!L!L!LMMrR   List[SchedulerBuffer]c                l    g }| j         D ])}|                    |                                           *|S rK   r  rT  r   rO   rc   rB   s      rP   r   zFusedSchedulerNode.get_outputs}  >    (*K 	. 	.DMM$**,,----rR   c                t     fdt           j                  D             } j        d         j        }|J|                                }t	          j        |          r"|                    t                                t          j	        d
                    |                                          d          S )Nc                r    g | ]3\  }}                                  d | d|                                 4S )z.snodes[z] =
)rX   re   )r   irB   rO   s      rP   r   z6FusedSchedulerNode.debug_str_extra.<locals>.<listcomp>  sU     
 
 
4 }}BBBB0@0@BB
 
 
rR   r   r   r  )	enumerater  rB   r@  r   rS  rT  rU  r  ra   rV  r   )rO   rW  rB   r  s   `   rP   r   z"FusedSchedulerNode.debug_str_extra  s    
 
 
 
$T[11
 
 
 {1~"__&&F|F## 6.t44555tyy//6688&AAArR   c                2    d | j         D             }|  d| S )Nc                6    g | ]}|                                 S r   )r   r0  s     rP   r   z6FusedSchedulerNode.debug_str_short.<locals>.<listcomp>  s$    EEEd**,,EEErR   z
, snodes: r  )rO   
snodes_strs     rP   r   z"FusedSchedulerNode.debug_str_short  s+    EEEEE
..*...rR   r   r   r   c                    t                                          ||           t                      }t          | j                  D ]2}|                    ||           |                    |j                   3d S rK   )r  r  r   reversedr  updater   )rO   r   r   rB   r  s       rP   r  z!FusedSchedulerNode.set_last_usage  s    
 	24FGGG 0:||T[)) 	8 	8D 35GHHH&&t7777	8 	8rR   c                <    t          j        d | j        D              S )Nc                6    g | ]}|                                 S r   )r
  r  s     rP   r   z8FusedSchedulerNode.used_buffer_names.<locals>.<listcomp>  s$    !M!M!MA!"5"5"7"7!M!M!MrR   r  rN   s    rP   r
  z$FusedSchedulerNode.used_buffer_names  s!    !M!M!M!M!MNNrR   c                <    t          j        d | j        D              S )Nc                6    g | ]}|                                 S r   )r   r  s     rP   r   zCFusedSchedulerNode.used_or_aliased_buffer_names.<locals>.<listcomp>  s$    DDD1a,,..DDDrR   r  rN   s    rP   r   z/FusedSchedulerNode.used_or_aliased_buffer_names  s%    DDDDD
 	
rR   r7  c                    | j         S rK   r  rN   s    rP   r1  zFusedSchedulerNode.get_nodes  rZ  rR   c                Z    t          |           j         d|                                  dS )Nz(nodes=r   r   rN   s    rP   r   zFusedSchedulerNode.__repr__  s*    t**%@@dmmoo@@@@rR   rx   c                >    t          d | j        D                       S )Nc              3  >   K   | ]}|                                 V  d S rK   )rB  r  s     rP   r   z2FusedSchedulerNode.is_reduction.<locals>.<genexpr>  s,      991>>##999999rR   r   r  rN   s    rP   rB  zFusedSchedulerNode.is_reduction  s!    99T[999999rR   c                >    t          d | j        D                       S )Nc              3  >   K   | ]}|                                 V  d S rK   )rE  r  s     rP   r   z3FusedSchedulerNode.is_split_scan.<locals>.<genexpr>  s,      ::1??$$::::::rR   r  rN   s    rP   rE  z FusedSchedulerNode.is_split_scan  s!    ::dk::::::rR   c                >    t          d | j        D                       S )Nc              3  >   K   | ]}|                                 V  d S rK   rG  r  s     rP   r   z1FusedSchedulerNode.is_template.<locals>.<genexpr>  s*      88q1==??888888rR   r  rN   s    rP   rG  zFusedSchedulerNode.is_template  s!    88DK888888rR   r  c                n    | j         D ],}|                                r|                                c S -d S rK   )r  rG  r  r   s     rP   r  z$FusedSchedulerNode.get_template_node  sI    K 	0 	0D!! 0--/////0trR   r>  c                    | j         d         S r  )r   rN   s    rP   r@  zFusedSchedulerNode.get_device  s    z!}rR   c                >    t          d | j        D                       S )Nc              3  >   K   | ]}|                                 V  d S rK   )r   r  s     rP   r   z>FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  s.      EEA1--//EEEEEErR   r  rN   s    rP   r   z+FusedSchedulerNode.has_aliasing_or_mutation  s!    EEEEEEEErR   r   c                    t           rK   NotImplementedErrorr   s     rP   r   z'FusedSchedulerNode.update_mutated_names      !!rR   rM   r%   c                    t           rK   r  rO   rM   s     rP   r   zFusedSchedulerNode.add_fake_dep  r  rR   rL  rM  c                    t           rK   r  rO  s     rP   rP  zFusedSchedulerNode.can_inplace  r  rR   c                   |                                  }d                    d | j        D                       }t                      }|                    | dt          |           j         d| d| dt          | j        j	                   d| dt          | j
                   d| d	t          | j        j        | j
        z
             d| d
           |                                5  |                                 D ])}|                    |                                           *	 ddd           n# 1 swxY w Y   |                    d           	 |                    |                                            n,# t"          $ r t$                              dd           Y nw xY w|                                                                S )r   rV   c              3  >   K   | ]}t          |          j        V  d S rK   )rZ   r[   r  s     rP   r   z/FusedSchedulerNode.debug_str.<locals>.<genexpr>  s+      FFQQ 0FFFFFFrR   rU   r   r   r   r   r   r   z.outputs = [
            NrW   r   Tr   )rX   rV  r  r6   r   rZ   r[   r^   r   r   r   r   ra   r   re   rY   r   r   r   r   rb   r   )rO   rM   node_typestrr   r   s        rP   re   zFusedSchedulerNode.debug_str  sR   }}xxFF$+FFFFF

 	d	 +  )011    %T%<==  	  #4#3#9D<S#STT	 
   	
 	
 	
 ZZ\\ 	, 	,'')) , ,

3==??++++,	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	c	HJJt++--.... 	H 	H 	HKK7$KGGGGG	H   '')))s$   (?D44D8;D8'E= =&F&%F&r  rC   r  rC   rH   r  r  rA   r@   r  r  rH   ri   r   r  rH   r  r  r  r   r  r  r  )rM   r%   rH   ri   r  ) r[   r   r   __doc__r   classmethodr  r   r   r0   rX   r-  r6  r   r   r   r  r
  r   r1  r   rB  rE  rG  r  r@  r   r   r   rP  re   r$  r%  s   @rP   r  r  4  s          $###+ + + [+#. #. #. #.JL L L L L L = = = ]=) ) ) ) N N N ]N   B B B B/ / / /8 8 8 8 8 8 O O O ]O 
 
 
 ]

   A A A A : : : ]: ; ; ; ]; 9 9 9 ]9    ]    F F F ]F
" " " "" " " "" " " "* * * * * * * *rR   r  c                      e Zd ZU dZd,dZd-dZed.d
            Zed/d            Z	 	 	 d0d1 fdZ	ed2d            Z
ed3d            ZeZded<   ed4d            Zed3d             Zd5d!Zd5d"Zd6d#Zd7d$Zd8d&Zd9d(Zd:d+Z xZS );ForeachKernelSchedulerNodez
    This is a schedular node that consists of a set of scheduler nodes that
    has no data dependencies among them and can be executed in parallel.
    producerrC   rH   Optional[BaseSchedulerNode]c                    |                                 D ]>}|                                | j        v r!| j        |                                         c S ?d S rK   )r   rX   read_to_node)rO   r  r   s      rP   get_consumer_subnode_forz3ForeachKernelSchedulerNode.get_consumer_subnode_for  s]     '')) 	9 	9C||~~!222(8888 3 trR   consumerc                t   t                      }|j        j        D ]m}|j        | j        j        vr| j        j        |j                 j                                        }|| j        v r |	                    | j        |                    nt          |          dk    rt          t          |                    S d S Nr   )setr   r   rM   rA   rv   rD   rX   name_to_noder  r`   rz  r{  )rO   r  	producersrd	node_names        rP   get_producer_subnode_forz3ForeachKernelSchedulerNode.get_producer_subnode_for   s     EE	&, 	< 	<Bwdn888227;GPPRRID---d/	:;;; y>>QY(((4rR   rx   c                
   t          |          }                                r|                                rt          j        t                    t          j        t          |          }t          j                  t          |j                  k    }|s |d           |o2t          fdt          j        |j                  D                       S |                                rz	                                r |d           dS t          j        t          |          }|
                              }||j                            |          S  |d           dS                                 rz|	                                r |d           dS t          j        t                                        |          }|j                            ||          S  |d           dS t          d          )	Nzforeach do not have same lengthc              3  T   K   | ]"\  }}j                             ||          V  #d S rK   )rA   can_fuse)r   lrr  s      rP   r   z6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>  sN       ) )Aq "++Aq11) ) ) ) ) )rR   zXcandidate producer is a reduction, foreach ops cannot be fused with reductions currentlyFz5candidate producer is not dep of any foreach consumerzXcandidate consumer is a reduction, foreach ops cannot be fused with reductions currentlyz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)r  rK  typingcastr  r`   r  allrj  rB  r  rA   r  r  AssertionError)r  r  r  whyforeach_matchconsumer_subnodeproducer_subnodes    `     rP   r  z#ForeachKernelSchedulerNode.can_fuse  s<   (++   &	X%8%8%:%: &	{#=xHHH{#=xHHH00C4H4HHM  75666  S ) ) ) )AA) ) ) & &    "" 	$$&& n   u{#=xHHH'@@JJ+)228=MNNNCGHHH5  "" 	$$&& n   u{#=xHHH'@@JJ+)223CXNNNCGHHH5f
 
 	
rR   c                   |                                 s|                                 sJ |                                 r)t          j        t          |          }|j        }|j        }n(t          j        t          |          }|j        }|j        }d }d }|                                 rn|                                 rZt          j        t          |          }t          j        t          |          }d t          |j        |j                  D             }nO|                                 rt          j        t          |          }|                    |          }g }|}d }|j        D ]N}	|	|u r3t          
                    |	|          }
|
}|                    |
           9|                    |	           On|                                 rt          j        t          |          }|                    |          }g }|}d }|j        D ]N}	|	|u r3t          
                    ||	          }
|
}|                    |
           9|                    |	           Ont          d           | |j        |||||          S )Nc                J    g | ] \  }}t                               ||          !S r   )r  r  )r   r  r  s      rP   r   z3ForeachKernelSchedulerNode.fuse.<locals>.<listcomp>U  s<       Aq #''1--  rR   zTAt least one node passed to ForeachKernelSchedulerNode.fuse should be a foreach node)use_custom_partition_algoprev_node_1prev_node_2enable_autotune)rK  r  r  r  r  r	  rj  r  r  r  r  r  r  r  rA   )r  r  r  r  r	  r  r  fused_nodesr  rB   new_noder  s               rP   r  zForeachKernelSchedulerNode.fuseB  s    ""$$=(;(;(=(====   	7{#=xHHH(0(J%&6OO{#=xHHH(0(J%&6O   &	X%8%8%:%: &	{#=xHHH{#=xHHH AA  KK   "" 	{#=xHHH'@@JJK"KK  - -+++166tXFFH"*K&&x0000&&t,,,,-   "" 	{#=xHHH'@@JJK"KK  - -+++166xFFH"*K&&x0000&&t,,,,- !f   s&?##+
 
 
 	
rR   NFrA   r@   r  r  r  r  r  r	  ri   c                    i  _         i  _        ||ht                                          ||           |D ]A}|j        j        D ]}| j         |j        <   |                                D ]}	| j        |	<   Bn| _        | _	        d  _
        g  _                             t          j                            |j        |j        g                     t!           fdt!          j        |j        |j                  D                        j        j        z
   _        t)          |j        |j        g           _        t-          |j        |j        g           _        |                                rt3          |t4                    sJ ||}}
nt3          |t4                    sJ ||}}
|
j         _         j                            |j                   |
j         _        |                                D ]}	| j        |	<   | _        |d                                         t?          j         d          fff _!        t!                       _"        | _#        d S )Nc              3  R   K   | ]!}|j                                         v|V  "d S rK   r  r  s     rP   r   z6ForeachKernelSchedulerNode.__init__.<locals>.<genexpr>  sL         xt'<'<'>'>>>	  ?>>> rR   r   combo_kernel)$r  r  r  r   r   r   rM   r2  rA   r  rB   rG   r   r   r  r  r   r  r   r   r  r   r  r   rK  rz   r  r   r  r  r@  rt  Exprr   r  r	  )rO   rA   r  r  r  r  r	  rB   rr  rM   foreach_node
other_noder  s   `           rP   r   z#ForeachKernelSchedulerNode.__init__  s    +"5GGY/// 3 3 ,2 8 8D37D%di00 4466 3 3D.2D%d++3	3 'DN DKDI)+DJ  '22 ,k.EF        )/#68V        ")* # !+"79N!OPPDN +"79N!OPPDN%%'' D!+/IJJJJJ+6j!+/IJJJJJ+6j)3DNN!!*"6777 , 9D"6688 5 5*4!$'')B&Qi**,,
>0J0J/L.NO
2<,,.rR   r  c                   d |D             }|r3t                               dt          |          d |D                        d |D             }d |D             }|r(t                               dt          |                     d |D             }d |D             r)t                               d	t                    h           fd
|D             }|S )Nc                <    g | ]}t          |t                    |S r   )rz   r  r  s     rP   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>  s(    OOOj4M&N&NO!OOOrR   z/ComboKernels: %d external nodes are filtered %sc                N    g | ]"}|j         	|j                                         #S rK   rB   r  r0  s     rP   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>  s-    UUUTty?T&&((?T?T?TrR   c                J    g | ] }t          |t          t          f          |!S r   )rz   rj  r  r  s     rP   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>  s?     
 
 
a"8:S!TUU

 
 
rR   c                <    g | ]}t          |t                    |S r   rz   r  r  s     rP   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>  s8     
 
 
A7Q)R)R

 
 
rR   z+ComboKernels: %d foreach nodes are filteredc                <    g | ]}t          |t                    |S r   r  r  s     rP   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>  s8     
 
 
Z;U-V-V

 
 
rR   c                :    g | ]}|                                 |S r   r  r  s     rP   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>  s%    GGGq}}G!GGGrR   z,ComboKernels: %d template nodes are filteredc                    g | ]}|v|	S r   r   )r   rX  template_nodess     rP   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>  s#    OOOq7N7N!7N7N7NrR   )r   r  r`   )r  r  externfiltered_nodesforeach_nodesr  s        @rP   combinable_nodesz+ForeachKernelSchedulerNode.combinable_nodes  s<    POUOOO 	IIAFUUVUUU  

 

 
 


 
%
 
 
  	YIICSEWEWXXX
 
%
 
 
 HG^GGG 	II>^ATAT@U   POOO^OOOrR   List[List[BaseSchedulerNode]]c           
         |                                  }g }d|D ]@|                    fdt          dt                              D                        A|S )zS
        Returns a list of lists of nodes that are to be grouped together.
           c                *    g | ]}||z            S r   r   )r   r  max_num_nodesr  s     rP   r   zUForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels.<locals>.<listcomp>  s8        !a-//0  rR   r   )_topological_sort_nodesrT  ranger`   )rA   sorted_nodesgrouped_nodesr%  r  s      @@rP   &_default_group_nodes_for_combo_kernelszAForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels  s     !88::! 	 	E      "1c%jj-@@      rR   4Callable[[Scheduler], List[List[BaseSchedulerNode]]]!group_algorithm_for_combo_kernelscustom_group_algorithmc                    | t           _        d S rK   r  r,  )r-  s    rP   %set_group_algorithm_for_combo_kernelsz@ForeachKernelSchedulerNode.set_group_algorithm_for_combo_kernels  s    
 # 	#DDDrR   c                6    t                               |           S rK   r/  r   s    rP   group_nodes_for_combo_kernelsz8ForeachKernelSchedulerNode.group_nodes_for_combo_kernels  s     *KKIVVVrR   c                    t           rK   r  rN   s    rP   r  z#ForeachKernelSchedulerNode.mark_run  r  rR   c                   t          | j        t          j                  sJ dt	          | j                               | j                                         | j                                                               d S r\  )rz   rB   r   r)   rZ   get_store_functionmake_loaderrN   s    rP   rd  z"ForeachKernelSchedulerNode.codegen  sq    $)R%677NN9NDOO9N9NNNN&	$$&&'>ty'<'<'>'>'@'@AAAAArR   c                    dS r!  r   rN   s    rP   rK  z%ForeachKernelSchedulerNode.is_foreach  r  rR   c                *    t          | j                  S )zeReturns a list of nodes which comprise the combo kernel.
        These nodes may be vertically fused.)r   r  rN   s    rP   get_subkernel_nodesz.ForeachKernelSchedulerNode.get_subkernel_nodes  s     DK   rR   r7  c                x    t          t          j                            d | j        D                                 S )zqReturns all nodes contained in this kernel, unpacking fused nodes
        into their constituent scheduler nodes.c              3  >   K   | ]}|                                 V  d S rK   )r1  r  s     rP   r   z7ForeachKernelSchedulerNode.get_nodes.<locals>.<genexpr>  s*      1U1UA!++--1U1U1U1U1U1UrR   )r   r  r	  rk  r  rN   s    rP   r1  z$ForeachKernelSchedulerNode.get_nodes  s3     IO111U1U1U1U1UUUVVVrR   rS   c                @    | j         d                                         S r  )r  r-  rN   s    rP   r-  z)ForeachKernelSchedulerNode.get_first_name  s    {1~,,...rR   r%  r&  c                z    t          | || j        j                   | j        D ]}|                    |           d S rK   )r(  rA   rv   r  r)  )rO   r%  rB   s      rP   r)  z/ForeachKernelSchedulerNode.prune_redundant_deps"  sO     	d$68RSSSK 	: 	:D%%&89999	: 	:rR   )r  rC   rH   r  )r  rC   rH   r  r  rC   r  rC   rH   rx   )r  rC   r  rC   rH   r  )NNF)rA   r@   r  r  r  rx   r  r  r  r  r	  rx   rH   ri   r  r  rH   r  )rA   r@   rH   r!  )r-  r+  rH   ri   r   r   rH   r  r  r   r  )r[   r   r   r  r  r  r  r  r  r   r   staticmethodr*  r,  r   r0  r2  r  rd  rK  r9  r1  r-  r)  r$  r%  s   @rP   r  r    s         
      & ,
 ,
 ,
 [,
\ >
 >
 >
 [>
J 4837 %@/ @/ @/ @/ @/ @/ @/D    [>    \* 	/ & / / / / 
 
 
 \
 W W W \W
" " " "B B B B   ! ! ! !
W W W W
/ / / /: : : : : : : :rR   r  c                       e Zd ZU dZded<   edd            Zd fd	Zdd
ZddZ	e
d d            Zd dZe
d!d            Zd"dZd#dZed$d            Z xZS )%r  aC  
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be *grouped* together (it does not allow another node to be scheduled
    in between its constituent nodes, nor does it allow another node to fuse into any of its constituent nodes).
    The way it does this is by maintaining its unmet dependencies as the union of its constituent nodes.
    Fusion will still happen among the nodes within each GroupedSchedulerNode.
    At codegen time, this scheduler node will be unpacked and codegen is called on each constituent node.
    r  r  rH   c                    |d         j         t          fd|D                       sJ  | |          }|D ]}|j        |                                <   |j        |                                <   |S )Nr   c              3  *   K   | ]}|j         u V  d S rK   r   )r   rB   rA   s     rP   r   z.GroupedSchedulerNode.create.<locals>.<genexpr>:  s*      BB44>Y.BBBBBBrR   )rA   r  r%  rX   )r  r  grouped_snoder  rA   s       @rP   createzGroupedSchedulerNode.create7  s    1I'	BBBB6BBBBBBBBIv.. 	K 	KE=JI()9)9::AN	$]%;%;%=%=>rR   rA   r@   ri   c                l    t                                          |           t          | ||           d S rK   )r  r   r  r  s      rP   r   zGroupedSchedulerNode.__init__A  s3    ###i00000rR   c                    | j         D ]#}|| j        j        |                                <   $| j        j        |                                 = | j                            | j                   S )z
        Do fusion among nodes within this GroupedSchedulerNode,
        and then unpack this GroupedSchedulerNode into regular nodes.
        )r  rA   r%  rX   
fuse_nodes)rO   r  s     rP   unpackzGroupedSchedulerNode.unpackE  s`    
 [ 	H 	HEBGDN-enn.>.>??N-dmmoo>~((555rR   fake_depr%   c                    |                      | j                            |                     | j                            |           d S rK   )r   r   r   r   r  )rO   rK  s     rP   r   z!GroupedSchedulerNode.add_fake_depO  sD    T-77AABBB##H-----rR   rS   c                J    d                     d | j        D                       S )Nr  c                6    g | ]}|                                 S r   r   r  s     rP   r   z1GroupedSchedulerNode.get_name.<locals>.<listcomp>U  r  rR   r  rN   s    rP   rX   zGroupedSchedulerNode.get_nameS  r  rR   c                @    | j         d                                         S r  r  rN   s    rP   r-  z#GroupedSchedulerNode.get_first_nameW  r  rR   r   c                <    t          j        d | j        D              S )Nc                6    g | ]}|                                 S r   r  r  s     rP   r   z9GroupedSchedulerNode.get_buffer_names.<locals>.<listcomp>\  r  rR   r  rN   s    rP   r6  z%GroupedSchedulerNode.get_buffer_namesZ  r  rR   r  c                l    g }| j         D ])}|                    |                                           *|S rK   r  r  s      rP   r   z GroupedSchedulerNode.get_outputs^  r  rR   r7  c                    | j         S rK   r  rN   s    rP   r1  zGroupedSchedulerNode.get_nodesd  rZ  rR   r  rC   r  rx   c                    dS r  r   )r  r  r  s      rP   r  zGroupedSchedulerNode.can_fuseg  s	     urR   )r  r  rH   r  r  r@  )rK  r%   rH   ri   r   r  r  r  r>  )r[   r   r   r  r   r  rF  r   rJ  r   r0   rX   r-  r6  r   r1  r  r$  r%  s   @rP   r  r  +  s=          $###   [1 1 1 1 1 16 6 6 6. . . . = = = ]=) ) ) ) N N N ]N          [    rR   r  r   stride_lengthsList[List[int]]rI  List[sympy.Expr]priority_idxTuple[int, ...]	List[int]c           
     :    t           j        d	 fd            }t          t          t	          t           d                                                 }t          |          dk    r fd|D              t          j        r|                    |           |S )
z
    A heuristic to decide loop iteration orders.  This has not been well
    tuned and may be something we should autotune.
    arI   brH   c                              dk    s         dk    r$t                    dk             dk              S  fdD             }fdD             }t          d t          ||          D                       }t          d t          ||          D                       }||k    rdS ||k    rdS t                     S )Nr   c                :    g | ]}t          |                   S r   abs)r   slr\  s     rP   r   z6pick_loop_order.<locals>.index_cmp.<locals>.<listcomp>  #    <<<rBqE

<<<rR   c                :    g | ]}t          |                   S r   r`  )r   rb  r]  s     rP   r   z6pick_loop_order.<locals>.index_cmp.<locals>.<listcomp>  rc  rR   c              3  4   K   | ]\  }}|d k    p||k     V  dS r  r   r   sl_asl_bs      rP   r   z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>  D       
 
)3tDAI$
 
 
 
 
 
rR   c              3  4   K   | ]\  }}|d k    p||k     V  dS r  r   rf  s      rP   r   z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>  ri  rR   r}  )r1   r  rj  )r\  r]  stride_len_astride_len_ba_firstb_firstrI  rU  s   ``    rP   	index_cmpz"pick_loop_order.<locals>.index_cmpw  s   8q==E!HMMuQx1}eAh!m444 =<<<^<<<<<<<^<<<  
 
7:<7V7V
 
 
 
 
  
 
7:<7V7V
 
 
 
 
 W2W1 1ayyrR   r   c                     g | ]
}|         S r   r   )r   pirU  s     rP   r   z#pick_loop_order.<locals>.<listcomp>  s    DDD.,DDDrR   rZ  )r\  rI   r]  rI   rH   rI   )		functools
cmp_to_keyr   r  r'  r`   r   pick_loop_orderssort)rU  rI  rX  ro  orders   ``   rP   pick_loop_orderrw  m  s           4 %N1$5 6 6778899E
<1DDDD|DDD "

y
!!!LrR   c                  V    e Zd ZU ded<   dZded<   dZded<   dd	ZddZddZddZ	dS )NodeUser$Union[BaseSchedulerNode, OutputNode]rB   Frx   rP  is_weakrH   rI   c                h    t          | j                                        | j        | j        f          S rK   )rL   rB   rX   rP  r{  rN   s    rP   rQ   zNodeUser.__hash__  s*    TY''))4+;T\JKKKrR   otherobjectc                    t          |t                    oI|                                 |                                k    o| j        |j        k    o| j        |j        k    S rK   )rz   ry  rX   rP  r{  rO   r}  s     rP   __eq__zNodeUser.__eq__  sY    uh'' .5>>#3#33. E$55. -		
rR   rS   c                4    | j                                         S rK   rg   rN   s    rP   rX   zNodeUser.get_name  rh   rR   c                ~    | j         |j         u sJ t          | j         | j        o|j        | j        o|j                  S rK   )rB   ry  rP  r{  r  s     rP   r   zNodeUser.merge  sH    yEJ&&&&I2!2L*U]
 
 	
rR   Nr   )r}  r~  rH   rx   r   )r}  ry  rH   ry  )
r[   r   r   r   rP  r{  rQ   r  rX   r   r   rR   rP   ry  ry    s         ....K GL L L L
 
 
 
$ $ $ $
 
 
 
 
 
rR   ry  c                      e Zd ZU ded<   dddZdd fdZded
ZdfdZdgdZdhdZ	dfdZ
dfdZdfdZdidZdjdZdkdZdfdZdfdZdidZdfdZdld"Zdfd#Zdmd&Zdnd)Zdid*Zdodpd.Zdqd/Zdrd1Zdnd2Zdnd3Zdsd6Zdnd7Zdnd8Z dnd9Z!dtd<Z"dudAZ#dvdCZ$dwdFZ%dxdGZ&dydIZ'dzdKZ(dfdLZ)dfdMZ*dfdNZ+d{dPZ,d{dQZ-dfdRZ.d|dUZ/d}dXZ0d}dYZ1d~dZZ2dfd[Z3dfd\Z4dd^Z5dd_Z6ddbZ7dfdcZ8 xZ9S )r@   zDict[Dep, int]_Scheduler__dep_size_hint_cacher  List[ir.Operation]rH   ri   c                    t          d          5  |                     |           d d d            d S # 1 swxY w Y   d S )NzScheduler.__init__)r   _initrO   r  s     rP   r   zScheduler.__init__  s    .// 	 	JJu	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   377c                :	    t                                                       i  _         t          j        _        i  _        t          t                     _	        t                       _        t          g t          j        j                                        t          j        j                                        t          j        j                                                   _         fd|D              _                                           j                            t          j        j                                                    j        D ]}|                                 d  j        D              _        d  j        D              _         j                                         _        i  _        i  _                                                                j                   _                                          d  j        D              _                                          t>          j         r*tC          j"         j         j         j                   _        tF          xj$        tK           j                  z  c_$        t          j&        '                     j                   tK           j                   _(         )                                                       j                   _        t                       _*        t>          j+        t?          j+         j                   _         ,                     j                   _         -                                  .                                 t>          j         rtC          j/         j                   _        t>          j0        r 1                    d             2                                  3                                 t          j&        4                     j                   t          j&        5                     j                    6                                 d  _7        t                       _8        i  _9        tu          d          ;                     fd           d S )Nc                :    g | ]}                     |          S r   )create_scheduler_noder   r  rO   s     rP   r   z#Scheduler._init.<locals>.<listcomp>  s'    CCCd0033CCCrR   c                8    i | ]}|                                 |S r   r   r  s     rP   r   z#Scheduler._init.<locals>.<dictcomp>  s/     ;
 ;
 ;
 !AJJLL!;
 ;
 ;
rR   c                f    i | ].}|                                 D ]}|                                |/S r   )r   rX   )r   rB   r   s      rP   r   z#Scheduler._init.<locals>.<dictcomp>  sO     8
 8
 8
$($BRBRBTBT8
 8
;>CLLNNC8
 8
 8
 8
rR   c                8    i | ]}|                                 |S r   r   r  s     rP   r   z#Scheduler._init.<locals>.<dictcomp>  s"    "G"G"Gq1::<<"G"G"GrR   )num_ck_nodesgraph_statsc                 H     j          j        t           j                  dS )N)graph_idnum_nodes_before_fusionnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesr`   r  rN   s   rP   rY  z!Scheduler._init.<locals>.<lambda>  s&     3+/+>*-dj//  rR   )<r  r   r  r;   ro   rA   backendsrz  _post_grad_graph_counterr  r   r]  r  keys	constantstorchbind_constantsr  r  update_zero_dim_cpu_tensorr  r   r  rv   copyr%  r   mutation_renamescompute_dependenciestopological_sort_scheduledead_node_eliminationcompute_ancestorsr    reorder_for_compute_comm_overlapr   decide_global_ordering_of_commsr   ir_nodes_pre_fusionr`   r  ir_pre_fusionr  create_foreach_nodeslogged_slow_fusion_pre_fusion_custom_passrI  merge_loopsfinalize_multi_template_buffers$reorder_compute_and_comm_for_overlapcombo_kernelscreate_combo_kernel_nodesprocess_grouped_nodescompute_last_usageir_post_fusiongraph_diagramdebug_draw_graphcurrent_devicebuffer_names_to_freeorigin_to_indexr   add_row)rO   r  rB   r  s   `  rP   r  zScheduler._init  s   %'" <>"&'?"@"@5?\\!&0%**,,"'')) ,1133'
 '
# DCCCUCCC
'')))#**17+<+A+A+C+CDDDJ 	 	DOO;
 ;
%)Z;
 ;
 ;
8
 8
,0J8
 8
 8
 AE@Q@V@V@X@X 35 13!!###33DJ??
""$$$"G"GDJ"G"G"G   2 	>
 ' DJ 	##s4:6##	dj)))!$*oo!!###33DJ??
?I||)57
CCDJ__TZ00
,,...2 	PCDJOODJ 	>***===""$$$!!!	tz***	dj))) 7;5?\\! :<''//   	
 	
 	
 	
 	
rR   r>  c                6    | j         x}r|S t          d          )NzNo current device)r  RuntimeErrorrO   r  s     rP   get_current_device_or_throwz%Scheduler.get_current_device_or_throw&  s&    ((6 	4M2333rR   c                    t           j                            dd          dk    rddlm}  || j        d           dS dS )z,Generate an image of the graph for debuggingINDUCTOR_WRITE_SCHEDULER_GRAPHN1r   )draw_buffersT)print_graph)osenvironr   r  r  r  )rO   r  s     rP   r  zScheduler.debug_draw_graph,  sV    :>>:DAASHH++++++L666666 IHrR   labelrS   c                    t                               t          j                  r9t                               d|           | j        D ]}|                                 d S d S )Nz%s:)r   isEnabledForloggingINFOr   r  r   )rO   r  rB   s      rP   debug_print_nodeszScheduler.debug_print_nodes3  sh    GL)) 	#HHUE"""
 # #  """"	# 	## #rR   rB   r   rC   c                d   |                                 
J d            |                                rt          | |          S t          |t          j        t          j        f          rt          | |          S t          |t          j                  rt          | |          S t          |          )Nz2All nodes passed to scheduling must have an origin)r  is_no_oprj  rz   r   r)   r5  r`  r  r  r  r   s     rP   r  zScheduler.create_scheduler_node9  s    **? +**==?? 	,)$555r0"2CDEE 	, t,,,bo.. 	,,T4888%d+++rR   c                    t                      g } j                                        t          j        j                                        D ]~} fd|D             }|s                    |            fd|D             }t          j	        dk    }t           |d|          }|                    |           |D ]}| j        |<   fd j        D             t          |          z    _        d S )Nc                \    g | ](}|v t          j        |         t                    &|)S r   )rz   r  rj  )r   rM   kept_node_namesrO   s     rP   r   z2Scheduler.create_foreach_nodes.<locals>.<listcomp>L  sI       ?**"4#4T#:<RSS + ***rR   c                *    g | ]}j         |         S r   )r  )r   rM   rO   s     rP   r   z2Scheduler.create_foreach_nodes.<locals>.<listcomp>W  s!    @@@$d'-@@@rR   r   Fr  r	  c                @    g | ]}|                                 v|S r   r   )r   rB   removed_node_namess     rP   r   z2Scheduler.create_foreach_nodes.<locals>.<listcomp>f  s3     
 
 
4==??BT+T+TD+T+T+TrR   )r   r%  r  r;   ro   listsr   r  r   combo_kernels_autotuner  r  r  r   )	rO   fe_nodesnamesr  r	  fe_noderM   r  r  s	   `      @@rP   r  zScheduler.create_foreach_nodesF  sQ   .8ll16688W]))++ 	8 	8E    !  E  %%e,,,@@@@%@@@F$;a?O0*/ /	  G OOG$$$ 8 807'--8
 
 
 
!Z
 
 
NN


rR   c                    t          d          } G fddt          |                   t          j                   j        D ]}|                                D ]}|                                }|                                D ]k}|v rJ|v rF|         }|         }||z   }                                D ]}	|	         |u s
|	         |u r||	<   P|v r|         |<   `|         |<   ld  fd	 	 d!d"fd}
i }t          j
        j                                        D ].\  }}t          |t          j                  r|j        D ]}d||<   / j        D ]}t"                              d|j                   |j        J t)          |j                                        d           }|D ]9}t          |t          j                  sJ ||vr|                                ||<   :t)          |j                                        d           }|D ]u}||v sJ | d|             ||         x}V j        |                                         D ]6}|                    t5          |                                                     7vt7          |j        j                  dk    rEt=          t?          |j        j                            x}rt          |t@                    r|j!        }nd}|                                D ]@}t7          |"                                          dk    sJ |"                                D ] } |          } |
||           |                    t5          ||                     |         j        D ]}|                                |                                k    r-t          |j        tF                    sJ |j        $                                D ]Q} |          }|                    tK          ||                                                      |
||d           RB|j        j&        D ]<}t          |tJ                    s% |
|j'        ||(                    |                     =|)                     j*                   |                                D ]}|"                                D ]x}|                                 j*         |          <   |                                 j*        |<    j+        ,                    ||           j+        |                                <   yt          j
        -                                D ]C}t"                              d|            |
|t]          t5          |                               Dt          j
        j/        D ]}|                                D ]}||v s!J | d|                                             ||         x}rd j        |         $                                D ]D}t"                              d||            |
|t]          t5          |                               E j*        D ]}|t          j
        j        v rK |
|t]          t5          |                               t          j
        j0        1                    |           `|t          j
        j2        v r& |
|t]          t5          |                               d tg          t          j
        j                                                  D             fdt          j
        j0        D             t          j
        _4         j        D ]K}|                                D ]4}|5                    |                                         j                   5LdS )#zi
        Create dependency edges between nodes, handling aliasing and
        mutation properly.
        Tc                  6    e Zd ZdZ	 	 ddd	ZddZd fdZdS )1Scheduler.compute_dependencies.<locals>.DedupListan  
            This data structure behaves like a list except it makes sure the
            elements remain unique.
            Normally one could use a OrderedSet/dict for this purpose however
            the list in question gets elements appended as it is being
            iterated over which means that we need to keep the list
            semantics.
            NitemsOptional[List[T]]
membershipOptional[OrderedSet[T]]rH   ri   c                B    |pg | _         |pt                      | _        d S rK   )r  r   r  )rO   r  r  s      rP   r   z:Scheduler.compute_dependencies.<locals>.DedupList.__init__|  s#    
 #[b
","<
rR   	node_userr  c                    || j         v rd S | j                            |           | j                             |           d S rK   )r  r  r  r  )rO   r  s     rP   r  z8Scheduler.compute_dependencies.<locals>.DedupList.append  sF    //F
!!),,,##I.....rR   r}  DedupList[T]c                     t          j         j        |j                  } j         fd|j        D             z   } ||          S )Nc                &    g | ]}|j         v|S r   )r  r^  s     rP   r   zMScheduler.compute_dependencies.<locals>.DedupList.__add__.<locals>.<listcomp>  s,     * * *at.F.FA.F.F.FrR   )r   r  r  r  )rO   r}  new_membership	new_items	DedupLists   `   rP   __add__z9Scheduler.compute_dependencies.<locals>.DedupList.__add__  sc    !+!1$/5CS!T!T J * * * *${* * * 	 !yN;;;rR   r  )r  r  r  r  rH   ri   )r  r  rH   ri   )r}  r  rH   r  )r[   r   r   r  r   r  r  )r  s   rP   r  r  r  sr          ,06:= = = = =/ / / /< < < < < < < <rR   r  r  rS   rH   c                F    | j         v r j         |                    S | S rK   )r  )r  r   rO   s    rP   r   z.Scheduler.compute_dependencies.<locals>.rename  s.    D)))vd3A6777HrR   Fused_by_name	user_noderz  rP  rx   r{  ri   c                n     |                                         t          |||                     d S rK   )r  ry  )r  r  rP  r{  name_to_usersr   s       rP   add_userz0Scheduler.compute_dependencies.<locals>.add_user  sE     &&../66K99    rR   Nzscheduling %sc                    | j         S rK   r  rW  s    rP   rY  z0Scheduler.compute_dependencies.<locals>.<lambda>      AF rR   rZ  c                    | j         S rK   r  rW  s    rP   rY  z0Scheduler.compute_dependencies.<locals>.<lambda>  r  rR   z not in r   )r  )mutating_bufT)r{  zscheduling output %sz+scheduling output %s for unbacked symint %sc                    i | ]\  }}||	S r   r   )r   r|  rM   s      rP   r   z2Scheduler.compute_dependencies.<locals>.<dictcomp>,  s+     
 
 
'E4D%
 
 
rR   c                     g | ]
}|         S r   r   )r   rM   	inp_namess     rP   r   z2Scheduler.compute_dependencies.<locals>.<listcomp>/  s*     &
 &
 &
 $IdO&
 &
 &
rR   )r  rS   rH   rS   )FF)
r  rS   r  rz  rP  rx   r{  rx   rH   ri   )6r   r	   r  r  r  r   rX   r]   r  r;   ro   r  r  rz   rt  r  free_symbolsr   r  rB   rg  get_unbacked_symbol_defsSymbolget_unbacked_symbol_usesr  r   r'   r`   r   r   rz  r{  r&   r  r_   rC   r6  r(   r   rM   rP  r   r  r   r   get_output_namesr|   graph_outputsmutated_inputsr  r  r  mutated_input_idxsr   ) rO   r  rB   buf1	buf1_name	buf2_namelist1list2combinedr[  r  unbacked_symbol_to_origin_noderM   valfsunbacked_symbol_defsr  unbacked_symbol_usesr  r   r   	node_modealt_namerd   
other_namerr  r;  r   r  r  r  r   s    `                           @@@@rP   r  zScheduler.compute_dependenciesj  sV	    CLL	< 	< 	< 	< 	< 	< 	<
 	< 	< 	<> @K?V@
 @
 J 	L 	LD((** L L MMOO	!%!1!1!3!3 L LI M11i=6P6P -i 8 -i 8#(5=#0#5#5#7#7 > >C -c 2e ; ;#0#5#>#>5=c 2> #m333@3Ki003@3Ki00LL&	 	 	 	 	 	 	 !&!		 	 	 	 	 	 	 	 MO&
 -3355 	> 	>ID##uz** >* > >B9=2266J J	H J	HDIIoty111 9(((#)	2244:J:J$ $ $  * H H!!U\22222 :::8<215#)	2244:J:J$ $ $  * C C7777AA!?AA 8777::AG#03??AA C C))'#,,..*A*ABBBB D$+,,11 d&6&=!>!>???S 2sI.. 2  H		 	 '')) E E3,,..//14444 # 1 1 3 3 E EH%vh//HHXt,,,%%ghY&G&G&GHHH -h 7 = E E==??dmmoo==$)$)5FGGGGG*.)*D*D*F*F E EJ)/
););J -- '
 P P P   %HZtDDDDDEEE& (. F F!$00 FHTYd.>.>t.D.DEEE%%d&;<<< '')) H H # 1 1 3 3 H HH>AllnnD)&&*:*:;69llnnD)(3 /33HhGG + HH 0022 	> 	>HII,h777HXz'(*;*;<<==== 7( 
	J 
	JC1133 	J 	J7777HH!?!D!D!F!FHH 8776q991 J$($5a$8$I$I$K$K J J		I8UV   !:gh6G6G+H+HIIII	J ) 	: 	:Dqw+++z'$--88999&**40000***z'$--88999
 
+4QW5I5N5N5P5P+Q+Q
 
 
	&
 &
 &
 &
()(>&
 &
 &
"
 J 	C 	CD'')) C CmCLLNN;ABBBBC	C 	CrR   c                ,   g }t          | j                  D ];}ddd}|                                D ]}t          fd|j        D                       }|rdt
                              d|                                           t          j	        j
                            |                                           d	}|                                 o| }|s|                    |           t
                              d
|                                           t          j	        j                            |                                           =t          t          |                    | _        | j        D ]}|                                 dS )z0
        Remove any nodes without users
        rd   ry  rH   rx   c                Z    | j         p$|                                 t          j        j        v S rK   )r{  rX   r;   ro   r  )rd   s    rP   can_eliminate_userz;Scheduler.dead_node_elimination.<locals>.can_eliminate_userB  s!    |Tt}}!':T'TTrR   Fc              3  .   K   | ]} |          V  d S rK   r   )r   ur  s     rP   r   z2Scheduler.dead_node_elimination.<locals>.<genexpr>G  s/      #M#Ma$6$6q$9$9#M#M#M#M#M#MrR   zremoved dead buffer: %sTzremoved dead operation: %sN)rd   ry  rH   rx   )r  r  r   r  rG   r   r  rX   r;   ro   rh  r  rR  r  r  r   r$  )rO   updated_nodesrB   active_buffersr   can_eliminater  s         @rP   r  zScheduler.dead_node_elimination8  s    TZ(( 	@ 	@DU U U U #N'')) * * ##M#M#M#M39#M#M#M M M  *II7HHHG+//????%)NN $ 5 5 7 77N<NM  @$$T**** 		6HHH*..t}}????(=1122
 J 	# 	#D  """"	# 	#rR   r  c                    t                      t                      g dfd|D ]}|                                D ]}||<   |D ]} |           S )z?
        Ensure nodes is in topologically sorted order
        r  rC   rH   ri   c                    | vrf                     |            t          | j        d           D ]"}|j        vr |j                            #                    |            d S d S )Nc                    | j         S rK   r  )ds    rP   rY  zDScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>j  s    af rR   rZ  )r  rg  r   rM   r  )r  r   r  rc   seenvisits     rP   r  z2Scheduler.topological_sort_schedule.<locals>.visitg  s    }}!!"6<L<LMMM 2 2Cx|33 E,sx01111a      }rR   )r  rC   rH   ri   )r   ri  r6  )rO   r  rB   rM   r  rc   r  r  s       @@@@rP   r  z#Scheduler.topological_sort_schedule]  s     /9ll59VV*,	! 	! 	! 	! 	! 	! 	! 	! 	!  	* 	*D--// * *%)T""* 	 	DE$KKKKrR   r  c                L    t                      }t          |t          t          t          t
          f          r%|j        D ]}|                    |j                   n t          dt          |           d           fd|D             }t           fd|D                       S )Nz+get_unmet_dep_nodes is not implemented for .c              3  <   K   | ]}j         |         j        V  d S rK   )rv   rD   r  s     rP   r   z1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>  s.      QQs)#.:QQQQQQrR   c                N    h | ]!}j         |                                         "S r   )r%  rX   r  s     rP   rB  z1Scheduler._get_unmet_dep_nodes.<locals>.<setcomp>  s)    RRRqT,QZZ\\:RRRrR   )r  rz   r`  r  rj  r  r   r  rM   r  rZ   r   )rO   r  
unmet_depsr   unmet_dep_opss   `    rP   _get_unmet_dep_nodeszScheduler._get_unmet_dep_nodesx  s    UU
)&"	
 
 	 / ) )sx(((() Ld5kkLLL   RQQQjQQQRRRRMRRRSSSrR   r!  c                b   g }t                               | j        d          }i }| j        D ]^}|                     |          }t	          |          ||<   |D ]2}|                    |g           }|                    |           |||<   3_d |                                D             }|rx|                    |           |D ]@}	|                    |	g           D ]}
||
xx         dz  cc<   |                    |	           Ad |                                D             }|x|r
J d            |S )zU
        Sort nodes by their topological order, return a list of node lists.
        r   c                $    g | ]\  }}|d k    |S r   r   r   r  vs      rP   r   z5Scheduler._topological_sort_nodes.<locals>.<listcomp>  s!    @@@1a!rR   r   c                $    g | ]\  }}|d k    |S r%  r   r&  s      rP   r   z5Scheduler._topological_sort_nodes.<locals>.<listcomp>  s!    DDDDAqQ!VVaVVVrR   zTopological sort failed!)	ri  fromkeysr  r"  r`   r   r  r  r  )rO   rv  r  childrenrB   r  r   czero_deg_nodesr  rd   s              rP   r&  z!Scheduler._topological_sort_nodes  sg    dj!,,#%J 	" 	"D,,T22Dd))E$K " "LLb)) !"
 A@@@@ 	ELL(((#  $LLB// % %D$KKK1$KKKK		!DDEKKMMDDDN  	E 444444rR   c                l   i }| j         D ]}t                      }|j        D ]K}| j        |j                 j                                        }|                    |           |||         z  }L|||                                <   ||_        t          | j                   D ]\  }}||_
        ||_        dS )z.
        Populate each node.ancestors
        N)r  r   r   rv   rM   rD   rX   r  r   r  r   r   )rO   name_to_ancestorsrB   r   r   dep_node_namerv  s          rP   r  zScheduler.compute_ancestors  s    
 9;J 	' 	'D)3I. > > $ 0 : F O O Q Qm,,,.}==		1:dmmoo.&DNN$TZ00 	# 	#KE4"DN"DNN	# 	#rR   c                   | j         D ]}t          j        st          |t          t
          f          r-|                                j        dk    rt          j        dk    rY|	                                D ]q}t          |t                    r|
                                r,|j                                        |_        |j        j        |_        |                    d           rd S )Nr2  halideTr3  )r  r   r;  rz   r`  r  r@  rZ   cpu_backendr1  rG  r8  r  rI  r7  rD  )rO   rB   r  s      rP   r  zScheduler.merge_loops  s    J 	; 	;D4  d]4F$GHH !!&&00V5G85S5S)) ; ;!%77 5;L;L;N;N #k5577${0
 **T*::::;	; 	;rR   c                f   t          d          D ]}t          |          }t                              d|dz   |           |                     |          }t          |          }t                              d|dz   ||           ||k    s|dk    r t                              d|dz               n|S )zB
        Combine eligible nodes into FusedSchedulerNodes.
        
   z/===== attempting fusion (%d/10): %d nodes =====r   z=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====)r'  r`   r  r  fuse_nodes_once)rO   r  r  old_lennew_lens        rP   rI  zScheduler.fuse_nodes  s     r 	 	A%jjGAA  
 ((//E%jjGPA	   '!!W\\  !NPQTUPUVVV &2 rR   c                    g }| j         D ]A}|                    t          |t                    r|                                n|g           B|| _         dS )zA
        Unpack GroupedSchedulerNode into regular nodes.
        N)r  rT  rz   r  rJ  )rO   	new_nodesrB   s      rP   r  zScheduler.process_grouped_nodes  se     .0	J 	 	D!+D2F!G!GSdV    


rR   r7  Tuple[float, str]c                    t          |          dk    sJ |d                                         }|| _        |                     |          }|                    |          S 
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        r   )r`   r@  r  r9  benchmark_fused_nodes)rO   r  r  backends       rP   r>  zScheduler.benchmark_fused_nodes  s[     5zzA~~~~q$$&&$""6**,,U333rR   c                
   dd}t          | j                  D ]\  }}t          |t                    rt          |j        t
          j                  r|j        }|                                \  }}t          |t          j	        j        j
                  r|j                            |           |                                }|j        }t          |t
          j                  sJ |j        }	t          |	t
          j                  sJ |j        |	_         |||	           |                     |	          }
|
| j        |<   |
| j        |                                <   |
| j        |                                <   t+          |
                                |                                          D ]-\  }}|| j        |                                <   |j        |_        .|j        |
_        |j        |
_        |j        |
_        d S )	N	orig_nodeir.MultiTemplateBufferr  ir.OperationBufferrH   ri   c                z   |                                 }|                                  }t          |t                    rt          |t                    sJ |                                }|                                 }t          |t                    rt          |t                    sJ t          j        j        |= ||_        t          j        j        |= ||_	        t          j        j
                            |           }t          j        j
                            |           |t          j        j
        |<   |t          j        j        |<   t          j        j                            |           }t          j        j                            |           |t          j        j        |<   |t          j        j        |<   d S rK   )rX   rz   rS   r+  r;   ro   r  rM   
name_to_opoperation_namebuffersr|  remove
operations)rA  r  replaced_buf_nameorig_buf_namereplaced_op_nameorig_op_nameorigs          rP   replace_operation_bufferzKScheduler.finalize_multi_template_buffers.<locals>.replace_operation_buffer	  sd    !) 1 1 3 3%..00MmS11XjARTW6X6XXXX'::<<$7799LlC00VZ@PRU5V5VVVV&'89)HM"#34&2H#7?((33DGO""8,,,$,AGOD!4<AG"=17%++I66DG%%h///'/AGt$/7AG|,,,rR   )rA  rB  r  rC  rH   ri   )r  r  rz   r`  rB   r   MultiTemplateBufferget_min_choicer   r   TritonTemplateCallerBasefinalize_as_triton_calleroutput_noder   
StorageBoxOperationBufferr\   r  r  rX   r%  rj  r   rv   rG   r   r   r   )rO   rO  r  rB   
multi_nodemin_node_unfusedr  out_tensorboxout_storage
out_buffernew_scheduler_nodenew_outold_outs                rP   r  z)Scheduler.finalize_multi_template_buffers
	  s   	8 	8 	8 	86 !,, $	@ $	@GAt$.. #@:	214 4 #@ "Y
&0&?&?&A&A# !$O&?   I778HIII 0 < < > >+0!+r}=====(-
!*b.@AAAAA$.$5
!((Z@@@%)%?%?
%K%K" 2
15G!$--//2;M'8(+&2244d6F6F6H6H) ) 2 2$GW <CD$W%5%5%7%78$+MGMM/3~",/3~",04"-I$	@ $	@rR   	node_listrx   c                4    t          d |D                       S )Nc              3     K   | ]Q}t          |j        d           o7|j        duo.t          |j        j        d          o|j        j        j        dk    V  RdS )r   Nscatter_moder  )rr   rB   r   rb  r  s     rP   r   z,Scheduler._any_atomic_add.<locals>.<genexpr>M	  s       
 

 	 AFF## 9d"9^449 (L8	
 
 
 
 
 
rR   )r   rO   r_  s     rP   _any_atomic_addzScheduler._any_atomic_addL	  s2     
 

 
 
 
 
 
 	
rR   r  r  c                t                                    o+t                                          t          j                  }t
          j        s|sdS                                  r,t                                          t          j                  r(                                s                                rdS 	                                }|d         
                                }|j        dk    rdS 	                                }t          t          j        ||                    }|                     |          rdS ddlm} t%                    }	dfd}
t          t&                    rkt          j        t          j                  rKj        }|j        }|                                \  }|                     |          \  t1          d          }d}d}t3          |                                d           D ]\  }}t          |t6          j        j        j                  s*|z   k    r nl|dz  }|t
          j        k    r nUj                            |          5  |                     |          \  }|k     r}|}ddd           n# 1 swxY w Y    |
|           |z   k     r|j                             |           dS dS 	 |                     |          \  tC          j"                  r |	d           dS |                     |          \  tC          j"                  r |	d           dS |                     |          \  tC          j"                  r |	d           dS n%# |$ r}dtG          |          v rY d}~dS  d}~ww xY w |
           tI          d          rZz   k    rQf| j%        vrF| j%        &                    f           tO          d          (                    fd           z   k     S )
        If config.benchmark_fusion is False, always return True.
        Otherwise, return True if fusion can brings speedup.
        Tr   cpuCompilationErrorms_fusedr  ms1ms2rH   ri   c           
        t                               t          j                  r| ||z   k     rXt                               d                                                                t          ||z   | z  d                     d S t                               d                                                                t          | ||z   z  d                     d S d S )Nz9can fuse (benchmark): fusing %s with %s cause %sx speedup.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdown)r  r  r  DEBUGr  r6  r-   r.   )rj  rk  rl  r  r  s      rP   
log_fusionz/Scheduler.speedup_by_fusion.<locals>.log_fusion	  s    &&w}55 cCi''$$S..00..00"sSyH&<#B#BCC	     $$W..00..00 Hc	$:!@!@AA	     rR   infNc                    | d         S r  r   rW  s    rP   rY  z-Scheduler.speedup_by_fusion.<locals>.<lambda>	  s
    ad rR   rZ  r   Fz%register spilling of the first kernelz&register spilling of the second kernelz%register spilling of the fused kernelLoop-carried variableslow_fusionc            	     $      z   z  dS )N)kernel1_pathkernel1_latencykernel2_pathkernel2_latencyfused_kernel_pathfused_kernel_latencyslow_down_ratior   )rk  rl  rj  path1path2
path_fuseds   rP   rY  z-Scheduler.speedup_by_fusion.<locals>.<lambda>	  s-    $)'*$)'*)3,4'/39'=  rR   )rj  r  rk  r  rl  r  rH   ri   ))rG  rz   r  r   rP  r   benchmark_fusionTritonTemplateBufferrK  r1  r@  rZ   r   r  r	  rd  triton.compiler.errorsri  r  r`  rB   choice_timingsrQ  r>  r  rg  r  r   r   rR   max_epilogue_benchmarked_choicesswap_as_triton_callerrS  mathisinfrS   r   r  r  r   r  )rO   r  r  is_multi_templatenode_list_1r  node_list_2node_list_fusedri  r   rp  rW  r  r  min_ms_fusedms_fused_choicetriton_choiceschoiceunfused_timer  rk  rl  rj  r}  r~  r  s    ``                 @@@@@@rP   speedup_by_fusionzScheduler.speedup_by_fusionU	  s=    "--// 
J##%%r'=5
 5
 & 	/@ 	4 	u6688":QRR	 !!	 !!		 4oo''Q**,, ;%4oo''y{KHHII
 00 	4;;;;;;u%%	 	 	 	 	 	 	" e]++ @	
J.1
 1
 @	 J'6N..00FAs33K@@JC <<L"ON(.$$&&NN) ) ) 1 1$ "&%/*<*UVV 39,,E!#!F$KKKE Z55f== 1 1"&"<"<_"M"MKHa,..'/*01 1 1 1 1 1 1 1 1 1 1 1 1 1 1 J|S#... sSy))o.I
44_EEEtu!77DD
U:c?? !C?@@@ 5!77DD
U:c?? !C@AAA 5'+'A'A/'R'R$*:h'' !C?@@@ 5! $   *c!ff4444444 	
8S#&&&#M22	C#I%%d&===#''777]++33        
 
 
 #)##s<   8#J''J+	.J+	*7N #7N 7N N8N32N33N8c                   t          |          }t                              t          j                  rNt                              d           |D ]1}t                              d|                                z              2|                     |          D ]}\  }}| j        |	                                         }| j        |	                                         }| 
                    ||          r"|                     ||          s|                     ||          st                              d|                                |                                           |                                }|                     |                              ||          |                    |           |                    |           |                               | j                            fd                                D                        t+          |d           }|                     |          }|                     |           |S )a  
        Combine eligible nodes into FusedSchedulerNodes.

        This relies on two key functions to control the logic:
            - self.can_fuse(): checks if a fusion is legal
            - self.score_fusion(): assigns priority to a given fusion
        zfuse_nodes_once, candidates:z  zfusing %s with %sc                :    i | ]}|                                 S r   r   )r   r  node3s     rP   r   z-Scheduler.fuse_nodes_once.<locals>.<dictcomp>	
  s#    DDDQQZZ\\5DDDrR   c                    | j         S rK   r  rW  s    rP   rY  z+Scheduler.fuse_nodes_once.<locals>.<lambda>
  s    !+ rR   rZ  )r   r  r  r  ro  r  r   get_possible_fusionsr%  r-  r  will_fusion_create_cycler  rX   r@  r9  r  rH  r  r  r1  rg  r  r)  )rO   r  r
  rB   r  r  r  r  s          @rP   r5  zScheduler.fuse_nodes_once	  s?    !''""7=11 	@;<<<# @ @  (<(<(>(>!>???? 55e<< 	 	LE5+E,@,@,B,BCE+E,@,@,B,BCE}}UE** 43P3Pu4 4  --eU;;   ')9)95>>;K;K  
 ))++((0055eUCC""5)))""5)))&&&'..DDDD%//2C2CDDD   {(=(=>>>..u55!!%(((rR   Nr  Optional[int]c                F  	 t          | j                  }d}t          | j                  }t                              d|           t          t                              |                     D ]2\  }}t                              |          }t          |          dk     r4|||k    r n| 	                    |          st                              d|           o|dz  }t          j        dk    }t          |d         j        |d|          	t                              d	t          |          |           |D ]}|                    |           |                    	           | j                            	fd
	                                D                        4t'          |d           | _        |                     | j                  | _        t                              d||t          | j                             |                     | j                   dS )z'
        Groups parallel nodes
        r   z2ComboKernels: Generating with num_ck_nodes = %d...r  Nz)ComboKernels: Not speeding up %d-th groupr   Tr  z0ComboKernels: Combining %d nodes for %d-th groupc                :    i | ]}|                                 S r   r   )r   r  r  s     rP   r   z7Scheduler.create_combo_kernel_nodes.<locals>.<dictcomp>4
  s#    LLLq{LLLrR   c                    | j         S rK   r  rW  s    rP   rY  z5Scheduler.create_combo_kernel_nodes.<locals>.<lambda>6
  s    q{ rR   rZ  zEGenerated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodels)r  r  r`   r   r  r  r  r2  r   speedup_by_combo_kernelr   r  rA   r   rH  r  r%  r  r1  rg  r  r)  )
rO   r  r
  countnum_nodes_orignumr_  r	  rB   r  s
            @rP   r  z#Scheduler.create_combo_kernel_nodes
  s-    $*ooTZ		FUUU'&DDTJJ
 
 	 	NC 3CCINNI9~~!!'EL,@,@//	:: 		EsKKKQJE$;a?O4!&*. /	  K HHBI  
 " ) )""4((((OOK(((#**LLLLK4I4I4K4KLLL    K-B-BCCC
33DJ??
S
OO		
 	
 	
 	!!$*-----rR   c                D    |D ]}|                     | j                   d S rK   )r)  r%  )rO   r  rB   s      rP   r)  zScheduler.prune_redundant_deps@
  s5     	? 	?D%%d&=>>>>	? 	?rR   1List[Tuple[BaseSchedulerNode, BaseSchedulerNode]]c                   	
 g 	t                      
d	
 fd}t          j        t                    }|D ]4}|                                D ]}||                             |           5|                                D ]} ||           t          j        rnt          j        t                    }|D ]0}t          |dd          }|r||                             |           1|                                D ]} ||            
                    	          		                     j        d	           t                              d
t          	                     	S )z^
        Helper to find all legal fusion opportunities, sorted by self.score_fusion()
        r  r  rH   ri   c                   t          |           D ]\  }}| |dz   d          D ]}||f}|v r                    |                               ||          r                    |           L|                                s|                                r-                    ||          r                    ||f           d S r  )r  r  r  r  rG  rK  )r  node1_indexr  r  r[  possible_fusionsr  rO   s        rP   check_all_pairsz7Scheduler.get_possible_fusions.<locals>.check_all_pairsM
  s   &/&6&6 @ @"U";?#4#45 @ @E %.Cd{{ HHSMMM}}UE22 @(//4444++-- @1A1A1C1C @uJ J @ )//???@@ @rR   r   NT)r[  reversezfound %d possible fusionsr  r  rH   ri   )r   r  r  r   r
  r  r   r   aggressive_fusionr   *get_possible_fusions_with_highest_priorityru  score_fusion_keyr  r  r`   )rO   r  r  buffer_names_groupingrB   r   node_groupinggroup_groupingr   r  r  s   `        @@rP   r  zScheduler.get_possible_fusionsD
  s    HR	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@  !, 7 = = 	8 	8D--// 8 8%c*11$77778299;; 	+ 	+MOM****# 	/(4T::N 7 7gt44 7"5)00666!/!6!6!8!8 / /....JJ
 
 	$"7FFF4c:J6K6KLLLrR   c                    t                      d fd|                                j                                        |                                j                                        z  |j        j                                        |j        j                                        z  z
  t           fdD                       }|r t          ||          d           |S )	z~
        Finds whether there's a path from node1 to node2 (or vice-versa)
        caused indirectly by other fusions.
        rB   rC   rH   rx   c                ,   t          | t                    r}| vry                    |            |                                                               rdS t          | j        z            p#t          fd| j        z
  D                       S dS )NFc              3  D   K   | ]} j         |                   V  d S rK   r%  r   r  
found_pathrO   s     rP   r   zIScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>
  sQ       H H #
4#:1#=>>H H H H H HrR   )rz   r  r  r2  issubsetrx   r   r   )rB   combined_ancestorscombined_namesr  rO   visiteds    rP   r  z6Scheduler.will_fusion_create_cycle.<locals>.found_path~
  s    $ 233 G8K8KD!!!++--667IJJ  !5   ?@@ C H H H H H!%2D!DH H H E E  5rR   c              3  D   K   | ]} j         |                   V  d S rK   r  r  s     rP   r   z5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>
  s5      WWqJJt6q9::WWWWWWrR   zwill create cycle)rB   rC   rH   rx   )r  r2  _dictr  r   r   r  )rO   r  r  cycler  r  r  r  s   `   @@@@rP   r  z"Scheduler.will_fusion_create_cyclet
  s    ,/55	 	 	 	 	 	 	 	 	 	2 %%''-2244''))/44667 	
 O!&&((5?+@+E+E+G+GG WWWWWDVWWWWW 	9#IeU##$7888rR   c                    t          t          |j        |j        z
            t          |j        |j        z
                      }|dk    S )aB  
        This function prevents fusion for nodes that can increase memory
        footprint. This problem is more common in horizontal fusion, where nodes
        that are far apart in the original order get fused, lengthening the live
        intervals of tensors. This is very evident in models with activation
        checkpointing, where the recomputed nodes from different checkpointed
        regions get fused and significantly increase the memory footprint.

        The current attempt is a quick, possibly hacky, heuristic to prevent the
        fusion of nodes that are far away in the original order.

        A better but difficult to implement heurisitic would be to use live
        intervals of the buffers, find region of peak pressure in the original
        program and prevent fusion that crosses that peak region. We might need
        special care or good approximation in this implementation, as fusion of
        node changes live intervals, and re-computing live intervals and peak
        memory after each fusion can introduce large compilation overhead.
        @   )r  ra  r   r   )rO   r  r  proximity_scores       rP   can_fusion_increase_peak_memoryz)Scheduler.can_fusion_increase_peak_memory
  sH    * %/122%/122
 
 ##rR   common_buf_namesTuple[str, ...]c                   i }d |j                                         D             }d |j                                         D             }|D ]}t          j                            |          }||         }	||         }
|	                                |
                                k    r0d|	                                 d|
                                 ||<   t          |	j                  t          |
j                  k    rd||<   t          |	t                    rt          |
t                    s'dt          |	           dt          |
           ||<   |	                                }|
                                }||k    rd| d| ||<   H|	                                |
                                k    rd|	 d|
 ||<   d	|	 d|
 d
|j         ||<   t          |          S )z}
        Try to decide reasons why fusion fail due to no shared memory even though
        there are common buffers.
        c                    i | ]
}|j         |S r   r  r  s     rP   r   z7Scheduler.decide_fusion_fail_reason.<locals>.<dictcomp>
      XXXC#(CXXXrR   c                    i | ]
}|j         |S r   r  r  s     rP   r   z7Scheduler.decide_fusion_fail_reason.<locals>.<dictcomp>
  r  rR   zdifferent numel: z v.s. 	broadcastznot MemoryDep: zdifferent offset: zMismatch loop orders: zUnknown reason: z
. Layout: )r   rR  r;   ro   r  r  r:   r}  rz   r&   rZ   
get_offsetnormalize_with_stride_orderr\   rS   )rO   r  r  r  reasonsnode1_name2depnode2_name2depr;  r   lhs_deprhs_deplhs_offrhs_offs                rP   decide_fusion_fail_reasonz#Scheduler.decide_fusion_fail_reason
  sF    XX53D3U3U3W3WXXXXX53D3U3U3W3WXXX( )	R )	RH'$$X..C$X.G$X.G  ""g&7&7&9&999 Y(9(9(;(;XX7CTCTCVCVXX   W\**mGL.I.III$/!gy11 GY9W9W  Kd7mmJJ4==JJ  ((**G((**G'!! %R$Q$Q$Q$Q! 335566889 9 %VW$U$UG$U$U!
 R7QQ'QQSZQQ   7||rR   c                   t           j        rt          d ||fD                       rdS |j                                        }|j                                        }||z  }|sdS d |j                                        D             }d |j                                        D             }g }|D ]}	||	         }
||	         }|
                                |                                k    rN|                    t          j	        j
                            |
                                d          |
|f           t          |          dk    rdS t          |dd 	          d         \  }}
}|
j        |j        k    r*|
                                |                                k    S |                                s|                    |
|           nk|                                s|                    ||
           n@t&                              d
|                                |                                           |                     ||          dk    S )z
        Right now just greedily reorder the loop of node1 to be compatible with node2,
        but ideally we should have some heuristics to reorder the loop for node2
        to be compatibile with node1 if that's more efficient.
        c              3  P   K   | ]!}|                                 j        d k    V  "dS )rg  N)r@  rZ   r  s     rP   r   zBScheduler.has_shared_data_after_reordering_loop.<locals>.<genexpr>  sC       8
 8
-.ALLNN5(8
 8
 8
 8
 8
 8
rR   Fc                    i | ]
}|j         |S r   r  r  s     rP   r   zCScheduler.has_shared_data_after_reordering_loop.<locals>.<dictcomp>  r  rR   c                    i | ]
}|j         |S r   r  r  s     rP   r   zCScheduler.has_shared_data_after_reordering_loop.<locals>.<dictcomp>  r  rR   r   r  Tc                    | d         S r  r   rW  s    rP   rY  zAScheduler.has_shared_data_after_reordering_loop.<locals>.<lambda>'  s    QRSTQU rR   )r  r[  z?Don't reorder loops since both nodes are reductions: %s v.s. %s)r   r;  r   r   buffer_namesrR  r  r  r;   ro   r  r  r  r`   rg  rL  r4  rB  r   rO  r  rX   score_fusion_memory)rO   r  r  node1_buffer_namesnode2_buffer_namescommon_buffer_namesr  r  
candidatesbuffer_namer  r  numels                rP   %has_shared_data_after_reordering_loopz/Scheduler.has_shared_data_after_reordering_loop
  s    0 	C 8
 8
38%.8
 8
 8
 5
 5
 	 5".;;==".;;==03EE" 	5XX53D3U3U3W3WXXXXX53D3U3U3W3WXXX 
. 	 	K$[1G$[1G335566889 9 !!(2273D3D3F3FQR2SS   z??a5 #)T~~"V"V"V#
w w///
 $$&&'*;*;*=*=== !!## 		++GW====##%% 	++GW====##Q       ''u5599rR   c                    u rdS t                    }t          t                    st          t                    r |d           dS t          t          t          f          r!                                s |d           dS t          t          t          f          r!                                s |d           dS                                 j        z  r |d           dS                                 r |d           dS                                 rA                                s 	                                st          j        s |d           dS                                                                 z  t          j        j        z  r |d           dS                                 }                                }||k    r |d	||           dS ~                               d
k    }|r                                }t&                              d                                                                |rdnd           |rt          j        r(	                                s	                                rt/          d          r{j                                        j                                        z  t5                    d
k    r5t7          d                               fd            |d           dS  |d           dS                                 sr                                s^t5                                                    t5                                                    z   t          j        k    r |d           dS                                 j        z  rA                                sdS  !                    |                                         S  "                              r |d           dS  !                    |          #                              S )zj
        Determine if it is possible to combine node1 and node2 into a
        single fused node.
        Fz/grouped node must not be fused with other nodesznode1 is extern or nopznode2 is extern or nopznode1 must go before node2z!templates can only fuse epiloguesztemplate epilogue not satisfiedz#fusion for buffer explicit disabledzdevice mismatch (%s vs %s)r   z%s and %s has%s shared dataz nor   'fusion_failure_due_to_indexing_mismatchc                 ^   t           j        j        t           j        j                                                                        t                                                    t                                                    t                                                    dS )N)pre_grad_graph_idr  
node1_name
node2_namenode1_debug_strnode2_debug_strr  failure_reason)	r;   ro   r  r  rX   r    re   r   r  )r  r  r  rO   s   rP   rY  z$Scheduler.can_fuse.<locals>.<lambda>  s    121A23'2L*/..*:*:*/..*:*:/9%//:K:K/L/L/9%//:K:K/L/L378H3I3I.2.L.L %u.>/ /! ! rR   z'no shared data due to indexing mismatchzno shared datazexceeds max fusionzwill increase peak memory)$r  rz   r  r  rj  rG  r2  r   r   rB  r   epilogue_fusionr6  r;   ro   no_fuse_buffer_namesr@  r  r  rO  r  rX   r  r   r   r  r`   r   r  rK  r1  max_fusion_sizecan_fuse_verticalr9  r  can_fuse_horizontal)rO   r  r  r   r  device2no_shared_datar  s   ```    @rP   r  zScheduler.can_fuse@  s    E>>5u%%e122 	j'7
 7
 	 CABBB5u8:PQRR	%%''	 C()))5u8:PQRR	%%''	 C()))5$$&&8 	C,---5 	C34445 	**,,	!!##	 )	
 C12225 ""$$u'='='?'??G() 	 C56665!!##""$$WC,fg>>>511%??1D 	!%!K!Ku" " N 	)NNNN#+EE		
 	
 	
  	(	,1,>,>,@,@	DIDVDVDXDX	 ''PQQ !%2244u7H7U7U7W7WW ! '((1,,$%NOOWW         CABBB 5C !!!5   ""	$$&&	 EOO%%&&U__->->)?)??&BXXXC$%%%5$$&&8 		N))%77 u##F++==eUKKK33E5AA /000u##F++??uMMMrR   c                   |                                 }|                                }t                      }t          ||          }|j        j        D ]M}t          |t                    s|j        D ]-}| 	                    ||          r|
                    |           .N|j        D ]C}	t          |	t                    r,|                     |	||          r|
                    |	           Dt          d |j        |z
  D                       }
|
|z  r |d           dS |
D ]I}| j        |         j                                        }|| j        |         j        z  r |d            dS JdS )a  
        Check if it is legal to fuse a consumer (node2) into a producer (node1).

        We can fuse them if all the reads of node2 either match
        corresponding writes in node1, or are written by nodes that can
        be scheduled before the fusion of node1 and node2.
        c              3  $   K   | ]}|j         V  d S rK   r  r  s     rP   r   z.Scheduler.can_fuse_vertical.<locals>.<genexpr>  s5       $
 $
CH$
 $
 $
 $
 $
 $
rR   zmemory deps did not matchFz(intermediate nodes between node1 & node2T)r6  r2  r   r  r   r   rz   r&   r   fusable_read_and_writer  r(   fusable_weak_deprv   rD   rX   r%  r   )rO   r  r  node1_buf_namesnode1_op_namescomputed_depsr   cdr  r   remaining_depsrM   r  s                rP   r  zScheduler.can_fuse_vertical  s     00222244)3u%%#* 	* 	*Bb),, . * *..r266 *!%%b)))* + 	' 	'C#w'' 'D,A,A#ue,T,T '!!#&&&# $
 $
 % 8= H$
 $
 $
 
 
 O+ 	
 C+,,,5" 	 	D&t,8AACCG 7 @ JJ >???uu trR   weak_depr(   c                   j         |                                vrdS fd|j        j        D             }t	          |          dk    rdS |d         t          t                    sJ t          j        t          j
                  rdS | j        j                 fd|j        j        D             }t          fd|D                       S )NFc                4    g | ]}|j         j        k    |S r   )rM   r  )r   writer  s     rP   r   z.Scheduler.fusable_weak_dep.<locals>.<listcomp>  s3     
 
 
zX222 222rR   r   r   c                *    g | ]}|j         k    |S r   r  )r   rr  	real_names     rP   r   z.Scheduler.fusable_weak_dep.<locals>.<listcomp>  s,     
 
 
	Y8N8ND8N8N8NrR   c              3     K   | ]Y}t          |t                    o?t          |j        t          j                   o|j        j        k    o|j        j        k    V  Zd S rK   )rz   r&   r   r|  r   TMPr}  )r   rr  r  s     rP   r   z-Scheduler.fusable_weak_dep.<locals>.<genexpr>  s       
 

 	 tY'' ('
DH===(
ek)( 	UZ'	
 
 
 
 
 
rR   )rM   r6  r   r   r`   rz   r&   r   r|  r   r  r   r  r   r  )rO   r  r  r  mutating_writesrelevant_readsr  r  s    `    @@rP   r  zScheduler.fusable_weak_dep  s"    = 6 6 8 8885
 
 
 
*1
 
 

 1$$5"%+++++u{DH55 	5+H,AB	
 
 
 
".4
 
 
  
 
 
 

 '
 
 
 
 
 	
rR   rr  r%   r  r&   c                   t          |t                    r1|j        |j        k    r	|j        dS | j                            |j        |j                  }||j        k    s>t          |j        t          j	                  st          |j        t          j	                  rdS t          j        r8|j        |j        k    r(|                                }|                                }|j        |j        k    oSt          |j                  t          |j                  k    o)|j        d t          |j                           |j        k    S t          |t                     ri| j                            |j        |j                  }| j                            |j        |j                  }|j        |j        k    r|j        ||k    rdS dS )NTF)rz   r&   r  r  r   rM   r   r|  r   r  r   r;  rL  r4  r`   r}  r'   )rO   rr  r  	read_name
write_names        rP   r  z Scheduler.fusable_read_and_write  s   dI&&  	yEJ&&5:+At-11$)TYGGI UZ''&tz48<< (&u{DH== ( u0 *T]en5T5T ~~'')) 
ek) ?	NNc%*oo5?I/EJ/0EJ>
 g&& 	-11$)TYGGI.225:uzJJJ	UZ''J*++turR   Tuple[bool, bool, int, int]c                l   |                      ||          }t          t          |j        |j        z
            t          |j        |j        z
                       }|                                t          j        k    o|dk    |                                |                                k    o|dk    ||fS )a\  
        Assign a score (higher comes first) to the fusion of node1
        and node2.  When different fusions conflict with each other,
        this is the way we decide what order to run them in.

        Our current score is based on:
        - Estimate of the saved memory operations
        - Fusions closer together in original order
        r   )	r  r  ra  r   r   rG  r   epilogue_fusion_firstrB  )rO   r  r  memory_scorer  s        rP   score_fusionzScheduler.score_fusion)  s     //u==%/122%/122
 
 

 6#??TLSTDT  E$6$6$8$88M\A=M	
 	
rR   r   rI   c                    d}|| j         vrE	 |                                s|                                }n# t          $ r Y nw xY w|| j         |<   n| j         |         }|S r  )r  has_unbacked_symbolsnumbytes_hintKeyError)rO   r   ress      rP   dep_size_hintzScheduler.dep_size_hintA  s    d000//11 .++--C    	
 /2D&s++,S1C
s   (6 
AAc                    t          |j        j                  t          |j        j                  z   }t          |j        j                  t          |j        j                  z   }t	          ||          dz  t          ||          k    rv||k    r|}|}|}g }|j        j        |j        j        z  D ]3}||j        j        v s||j        j        v r|                    |           4t           fd|D                       S |j        j        |j        j        z  |j        j        |j        j        z  z  }t           fd|D                       S )zn
        The first term in our fusion score that estimates number of saved
        memory operations.
        r  c              3  B   K   | ]}                     |          V  d S rK   r  r  s     rP   r   z0Scheduler.score_fusion_memory.<locals>.<genexpr>g  s1      ??3t))#..??????rR   c              3  B   K   | ]}                     |          V  d S rK   r  r  s     rP   r   z0Scheduler.score_fusion_memory.<locals>.<genexpr>l  s1      IIs4%%c**IIIIIIrR   )r`   r   r   r   r  r  r  r  )	rO   r  r  node1_dep_lennode2_dep_lentmpr  r   common_memory_depss	   `        rP   r  zScheduler.score_fusion_memoryQ  sc    E-344s5;L;S7T7TTE-344s5;L;S7T7TT }m,,q03}m3T3TTT},,D(.1B1II % %%+111SE<M<T5T5TKK$$$????$??????#/58I8PP#e&7&>>
 IIII6HIIIIIIrR   r  c                $   t          |          dk    r|S i }|D ]\  }}|                                |                                k    sJ |                                }t          |                     |                              ||                    }||vr	||fg||<   ||                             ||f           t          |                                t          j	        d                    d         }t          |          dk    sJ |S )Nr   rZ  r   )
r`   r@  rI   r9  get_fusion_pair_priorityr  r  r  operator
itemgetter)rO   r  "possible_fusions_group_by_priorityr  r  r  fusion_pair_priority&possible_fusions_with_highest_prioritys           rP   r  z4Scheduler.get_possible_fusions_with_highest_priorityn  sY   
   A%%##  	+ - 	 	LE5##%%)9)9););;;;;%%''F#&  ((AA%OO$ $  $+MMMENL23GHH 33GHOOEN    25.4466H<OPQ<R<R2
 2
 2

2. 9::Q>>>>55rR   +Tuple[BaseSchedulerNode, BaseSchedulerNode]c                8    |\  }}|                      ||          S )z-
        Shim for list.sort(key=...)
        )r	  )rO   r  r  r  s       rP   r  zScheduler.score_fusion_key  s#     u  ...rR   c                    t          t          j                                                  }t	          | j                  D ]7}|                    || j                   |                    |j	                   8dS )zg
        Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
        N)
r   r;   ro   r  r  r  r  r   r  r   )rO   r   rB   s      rP   r  zScheduler.compute_last_usage  sv    
 0:!':R:R:T:T/U/UTZ(( 	8 	8D 3T5LMMM&&t7777	8 	8rR   c                p   t          | j        t          j        j        z
  t          j        j        j        z
            D ]}|| j        v rK| j        |         }|                                r)t          j        j        	                    |j
                   V|t          j        j        v rut          j        j        |         j        }t          |t          j                  r|                                sJ t          j        j        	                    |j                   | j                                         dS )z*Free any buffers that are no longer neededN)rg  r  r;   ro   rh  rp   freedrv   r~   codegen_freerB   r  r   rz   r   rU  is_input_bufferclear)rO   rM   r   storages       rP   free_bufferszScheduler.free_buffers  s   %g%&g"()
 
 	@ 	@D
 t'''&t,<<>> @G(55ch???---'.t49!'2=99Wg>U>U>W>WWWW$11',???!'')))))rR   c                j    t           fdt          j        j        D                       }g t          j        j        D ]z}| j        vr                    |           ! j        |         j        }|J t          d |D                       }|                    |          r                    |           {d fd}t          t          |                    D ]}|t          j        j
        j        v rt          j        j
        j        |         }t          |t                    r|                    d	          rat          fd
|j        D                       }|r                     |           t          j        j                            |                                |           dS )zr
        Any buffers that are both created and have a last use in the
        same kernel can be removed.
        c              3  r   K   | ]1}|j         v j         |         j                                        V  2d S rK   )rv   rD   rX   )r   r   rO   s     rP   r   z8Scheduler.remove_kernel_local_buffers.<locals>.<genexpr>  sT       &
 &
d&&& S!-6688&&&&&
 &
rR   Nc              3  L   K   | ]}|j         	|                                V   d S rK   )r{  rX   r  s     rP   r   z8Scheduler.remove_kernel_local_buffers.<locals>.<genexpr>  s1      UU4Ut}}UUUUUUrR   r  rS   rH   rx   c                ~    | t           j        j        vo)| t           j        j        j        vo| j        vo| j        vS rK   )r;   rs   must_keep_buffersrk   input_buffersr  r   r  rO   s    rP   remove_filterz<Scheduler.remove_kernel_local_buffers.<locals>.remove_filter  sK    33 5QX]885T225 T44	rR   REMOVEDc              3      K   | ]}|v V  	d S rK   r   )r   r  names_to_removes     rP   r   z8Scheduler.remove_kernel_local_buffers.<locals>.<genexpr>  s(      KKaQ/1KKKKKKrR   )r  rS   rH   rx   )r   r;   rs   store_buffer_namesrv   r  rG   r  r   filterrk   ra  rz   rS   
startswithr  other_namesremove_inplace_bufferinplaced_to_remover  remove_buffer)	rO   fused_node_namesout_bufrG   r0  rM   r   rH  r3  s	   `       @rP   remove_kernel_local_buffersz%Scheduler.remove_kernel_local_buffers  s    & &
 &
 &
 &
x2&
 &
 &
 
 

 x2 		0 		0Gd...&&w///$W-3E$$$UU5UUUUUE~~.// 0&&w///	 	 	 	 	 	 vm_EEFF# 
	) 
	)Dqx}444hm3D9c3'' CNN9,E,E KKKK3?KKKKK 5..t444+//5555""4((((
	) 
	)rR   rM   c                    t                               d|           dt          j        j        j        |<   t          j        j                            |           d S )Nzremove_buffer(%r)r1  )r   r  r;   rs   rk   output_buffersrh  r  r  s     rP   r:  zScheduler.remove_buffer  sI     			%t,,,-6$T*	 $$T*****rR   c                    t                               d|           t          j        j        j        |         j        }|                    dd          t          j        j        j        |<   t          j        j        	                    |           d S )Nzremoving_inplace_buffer(%r)
in_out_ptrr1  )
r   r  r;   rs   rk   ra  
inner_namer  rh  r  )rO   rM   rB  s      rP   r8  zScheduler.remove_inplace_buffer  sr    		/666X]248C
.8.@.@)/
 /
%d+ 	
 $$T*****rR   c                    | j                                         D ]}|                                 |                                  d S rK   )r  r   flushr(  )rO   r?  s     rP   rD  zScheduler.flush  sF    }++-- 	 	GMMOOOOrR   scheduler_noder  c                   t          |t                    sJ t          d         dxx         dz  cc<   t          j        t          d                    5  |                                 |                                 d d d            n# 1 swxY w Y   |j        }t          |t          j
                  sJ dt          |                      |                    t          j        j                   |                                  d S )Ninductorextern_callsr   F)increase_kernel_countztype(node)=)rz   r  r   r;   set_kernel_handlerr#   ru  r  rB   r   r  rZ   rd  ro   rp   r(  )rO   rE  rB   s      rP   codegen_extern_callzScheduler.codegen_extern_call  s9   .*CDDDDD
 	^,,,1,,,!&u"E"E"EFF 	& 	&00222##%%%	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& "$00BB2BT$ZZ2B2BBBBQW)***s   )B

BBr  BaseSchedulingc                   t          |j                  r|j        J | d            t          j                            |           t          |j                  }|t          d|j                   t                      s|j        dk    rQt          j
                            |          x}j        dk     r't          d|j         d|j         d|j                   t          |j                  rt          d           ||           S )	Nz( should have been normalized in loweringzUnsupported device type: r2     zFound z which is too old to be supported by the triton GPU compiler, which is used as the backend. Triton only supports devices of CUDA Capability >= 7.0, but your device is of CUDA capability r  zCannot find a working triton installation. Either the package is not installed or it is too old. More information on installing Triton can be found at https://github.com/openai/triton)r8   rZ   r|  r;   ro   add_device_infor"   r  r   r   r2  get_device_propertiesmajorrM   minor)rO   r  device_schedulingdevice_propss       rP   create_backendzScheduler.create_backend  sU   v{##	?'-|'?'?>>> (@'?'?	'''5fkBB$H6;HHIII|| 	v%%%*Z%E%Ef%M%MM\TWXXX" T\.  T  T  kw  k}  T  T  @L  @R  T  T   $$ " N   ! &&&rR   c                h    || j         vr|                     |          | j         |<   | j         |         S rK   )r  rU  r  s     rP   r9  zScheduler.get_backend  s6    &&$($7$7$?$?DM&!}V$$rR   c                4    d	 fdfd|                                 D             }t          |                                          }|rLt          |t	          j        d                    \  }}t          j        j        	                    |           d S d S )
Nr  torch.fx.NoderH   rI   c                    | j         vr;j                             d t          | j        j                  D                        j         |          S )Nc                    i | ]\  }}||	S r   r   )r   r  r  s      rP   r   z>Scheduler.enter_context.<locals>.get_order.<locals>.<dictcomp>&  s    ,W,W,WdaQ,W,W,WrR   )r  r  r  ro   r  r/  s    rP   	get_orderz*Scheduler.enter_context.<locals>.get_order$  sQ    ,,,$++,W,Wi>V>V,W,W,WXXX'**rR   c                r    i | ]3}|j         	|j                                         D ]} |          |fd 4S rK   r  )r   r  r  r[  s      rP   r   z+Scheduler.enter_context.<locals>.<dictcomp>*  sY     
 
 
v!V'')) "! Yq\\1t!!!!rR   r   rZ  )r  rX  rH   rI   )
r1  r   r  r  r  r  r;   ro   rp   enter_context)rO   rB   r  r  lastr[  s   `    @rP   r]  zScheduler.enter_context#  s    	+ 	+ 	+ 	+ 	+ 	+
 
 
 
^^%%
 
 
 w||~~&& 	5'x':1'='=>>>GAtG ..t44444	5 	5rR   c                |    t          d          5  |                                 cd d d            S # 1 swxY w Y   d S )NzScheduler.codegen)r   _codegenrN   s    rP   rd  zScheduler.codegen5  s    -.. 	# 	#==??	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	#s   155c                
   t           j        rdd l}t          j                    }t                      }t          |          D ]k}|j        dk    r|j        |j	        j
        j        k    r nC|j        |j        f}||vsJ d|j         d|j         d            |                    |           l| j        D ]l}	 t                              d|                                |                                           nD# t&          $ r7}t                              d|                                           Y d }~nd }~ww xY w|                     |           t+          |t,                    s |                                x}r|| j        k    s(|                                s|                                r|                                  || j        k    r| j        r<t9          | j        j                  r#t<          j        j         !                                 t9          |j                  r:|j"        
J d            t<          j        j         #                    |j"                   || _        | j$        %                    |j&                   |                                rA|'                                ^}}	| (                    |          )                    ||	           nN|                                r1tU          j+        tX          |          }| -                    |           n	|.                                rtU          j+        t^          |          }| (                    |          }
d	d
l0m1} d	dl2m3} t+          |
||f          r|
}nti          dt;          |                     |5                    |           npt+          |tl          tn          f          r)| (                    |          8                    |           n+t+          |t,                    sJ |9                                 t           j:        j;        r'| (                    |          <                                 | j=        %                    |>                                           | j?        %                    |@                                           t+          |t,                    sQ|                                }|;| (                    |          A                                r|                                  n| j        r<t9          | j        j                  r#t<          j        j         !                                 |                                  d S )Nr   _compile_innerzDuplicate stack frame :zs; did you add a decorator to one of the functions in this stack trace?  If so, try using a context manager instead.z5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0zdevice should have an indexr   CUDACombinedSchedulingSIMDSchedulingztype(self)=)Br   "check_stack_no_cycles_TESTING_ONLYtorch._dynamo.convert_frame	tracebackextract_stackr  r  rM   filename_dynamoconvert_frame__file__linenor  r  r   r  rX   r  r   r]  rz   rj  r@  r  rI  rG  rD  r2   rZ   r;   ro   rp   codegen_device_guard_exitr|  codegen_device_guard_enterr  r  r   r1  r9  codegen_templater  r  r  rK  rK  r   codegen.cuda_combined_schedulingre  codegen.simdrg  r  codegen_combo_kernelr  r`  codegen_noder  tritondebug_sync_kernelcodegen_syncr  r6  r]  r2  ready_to_flush)rO   r   stackr  framer[  rB   r  r  epiloguebackend_re  rg  r?  s                 rP   r`  zScheduler._codegen9  sc   4 	....+--E55D!%   J"222%-*E*NNNE~u|4$JU^ J Jel J J J '
 J E	! E	!D
		KMMOO..00   
    		LMMOO        t$$$d$:;; 1//+++1 d111~~'' 2'')) 2 JJLLLT000* I/@+00 0 I ,FFHHH(55 V%|779V777,GGUUU*0D'%,,T_===!!  "&.."2"2x  ((99$IIII!!  {#<dCC((....""  {#=tDD++F33TTTTTT888888h9O(PQQ ;&GG()9DJJ)9)9:::,,T2222D#5}"EFF    ((55d;;;;!$(>?????}. 8  ((55777'..t/D/D/F/FGGG%,,T-E-E-G-GHHHd$:;; !**%$*:*:6*B*B*Q*Q*S*S%JJLLL 	=#4T5H5M#N#N 	= G ::<<<

s   8A C99
D:-D55D:Tuple[float, float, str]c                    |d                                          }| t          j        _        || _        |                     |          }|                    |          S r<  )r@  r;   ro   rA   r  r9  benchmark_combo_kernel)rO   r_  r  r?  s       rP   r  z Scheduler.benchmark_combo_kernel  sR     1((** $""6**--i888rR   c                v   t           j        sdS |}|d                                         }|j        dk    rdS ddlm} dg }}t          |          D ]\  }}|                                }	|                     |	          rt          
                    d           	 |                     |	          \  }
}t          j        |
          rt          
                    d|            dS n@# |$ r8}d	t          |          v r!t          
                    d
           Y d}~ dS  d}~ww xY w||
z  }|                    |           	 |                     |          \  }}}n?# |$ r7}d	t          |          v r t          
                    d
           Y d}~dS  d}~ww xY w||z
  dk     p|dk     }t                              t"          j                  rc||k    s|r.t          
                    dt'          ||z  d                     n-t          
                    dt)          ||z  d                     ||z
  |k     p|S )rf  Tr   rg  rh  g        z<ComboKernel: benchmarking may not accurate due to atomic_addz;ComboKernel benchmark: register spilling of %d-th subkernelFrs  zCComboKernel benchmark: return True because of loop-carried variableNg333333?z/can fuse (benchmark): fusing causes %sx speeduprn  z3cannot fuse (benchmark): fusing causes %sx slowdown)r   r  r@  rZ   r  ri  r  r1  rd  r  r  r>  r  r  rS   r  r  r  ro  r-   r.   )rO   r  subkernel_nodesr  ri  rk  
path1_listr  r  r_  mspathr  rl  	ms2_clone
path2_listsmall_kernels                    rP   r  z!Scheduler.speedup_by_combo_kernel  s   
 , 	4 #..00 ;%4;;;;;;rZ!/22 	$ 	$HAu))I ##I..   R  55i@@D:b>> !$$U   !55! $   *c!ff44$$]    444444 2ICd####
	)-)D)D_)U)U&CJJ 	 	 	&#a&&00  Y   ttttt	 Y,9c	""7=11 
	SyyLy  E#)1122   
   Ic	//00  
 Y$44s=   AC%%D"*+DDD"E F +FFFr;  	ir.Layoutc                `    | j         |         }|j        J |j                                        S rK   )rv   rB   rk  )rO   r;  r   s      rP   get_buffer_layoutzScheduler.get_buffer_layout  s1    x(x###x""$$$rR   c                   | j         D ]}|                                rt          |                                j                  r|j        j        D ]}t          j        j        	                    |j
                  }|r|                                rx|                                j        dk    r[t          |j        t                    sA|                                g k    r)t          j        j                            |j
                   d S )Nrg  )r  r@  r8   rZ   r   r   r;   ro   r  r   rM   rz   r\   r+   r   zero_dim_cpu_tensor_listr  )rO   rB   rr  rv  s       rP   r  z$Scheduler.update_zero_dim_cpu_tensor  s   J 	H 	HD   
HVDOO,=,=,B%C%C 
H ,2 	H 	HDW377	BBFH"--//H #--//4== *6=:K L L >"OO--338<<TYGGG	H 	HrR   )r  r  rH   ri   r  r   )r  rS   rH   ri   )rB   r   rH   rC   r?  )r  rC   rH   r  )rH   r!  r  r7  rH   r:  )r_  r7  rH   rx   r  rC   r  rC   rH   rx   rK   )r  r  rH   ri   r  )r  r  rH   r  )r  rC   r  rC   r  r  rH   rS   )r  r(   r  rC   r  rC   rH   rx   )rr  r%   r  r&   rH   rx   )r  rC   r  rC   rH   r  )r   r%   rH   rI   r  rC   r  rC   rH   rI   )r  r  rH   r  )r  r  rH   r  )rM   rS   rH   ri   )rE  r  rH   ri   )r  r>  rH   rL  )rB   rC   rH   ri   r_  r7  rH   r  )r  r  rH   rx   )r;  rS   rH   r  ):r[   r   r   r   r   r  r  r  r  r  r  r  r  r  r"  r&  r  r  rI  r  r>  r  rd  r  r5  r  r)  r  r  r  r  r  r  r  r  r  r	  r  r  r  r  r  r(  r=  r:  r8  rD  rK  rU  r9  r]  rd  r`  r  r  r  r  r$  r%  s   @rP   r@   r@     s        ))))   _
 _
 _
 _
 _
 _
B4 4 4 47 7 7 7# # # #, , , ," " " "HLC LC LC LC\## ## ## ##J   6T T T T(   4# # # #&; ; ; ;B   0	 	 	 	4 4 4 4@@ @@ @@ @@D
 
 
 
P$ P$ P$ P$d' ' ' 'R.. .. .. .. ..`? ? ? ?.  .  .  . `, , , ,\$ $ $ $69 9 9 9vF: F: F: F:PrN rN rN rNh* * * *X
 
 
 
J" " " "H
 
 
 
0    J J J J:6 6 6 6@/ / / /	8 	8 	8 	8* * * *$+) +) +) +)Z+ + + ++ + + +   
   ' ' ' '2% % % %
5 5 5 5$# # # #a a a aF9 9 9 9I5 I5 I5 I5V% % % %
H H H H H H H HrR   c                      e Zd Zed'd            Zd(d
Zd(dZd)dZd*dZd+dZ	d,dZ
d-dZd.dZd-dZd/d Zd0d"Zd1d%Zd&S )2rL  r  r>  rH   Sequence[BackendFeature]c                    dS )z0Return a set of .codegen.common.BackendFeature()r   r   )r  r  s     rP   get_backend_featuresz#BaseScheduling.get_backend_features	  s	     rrR   r  rC   r  rx   c                    t           )zO
        Check whether node1 and node2 can be vertically fused or not.
        r  r  s      rP   r  z BaseScheduling.can_fuse_vertical  
     "!rR   c                    t           )zQ
        Check whether node1 and node2 can be horizontally fused or not.
        r  r  s      rP   r  z"BaseScheduling.can_fuse_horizontal  r  rR   r  c                    |                                 s|                                 rt                              ||          S t                              ||          S )z 
        Fuse two nodes
        )rK  r  r  r  r  s      rP   r  zBaseScheduling.fuse  sW      	9!1!1!3!3 	9-225%@@@%**5%888rR   rI  rX  "Tuple[Tuple[sympy.Expr, ...], ...]c                    t           )z[
        Process the iteration sizes in case a transformation needs to be applied.
        r  )rO   rI  s     rP   r:  zBaseScheduling.group_fn)  r  rR   template_nodeepilogue_nodesr7  Optional[str]c                    t           )z
        Given a template node, generate a kernel.

        This function is only available for triton now. If the third-party backend behaves as a sub-class
        of TritonScheduling, it can override it or reuse it.
        r  )rO   r  r  s      rP   rs  zBaseScheduling.codegen_template1  s
     "!rR   rB   (Union[FusedSchedulerNode, SchedulerNode]ri   c                    t           )zD
        Generate a kernel given a list of pre-fused nodes.
        r  r   s     rP   rw  zBaseScheduling.codegen_node>  
     "!rR   c                    t           )zt
        Generate synchronization code for the kernel. This method depends on the hardware characteristics.
        r  rN   s    rP   rz  zBaseScheduling.codegen_syncD  r  rR   c                    dS )z
        Check whether the backend is requesting the scheduler to flush the generated kernel.
        If not supported, please return False.
        Fr   rN   s    rP   r{  zBaseScheduling.ready_to_flushJ  s	    
 urR   c                    t           )z]
        Flush the generated kernel and python wrapper code to the source code file.
        r  rN   s    rP   rD  zBaseScheduling.flushQ  r  rR   r  r:  c                    t           )r=  r  r  s     rP   r>  z$BaseScheduling.benchmark_fused_nodesW  
     "!rR   rI   c                    dS )z
        Return an unsigned integer which represents the priority of this fusion pair.
        The smaller is with higher priority.
        r   r   r  s      rP   r  z'BaseScheduling.get_fusion_pair_priority`  s	     qrR   r_  r  c                    t           )z
        Benchmark the list of nodes to combine and return the execution time
        and memory copy time in milliseconds on randomly generated inputs.
        r  rc  s     rP   r  z%BaseScheduling.benchmark_combo_kerneli  r  rR   N)r  r>  rH   r  r  r  )rI  rX  rH   r  )r  rC   r  r7  rH   r  )rB   r  rH   ri   r   r   r  r  r  )r[   r   r   r  r  r  r  r  r:  rs  rw  rz  r{  rD  r>  r  r  r   rR   rP   rL  rL    s"          [" " " "" " " "	9 	9 	9 	9" " " "" " " "" " " "" " " "   " " " "" " " "   " " " " " "rR   rL  (Union[SchedulerNode, FusedSchedulerNode]r   c                   g }|                                  }|t          |t          j                  sJ |r3|j        ,|                    |                                  d           nddlm} ddl	m
} t          | t                    r| fn| j        }|d                                         }| j                            |          }t          |||f          sJ |t           j        j        _        t&          j        }|                    |                                          }	|t&          _        |                    |                                  d           |                    t/          j        |	d                     |S )Nz" Unfinalized multi template bufferr   rd  r   rf  z Triton code:r  )r  rz   r   rP  make_kernel_renderr  rX   0torch._inductor.codegen.cuda_combined_schedulingre  ru  rg  r`  r  r@  rA   r9  r;   ro   r  r   generated_kernel_countgenerate_kernel_code_from_nodesstripr  ra   )
rB   rW  multi_templatere  rg  r  r  r?  old_generated_kernel_counttriton_codes
             rP   rU  rU  s  s   E++--N!Z@V%W%W!!! ;.;CKKKLLLL	
 	
 	
 	
 	
 	
 	100000&t];;L$%%''.,,V44'N4J#KLLLLL+1(
 &-%C"==fEEKKMM)C&666777X_[&99:::LrR   )r  r   rH   rS   )rB   rC   r%  r&  rv   r  rH   ri   )r  rC   rH   ri   )r  rC   rA   r@   r  r  rH   ri   )r   )rU  rV  rI  rW  rX  rY  rH   rZ  )rB   r  rH   r   ){
__future__r   r  r   rr  r  r  r  r  r  r  r  rj  r  r   r   r   r   r   r	   r
   r   r   r   r   r   r   rt  r   torch._inductor.async_compiletorch._dynamo.utilsr   r   torch._inductor.metricsr   r   %torch.fx.experimental.symbolic_shapesr   torch.utils._ordered_setr   torch.utils._sympy.symbolr   r   torch.utils._tritonr   r   r   r   r   r   r   	codecacher    codegen.commonr!   r"   r#   comm_analysisr$   r%   r&   r'   r(   r)   r*   r+   	loop_bodyr,   runtime.runtime_utilsr-   r.   r  r/   utilsr0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   virtualizedr;   	getLoggerr[   r   _logginggetArtifactLoggerr  rO  	dataclassr?   rC   r  r^   r|   r(  opsatenconvolutionmmbmmaddmmr  r  rj  r`  r  r  r  r  r  rw  ry  r  r  r@   rL  rU  r   rR   rP   <module>r     s   " " " " " "                    				                                        $ $ $ $ 6 6 6 6 6 6 6 6 M M M M M M M M G G G G G G / / / / / / ? ? ? ? ? ? ? ? * * * * * * 6 6 6 6 6 6 6 6 6 6 6 6 6 6 ! ! ! ! ! ! M M M M M M M M M M ; ; ; ; ; ; : : : : : : : : : : : : > > > > > > > > > >       7 7 7 7 7 7 7 7 & & & & & &                                g!!^--hAA
N44XOO  Q. Q. Q. Q. Q. Q. Q. Q.h^ ^ ^ ^ ^ ^ ^ ^B
 
 
 
 
 
 
 
,           &K &K &K &KV #()."<*).,!IN0	  W W W W W 1 W W W"5 5 5 5 5. 5 5 5R+ R+ R+ R+ R+% R+ R+ R+j       ,z* z* z* z* z** z* z* z*zw: w: w: w: w:!3 w: w: w:t	? ? ? ? ?, ? ? ?J %'+ + + + +\ 
 
 
 
 
 
 
 
> +9?,, GH GH GH GH GH GH GH GHT:h" h" h" h" h" h" h" h"V     rR   