
    קg                    j	   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZmZ d dlmZmZmZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmc mZ d dlmZ d dl m!Z!m"Z" d dl#m$Z$m%Z% d dl&m'Z'm(Z(m)Z)m*Z* d d	l+m,Z, d d
l-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7 erd dl8Z8e0j9        Z: ej;        e<          Z=ej>        j?        Z?ej>        j@        Z@e G d d                      ZAe G d d                      ZBe G d d                      ZCdejD        deEfdZFdejG        deEfdZHdejG        deEfdZIdejD        deJfdZK G d d          ZL eL            ZM	 dbdejN        d eejD                 d!eejD                 d"eeO         dejN        f
d#ZPdejD        deEfd$ZQdejD        deEfd%ZRdejD        deEfd&ZSdejD        deEfd'ZTdejD        deEfd(ZUdejD        deEfd)ZVdejD        deEfd*ZWdejD        deEfd+ZXd,ejG        deeejD                 eejD                 f         fd-ZYd.eejD                 d/eOfd0ZZd,ejG        d.eejD                 d1eejD                 d2eJdeejG        ejG        f         f
d3Z[d,ejG        deejG        ejG        f         fd4Z\ eJd5          Z]d6eJdeJfd7Z^dejD        deJfd8Z_d9ejN        fd:Z` eja        d          d;             Zbd<eejD        eJf         deeejD        eJf                  fd=Zcd>ejG        dejG        fd?Zdd,ejG        d@ejG        dAejG        dBeJdeejG        ejG        f         f
dCZed,ejG        dejG        fdDZf	 dbdejN        dEeBdFeCfdGZgdH ZhdeAfdIZid9ejN        fdJZjdKeek         dLeek         dMekdeekeeJ         eeJ         f         fdNZldKeek         dLeek         dMekdeekeeJ         eeJ         f         fdOZmdKeek         dLeek         dMekdeekeeJ         eeJ         f         fdPZndKeek         dLeek         dMekdeekeeJ         eeJ         f         fdQZod dRlpmqZq dS Zr	 dcdejN        dEeBdeejD                 fdTZs	 ddd,ejG        deejG        ejG        f         fdVZt	 	 	 	 	 dedZej        jG        d[eOd\eOd]eEd^eeeOeeO         f                  d_eEd`eeO         ddfdaZudS )f    Ndefaultdict)	dataclassreplace)CallableDictListOptionalSetTupleTYPE_CHECKINGUnion)BackwardState)is_sym_nodepy_sym_types)magic_methodsmethod_to_operator)find_symbol_binding_fx_nodesfree_symbolshint_intis_symbol_binding_fx_node)graph_drawer)CheckpointPolicy   )config)get_aot_graph_name)is_with_effects)fx_graph_cseget_aten_targetc                       e Zd ZU dZee         ed<   ee         ed<   ee         ed<   ee         ed<   ee         ed<   dej        fdZ	dej        fd	Z
dej        fd
Zdej        fdZdej        fdZdS )OpTypesz8Class for keeping track of different operator categoriesfusible_opscompute_intensive_ops
random_opsview_opsrecomputable_opsnodec                 .    t          |          | j        v S N)r   r"   selfr'   s     Y/var/www/html/ai-engine/env/lib/python3.11/site-packages/torch/_functorch/partitioners.py
is_fusiblezOpTypes.is_fusible9   s    t$$(888    c                 .    t          |          | j        v S r)   )r   r#   r*   s     r,   is_compute_intensivezOpTypes.is_compute_intensive<   s    t$$(BBBr.   c                 .    t          |          | j        v S r)   )r   r$   r*   s     r,   	is_randomzOpTypes.is_random?   s    t$$77r.   c                 .    t          |          | j        v S r)   )r   r%   r*   s     r,   is_viewzOpTypes.is_viewB   s    t$$55r.   c                 .    t          |          | j        v S r)   )r   r&   r*   s     r,   is_recomputablezOpTypes.is_recomputableE   s    t$$(===r.   N)__name__
__module____qualname____doc__r   r   __annotations__fxNoder-   r0   r2   r4   r6    r.   r,   r!   r!   /   s        BBXx=(((H(m(m###9rw 9 9 9 9C C C C C8bg 8 8 8 86BG 6 6 6 6>BG > > > > > >r.   r!   c                   X   e Zd ZU eej                 ed<   eej                 ed<   eej                 ed<   eej                 ed<   eej        e	f         ed<   e
j        deej                 fd            Zdej        defd	Zdej        defd
Zdej        defdZdej        de	fdZdS )NodeInfoinputs_required_fw_nodesrequired_bw_nodesunclaimed_nodesfw_orderreturnc                 J     t          d  j        D              fd          S )Nc              3      K   | ]}|V  d S r)   r>   .0ns     r,   	<genexpr>z-NodeInfo.required_fw_nodes.<locals>.<genexpr>V   s"      001Q000000r.   c                     j         |          S r)   )rE   )rK   r+   s    r,   <lambda>z,NodeInfo.required_fw_nodes.<locals>.<lambda>V   s    a@P r.   key)sortedrB   r+   s   `r,   required_fw_nodeszNodeInfo.required_fw_nodesS   s:    00/0006P6P6P6P
 
 
 	
r.   rK   c                     || j         v S r)   )rB   r+   rK   s     r,   is_required_fwzNodeInfo.is_required_fwY   s    D+++r.   c                     || j         v S r)   )rC   rU   s     r,   is_required_bwzNodeInfo.is_required_bw\   s    D***r.   c                     || j         v S r)   )rD   rU   s     r,   is_unclaimedzNodeInfo.is_unclaimed_   s    D(((r.   c                 J    || j         v sJ d| d            | j        |         S )NNode z not in fw nodes!)rB   rE   rU   s     r,   get_fw_orderzNodeInfo.get_fw_orderb   s7    D++++-IQ-I-I-I+++}Qr.   N)r7   r8   r9   r	   r<   r=   r;   r   r   int	functoolscached_propertyrS   boolrV   rX   rZ   r]   r>   r.   r,   r@   r@   I   s6         MBG$$$27|###\!!!27C<    
4= 
 
 
 

, ,D , , , ,+ +D + + + +)bg )$ ) ) ) ) bg  #            r.   r@   c                   B    e Zd ZU eed<   eed<   eed<   eed<   eed<   dS )MinCutOptionsban_if_used_far_apartban_if_long_fusible_chainsban_if_materialized_backwardban_if_not_in_allowlistban_if_reductionN)r7   r8   r9   ra   r;   r>   r.   r,   rc   rc   g   sN          $$$$"&&&&!!!!r.   rc   r'   rF   c                 h    | j                             dd           t          j        t          j        fv S )N	recompute)metagetr   MUST_RECOMPUTEPREFER_RECOMPUTEr'   s    r,   must_recomputerp   p   s0    9==d++')0  r.   fx_gc                 L    d}| j         j        D ]}t          |          r dS dS )NFT)graphnodesrp   )rq   foundr'   s      r,   has_recomputable_opsrv   w   s<    E
   $ 	44	5r.   c                     | j         j        D ]F}t          |          r5t          |j        d          r t
          j        j        |j        j        v r dS GdS )NtagsTF)	rs   rt   rp   hasattrtargettorchTagnondeterministic_seededrx   )rq   r'   s     r,   has_recomputable_rng_opsr~      s^    
   4  	V,,	 	1T[5EEE445r.   c                     t          | j        d         t          j        t          j        f          rdS t          | j        d         t          j                  sJ dS )Nvalr      )
isinstancerk   r{   SymIntSymBoolSymFloatro   s    r,   sym_node_sizer      sM    $)E"U\5=$ABB qdi&777771r.   c                       e Zd Zd ZdS )InvalidNodeBasec                     dS )NzInvalid Noder>   rR   s    r,   __repr__zInvalidNodeBase.__repr__   s    ~r.   N)r7   r8   r9   r   r>   r.   r,   r   r      s#            r.   r   joint_graphrA   outputssubgraphc                   
 t          j                    }i 
|D ]-}|                    |j                  }|j        |_        |
|<   .| j        D ]}t          |          r|dk    rt          
|<   "|
v r'|j        dk    rt          
|<   =|j        dk    r`t          j
        |j        i |j        }
fd|D             }t          |          rt          
|<   |                    |
fd          
|<   |j        dk    r|                    |
fd          
|<   |j        dk    r	 g }|D ]}	t          |	t           j                  r\|	
vrt#          d	|	 d
          t          
|	         t$                    rJ d	|	 d            |                    
|	                    x|                    |	           |                    t+          |                     |                                 |                                 |S )a  
    Given a graph, extracts out a subgraph that takes the specified nodes as
    inputs and returns the specified outputs.

    This includes specifying non-placeholder nodes as inputs.

    The general strategy is to initialize all inputs with proxies as we
    encounter them, and trace through the graph, only keeping values which take
    in valid proxies. Then, all dead code is eliminated.
    backwardplaceholdercall_functionc                 z    g | ]7}t          |t          j                  t          |         t                    8S r>   )r   r<   r=   r   )rJ   xenvs     r,   
<listcomp>z6_extract_graph_with_inputs_outputs.<locals>.<listcomp>   sI       a))3q6?33  r.   c                     |          S r)   r>   r   r   s    r,   rN   z4_extract_graph_with_inputs_outputs.<locals>.<lambda>       CF r.   get_attrc                     |          S r)   r>   r   s    r,   rN   z4_extract_graph_with_inputs_outputs.<locals>.<lambda>   r   r.   outputr\   z couldn't be found in envz was invalid, but is output)r<   Graphr   namerk   rt   _must_be_in_backwardInvalidNodeoppytreearg_tree_leavesargskwargsany	node_copyr   r=   RuntimeErrorr   appendr   tupleeliminate_dead_codelint)r   rA   r   r   	new_graphr'   new_nodeall_argsoutput_valuesr   r   s             @r,   "_extract_graph_with_inputs_outputsr      s     

I
C   ((33	D		!  %% 	(j*@*@#CI3;; W%%#CIIW''-tyHDKHHH   !  H
 8}} 'D	!++D2B2B2B2BCCCIIW
""!++D2B2B2B2BCCCIIW  M 	$ 	$a!! 	$||"#G1#G#G#GHHH!A  6 65q5556 6 6   Q((((  ####U=))***!!###NNr.   c                     | j         dk    o5dt          | j                  vot          |            ot	          |            S Nr   tangents)r   strrz   _is_bwd_seed_offset_is_fwd_seed_offsetro   s    r,   
_is_primalr      sP    =  	*c$+...	*#D)))	* $D)))	r.   c                 D    | j         dk    odt          | j                  v S r   r   r   rz   ro   s    r,   _is_tangentr      s#    7m#F
c$+6F6F(FFr.   c                 p    | j         dk    o+dt          | j                  v pdt          | j                  v S )Nr   bwd_seedbwd_base_offsetr   ro   s    r,   r   r      =    7m# c$+&&&O*;s4;?O?O*Or.   c                 p    | j         dk    o+dt          | j                  v pdt          | j                  v S )Nr   fwd_seedfwd_base_offsetr   ro   s    r,   r   r      r   r.   c                 r    | j         dk    o,t          | j                            d          t                    S )Nr   r   )r   r   rk   rl   r   ro   s    r,   _is_backward_stater      s,    7m#W
49==3G3G(W(WWr.   c                 @    | j                             dd           dk    S )Npartitioner_tagis_backwardrk   rl   ro   s    r,   _has_tag_is_backwardr      s    9==*D11]BBr.   c                 @    | j                             dd           dk    S )Nr   must_be_in_backwardr   ro   s    r,   _has_tag_must_be_in_backwardr     s    9==*D115JJJr.   c                 \    t          |           pt          |           ot          |           S r)   )r   r   r   ro   s    r,   r   r     s/    '-- T""<t'<'<r.   joint_modulec                    t          j        d | j                            d          D              }|d |         }||d          }||fS )Nc              3   $   K   | ]}|j         V  d S r)   )r   rJ   r'   s     r,   rL   z+_extract_fwd_bwd_outputs.<locals>.<genexpr>  s$      	K	K$)	K	K	K	K	K	Kr.   r   r   )r   r   rs   
find_nodes)r   num_fwd_outputsr   fwd_outputsbwd_outputss        r,   _extract_fwd_bwd_outputsr     sc     $	K	K 2 = = = J J	K	K	KG *?*+K/**+K##r.   saved_valuesr   c                 V    | D ]%}|j         |k    r|                     |            d S &d S r)   )r   remove)r   r   saved_values      r,   _remove_by_namer     sI    #  t##,,,EE $ r.   saved_sym_nodesr   c                   t          | |          \  }}| j                            d          }g t          t          |          }g t          t
          |          }g t          t          |          }	g t          t          |          }
g t          t          |          }t          | j        ||z   |z   |
z   |d          }|                    d          D ]\}|j
        s+t          ||j                   t          ||j                   4t          |          rt          ||j                   |sJ ]t                      }g }g }|D ]S}t          |          }|r+|                    |           |                    |           >|                    |           Tt#          | j                  }t%          j        |||          D ]c}d|j        vrt+          |j        d                   |z
  }t-          |d           D ]"}||vr|                    ||                    #||z  }d|                                 |                    ||z              t          | j        ||	z   ||z   |z   d          }t          | j        ||z   |z   |
z   |z   |d          }t2          j                            | |          }t2          j                            | |          }||fS )	Nr   r   r   r   r   c                     | j         S r)   r   )ss    r,   rN   z*_extract_fwd_bwd_modules.<locals>.<lambda>Y  s    16 r.   rO   forward)r   rs   r   filterr   r   r   r   r   r   usersr   r   setr   addr   r   	itertoolschainrk   r   rQ   clearextendr<   _lazy_graph_module_make_graph_module)r   r   r   r   r   r   placeholdersprimal_inputstangent_inputsfwd_seed_offset_inputsbwd_seed_offset_inputsbackward_state_inputs	bwd_graphr'   saved_symbolssaved_sym_nodes_bindingsaved_sym_nodes_derivedsymbolsymbol_bindingsnew_symbolsr   	fwd_graph
fwd_module
bwd_modules                           r,   _extract_fwd_bwd_modulesr     si     8o     K  %00M0BBL7fZ667M9vk<889NIv&9<HHIIv&9<HHIGf%7FFG2,&7:PP	 I $$$66 ) )z 	)L$)444OTY7777%% 	)L$)444(((( (+uuM     1 1*400 	1f%%%#**40000#**40000 3<3EFFO 7~VV % %	!!"49U#344}D)9)9::: 	? 	?A ''#**?1+=>>>>$
 25LLMMM 3..l"_4	 I 3
	
	 !	!  		 
 		 	I &99,	RRJ&99,	RRJz!!r.   c                ^   t          |           rt          | ||          S t          t          t          | j        j                            }t          t          t          | j        j                            }||z   }t          | |          \  }}t          | j        ||d          }d |j        D             g }	g }
| j        j        D ]}|j
        vrt          |          r|
                    |           1d|j        vrC|j        dk    r8|j        }t!          d |D                       sJ |	                    |           }fd|j        D             }d|j        v r/t!          d |D                       r|
                    |           |	                    |           t          t$                              |	                                                    }	t          t$                              |
                                                    }
t+          | |	|
|	          S )
a  
    Partitions the :attr:`joint_module` in a manner that closely resembles the
    behavior observed in the original ``.forward()`` and ``.backward()`` of the
    callable, i.e., the resulting forward graph contains those operators that
    are executed in the original ``.forward()`` callable passed to
    :func:`aot_function`.

    The default partitioner collects the operators that are between the forward
    inputs and the forward outputs. This helps in finding the tensors which have
    to be stashed for the backward pass. These stashed tensors become the output
    of the generated forward graph. The remaining operators are then placed in
    the backward graph.

    .. warning::
        This API is experimental and likely to change.

    Args:
        joint_module(fx.GraphModule): The joint forward and backward graph. This
            is the result of AOT Autograd tracing.

    Returns:
        Returns the generated forward and backward Fx graph modules.
    r   r   c                 2    h | ]}|j         d k    |j        S r   r   r   r   s     r,   	<setcomp>z$default_partition.<locals>.<setcomp>  s-       $'X:M:M	:M:M:Mr.   tensor_metar   c              3   @   K   | ]}|j         t          j        k    V  d S r)   )rz   operatorgetitem)rJ   users     r,   rL   z$default_partition.<locals>.<genexpr>  s,      II4t{h&66IIIIIIr.   c                 &    g | ]}|j         v|S r>   r   )rJ   rK   forward_node_namess     r,   r   z%default_partition.<locals>.<listcomp>  s-       7I)I)I)I)I)Ir.   c              3   4   K   | ]}t          |          V  d S r)   r   rI   s     r,   rL   z$default_partition.<locals>.<genexpr>  s9       2 2#$A2 2 2 2 2 2r.   r   r   )rv   #min_cut_rematerialization_partitionlistr   r   rs   rt   r   r   r   r   r   r   rk   r   r   allr   dictfromkeyskeysr   )r   _joint_inputsr   r   r   rA   r   r   forward_only_graphr   r   r'   r   backward_usagesr	  s                 @r,   default_partitionr    s   4 L)) 
2-
 
 
 	
 
L,>,DEEFFM!&)<l>P>V"W"WXX33F7o     K <FK  06   LO"( * *9...t 	* ""4(((($)++?0J0JJEII5IIIIIIII&&&&   :  O 	))c 2 2(72 2 2 / /)  &&7777##D))))l3388::;;L4==99>>@@AAO#''	   r.   g    .Anumelc                     | |j         z  S r)   )itemsize)r  dtypes     r,   _tensor_nbytesr    s    5>!!r.   c                 4   dt           fdd| j        v r| j        d         }t          |t                    rdS t          |t          t
          f          rt          fd|D                       S t          |t                    r-t          fd|                                D                       S t          |t          j
                  r |          S t          dt          |           d|            | j        d	k    rd
S t          d|  d          )NrF   c                     t          | t          j                  sdS t          t	          |                                 d          | j                  S )Nr      fallback)r   r{   Tensorr  r   r  r  r   s    r,   object_nbytesz_size_of.<locals>.object_nbytes  sB    !U\** 	1hqwwyy4@@@!'JJJr.   r   r   c              3   .   K   | ]} |          V  d S r)   r>   )rJ   rK   r#  s     r,   rL   z_size_of.<locals>.<genexpr>  s-      55A}}Q''555555r.   c              3   4   K   | ]\  }} |          V  d S r)   r>   )rJ   _rK   r#  s      r,   rL   z_size_of.<locals>.<genexpr>  s1      @@DAq}}Q''@@@@@@r.   zUnknown metadata type z	 on node r   r   r\   zO didn't have `val` metadata; we should always have `val` metadata on the nodes.)r^   rk   r   r   r  r   sumr  itemsr{   r!  r   typer   )r'   r   r#  s     @r,   _size_ofr*    sH   KC K K K K
 	ic<(( 
	&1 dE]++ 	&5555555555T"" 	&@@@@CIIKK@@@@@@U\** 	& =%%%NDIINNNNOOOw*q
eeee  r.   rs   c                     ddl m}  |t                    }| j        D ]'}|j        dk    r||j        j        xx         dz  cc<   (t          t          |	                                d d                     d S )Nr   r   r   r   c                     | d         S Nr   r>   r"  s    r,   rN   z_count_ops.<locals>.<lambda>  s
    AaD r.   TrP   reverse)
collectionsr   r^   rt   r   rz   r7   printrQ   r(  )rs   r   cntr'   s       r,   
_count_opsr3    s    ''''''%+c**C + +7o%%$%%%*%%%	&..$
?
?
?@@@@@r.   c                  v   g } t          t          j        j                  D ]}t	          t          j        j        |          }t          |t          j        j                  sA|                                D ]A}t	          ||          }t          j	        j
        |j        v r|                     |            nB| S r)   )dirr{   opsatengetattrr   _opsOpOverloadPacket	overloadsr|   	pointwiserx   r   )r6  	attr_nameopoverloadpacketoverloadop_overloads        r,   pointwise_opsrA    s    
C(( 
 
	"59>9==*EJ,GHH 	(2244 	 	H!"2H==Ky"k&666

+,,, 7
 Jr.   	depth_mapc                 j    fd| D             }t          |                                d d          S )Nc                 j    i | ]/}t          |t          j        j        j                  &||         0S r>   )r   r{   r<   r'   r=   )rJ   argrB  s     r,   
<dictcomp>zsort_depths.<locals>.<dictcomp>  sE        #z#ux}?Q/R/RYs^  r.   c                     | d         S r-  r>   r"  s    r,   rN   zsort_depths.<locals>.<lambda>  s
    AaD r.   Tr.  )rQ   r(  )r   rB  
arg_depthss    ` r,   sort_depthsrI    sP       '+  J *""$$..$GGGGr.   gmc                 t  
 t          j                    i 
| j                            d          D ]}                    |
fd          
|<   i t          | j        j                  D ]
\  }}||<   
fd}t          t          t          | j        j                            }d}t          j        }|D ]"}|j        D ]}|         |k     r
|         }|}#|| S t          | j        j                  |         d         D ]} ||           t          j                             |           }	|	S )a  
    This pass finds the first bwd node in the graph (by looking at users of
    tangents) and then reorders the graph by walking from this node to all the
    way to the end of the graph. At each op in this traveral, we insert this op
    in a new graph and try to bring only the relevant subgraph from the other
    non-bwd edges relevant for this op. This closely mimics the behavior of
    autograd engine.

    Why is this pass required in the first place?

    This is an artifact of how partitioners work today. The starting point of
    partitioner is a joint graph, which is fwd and then bwd graph. In the case
    of checkpointing, we keep portions of fwd graph in their original place in
    the joint graph, while obtaining a bwd graph. As a result, the resulting bwd
    graph has copies of recomputed fwd subgraphs followed by the original bwd
    graph. If we run this naively, this leads to bad memory footprint, because
    the fwd subgraphs are live for way longer duration than necessary. This pass
    reorders the operations such that we prioritize the ops for the original bwd
    graph while only realizing those ops from the fwd graph that are necessary
    at any given point in the graph.
    r   r   c                     |          S r)   r>   r   s    r,   rN   z5reordering_to_mimic_autograd_engine.<locals>.<lambda>9  s    A r.   c                 X   | g}t                      }t          |          dk    rO|                                } | |v s| v r0|                    |            || j        z  }t          |          dk    Ot          |fd          }|D ]}                     | fd          | <   d S )Nr   c                     |          S r)   r>   )rK   orders    r,   rN   zSreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graph.<locals>.<lambda>L  s    %( r.   rO   c                     |          S r)   r>   r   s    r,   rN   zSreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graph.<locals>.<lambda>N  r   r.   )r   lenpopr   all_input_nodesrQ   r   )r'   	cur_nodesinsertable_nodesr   r   rO  s      r,   insert_node_in_graphzAreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graph?  s    F	55)nnq  ==??D'''43;;  &&& --I )nnq   ""28J8J8J8JKKK$ 	D 	DD!++D2B2B2B2BCCCII	D 	Dr.   N)r<   r   rs   r   r   	enumeratert   r  r   r   mathinfr   r{   GraphModule)rJ  r'   idxrV  r   first_node_in_bwdminimum_ordertangentr  new_gmr   r   rO  s             @@@r,   #reordering_to_mimic_autograd_enginer`    s   . 

I"$C ##}#55 @ @''.>.>.>.>??D		Erx~..  	TdD D D D D D D$ &bhn==>>NHM! ) )M 	) 	)DT{]** %d$(!	)  	 RX^$$U+<%=%?%?@ # #T"""" X!!"i00FMr.   	fw_module	bw_modulenum_sym_nodesc                 (   t          j                    }d }d }d } ||           } ||          }	 ||          }
i }| j        j        D ]r}t	          |          rat          |j        d          rLt          j        j	        |j        j
        v r/||j                 }|	|j                 }|
|j                 }||d||<   st          j        j        j        }t          j        j        j        }d }|j                            d          D ]}d|j        v r|} n|t#          d	          g }|                                D ]\  }}|d
         }|d         }|j        }|                    |          5  |                    d||j        g|j        R |j                  }|                    dt.          j        |dfi           }|                    dt.          j        |dfi           }|                    |           |                    |           |                    |           d d d            n# 1 swxY w Y   |j        }|                    |          5  dt9          |           }|                    |          } | ||                    |j        d<   d d d            n# 1 swxY w Y   |                    |          5  |                    d|||j        g|j        R |j                  }|                    |           |                    |           d d d            n# 1 swxY w Y   t9          t?          |j                            d                              }|j        d         }tA          |          |z
  }|d |         tC          |          z   ||d          z   }|j        "                    |           |j                            |           |#                                 |#                                 ||fS )Nc                     i }| j         j        D ]I}|j        dk    r<t          |j        d          r't
          j        j        |j        j        v r
|||j	        <   J|S )Nr   rx   )
rs   rt   r   ry   rz   r{   r|   r}   rx   r   )gmodrandom_nodesr'   s      r,   get_rng_opsz*functionalize_rng_ops.<locals>.get_rng_ops  sd    J$ 	/ 	/D?**DK00 +I59III*.TY'r.   c                     d| j         vrdS | j         d         }t          |t                    s|f}|D ]/}t          |t          j                  r|j        j        dk    r dS 0dS )zV
        Check the example value of the node outputs to find the device type.
        r   Ncudacpu)rk   r   r   r{   r!  devicer)  )r'   
candidates	candidates      r,   
get_devicez)functionalize_rng_ops.<locals>.get_device  s|     	!!4Yu%
*e,, 	'$J# 	" 	"I)U\22 "#(F22!66ur.   c                 p    | dk    rt           j                                        S t          j                    S )Nrj  )r{   rj  get_rng_state)rl  s    r,   get_sample_rng_statez3functionalize_rng_ops.<locals>.get_sample_rng_state  s0    V:++---"$$$r.   rx   )fwdbwdr   r   r^  zaCouldn't find tangent node in graph inputs. This is unexpected, please file a bug if you see thisrs  rt  r   )r   r   r   r   rng_state_output_r   r   )$r   countrs   rt   rp   ry   rz   r{   r|   r}   rx   r   _prims	rng_primsrun_and_save_rng_staterun_with_rng_stater   r   r(  inserting_beforecreate_noder   r   r  r  replace_all_uses_with
erase_noder   nextr   rk   iterrQ  r   r   	recompile) r   ra  rb  rc  uidrh  ro  rr  joint_graph_rng_opsfw_graph_rng_opsbw_graph_rng_opsrecomputable_rng_ops_mapr'   	base_nodefw_nodebw_noderun_and_save_rngrz  bw_tangent_start_nodefw_rng_state_outputs	node_pairfw_graphfunctional_fw_nodestate
rng_outputbw_graph
state_namebw_rng_state_nodefw_output_node
fw_outputssym_node_start_idxr   s                                    r,   functionalize_rng_opsr  g  s   2 /

C	 	 	  $% % % &+l33"{9--"{9--!"( 	S 	S4  	SV,,	S 	1T[5EEE+DI6I&ty1G&ty1G:A'2R2R$Y/|-D/B **m*<<  	!!$(!E " $o
 
 	
  8 > > @ @ /) /)	9E"E"?&&w// 	/ 	/!)!5!5 n4w|44~	 "6 " " (( (!,	 )  E "-- &  .  J ))*555((( ''...1	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/6 ?&&'<== 	V 	V8T#YY88J ( 4 4Z @ @,@,@GATAT,U,U"5)	V 	V 	V 	V 	V 	V 	V 	V 	V 	V 	V 	V 	V 	V 	V
 &&w// 		) 		)!--"'G',GG~	 .  J ))*555(((		) 		) 		) 		) 		) 		) 		) 		) 		) 		) 		) 		) 		) 		) 		) $y99X9FFGGHHN$Q'JZ=8&&&'
$
%
%	&
'((
)	* 
 O7###O~...is9   1B6H33H7	:H7	AJ**J.	1J.	
AL..L2	5L2	c                     | j         j        D ]Z}t          |          rI|j        D ]A}t          |          r0|j        d         |j        d         k    rt
          j        |j        d<   B[| S )a  
    If there are two consecutive checkpointed blocks with no operator in
    between, we would still want to stash the tensor at the boundary of
    checkpointed blocks. The following pass makes the last output node
    non-recomputable to allow for that.
    ac_graph_idrj   )rs   rt   rp   r   rk   r   	MUST_SAVE)r   r'   r  s      r,   cleanup_recompute_tagsr    s     "( H H$ 	H
 H H"4((H	-049]3KKK-=-GDIk*r.   	node_infomin_cut_optionsc                   $%&'()*+,-./ t                      t                      .t          rCd | j        D             }|d .j        D             z
  }t          d|           t                       d %d &%&.fd'	 dd l}n"# t          $ r}t          d          |d }~ww xY w'.fd	)).fd
}'fd(dt          f(.fd}	|
                                -t                      $$-.fd}
| j        D ]}|j        dk    r|j        v r^|j        vr+-                    |j        dz   dt           j                   L-                    |j        dz   dt           j                   t%          |          r+-                    |j        dz   dt           j                   t'          |          st)          |          r |
|                               |          r ||          r |
|           d|j        vod|j        vp.d|j        v o%t/          |j        d         t0          j                   }t5          |          rt          t7          |                    }nI|r<t/          |j                            d          t:                    rdnt           j        }n |	|          }-                    |j        dz   |j        dz   |           |j        D ]4}-                    |j        dz   |j        dz   t           j                   5 dt>          t@          j!                 dtD          dtD          f'fd}j#        rj$        D ]}fd|j        D             }fd|j        D             }tK          |          dk    r ||tM          |                    }tO          |j                  D ]}                    |          rz(                    |          |k    ra '||          rU|$v rAtR          *                    d|(                    |          ||(                    |                      |
|           j+        rt                      }| j        D ]w}                    |          s(                    |          |fg}(                    |          }tK          |          dk    rtY          j-        |          \  }}||v r0|.                    |           (                    |          |dz   k    rctK          |          dk    rPtR          *                    d||(                    |          (                    |                      |
|           nm|j        D ]Q}                    |          r: '||          r.|$vr*tY          j/        |(                    |          |f           RtK          |          dk    y	 |0                    -dd          \  }}nl# tb          $ r_ t          d            t          d!2                    |j3        j4        5                    -                               tm          -            w xY w|\  },t                      }-fd"|D             D ]'\  /}|7                    ,/fd#|D                        (t                      }|D ]<\  } }!| d d$         |!d d%         k    sJ | d d$         }"|.                    |"           =tq          |           *d& ts          | j                  D             +tu          *fd'|D             +fd()          }#|#$fS )*Nc                     h | ];}|j         d k    t          |j        d          "t          |j        j                  <S )r   _overloadpacket)r   ry   rz   r   r  r   s     r,   r  z solve_min_cut.<locals>.<setcomp>!  sN     
 
 
w/))gdkCT.U.U) +,,)))r.   c                 ,    h | ]}t          |          S r>   )r   rJ   is     r,   r  z solve_min_cut.<locals>.<setcomp>&  s    )T)T)TQ#a&&)T)T)Tr.   z$Ops banned from re-materialization: c                    |j         t          j        j        j        k    rdS |j        d         }t          j        j                            |          \  }}|D ]2}|j	        |         }| |u r dS t          |t                    r| |v r dS 3dS NFr   T)rz   r{   r6  higher_orderauto_functionalizedr   _higher_order_opsauto_functionalizeget_mutable_argsr   r   r  )ab
mutable_opmutable_arg_namesr&  r   rE  s          r,   !can_fuse_into_auto_functionalizedz8solve_min_cut.<locals>.can_fuse_into_auto_functionalized*  s    8uy-AAA5VAY
 #6GG
SS	
% 	  	 D(4.CCxxtt#t$$  8844ur.   c                     |j         t          j        j        j        k    rdS |j        d         }|D ]}|j        d         |         }| |u r dS dS )NFtensors_to_cloner   T)rz   r{   r6  r   triton_kernel_wrapper_functionalr   )r  r  r  r   rE  s        r,   .can_fuse_into_triton_kernel_wrapper_functionalzEsolve_min_cut.<locals>.can_fuse_into_triton_kernel_wrapper_functional;  sd    8uy-NNN5H%78% 	 	D(8$T*CCxxtt ur.   c                     t          |          t          j        k    rdS  | |          rdS  | |          rdS                     |           o                    |          S )NT)r   r7  catr-   )r  r  r  r  op_typess     r,   r-   z!solve_min_cut.<locals>.is_fusibleE  sz     1))4,,Q22 	499!Q?? 	4""1%%@(*=*=a*@*@@r.   r   zANeed networkx installed to perform smart recomputation heuristicsc                 `                        |           rdS | h}t          |          dk    r|                                }|j        D ]P}                    |          s ||          s dS                      |          r|                    |           Qt          |          dk    dS r  )r4   rQ  rR  r   rV   r   )r'   rT  curr  r-   r  r  s       r,   is_materialized_backwardsz0solve_min_cut.<locals>.is_materialized_backwardsW  s    D!! 	5F	)nnq  --//C	 ( ( //55  jjd>S>S  44##D)) (MM$''' )nnq   ur.   c                    | j         dk    rdS | j        t          j        k    rdS | j                            dd           t          j        k    rdS t          j	        r
                    |           rdS | j        t          j        j        t          j        j        fv rdS j        r                    |           sdS n,                    |           s                    |           rdS j        r; |           r0t(                              d| t-          | j                             dS | j        dk     r| j        t          j        k    rdS j        r6t7          d | j        D                       }t;          |           }|dz  |k     S dS )	Nr   Frj   Tzmaterialized backwards: %s %si  c              3   h   K   | ]-}t          |t          j                  t          |          V  .d S r)   )r   r<   r=   r*  r  s     r,   rL   zBsolve_min_cut.<locals>.should_ban_recomputation.<locals>.<genexpr>  sM       % % !*Q2H2H%% % % % % %r.   r   )r   rz   r  r  rk   rl   r   r  r   recompute_viewsr4   r7  lift_fresh_copydefault
lift_freshrg   r6   r2   r0   rf   loginfor   r   dist_from_bwmax_dist_from_bwrh   r'  r   r*  )r'   input_tensors_sizeoutput_sizer  r  r  s      r,   should_ban_recomputationz/solve_min_cut.<locals>.should_ban_recomputatione  s   7o%%5;(***59==d++/?/III4! 	h&6&6t&<&< 	5;4/79PQQQ52 	++D11 t !!$'' 8+H+H+N+N t 7 	<U<U=
 =
 	 HH4dE$*<M<MNNN4 t##(9F<S(S(S4 + 	8!$ % %%)Y% % % " " #4..K?%777ur.   c                 d      j         dk    rdS t           fd j        D                        S )Nr   Tc              3   0   K   | ]} |          V  d S r)   r>   )rJ   r  r-   r'   s     r,   rL   z9solve_min_cut.<locals>.is_materialized.<locals>.<genexpr>  s/      EE$zz$--EEEEEEr.   )r   r  r   )r'   r-   s   `r,   is_materializedz&solve_min_cut.<locals>.is_materialized  sA    7m##4EEEEE$*EEEEEEEr.   rF   c           
         t          |           }t          j        r!                    |           rt          j        S t          | j        d         t                    r,t          | j        d         t          j
                  st          S t          |dt          t          | j        d          d          z  z            } |           r|S |dz  S )Nr   g?d   r      )r*  r   r  r4   rX  rY  r   rk   r   r{   r   INT_INFr^   maxminr  )r'   mem_szr  r  s     r,   get_node_weightz&solve_min_cut.<locals>.get_node_weight  s    $! 	h&6&6t&<&< 	 8Odi&55 	di.==  Vsc#d.?*E*Eq&I&IIJKK?4   	MA:r.   c                 B                        |           rdS | v rdS t          |           rdS d| j        v r't          | j        d         t          j                  rdS                     |                                d| j        dz   t          j
                   dS )NFr   source_incapacityT)r4   rp   rk   r   r{   r   r   add_edger   rX  rY  )r'   banned_nodesdont_bannx_graphr  s    r,   ban_recomputation_if_allowedz3solve_min_cut.<locals>.ban_recomputation_if_allowed  s    D!! 	585 $ 	5DI*TYu-=u~"N"N5
 	(DI$5IIItr.   r   r  sinkr  _outr   r          start_nodes	max_rangec           
         g }| D ]-}t          j        |	                    |          |df           .t          |          dk    rt          j        |          \  }}}|s	                    |          S |j        D ]f}	                    |          rO	                    |          |k    r1t          j        |	                    |          | ||          f           gt          |          dk    |S )z
        Finds the first unfusible node in the chain of nodes starting from
        `start_nodes` and returns its position.
        Tr   )heapqheappushr]   rQ  heappopr   rV   )
r  r  sorted_nodesrK   r&  r'   node_is_fusibler  r-   r  s
           r,   find_first_unfusiblez+solve_min_cut.<locals>.find_first_unfusible  s&   
 9; 	O 	OAN<)*@*@*C*CQ)MNNNN,!##',}\'B'B$At_" 4 --d333
  ++D11  --d33i?? N$"//55tZZd=S=ST   ,!## r.   c                 d    g | ],}                     |                              |          -S r>   )rV   r]   rJ   r  r  s     r,   r   z!solve_min_cut.<locals>.<listcomp>-  sK       ++D11&&t,,  r.   c                 >    g | ]}                     |          |S r>   )rV   r  s     r,   r   z!solve_min_cut.<locals>.<listcomp>2  s<       I4L4LT4R4R  r.   z1used above/below fusible %s:(%s) -> %s -> %s:(%s)r  ztoo long %s %s %s %sr  z-Failed to compute min-cut on following graph:
c              3   ,   K   | ]}||         fV  d S r)   r>   )rJ   rK   r  s     r,   rL   z solve_min_cut.<locals>.<genexpr>  s,      88Q$888888r.   c              3   (   K   | ]}|v |fV  d S r)   r>   )rJ   vnon_reachableus     r,   rL   z solve_min_cut.<locals>.<genexpr>  s1      AAa=.@.@q!f.@.@.@.@AAr.   c                     i | ]\  }}||	S r>   r>   )rJ   r[  r'   s      r,   rF  z!solve_min_cut.<locals>.<dictcomp>  s    HHHic4cHHHr.   c              3   (   K   | ]}|         V  d S r)   r>   rJ   r'   name_to_nodes     r,   rL   z solve_min_cut.<locals>.<genexpr>  s(      22d	222222r.   c                     |          S r)   r>   )r   node_idxs    r,   rN   zsolve_min_cut.<locals>.<lambda>  s    (1+ r.   rO   );r   get_default_op_listAOT_PARTITIONER_DEBUGrt   r&   r1  networkxImportErrorr   floatDiGraphr   rC   rA   r  r   rX  rY  rp   r   r   rV   rk   r   r{   r!  r   r   rl   r   r   r	   r<   r=   r^   rd   rS   rQ  r  r   r]   r  r  re   r  r  r   r  minimum_cut	Exceptionjoin	readwriteedgelistgenerate_edgelistvisualize_min_cut_graphupdateget_name_to_noderW  rQ   )0r   r  r  r  joint_module_opsops_ignorednxer  r  r  r'   is_non_tensor_nodeweightr  r  	used_nodeordersfw_usersfirst_unfusible_usevisited
start_nodefusiblestart_orderr&  r  	cut_value	partition	reachablecutsetnbrs	cut_nodesnode_innode_out	node_namer   r  r  r  r-   r  r  r  r  r  r  r  r  s0    ```                                @@@@@@@@@@@@r,   solve_min_cutr     st	    55"$$H 
 
#)
 
 

 ')T)T(:S)T)T)TT4kBBB  "  	A 	A 	A 	A 	A 	A 	A   R
 
	
      0 0 0 0 0 0 0dF F F F F       2 zz||H55L       * ! 3X 3X7h9...9+++!!$)e"3Vdh!OOO di&0&48LLL$ 	
 di%/$(KKKd 	/2488 	/((...
 ##D)) 	/.F.Ft.L.L 	/((... "E}DI'EUty SDIe4Del)S)S%S 	 t 	+=..//FF 	+!$)--"6"6FFTDH F %_T**F$)e+TY-?&QQQJ 	X 	XDdi&0$)e2CdhWWWW	X($rw- C C       . , ;"4 	; 	;I   %O  F
   !*  H 6{{Q&:&:8S[[&Q&Q#!)/22 ; ;D!0066;%22488;NNN&Jy$77 O  <//$O%%229==/ %22488   54T::: 1 !V%%%+ 	V 	VJ++J77 !..z::JGHG#00<<Kg,,""w//3'>>C    **3//+2CCCG))HH."!..s33!..z::   10555I V VD!0066V&JsD11V !44w1G1G1M1Mt0TUUU5 g,,""8!~~h&II	99   =>>>dii-??IIJJKKK)))	  )I}#&55F8888i888 B B4AAAAAdAAAAAAAI# ! !ss|x},,,,CRCL	i    #K00LHH9[5F+G+GHHHH2222	2228M8M8M8M  L %%s%   B 
B*B%%B*,X A)Y0c                    dd l }dd l}|j                            |                                           }|                    |          d         }|                                D ]}| |                                         |                                         d         }|	                    t          |                     |t          d          k    r|                    d           t          d           |                    d           d S )Nr   r  rY  redz2Visualizing the failed graph to min_cut_failed.svgzmin_cut_failed.svg)r  pydotnx_pydotto_pydot	to_stringgraph_from_dot_data	get_edges
get_sourceget_destination	set_labelr   r  	set_colorr1  	write_svg)r  r  r#  
dot_format	dot_graphedger  s          r,   r  r    s    LLL%%h//99;;J))*55a8I##%% " "$//++,T-A-A-C-CDZPs6{{###U5\\!!NN5!!!	
>???,-----r.   c                      g t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j	        t           j
        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j         t           j!        t           j"        t           j#        t           j$        t           j%        t           j&        t           j'        t           j(        t           j)        t           j*        t           j+        t           j,        t           j-        t           j.        t           j/        t           j0        t           j1        t           j2        t           j3        t           j4        t           j5        t           j6        t           j7        t           j8        t           j9        t           j:        t           j;        t           j<        t           j=        t           j>        t           j?        t           j@        t           jA        t           jB        t           jC        t           jD        t           jE        t           jF        t          jH        t           jI        t           jJ        t           jK        t           jL        } t           jI        t           jJ        t           jM        g}|t           jN        t           jO        t           jP        t          jR        t           jS        t           jT        t           jU        gz  }|}| g t          j        t          jV        t           jW        t           jL        t           jX        t          jY        t          j@        t           jY        t           jZ        t          jR        t           j[        t           j\        t           jN        t           jS        t           jO        t           j]        t           j^        t           j_        t           j`        t           ja        t           jb        t           jc        t           jd        t           je        t           jf        t           jg        t           jh        t           jT        t           ji        t           jj        t           jk        t           jl        t           jm        t          jn        t          jo        z  } | t           jp        t           jq        gz  } | |z  } | t                      z  } | t           js        gz  } | d t          D             z  } t          |           }t           jv        t           jw        t           jx        g}t           jy        t           jz        t           j{        t           j|        t           j}        t           j~        t           j        t           j        t           j        t           j        t           j        g}|t          |          z  }t	          t          |          t          |          t          |          t          |          t          |                    S )Nc                 ,    g | ]}t          |          S r>   )r   )rJ   ms     r,   r   z'get_default_op_list.<locals>.<listcomp>,  s!     N N N1!3A!6!6 N N Nr.   )r7  r   subdivatan2mulr  r  pow	remainderfmod__and____or____xor__
__lshift__
__rshift__eqnegegtleltabsbitwise_notceilfloorfracnegreluroundsilutruncr  log10log1plog2lgammaexpexpm1erferfccosacoscoshsinasinsinhtanatantanhatanhsqrtrsqrt
reciprocalsigmoidsoftplus	thresholdthreshold_backwardclampwherelerpaddcmulgelugelu_backwardr'  mean_grad_sum_to_sizesum_to_sizeamaxtotype_asr  r  squeeze	unsqueezersub_to_copyaliasviewslicetprimsbroadcast_in_dimexpand
as_stridedpermuteconvert_element_typeclone	full_likevarstdselect_unsafe_viewreshapebroadcast_tensorsscalar_tensorones	new_zerosr  arangetriuvar_meanisinfr   fullzerosempty
empty_likeargmaxmaximumiota)_low_memory_max_pool2d_offsets_to_indicesindexgatherrA  
zeros_liker   r   native_dropout	rand_like
randn_likemmconvolutionconvolution_backwardbmmaddmm#_scaled_dot_product_flash_attention'_scaled_dot_product_efficient_attention_flash_attention_forward_efficient_attention_forwardupsample_bilinear2d
_scaled_mmr!   )default_recomputable_opsrecomputable_view_opsr%   r&   r$   r#   r"   s          r,   r  r    sg   L0L0L0 	L0 	
	L0
 	L0 	L0 	L0 	L0 	L0 		L0 	L0 	L0 	L0 	L0 	L0  	!L0" 	#L0$ 	%L0& 	'L0( 	)L0* 	+L0, 	-L0. 	/L00 		1L02 	
3L04 		5L06 	7L08 		9L0: 	
;L0< 		=L0> 	
?L0@ 	AL0B 	
CL0D 	
EL0F 		GL0H 	IL0J 	KL0L 	
ML0N 	OL0P 		QL0R 	SL0T 		UL0V 		WL0X 	YL0Z 		[L0\ 		]L0^ 	_L0` 		aL0b 		cL0d 	
eL0f 		gL0h 	
iL0j 	kL0l 	mL0n 	oL0p 	qL0r 	sL0t 	
uL0v 	
wL0x 		yL0z 	{L0| 		}L0~ 	L0@ 	AL0B 		CL0D 	EL0F 	GL0H 		IL0J 	KL0L 	ML0N 	OL0P 	QL0R 	SL0T 		UL0V 	WL0Z "\4>4:F	
  %H $!	$!"$! 	
$! 		$!
 	$! 		$! 		$! 	$! 	$! 	$! 	$! 	$! 		$! 	$! 	
$!  	!$!" 	#$!$ 	%$!& 		'$!( 	)$!* 	+$!, 	-$!. 		/$!0 	1$!2 	
3$!4 	5$!6 		7$!8 	9$!: 	
;$!< 	
=$!> 	?$!@ 	A$!B 	C$!D 	
E$!F 	7G$! $L T[ 99(/!   N N N N NN344%t~tGJ!
04%)  #S__4KK!""JH  r.   c                 2    i }| j         D ]}|||j        <   |S r)   )rt   r   )rs   r  r'   s      r,   r  r  H  s-    L ' '"&TYr.   memoryruntimes
max_memoryc                 L    t                    }t          t          |                    }t          | fdd          }d}d}g }g }|D ]R}	| |	         z   |k    r,| |	         z  }||	         z  }|                    |	           =|                    |	           S|||fS )Nc                 &    |          |          z  S r)   r>   )r  r  r  s    r,   rN   z!greedy_knapsack.<locals>.<lambda>V  s    fQi(? r.   Tr.  r  )rQ  r  rangerQ   r   )
r  r  r  rK   r(  total_memorytotal_runtimeitems_to_saveitems_to_allow_recomputingr  s
   ``        r,   greedy_knapsackr  O  s     	HAqNNE 5?????NNNELMM!# 1 1&)#z11F1I%LXa[(M  ####&--a0000-)CCCr.   c           	      :   dd l }	 ddlm}m}m} n# t
          $ r t          d          d w xY w|                    |           }|                    |          }| }	 |||                    |                    }
|
g}|                    |	          } ||	|| |dd                    }|j	        st          d          g }g }t          |j                  D ]6\  }}|dk    r|                    |           !|                    |           7|j         ||fS )Nr   )BoundsLinearConstraintmilpzHTo use the ILP for memory budget checkpointing you need to install scipy)Aubr   )cconstraintsintegralityboundszSomehow scipy solving failed)numpyscipy.optimizer  r  r  r  r   array	ones_likesuccessrW  r   r   fun)r  r  r  npr  r  r  	np_memorynp_runtimesr  memory_constraintr  r  resr  r  r[  r  s                     r,   ilp_knapsackr  g  s    AAAAAAAAAAA   V
 
	
   I((8$$K	A((9*9M9MNNN$%K,,q//K
$
+ffQPQll  C ; ;9:::M!#CE"" 3 3Q66  %%%%&--c2222G8]$>>>s   
 ,c                 .   dt          j        fd| D             t           j        d          }t          j        |t           j        d          }t	          t          |z                      }t          |           }t          j        |dz   |dz   ft           j        d          }t          d|dz             D ]}||dz
           }||dz
           }	||dz
  d d f         ||d d f<   |dk    r||dz
  d d f         |	z   ||d d f<   Nt          j	        ||dz
  |d f         ||dz
  d | f         |	z             |||d f<   g }
g }|}t          |dd          D ]}||         |         ||dz
           |         k    rF|

                    |dz
             |t	          ||dz
                                                     z  }i|
                    |dz
             |
                                 ||         |                                         }||
|fS )Ni'  c                 N    g | ]!}t          t          |z                      "S r>   )r^   rM  )rJ   r3  Ss     r,   r   zdp_knapsack.<locals>.<listcomp>  s+    +++qU1q5\\		+++r.   rk  )r  rl  r   r   )r{   tensorlongfloat32r^   rM  rQ  r  r  r  r   itemr/  )r  r  r  quantized_memoryquantized_max_memoryrK   dpr  current_memorycurrent_runtimesaved_itemsrecomputable_itemsjmax_runtimer  s                 @r,   dp_knapsackr    sz    	A |++++F+++5:e   |HEM%HHHH uZ!^4455FA 
	
Q$q()u
 
 
B 1a!e__  )!a%0"1q5/ a!eQQQh<1aaa4 Q!a%(|o5Bq!!!tHH%*]1q5.//)*1q5*N?**+o=& &Bq.//!"" K!A1a__ - -a58r!a%y|##q1u%%%%a!e,1133444AA%%a!e,,,, Q%,-2244K%777r.   c                     t           j        }|dk    rt          | ||          S |dk    rt          | ||          S |dk    rt	          | ||          S t          d|           )Ngreedyilpr  z,Not aware of memory budget knapsack solver: )r   activation_memory_budget_solverr  r  r  r   )r  r  r  SOLVERs       r,   #_optimize_runtime_with_given_memoryr    sv    
 3Fvx<<<	5FHj999	468Z888R&RRSSSr.   no_dispatchc                 L   	 t           j        }d }|dk    rdS |dk    rnt                      5  ddlm} t          j        | j         j        f          \  	|	                    	 fd          }|cd d d            S # 1 swxY w Y   d S |dk    rdd	l
m} t          j        | j         j        f          \  	 |d
          5 }  j        i 	 d d d            n# 1 swxY w Y   |                                }t          |d          S t          d|           )Nc                    t          | t          j                  rt          | j        d         t          j                  rbt          | j        d         j                  }d fd|D             }| j        d                             || j        d         j	                  S t          | t          j                  rAt          | j        d         t          j
                  rt          | j        d         d          S t          | t          j                  r't          | j        d         t          j                  rdS t          | t          j                  r't          | j        d         t          j                  rd	S | S )
Nr   c                 $    t          | d          S )Nr  r  )r   )ds    r,   realize_symbolzAestimate_runtime.<locals>.materialize_arg.<locals>.realize_symbol  s    D1111r.   c                 &    g | ]} |          S r>   r>   )rJ   r   r  s     r,   r   z=estimate_runtime.<locals>.materialize_arg.<locals>.<listcomp>  s#    6661^^A&&666r.   r  )strider  r  g      ?T)r   r<   r=   rk   r{   r!  r  shapenew_empty_stridedr  r   r   r   r   )r   r  r  s     @r,   materialize_argz)estimate_runtime.<locals>.materialize_arg  sS   a!! 	j&M&M 	,--E2 2 2 7666666E6%=22af]3: 3    27## 	
16%=%,(O(O 	AF5MD999927## 	
16%=%.(Q(Q 	327## 	
16%=%-(P(P 	4Hr.   testingr   profiler   )benchmarkerc                       j          i S r)   )rz   )r   r   r'   s   r,   rN   z"estimate_runtime.<locals>.<lambda>  s    ;4;3O3O3O r.   flops)FlopCounterModeF)displayz Not aware of runtime estimator: )r   *activation_memory_budget_runtime_estimatorr  $torch._inductor.runtime.benchmarkingr  r   tree_mapr   r   benchmark_gputorch.utils.flop_counterr  rz   get_total_flopsr  r   )
r'   RUNTIME_MODEr  r  msr  modecounted_flopsr   r   s
   `       @@r,   estimate_runtimer    s   DL  ( y  q		"	"]] 	 	HHHHHH!??TY<TUULD&**+O+O+O+O+O+OPPB	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 
	 	 <<<<<<DK8PQQf_U+++ 	)tDK((((	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	),,..=!$$$LlLLMMMs$   ABB	BC%%C),C)c                     !"# |dk    s|dk     rt          d|           t          t          j        t          j        t          j        t          j        t          j                  }t          j        rt          |dddd          }|dk    rj
        S t           |          \  }}|dk    r|S dt          t          j                 dt          fd	 j
                  ! |          !k    r|S !fd
dt          t          j                 f!fd}t          |ddd          }t           |          \  }} ||          |k     r|S t          |d          t                     \  }	}
 ||	          |k     r|	S ddlm fdj
        D             dt          t          j                 dt          t          j                 ffd} ||
          }t%          |t&          d          t)                    dk    rj
        S fdD              d D             #ddlm"   "#fd}t          j        rg }t1          ddd          D ]D} ||dz            \  }}|                    |t5          #          |z
   ||          f           Edd lm} d |D             }d |D             }|                    d           |                    ||d !           t?          |          D ])\  }}|                     |d"|||         fd#d$d%&           *|!                    d'           |"                    d(           |#                    d)           |$                    d           |%                                }|&                                 d*tO                       d+}|(                    |           tR          *                    d,|            ||-          d         S ).Nr   r   zJThe valid ranges for memory budget are 0 <= m <= 1. The provided value is )rd   re   rf   rg   rh   F)rd   re   rf   rg   r   rF   c                 L    t          t          t          |                     dz  S N    eA)r'  mapr*  )r   s    r,   estimate_activations_sizez:choose_saved_values_set.<locals>.estimate_activations_size,  s    3x..//#55r.   c                     | dz  z
  z  S r  r>   )szmax_act_sizemin_act_sizes    r,   get_normalized_sizez4choose_saved_values_set.<locals>.get_normalized_size5  s    S\L899r.   activationsc                 ,     |           z
  z
  z  S r)   r>   )r  r
  r  r  s    r,   get_mem_ratioz.choose_saved_values_set.<locals>.get_mem_ratio8  s(    ))+66E<'
 	
r.   )rd   re   rf   )rg   get_node_storagec                 &    h | ]} |          S r>   r>   rJ   r'   r  s     r,   r  z*choose_saved_values_set.<locals>.<setcomp>V  s%    JJJ&&t,,JJJr.   r  c                 "    fd| D             S )Nc                 ^    g | ])}|j         t          d           k     r |          v'|*S )r  )r  r^   )rJ   r  r  input_storagess     r,   r   zRchoose_saved_values_set.<locals>.get_recomputable_banned_nodes.<locals>.<listcomp>Y  sP     
 
 
 S))$$Q''~== 
 >==r.   r>   )r  r  r  s    r,   get_recomputable_banned_nodesz>choose_saved_values_set.<locals>.get_recomputable_banned_nodesX  s3    
 
 
 
 
!
 
 
 	
r.   Tr.  c                 @    g | ]} t          |                    S r>   r*  )rJ   r  r  s     r,   r   z+choose_saved_values_set.<locals>.<listcomp>n  s8       -.HQKK((  r.   c                 ,    g | ]}t          |          S r>   )r  r   s     r,   r   z+choose_saved_values_set.<locals>.<listcomp>q  s.       #'  r.   r  c           	      J                5  t          t          | d                    \  }}}d d d            n# 1 swxY w Y   t                      }|D ]}|                    	|                    |                    	          sJ t          
|          \  }}||fS )Nr   )r  r  r   r   issubsetr   )memory_budgetexpected_runtimesaved_node_idxsrecomputable_node_idxsr  r[  r   r&  aggressive_optionsall_recomputable_banned_nodesr   memories_banned_nodesr  r  runtimes_banned_nodess           r,   get_saved_values_knapsackz:choose_saved_values_set.<locals>.get_saved_values_knapsackv  s   []] 	 	
 4%'<c-QR>S>S 	 &		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 55) 	= 	=CLL6s;<<<<  !>?????'	
 
a ---s   $<A A r  r  c                     g | ]
}|d          S )r  r>   rJ   r  s     r,   r   z+choose_saved_values_set.<locals>.<listcomp>      000DG000r.   c                     g | ]
}|d          S r   r>   r+  s     r,   r   z+choose_saved_values_set.<locals>.<listcomp>  r,  r.   )
      )figsizeo)markerz.2fzoffset points)r   r/  center)
textcoordsxytexthazMemory Budgetz Runtime of Recomputed Componentsz:Pareto Frontier of Memory Budget vs. Recomputation Runtimememory_budget_pareto_z.pngz%Generated Pareto frontier curve at %sr   )+r   rc   r   ban_recompute_used_far_apart!ban_recompute_long_fusible_chains#ban_recompute_materialized_backwardban_recompute_not_in_allowlistban_recompute_reductionsaggressive_recomputationr   rA   r   r	   r<   r=   r  torch._inductor.fx_utilsr  rQ   r*  rQ  torch.utils._mode_utilsr  visualize_memory_budget_paretor  r   r'  matplotlib.pyplotpyplotfigureplotrW  annotatexlabelylabeltitlegridgcfshowr   savefigr  warning)$r   r  r   r  runtime_optimized_saved_valuesr&  r  more_aggressive_optionsmore_aggressive_saved_values%aggressive_recomputation_saved_valuesr  r  recomputable_banned_nodesr(  optionssweep_memory_budgetr   r!  pltx_valuesy_valuesr  txtfigfig_namer$  r%  r
  r  r  r  r  r&  r  r  r'  s$   ``                       @@@@@@@@@@@r,   choose_saved_values_setr]  	  s    qMA--hYfhh
 
 	
 $$A#)#K%+%O & E8  O & 
!"'',).$)
 
 
 (5) )%"A --6RW 6% 6 6 6 6 -,Y-=>>L,,-KLLL|##--: : : : : :
4= 
 
 
 
 
 
 
 

 &##(%*	   '4Y 7' '# ! }122]BB++  %   ;HY 2; ;7)< }:;;mKK44999999JJJJ9IJJJN	
DM 	
d27m 	
 	
 	
 	
 	
 	
 	
 !> =l K K %+!x% % %! ())Q..   2O   +H   433333. . . . . . . . . . ., , )G#(b"#5#5 
	 
	-F-F#c). .*L* NN'-..1AA!M,//    	('''''0000000000 	

7
###8C000  )) 	 	FAsLLhqk"*      	

?###

5666		NOOOggii


E+=+?+?EEEH;XFFF %$=AAA!DDr.   inductorc          
         | j                                          |                                  | j         }t          j        rt          |          }|| _         | j         }t          |           }t          |           }|rt          |           } fd}	 |	|           }
t          |
j
                  dk    rt          | |          S t          | j         j                  D ]}|j        dk    rt          d          |_        "|
                    |          sd|_        ?t          d          |_        |j        D ]$}t'          |j        |j        dz             |_        %t          j        }|j        D ]?}t+          |j                            dd          t0                    r|j        d         } n@t3          ||
|	          }t5          t7          t8          |                    }t5          t7          d
 |                    }t;          | ||          \  }}|r$|r"t=          | ||t          |                    \  }}t?          |          }t@          rbddl!m" fd|D             }tG          dtI          d |D                       dz             tK          d |D                       }d |j         j        D             }d |j         j        D             }||z  }tM          t                    }|j         j        D ]G}|j'        |v r<tQ          |j)        d          r'|tU          |j)        j+                  xx         dz  cc<   HtG          dt          |           dt          |           dt          |                      tG          dtK          |,                                d d                     ||fS )ax  
    Partitions the joint graph such that the backward recomputes the forward.
    Recomputing helps in trading off memory bandwidth with computation.

    To create the fwd and bwd graph, we copy the joint graph, manually set the
    outputs to just original forward or backward outputs. And then we run the
    resulting graphs through dead code elimination.

    .. warning::
        This API is experimental and likely to change.

    Args:
        joint_module(fx.GraphModule): The joint forward and backward graph. This
            is the result of AOT Autograd tracing.
        _joint_inputs: The inputs to the joint graph. This is unused.
        compiler: This option determines the default set of recomputable ops.
            Currently, there are two options: ``nvfuser`` and ``inductor``.
        recomputable_ops: This is an optional set of recomputable ops. If this
            is not None, then this set of ops will be used instead of the
            default set of ops.
        num_fwd_outputs: The number of outputs from the forward graph.

    Returns:
        Returns the generated forward and backward Fx graph modules.
    c                 \   t          | j                  t                      | j        j        D ]s}|j        dk    rd|j        v r                    |           n$t          |          r                    |           |v r|j        D ]}                    |           tt          t          t          | j        j                            }t          t          t          | j        j                            }||z   }t          |           \  }}                    d |D                        t          | j        ||d          }fd|j        D             fd| j        j        D             }	d}
i }| j        j        D ]}|v r
|
||<   |
d	z  }
t!          ||	|          S )
Nr   r   r   c              3   4   K   | ]}||j         dk    |V  d S )Nr   r   )rJ   r2  s     r,   rL   zNmin_cut_rematerialization_partition.<locals>.classify_nodes.<locals>.<genexpr>  s;       !
 !
am8H8HA8H8H8H8H!
 !
r.   r   c                 @    h | ]}|j         d k    |j                 S r   r  r  s     r,   r  zNmin_cut_rematerialization_partition.<locals>.classify_nodes.<locals>.<setcomp>  s7     +
 +
 +
w("" #"""r.   c                 $    h | ]}|v|v
|S r>   r>   )rJ   r'   rC   rS   s     r,   r  zNmin_cut_rematerialization_partition.<locals>.classify_nodes.<locals>.<setcomp>  s:     
 
 
,,,=N1N1N 1N1N1Nr.   r   r   )r  rs   r   rt   r   rz   r   r   r   r  r   r   r   r   r  r   r@   )r   r'   r  r   r   rA   r   r   r  rD   fw_cntrE   r  rC   rS   r   s               @@@r,   classify_nodesz;min_cut_rematerialization_partition.<locals>.classify_nodes  s=   '(:;;EE &, 	0 	0Dw-''J$+,E,E!%%d++++%d++ ,!%%d+++((( J 0 0D%))$////VJ0B0HIIJJ!%&(:(@AA"
 "
 !77#;/$
 $
 $
 [ 	   !
 !
"!
 !
 !
 	
 	
 	
 @Y
 
+
 +
 +
 +
*0+
 +
 +


 
 
 
 
$*0
 
 

  &, 	 	D(((!'!%'8/8
 
 	
r.   r   r   r   r  r   r   Nr9  c                 "    t          |            S r)   r  )rK   s    r,   rN   z5min_cut_rematerialization_partition.<locals>.<lambda>?  s    [^^); r.   r  r  c                 &    h | ]} |          S r>   r>   r  s     r,   r  z6min_cut_rematerialization_partition.<locals>.<setcomp>S  s%    DDDt$$T**DDDr.   z Theoretical Activations Stored: c              3   4   K   | ]}t          |          V  d S r)   r  r  s     r,   rL   z6min_cut_rematerialization_partition.<locals>.<genexpr>V  s(      22222222r.   c                 J    g | ] }t          |          t          |          f!S r>   )r*  r   r  s     r,   r   z7min_cut_rematerialization_partition.<locals>.<listcomp>X  s)    KKKSVV4KKKr.   c                 2    h | ]}|j         d k    |j        S r   r  r   s     r,   r  z6min_cut_rematerialization_partition.<locals>.<setcomp>Y  -     
 
 
47o;U;UDI;U;U;Ur.   c                 2    h | ]}|j         d k    |j        S rk  r  r   s     r,   r  z6min_cut_rematerialization_partition.<locals>.<setcomp>\  rl  r.   r  z# remat/fw/bw: /zCount of Ops Rematerialized: c                     | d         S r-  r>   r"  s    r,   rN   z5min_cut_rematerialization_partition.<locals>.<lambda>j  s
    1 r.   Tr.  )-rs   r   r  r   cser   rv   r~   r  rQ  rC   r  reversedrt   r   r^   r  rV   r   r  activation_memory_budgetr   rk   rl   r  r]  r  r   r   r   r  r`  r  r@  r  r1  r'  rQ   r   r   ry   rz   r   r  r(  )r   r  compilerr   rq   	cse_graphr   graph_has_recomputable_opsgraph_has_recomputable_rng_opsre  r  r'   r  r   r   r   ra  rb  storagessorted_sizesfw_module_nodesbw_module_nodesremat_nodescountsr  s      `                    @r,   r  r    sq   B **,,,D z ' &&	&$K!5l!C!C%=l%K%K"! <-l;;-
 -
 -
 -
 -
^ |,,I
 9&''1,, -
 
 
 	
 +122 R R7h #CD))$// 	R !D #CD
 R R$'(94;Lq;P$Q$Q!!R 3M!  dimmOT::EBB 	 Io6ME	 +Ym  L 6+|<<==O;;\JJKKL 4''	  Iy " ) 	#8iC4H4H$ $ Iy 4I>>I 
======DDDD|DDD.22\22222S8	
 	
 	
 KKlKKKLL
 
"+/"7
 
 

 
"+/"7
 
 
 &7!,S!1!1O) 	> 	>DyK''GDKAR,S,S's4;677888A=888^c+..^^_1E1E^^OH\H\^^	
 	
 	
 	+6<<>>~~tDDD	
 	
 	
 ir.   fx_graphTFtracedfnamefigname
clear_metaprogparse_stack_tracedot_graph_shapec                    |rDt          j        | j                  }t          j        | |          } | j        j        D ]	}i |_        
t          j        	                    |          \  }	}
|
sdt          j        z   }
t          d|	 |
            t          j        | |||          }|                                }t!          |d|
                    d          z             }|	 |
 }| ||           d S  |||           d S )N.zWriting FX graph to file: )r  r  write_)r  )copydeepcopyrs   r<   rZ  rt   rk   ospathsplitextr   torch_compile_graph_formatr1  r   FxGraphDrawerget_main_dot_graphr8  lstrip)r~  r  r  r  r  r  r  r   r'   baseextgr   write_methods                 r,   
draw_graphr  o  s,     M&,//		22L& 	 	DDII  ''ID# 6F55	
2t
2S
2
2333"+'		 	 	A 	
A1hC899LNSNNE|UU&&&&&&r.   r)   r.  )r^  )r}  TNFN)vr  r_   r  r   loggingrX  r  r  r0  r   dataclassesr   r   typingr   r   r	   r
   r   r   r   r   r{   torch._inductor.inductor_primstorch.fxr<   torch.utils._pytreeutils_pytreer   %torch.fx.experimental._backward_stater   "torch.fx.experimental.proxy_tensorr   r   torch.fx.experimental.sym_noder   r   %torch.fx.experimental.symbolic_shapesr   r   r   r   torch.fx.passesr   torch.utils.checkpointr    r   _aot_autograd.logging_utilsr   _aot_autograd.utilsr   compile_utilsr   r   sympydebug_partitionerr  	getLoggerr7   r  r6  r7  r}  r!   r@   rc   r=   ra   rp   rZ  rv   r~   r^   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r*  r3  	lru_cacherA  rI  r`  r  r  r   r  r  r  r  r  r  r  r  rA  r  r  r]  r  r  r>   r.   r,   <module>r     s                				 # # # # # # * * * * * * * * S S S S S S S S S S S S S S S S S S S S  % % % %       $ $ $ $ $ $ $ $ $ ? ? ? ? ? ? H H H H H H H H L L L L L L L L            ) ( ( ( ( ( 3 3 3 3 3 3       ; ; ; ; ; ; 0 0 0 0 0 0 8 8 8 8 8 8 8 8  LLL 0 g!!y~	 > > > > > > > >2                :         T    r~ $    2> d     C           
 o #	D DDMD "']D sm	D
 XD D D DNRW     Gbg G$ G G G Gbg $    bg $    XRW X X X X XCrw C4 C C C CKrw K4 K K K Krw 4    $.$
4=$rw-'($ $ $ $$rw- s    `".`"rw-`" "']`"
 `" 2>2>)*`" `" `" `"FS.S
2>2>)*S S S Sl #c(("# " " " " "27 s    :Abh A A A A T  "Hbgsl!3 HU27C<=P8Q H H H HGBN Gr~ G G G GTZ .Z ~Z  ~Z  	Z 
 2>2>)*Z  Z  Z  Z z BN    , 	z& z&z&z& #z& z& z& z&z. . ."aW a a a aHBH    DKD#';D<AD
5$s)T#Y&'D D D D0!?K!?#';!?<A!?
5$s)T#Y&'!? !? !? !?H98K98#';98<A98
5$s)T#Y&'98 98 98 98xTKT5kT T 5$s)T#Y&'	T T T T  0 / / / / /,N ,N ,N` ?@tE tEtE&.tE	"']tE tE tE tEt l  l .l  2>2>)*l  l  l  l d ,0#%)' 'H '' ' 	'
 5d3i(
)' ' c]' 
' ' ' ' ' 'r.   