
    çgG                         d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	 ddl
mZ d Zdd	Z	 	 ddZddZ G d d          Z G d d          Zd Zd dZd dZd Zed!d            Zd dZdS )"    N)contextmanager)AnyDictList   )languagec                     d                     |           } dddd| z   dg}t          j        |          }|                    t          j        j                                      d          }d |D             }|S )N,
nvidia-smi-i0z--query-gpu=z--format=csv,noheader,nounitsc                 ,    g | ]}t          |          S  )int.0xs     J/var/www/html/ai-engine/env/lib/python3.11/site-packages/triton/testing.py
<listcomp>znvsmi.<locals>.<listcomp>   s    


a3q66


    )join
subprocesscheck_outputdecodesysstdoutencodingsplit)attrscmdoutrets       r   nvsmir#   
   ss    HHUOOEsNU$:<[
\C

!#
&
&C
**SZ(
)
)
/
/
4
4C

3


CJr      meanc                    ddl }|dv sJ |j                                        |j                                        k    rt	          d           |              |5|D ]2}|                                 |                    d           d|_        3|j                                        }|j        	                    |          5   |              ddd           n# 1 swxY w Y   |j        
                                 |j                            d          }|j                            d          }|                                 |                                 |                                 |j        
                                 |                    |          }	t          dt!          ||	z                      }
|j                                        }|j        	                    |          5  t#          |
          D ]}||D ]	}d|_        
 |              	 ddd           n# 1 swxY w Y   |j        
                                 g }d}t#          |          D ]}|j                            d          }|j                            d          }|                                 |                                 |                                 |j        
                                 ||                    |          |
z  gz  }|                    |          } t'          ||          |                                          S )	a+  
    Benchmark the runtime of the provided function.

    :param fn: Function to benchmark
    :type fn: Callable
    :param rep: Repetition time (in ms)
    :type rep: int
    :param grad_to_none: Reset the gradient of the provided tensor to None
    :type grad_to_none: torch.tensor, optional
    r   Nminmaxr%   medianzQCannot capture graph in default stream. Please use side stream in benchmark code.Tenable_timingr   
   )torchcudacurrent_streamdefault_streamRuntimeErrordetach_requires_grad_grad	CUDAGraphgraphsynchronizeEventrecordreplayelapsed_timer)   r   rangetensorgetattritem)fnrepgrad_to_nonereturn_moder.   r   gstart_event	end_eventestimate_msn_repeatir"   	n_retriestimess                  r   do_bench_cudagraphrM      s    LLL:::::z  ""ej&?&?&A&AAAnoooBDDD  	 	AIIKKKT"""AFF
A			!		  
              	J*"""66K
  t 44IHHJJJ	J**955K1c#+,,--H 	
A			!		  x 	 	A'% " "A!AFFBDDDD		               
J
CI9 @ @j&&T&::J$$4$88		



   ((33h>??LLE&75+&&u--22444s$   CC C0+H((H,/H,   d   Tr/   c                 P   |dv sJ ddl }|j        j                            |           |                                               |r+|                    t          d          |j        |          }	n*|                    t          d          |j        |          }	                    d          }
                    d          }|
	                                 t          d	          D ] }|	                                  |              !|	                                                                  |
                    |          d	z  }t          d
t          ||z                      }t          d
t          ||z                      }fdt          |          D             }
fdt          |          D             }t          |          D ]} |              t          |          D ]b}||D ]	}d|_        
|	                                 |
|         	                                  |              ||         	                                 c                                 |                    d t!          |
|          D             |j                  }|_|                    ||                    ||j                                                            }t)          |          d
k    r|d         }|S  t+          ||          |                                          S )a  
    Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
    the 20-th and 80-th performance percentile.

    :param fn: Function to benchmark
    :type fn: Callable
    :param warmup: Warmup time (in ms)
    :type warmup: int
    :param rep: Repetition time (in ms)
    :type rep: int
    :param grad_to_none: Reset the gradient of the provided tensor to None
    :type grad_to_none: torch.tensor, optional
    :param quantiles: Performance percentile to return in addition to the median.
    :type quantiles: list[float]
    :param fast_flush: Use faster kernel to flush L2 between measurements
    :type fast_flush: bool
    r'   r   Ng    A)dtypedeviceg    ATr+      r   c                 <    g | ]}                     d           S Tr+   r9   r   rJ   dis     r   r   zdo_bench.<locals>.<listcomp>   s'    IIIA288$8//IIIr   c                 <    g | ]}                     d           S rU   rV   rW   s     r   r   zdo_bench.<locals>.<listcomp>   s'    GGG!--GGGr   c                 >    g | ]\  }}|                     |          S r   )r<   )r   ses      r   r   zdo_bench.<locals>.<listcomp>   s(    TTT1!..++TTTr   )rQ   )r.   _dynamodevice_interfaceget_interface_for_devicer8   emptyr   int8r9   r:   r=   zero_r<   r)   r5   r>   zipfloatquantiletolistlenr?   r@   )rA   warmuprB   rC   	quantiles
fast_flushrD   device_typer.   cacherF   rG   _rH   n_warmuprI   rJ   r   rL   r"   rX   s                       @r   do_benchro   R   s   & :::::LLL		'	@	@	M	MBBDDDNN
  NC
OO59[QQCJJejMM (((..Kt,,I1XX  
NN**9559K 1c&;.//00H1c#+,,--HIIIIxIIIKGGGGuXGGGI8__  
8__   #!  A
!NNLLTTK8S8STTT\a\gLhhEnnUELL%+L$N$NOOVVXXs88q==a&C
&75+&&u--22444r    c                    dd l }dd l}t          | |j                  s|                    |           } t          ||j                  s|                    |          }|d}t          |          r || j                  n|}|d}t          |          r || j                  n|}t          | |j                  r\| j        |j        k    r|                                 } | 	                                
                                                                 } t          ||j                  r\|j        |j        k    r|                                }|	                                
                                                                 }| j        dk    s|j        dk    r!|j                            | |||d           d S |                    | |||          st          | d|  d	| d
| d| d
          d S )Nr   g{Gz?g        r   T)atolrtol	equal_nan)rr   rs    z is not close to z (atol=z, rtol=))numpyr.   
isinstanceTensorr>   callablerQ   bfloat16rd   cpudetachsizetestingassert_allcloseallcloseAssertionError)r   yrr   rs   err_msgnpr.   s          r   assert_closer      s   LLL a&& LLOOa&& LLOO|$TNN444===D|$TNN444===D !U\"" %7en$$		AEEGGNN""$$!U\"" %7en$$		AEEGGNN""$$ 	vzzQVaZZ

""1ad"NNN;;q!$T;22 ^\\!\\a\\\\UY\\\]]]^ ^r   c                       e Zd ZdZ	 	 	 	 	 	 ddee         dee         dedee         d	ee         d
edeeef         dedededefdZ	dS )	Benchmarkzk
    This class is used by the :code:`perf_report` function to generate line plots with a concise API.
    rp   FNx_namesx_valsline_arg	line_vals
line_names	plot_nameargsxlabelylabelx_logy_logc                     || _         || _        |
| _        || _        || _        || _        || _        || _        || _        |	| _	        || _
        || _        dS )a  
        Constructor.
        x_vals can be a list of scalars or a list of tuples/lists. If x_vals is a list
        of scalars and there are multiple x_names, all arguments will have the same value.
        If x_vals is a list of tuples/lists, each element should have the same length as
        x_names.

        :param x_names: Name of the arguments that should appear on the x axis of the plot.
        :type x_names: List[str]
        :param x_vals: List of values to use for the arguments in :code:`x_names`.
        :type x_vals: List[Any]
        :param line_arg: Argument name for which different values correspond to different lines in the plot.
        :type line_arg: str
        :param line_vals: List of values to use for the arguments in :code:`line_arg`.
        :type line_vals: List[Any]
        :param line_names: Label names for the different lines.
        :type line_names: List[str]
        :param plot_name: Name of the plot.
        :type plot_name: str
        :param args: Dictionary of keyword arguments to remain fixed throughout the benchmark.
        :type args: Dict[str, Any]
        :param xlabel: Label for the x axis of the plot.
        :type xlabel: str, optional
        :param ylabel: Label for the y axis of the plot.
        :type ylabel: str, optional
        :param x_log: Whether the x axis should be log scale.
        :type x_log: bool, optional
        :param y_log: Whether the y axis should be log scale.
        :type y_log: bool, optional
        N)r   r   r   r   r   r   r   stylesr   r   r   r   )selfr   r   r   r   r   r   r   r   r   r   r   colorr   s                 r   __init__zBenchmark.__init__   s]    \ 
 "$
"			r   )rp   rp   FFNN)
__name__
__module____qualname____doc__r   strr   r   boolr   r   r   r   r   r      s          : :c: S	: 	:
 9: I: : 38n: : : : : : : : : :r   r   c            	       :    e Zd Zd Z	 	 ddedededefdZdd
ZdS )Markc                 "    || _         || _        d S NrA   
benchmarks)r   rA   r   s      r   r   zMark.__init__
  s    $r   F   bench	save_path
show_plots
print_datac                 .	   dd l }dd lm}	 dd l}
|j        }d |j        D             }d |j        D             }t          |j                  }|
                    ||z   |z   |z             }|j        D ]t          t
          t          f          sfd|D             t                    t          |          k    r"t          dt          |           d           t          t          |                    }g g g }}}|j        D ]Q} | j        di ||j        |i|j        |}	 |\  }}}n# t&          $ r	 |d d }}}Y nw xY w||gz  }||gz  }||gz  }Rt                    |z   |z   |z   |j        t          |          <   |j        r4|	                                 |	                                }|d         }t1          |j                  D ]\  }}||dz            ||d	z            }}|j        r|j        |         d         nd }|j        r|j        |         d
         nd }|                    ||         ||         |||           |                                                                sz|                                                                sT|                    t<                    }|                    t<                    }|                    ||         ||d|           |                                  |!                    |j"        p|           |#                    |j$                   |%                    |j&        rdnd           |'                    |j(        rdnd           |r|	)                                 |r6|	*                    |j+        ,                    ||j         d                     |||j        z            }|rA|j-        d
         dk    r0|j.        /                                \  }}||         ||         z
  |d<   |r8ta          |j        dz              ta          |1                                           |r=|2                    |j+        ,                    ||j         d          d| dd           |S )Nr   c                     g | ]}| d S )-minr   r   s     r   r   zMark._run.<locals>.<listcomp>      666A666r   c                     g | ]}| d S )-maxr   r   s     r   r   zMark._run.<locals>.<listcomp>  r   r   )columnsc                     g | ]}S r   r   )r   rm   r   s     r   r   zMark._run.<locals>.<listcomp>  s    (((1Q(((r   z	Expected z values, got r   r   r   )labelr   lsg333333?)alphar   loglinearz.png   Diff:z.csvz%.fF)float_formatindexr   )3osmatplotlib.pyplotpyplotpandasr   listr   	DataFramer   rx   tuplerg   
ValueErrordictrc   r   rA   r   r   	TypeErrorlocr   figuresubplot	enumerater   plotisnullallastyperd   fill_betweenlegend
set_xlabelr   
set_ylabelr   
set_xscaler   
set_yscaler   showsavefigpathr   shaper   rf   print	to_stringto_csv)r   r   r   r   r   diff_colsave_precisionkwragsr   pltpdy_meany_miny_maxr   dfx_argsrow_meanrow_minrow_maxr   r"   axfirst_xrJ   colstycol0col1r   s                                @r   _runz	Mark._run  s   			''''''!66U%566666U%5666u}%%\\'F"2U":U"B\CC 	E 	EAa$// )(((((((1vvW%% !KS\\!K!K!K!KLLL#gq//**F)+RwgH_ # #dgVVV5>1*=VVvVV;+.(FE55  ; ; ;+.d5EFFF;VH$E7"E7""1gg07:WDBF3r77OO? 	OJJLLLBajG!%"233 V V1!!f*~r!f*~u,1LBel1oa((d,1LBel1oa((d7RU!33GGG||~~))++ VELLNN4F4F4H4H V!LL//E!LL//EOOBwKTQTOUUUIIKKKMM%,1'222MM%,'''MM5;<%%H===MM5;<%%H=== 


 OBGLLu4L4L4LMMNNN%**+ 	-q((**,,JD$DBtH,BvJ 	"%/C'(((",,..!!! 	#IIbgll9.F.F.FGGVl[iVlVlVl!  # # #	s   .D55EErp   c           	         t          | j        t                    }|r| j        gn| j        }g }|rYt          j        |d           t          t          j                            |d          d          }	|	                    d           |D ]F}
|	                     | j
        |
|||fi |           |r|	                    d|
j         d           G|r)|	                    d           |	                                 |r|r|d	         S |S d S )
NT)exist_okzresults.htmlwz<html><body>
z<image src="z.png"/>
z</body></html>
r   )rx   r   r   r   makedirsopenr   r   writeappendr   r   close)r   r   r   r   	return_dfkwargshas_single_benchr   
result_dfshtmlr   s              r   runzMark.runS  s5   %doyAA*:Odo&&

 	)K	D1111Y??EEDJJ'((( 	H 	HEidiy*j[[TZ[[\\\ H

F5?FFFGGG 	JJ)***JJLLL 	" "!!}$!!tr   N)Fr   )FFrp   F)	r   r   r   r   r   r   r   r   r   r   r   r   r   r     s        % % % chC C) C C CSW C C C CJ     r   r   c                       fd}|S )z
    Mark a function for benchmarking. The benchmark can then be executed by using the :code:`.run` method on the return value.

    :param benchmarks: Benchmarking configurations.
    :type benchmarks: List of :class:`Benchmark`
    c                 $    t          |           S r   )r   r   s    r   <lambda>zperf_report.<locals>.<lambda>r  s    b*-- r   r   )r   wrappers   ` r   perf_reportr  k  s     .---GNr   c                     ddl }ddlm} | s|j                                        } |j        j                            |           d         }|j        j                            |           d         }||z  dz  dz  d	z  }|S )
z return DRAM bandwidth in GB/s r   Nr   drivermem_clock_ratemem_bus_widthr   g    .A   )r.   runtimer  r/   current_deviceactiveutilsget_device_properties)rR   r.   r  mem_clock_khz	bus_widthbw_gbpss         r   get_dram_gbpsr  v  s    LLL -**,,M'==fEEFVWM#99&AA/RIi'!+c1A5GNr   c                    dd l }ddlm} |s|j                                        }|j        j                            |          d         dz  }|j                            |          }|d         dk     r| |j	        k    sJ d}ni| |j
        |j        fv rd}nV| |j	        |j        |j        fv rd}n=| |j        t          j        t          j        t          j        fv rd	}nt'          d
          ||z  |z  dz  }|S )Nr   r   r  multiprocessor_count   r     i   i   dtype not supported&.>)r.   r	  r  r/   r
  r  r  r  get_device_capabilityfloat16float32int32r{   int16ra   tl
float8e4nvfloat8e4b15float8e5r2   	rQ   
clock_raterR   r.   r  num_subcores
capabilityops_per_sub_coretflopss	            r   get_max_tensorcore_tflopsr'    s(   LLL -**,,=&<<VDDE[\_``L11&99J!}q%%%%U]EK000"u}enekBBB"uz2="."+NNN#4555J&)99D@FMr   c                        fd}|S )Nc                 J     t          j                    fd            }|S )Nc                  p   dd l }|                    t          j                                                              }
                                |                                k    }|r|dk    rt          j                            j        d                   }t          j	        d         dd}d|v s
J d            |d         j
        j        j        }| d	j         d
| d}t          j        ddd|gd|          }	|	j        dk    s
J d            dt#          |	j                  v sJ d S  | i | d S )Nr   zcuda-memcheck__file__PATH1)r,  PYTORCH_NO_CUDA_MEMORY_CACHINGrequestz@memcheck'ed test must have a (possibly unused) `request` fixturez::[]pytestz-vsT)capture_outputenvz7cuda-memcheck returned an error: bounds checking failedzERROR SUMMARY: 0 errors)psutilProcessr   getppidnameitemsr   realpath__globals__environnodecallspecidr   r   r   
returncoder   r   )r   r   r5  	ppid_namerun_cuda_memcheckr   r4  test_idr    r!   target_kwargstest_fns             r   r  z1cuda_memcheck.<locals>.decorator.<locals>.wrapper  sW   MMMrz||4499;;I - 3 3 5 5 G  )Y/%A%Aw''(;J(GHH!z&1UXYY F***,n*** +09<>>!1>>G>>> nox%L]agjkkk~***,e***0C
OOCCCCCC((((((r   )	functoolswraps)rE  r  rD  s   ` r   	decoratorz cuda_memcheck.<locals>.decorator  s>    		!	!	) 	) 	) 	) 	) 
"	!	)" r   r   )rD  rH  s   ` r   cuda_memcheckrI    s$        , r   F    c           	   #     K   	 t          j        g d           t          j        dddd|  d|  g           t          j        dddd| d| g           t          dg          d	         }t          d
g          d	         }t          || z
            dk     sJ d|  d            t          ||z
            dk     sJ d| d            d| z  }d|z  dz  }||fV  t          j        g d           t          j        g d           t          j        g d           d S # t          j        g d           t          j        g d           t          j        g d           w xY w)N)r   r   r   -pmr-  r   r   r   z--lock-gpu-clocks=r
   z--lock-memory-clocks=zclocks.current.smr   zclocks.current.memoryr-   zGPU SMs must run at z MHzg 3O?i   gMbP?)r   r   r   rM  r   )r   r   r   z-rgc)r   r   r   z-rmc)r   r   r#   abs)ref_sm_clockref_mem_clockcur_sm_clockcur_mem_clockr&  gbpss         r   set_gpu_clockrT    s     C E E EFFF>>>>>	!
 	 	 	 	CMCCMCC	!
 	 	 	 1233A66788;<,.//"4446_\6_6_6_444==011B6668b}8b8b8b666)L8&-dl E E EFFF A A ABBB A A ABBBBB 	 E E EFFF A A ABBB A A ABBBBs   CD! !AE%c                    dd l }ddlm} |s|j                                        }|j        j                            |          d         dz  }|j                                        }|d         dk     r+| |j	        k    rd}nM| |j
        k    rd}n?t          d	          | |j	        k    rd}n"| |j
        |j        fv rd}nt          d	          ||z  |z  d
z  }|S )Nr   r   r  r  r  r      @   r  r  )r.   r	  r  r/   r
  r  r  r  r  r  r  r2   r{   r!  s	            r   get_max_simd_tflopsrX    s   LLL -**,,=&<<VDDE[\_``L1133J!}qEM!!!em##!4555EM!!!u}en555!4555J&)99D@FMr   )r$   Nr%   )rN   rO   NNTr%   r/   )NNrp   r   )rJ  rK  )rF  r   r   r   
contextlibr   typingr   r   r   rp   r   r  r#   rM   ro   r   r   r   r  r  r'  rI  rT  rX  r   r   r   <module>r[     s       				     



 % % % % % % " " " " " " " " " "        <5 <5 <5 <5~ flL5 L5 L5 L5^"^ "^ "^ "^J? ? ? ? ? ? ? ?D` ` ` ` ` ` ` `F  
 
 
 
   :  6 C C C C8     r   