
    çg3                        d dl mZmZ d dlmZmZmZmZ d dlm	Z	 d dl
Z
d dlmZmZmZ d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ  e
j                    defd            Z e
j                    d	             Z e
j                    d
efd            Z e
j        d          d             Z e	d           G d d                      Z G d de          ZdS )    )BaseBackend	GPUTarget)irpassesllvmnvidia)	dataclassN)AnyTupleOptional)Pathbinaryc                    t           j                            d|                                  dd          t           j                            t           j                            t                    d|           g}|D ]}t           j                            |          rt           j        	                    |          rst          j        |dgt          j                  }|Ot          j        d|                    d          t          j        	          }|||                    d
          fc S t%          d|            )NTRITON__PATH bin	--version)stderrz.*release (\d+\.\d+).*utf-8flags   zCannot find )osenvirongetupperpathjoindirname__file__existsisfile
subprocesscheck_outputSTDOUTresearchdecode	MULTILINEgroupRuntimeError)r   pathsr   resultversions        [/var/www/html/ai-engine/env/lib/python3.11/site-packages/triton/backends/nvidia/compiler.py_path_to_binaryr1      s    	
6666;;
RW__X..v>>E
  1 17>># 	127>>##6#6 	1,c;-?
HYZZZF!)$=v}}W?U?U]_]ijjj&a 0 00000
.f..
/
//    c                  ~    t          j        t          d          d         dg                              d          } | S )Nptxasr   r   r   )r$   r%   r1   r)   )r/   s    r0   get_ptxas_versionr5   !   s8    %w'?'?'BK&PQQXXY`aaGNr2   returnc                     t          | t                    sJ t          t          |                     d                    \  }}|dk    rd|z   S |dk    rd|z   S |dk    rd|z   S t          d          )	zK
    Get the highest PTX version supported by the current CUDA driver.
    .   P      F   
   ?   z'Triton only support CUDA 10.0 or higher)
isinstancestrmapintsplitr,   )cuda_versionmajorminors      r0   ptx_get_versionrG   '   s    
 lC(((((sL..s3344LE5{{Ez{{Ez{{Ez
@
A
AAr2   c                     t          | d          5 }t          j        |                                                                          cd d d            S # 1 swxY w Y   d S )Nrb)openhashlibsha256read	hexdigest)r   fs     r0   	file_hashrP   7   s    	dD		 4Q~affhh''11334 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4s   8AAAT)frozenc                      e Zd ZU dZeed<   dZeed<   dZeed<   dZe	e         ed<   d	Z
eed
<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZee         ed<   dZeed<   dZeed<   dZeed<   dZeed<   d Zd ZdS )CUDAOptions   	num_warpsr   num_ctas   
num_stagesNmaxnreg)r   r   r   cluster_dimsptx_versionTenable_fp_fusionFallow_fp8e4nvallow_fp8e4b15tf32default_dot_input_precision)r_   tf32x3ieeeallowed_dot_input_precisionsmax_num_imprecise_acc_defaultextern_libsdebugcudabackend_namec                    t          t                    j        dz  }| j        i nt	          | j                  }|                    dd           s(t          j        dt          |dz                      |d<   t          
                    | dt          |                                                     | j        dk    r| j        | j        dz
  z  dk    s
J d            d S )	Nlib	libdeviceTRITON_LIBDEVICE_PATHzlibdevice.10.bcre   r   r   znum_warps must be a power of 2)r   r!   parentre   dictr   r   getenvr@   object__setattr__tupleitemsrU   )selfdefault_libdirre   s      r0   __post_init__zCUDAOptions.__post_init__Q   s    h.6 ,4bb$t?O:P:P{D11 	s')y1H#n_pNpJqJq'r'rK$4k6G6G6I6I0J0JKKK~!!t~!9K'LQR&R&R&R/ 'S&R&R&R&Rr2   c                 v   t          | j                  }t          d t          |d                   D                       |d<   d                    d t          |                                          D                       }t          j        |                    d                    	                                S )Nc              3   >   K   | ]\  }}|t          |          fV  d S N)rP   ).0kvs      r0   	<genexpr>z#CUDAOptions.hash.<locals>.<genexpr>\   s1      (h(htq!!Yq\\):(h(h(h(h(h(hr2   re   _c                 "    g | ]\  }}| d | S )- )rz   namevals      r0   
<listcomp>z$CUDAOptions.hash.<locals>.<listcomp>]   s&    SSSID#4#SSSr2   r   )
rn   __dict__rr   sortedr   rs   rK   rL   encoderN   )rt   	hash_dictkeys      r0   hashzCUDAOptions.hashZ   s    ''	#((h(hviXeNfGgGg(h(h(h#h#h	- hhSS	@Q@Q9R9RSSSTT~cjj1122<<>>>r2   )__name__
__module____qualname__rU   rB   __annotations__rV   rX   rY   r   rZ   rr   r[   r\   boolr]   r^   r`   r@   rc   r   rd   re   rn   rf   rh   rv   r   r   r2   r0   rS   rS   =   s2        IsHcJ "GXc]!!!#L%###K!d!!!M4 ND   '----/I %*III*.!4...KE4L#0 0 0? ? ? ? ?r2   rS   c                       e Zd Zedefd            Zdeddf fdZdefdZd Z	d Z
d	 Zed
             Zed             Zed             Zed             Zed             Zd Z ej                    d             Z xZS )CUDABackendtargetc                     | j         dk    S )Nrg   )backend)r   s    r0   supports_targetzCUDABackend.supports_targetc   s    ~''r2   r6   Nc                     t                                          |           |j        | _        t	          | j        t
                    sJ d| _        d S )Ncubin)super__init__arch
capabilityr?   rB   
binary_ext)rt   r   	__class__s     r0   r   zCUDABackend.__init__g   sG        +$/3/////!r2   c                     fdt           j                                        D             }| j        dk    |d<   | j        dk     |d<   | j        dk    rdnd|d<   t          d	i |S )
Nc                 *    i | ]}|v ||         S r   r   )rz   r{   optss     r0   
<dictcomp>z-CUDABackend.parse_options.<locals>.<dictcomp>n   s%    YYYqqTXyy47yyyr2   Y   r]   Z   r^   i   @r   rd   r   )rS   __dataclass_fields__keysr   )rt   r   argss    ` r0   parse_optionszCUDABackend.parse_optionsm   s~    YYYYK$D$I$I$K$KYYY $2 5_!%2!59=B9N9NTU,-""T"""r2   c                 r    |j         |j        |j        |j        d         |j        d         |j        d         fS )Nr   r      )rU   rV   sharedrZ   )rt   metadatas     r0   pack_metadatazCUDABackend.pack_metadatat   s>    O!!$!!$!!$
 	
r2   c                 T    dd l mc mc m} d| j        dk    r|j        n|j        i}|S )Nr   convert_custom_typesr:   )triton.language.extra.cudalanguageextrarg   r   convert_custom_float8_sm80convert_custom_float8_sm70)rt   rg   codegen_fnss      r0   get_codegen_implementationz&CUDABackend.get_codegen_implementation~   sR    111111111111"/3"/D/DD++$Ji
 r2   c                 .    t          j        |           d S ry   )r   load_dialects)rt   ctxs     r0   r   zCUDABackend.load_dialects   s    S!!!!!r2   c                 z   t          j        | j                  }|                                 t          j                            |           t          j                            |           t          j        	                    |           t          j        
                    |           t          j                            |           t          j                            |           t          j                            |           t          j                            |           |                    |            | S ry   )r   pass_managercontextenable_debugr   commonadd_inlinerttiradd_rewrite_tensor_pointeradd_combineadd_canonicalizeradd_reorder_broadcastadd_cseadd_licmadd_symbol_dcerun)modr   optpms       r0   	make_ttirzCUDABackend.make_ttir   s    _S[))
!!"%%%..r222###''+++))"---b!!!r"""$$R(((
s
r2   c                 D   t          j                    }|j        6|j        d         |_        |j        d         |_        |j        d         |_        t          j        | j                  }|	                                 t          j                            |d| |j        d|j                   t          j                            |           |dz  dk    rt          j                            |           t           j
        j                            ||           t          j                            |           t          j                            |           t          j                            |           t          j                            |           t          j                            ||dk               t          j                            |           |dz  dk    rDt          j                            |           t          j                            ||j                   t          j                            |           t          j                            ||dk               t          j                            |           t          j                            |           t          j                            |           t          j                            |           t          j                             |           |dz  d	k    rHt           j
        j        !                    |           t           j
        j        "                    |           t          j        #                    |           |$                    |            |j        |j        |j        f|d
<   | S )Nr   r   r   zcuda:    r=      r:   	   rZ   )%r   ClusterInforZ   clusterDimXclusterDimYclusterDimZr   r   r   r   r   r   add_convert_to_ttgpuirrU   rV   ttgpuiradd_coalesceadd_f32_dot_tc	ttnvgpuiradd_plan_ctaadd_remove_layout_conversionsadd_optimize_thread_localityadd_accelerate_matmuladd_optimize_dot_operandsr   r    add_combine_tensor_select_and_ifadd_pipelinerX   add_prefetchadd_reduce_data_duplicationadd_reorder_instructionsr   add_fence_insertionadd_tma_loweringr   r   )r   r   r   r   cluster_infor   s         r0   
make_ttgirzCUDABackend.make_ttgir   s   )++''*'7':L$'*'7':L$'*'7':L$_S[))
**2/Cz/C/CS]TVX[Xdeee##B'''q  N))"---,,R>>>44R88833B777,,R00044R88800Z25EFFFb!!!q  N;;B???N''CN;;;##B'''00Z25EFFF44R888222666//333b!!!$$R(((q  M#77;;;M#44R888''+++
s$0$<l>VXdXp#q 
r2   c                 t   |                      d          }||dxx         |z  cc<   | }t          j        |j                  }|                                 t
          j        j                            |           t          j        	                    |           t          j
                            |           t          j
                            |           t          j                            |           t
          j        j                            ||           t
          j        j                            |           t          j
                            |           t          j                            |           t          j                            |           t          j                            |           t,          j                            dd          dk    rt          j                            |           |                    |           t9          j                     t9          j                    }t9          j        ||          }t          j        |           |j         Y|!                                D ]D}	|	"                                s.|	#                                r|	$                    |j                    E|j%        r&d |j%        D             }
t9          j&        ||
           t9          j'        |t8          j(                   |                      d          |d<   tS          |          }~~|S )Nz"triton_gpu.num-warp-groups-per-ctarU   TRITON_DISABLE_LINE_INFO0c                     g | ]\  }}|S r   r   )rz   r   r   s      r0   r   z)CUDABackend.make_llir.<locals>.<listcomp>   s    BBBltTTBBBr2   ztriton_gpu.sharedr   )*get_int_attrr   r   r   r   r   r   r   %add_decompose_unsupported_conversionsr   convertadd_scf_to_cfadd_index_to_llvmiradd_allocate_shared_memoryadd_to_llvmirr   add_nvgpu_to_llvmadd_arith_to_llvmirr   r   r   r   r   r   r   llvmiradd_di_scoper   r   init_targets	to_moduleset_nvvm_reflect_ftzrY   get_functionsis_declarationis_external_linkageset_nvvm_maxnregre   link_extern_libsoptimize_moduleOPTIMIZE_O3r@   )srcr   optionsr   num_warp_groupsr   r   r   llvm_modr{   r-   rets               r0   	make_llirzCUDABackend.make_llir   s    **+OPP&[!!!_4!!!_S[))
CCBGGG77;;;$$R(((**2...11"555++B
;;;11"555**2...''+++b!!!$$R(((:>>4c::cAAM&&r***
s,..>#w//#H--- ?&++-- 8 8'')) 8a.C.C.E.E 8&&w777 	3BBg.ABBBE!(E222Xt'7888 !--.ABB(mm
r2   c           	      `   |j         }|!t          d          \  }}t          |          }t          d|          }d}|dk    rdnd| }	d| }
t	          j        | ||	|
dg|j        d	          }t          j        d
|          }t          |          dk    sJ |d         |d<   |dz   d|dz   }t          j
        dd| |t          j                  }t          j
        dd|          }t          j                            dd          dk    rt          d           t          |           |S )Nr4   S   znvptx64-nvidia-cudar   sm_90asm_z+ptxznvptx-short-ptrFz(.visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)r   r   r   r=   r8   z\.version \d+\.\d+z	.version r   z,\s*debug|debug,\s*r   NVPTX_ENABLE_DUMPr   1z // -----// NVPTX Dump //----- //)r[   r1   rG   minr   translate_to_asmr\   r'   findalllensubr*   r   r   r   print)r   r   r   r   r[   r~   rD   llvm_ptx_versiontripleprocfeaturesr  namess                r0   make_ptxzCUDABackend.make_ptx   sV   o-g66OA|),77K r;//&%++xx1Cz1C1C,*,,#CxBSATVYVjlqrr
FLL5zzQ 8$b;;;r>;;f*,E,E,EsRTR^___f+R55:>>-s33s::4555#JJJ
r2   c                 
   t          d          \  }}t          j        ddd          5 }t          j        ddd          5 }|                    |            |                                 |j        dz   }t          j                            d	          rd
nd}	|j	        rd
nd}
|dk    rdnd}t          j                            dd          dk    r | |	 |
 d| | |j         d| d|j         }n| |	 |
 d| | |j         d| d|j         }	 t          j        |dd           n# t          j        $ r}t          |j                  5 }|                                }d d d            n# 1 swxY w Y   |j        dk    rt!          d|           |j        dt"          j        z   k    rt!          d|j         d|           t!          d|j         d|           d }~ww xY w	 t          j                            |j                  rt          j        |j                   t          j                            |j                  rt          j        |j                   n# t          j                            |j                  rt          j        |j                   t          j                            |j                  rt          j        |j                   w w xY wt          |d           5 }|                                }d d d            n# 1 swxY w Y   t          j                            |          rt          j        |           d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   |S )!Nr4   Fwz.ptx)deletemodesuffixrz.logz.or   r   z
 -lineinfoz --fmad=falser   za  DISABLE_PTXAS_OPTr   r  z  -v --opt-level 0 --gpu-name=sm_z -o z 2> z -v --gpu-name=sm_T)shellcheck   z$Internal Triton PTX codegen error: 
   zPlease run `ptxas z+` to confirm that this is a bug in `ptxas`
z`ptxas` failed with error code z: 
rI   )r1   tempfileNamedTemporaryFilewriteflushr   r   r   r   r\   r$   r   CalledProcessErrorrJ   rM   
returncoder,   signalSIGSEGVr   r"   remove)r   r   r   r   r4   r~   fsrcflogfbin	line_infofmadr  cmdelog_filelogrO   r   s                     r0   
make_cubinzCUDABackend.make_cubin  s   "7++q(COOO #	 SW'u3vNNN#	 RVJJsOOOJJLLL9t#D jnn-GHHZlI-B22?D'2--TT3Fz~~13773>>  J	  J4  J  JQ[  J]c  Jeien  J  Jtx  J  J  C  H  J  J{	{4{{:{v{W[W`{{fj{{ptpy{{)s$d;;;;;0 	b 	b 	b$)__ *"--//C* * * * * * * * * * * * * * *<3&&&'Ts'T'TUUU\S6>%999&iTYiidgiik k k ''`'`'`[^'`'`aaa	b < 7>>$),, )Idi(((7>>$),, )Idi((( 7>>$),, )Idi(((7>>$),, )Idi(((() dD!! !Q! ! ! ! ! ! ! ! ! ! ! ! ! ! !w~~d##  	$G#	  #	  #	  #	  #	  #	  #	  #	  #	  #	  #	  #	  #	  #	  #	  #	  #	  #	  #	  #	  #	  #	  #	  #	  #	  #	  #	  #	  #	  #	 H s   M8CM!D*)I&*G'9G"E.	"G".E22G"5E26A,G""G''I&+A;M!&A=K##M!6LM!LM!L6M!M8!M%	%M8(M%	)M88M<?M<c                 d      fd|d<    fd|d<    fd|d<    fd|d<    fd	|d
<   d S )Nc                 2                         | |          S ry   )r   r   r   r  rt   s     r0   <lambda>z(CUDABackend.add_stages.<locals>.<lambda><  s    t~~c8W/U/U r2   r   c                 >                         | |j                  S ry   )r   r   r9  s     r0   r:  z(CUDABackend.add_stages.<locals>.<lambda>=      XwX\Xg0h0h r2   ttgirc                 >                         | |j                  S ry   )r  r   r9  s     r0   r:  z(CUDABackend.add_stages.<locals>.<lambda>>  s    t~~c8WVZVe/f/f r2   llirc                 >                         | |j                  S ry   )r  r   r9  s     r0   r:  z(CUDABackend.add_stages.<locals>.<lambda>?  s    dmmC7TXTc.d.d r2   ptxc                 >                         | |j                  S ry   )r6  r   r9  s     r0   r:  z(CUDABackend.add_stages.<locals>.<lambda>@  r<  r2   r   r   )rt   stagesr  s   ` `r0   
add_stageszCUDABackend.add_stages;  sq    UUUUUvhhhhhwfffffvddddduhhhhhwr2   c                 6    t                      }| d| j         S )Nr   )r5   r   )rt   r/   s     r0   r   zCUDABackend.hashB  s#    #%%--DO---r2   )r   r   r   staticmethodr   r   r   r
   r   r   r   r   r   r   r  r  r6  rD  	functools	lru_cacher   __classcell__)r   s   @r0   r   r   a   s{       (	 ( ( ( \("y "T " " " " " "#S # # # #
 
 
  " " "   \ & & \&P . . \.`   \> & & \&Pi i i Y. . . . . . .r2   r   ) triton.backends.compilerr   r   triton._C.libtritonr   r   r   r   dataclassesr	   rG  typingr
   r   r   rK   r'   r$  r*  r   r$   pathlibr   rH  r@   r1   r5   rB   rG   rP   rS   r   r   r2   r0   <module>rO     s   ; ; ; ; ; ; ; ; 8 8 8 8 8 8 8 8 8 8 8 8 ! ! ! ! ! !     ' ' ' ' ' ' ' ' ' '  				   				           0C 0 0 0 0    
 BS B B B B T4 4 4
 $ ?  ?  ?  ?  ?  ?  ?  ?Fd. d. d. d. d.+ d. d. d. d. d.r2   