
    çg3                        d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ ej                            ej                            e                    Zej                            ed          gZej                            ed          Zd	gZ e j                    d
             Z e j                    d             Zd Z G d de          Zd Zd Z G d de          Z G d de          Z dS )    N)Path)_build)get_cache_manager)	GPUTarget)	GPUDriverincludelibcudac                     t          j        d          } | r| gS t          j        ddg                                          }d |                                D             }d |D             }t          j        d          }|r!|sd |                    d          D             }d	}|r|d
t          |          z  z  }|dz  }n
|dz  }|dz  }t          d |D                       s
J |            |S )NTRITON_LIBCUDA_PATHz/sbin/ldconfigz-pc                 J    g | ] }d |v |                                 d         !S )libcuda.so.1)split).0lines     Y/var/www/html/ai-engine/env/lib/python3.11/site-packages/triton/backends/nvidia/driver.py
<listcomp>z libcuda_dirs.<locals>.<listcomp>   s0    UUUnPT>T>TDJJLL>T>T>T    c                 L    g | ]!}t           j                            |          "S  )ospathdirname)r   locs     r   r   z libcuda_dirs.<locals>.<listcomp>   s&    111SBGOOC  111r   LD_LIBRARY_PATHc                     g | ]A}t           j                            t           j                            |d                     ?|BS )r   r   r   existsjoin)r   dirs     r   r   z libcuda_dirs.<locals>.<listcomp>   sA    sssPRPWP\P\]`bpPqPqArArssssr   :zlibcuda.so cannot found!
z!Possible files are located at %s.z:Please create a symlink of libcuda.so to any of the files.z<Please make sure GPU is set up and then run "/sbin/ldconfig"z- (requires sudo) to refresh the linker cache.c              3      K   | ]A}t           j                            t           j                            |d                     V  BdS )r   Nr   )r   r   s     r   	<genexpr>zlibcuda_dirs.<locals>.<genexpr>'   s@      SSdrw~~bgll4@@AASSSSSSr   )	r   getenv
subprocesscheck_outputdecode
splitlinesr   strany)env_libcuda_pathlibslocsdirsenv_ld_library_pathmsgs         r   libcuda_dirsr2      s4   y!677 " !!"$4d#;<<CCEED VU):):UUUD11D111D)$566 t4 tss288==sss
&C ?2SYY>>KKMM>>SSdSSSSSXXUXXXXKr   c                  .    t           gt                      S N)libdevice_dirr2   r   r   r   library_dirsr6   +   s    +LNN++r   c           	         t          j        |                     d                                                    }t	          |          }|                    | d          }|t          j                    5 }t          j	        
                    |d          }t          |d          5 }|                    |            d d d            n# 1 swxY w Y   t          |||t                      t          t                     }t          |d          5 }|                    |                                | dd          }d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   dd l}	|	j                            ||          }
|	j                            |
          }|
j                            |           |S )	Nzutf-8z.sozmain.cwrbT)binaryr   )hashlibsha256encode	hexdigestr   get_filetempfileTemporaryDirectoryr   r   r    openwriter   r6   include_dir	librariesputreadimportlib.utilutilspec_from_file_locationmodule_from_specloaderexec_module)srcnamekeycache
cache_pathtmpdirsrc_pathfso	importlibspecmods               r   compile_module_from_srcrZ   0   sO   
.G,,
-
-
7
7
9
9Cc""E4--J(** 	Lfw||FH55Hh$$               hYWWBb$ L1"YYqvvxxDdYKK
L L L L L L L L L L L L L L L	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L 	L >11$
CCD
.
)
)$
/
/CKC   JsZ   71E (C
>E 
C	E C	=E .E	=E 	E	E E	E  E$'E$c                   $     e Zd Z fdZd Z xZS )	CudaUtilsc                     t          | d          s-t          t          |                               |           | _        | j        S )Ninstance)hasattrsuperr\   __new__r^   )cls	__class__s    r   ra   zCudaUtils.__new__J   s<    sJ'' 	> C0088==CL|r   c                 :   t          t          t          j                            t
          d                                                    d          }|j        | _        |j        | _        |j	        | _	        |j
        | _
        |j        | _        |j        | _        d S )Nzdriver.c
cuda_utils)rZ   r   r   r   r    r   	read_textload_binaryget_device_propertiescuOccupancyMaxActiveClustersset_printf_fifo_sizefill_1d_tma_descriptorfill_2d_tma_descriptor)selfrY   s     r   __init__zCudaUtils.__init__O   s}    %d27<<+L+L&M&M&W&W&Y&Y[ghh?%(%>",/,L)$'$<!&)&@#&)&@###r   )__name__
__module____qualname__ra   rn   __classcell__rc   s   @r   r\   r\   H   sN            
A A A A A A Ar   r\   c                 N    | d         dk    rdS ddddddd	d
dddddddd|          S )Nr   *CUdeviceptrint32_tint8_tint16_tint64_tuint32_tuint8_tuint16_tuint64_tfloatdouble)i1i8i16i32i64u1u8u16u32u64fp16bf16fp32f32fp64r   tys    r   	ty_to_cppr   ^   sZ    	!u||}   	!
 
r   c                    	
 d                     d |                                D                       }d 	d 
d                     	
fd|                                D                       }d|z   }t          |          dk    r4dd                     d	 |                                D                       z   nd} fd
|                                D             }dt          |          dk    rd|z   nd dd                     d |D                        dd                     	fd|                                D                        d| d| dd                     d |                                D                        dt          |          dk    r4dd                     d |                                D                       z   nd d}|S )N, c              3   D   K   | ]\  }}t          |           d | V  dS )z argNr   r   ir   s      r   r$   z make_launcher.<locals>.<genexpr>w   s:      SS2Yr]]3333SSSSSSr   c                 <    | d         dk    rdS t          |           S )Nr   ru   	PyObject*r   r   s    r   _extracted_typez&make_launcher.<locals>._extracted_typey   s     a5C<<;}}r   c                 ,    dddddddddd	d
dd|          S )NOrU   dlbhr   BHIK)r   r   r   longrx   ry   rw   rz   r|   r}   r{   r~   r   r   s    r   	format_ofz make_launcher.<locals>.format_of~   s?    
 
  	r    c                 8    g | ]}  |                    S r   r   )r   r   r   r   s     r   r   z!make_launcher.<locals>.<listcomp>   s-    WWWb99__R%8%899WWWr   	iiiKKOOOOr   c              3   &   K   | ]\  }}d | V  dS )z&_argNr   r   s      r   r$   z make_launcher.<locals>.<genexpr>   s,       L LB L L L L L Lr   c                     g | ]}|v|	S r   r   )r   r   	constantss     r   r   z!make_launcher.<locals>.<listcomp>   s#    @@@AQi-?-?a-?-?-?r   a	  
#include "cuda.h"
#include <stdbool.h>
#include <Python.h>
#include <dlfcn.h>

static inline void gpuAssert(CUresult code, const char *file, int line)
{
   if (code != CUDA_SUCCESS)
   {
      const char* prefix = "Triton Error [CUDA]: ";
      const char* str;
      cuGetErrorString(code, &str);
      char err[1024] = {0};
      strcat(err, prefix);
      strcat(err, str);
      PyGILState_STATE gil_state;
      gil_state = PyGILState_Ensure();
      PyErr_SetString(PyExc_RuntimeError, err);
      PyGILState_Release(gil_state);
   }
}

#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

typedef CUresult (*cuLaunchKernelEx_t)(const CUlaunchConfig* config, CUfunction f, void** kernelParams, void** extra);

static cuLaunchKernelEx_t getLaunchKernelExHandle() {
  // Open the shared library
  void* handle = dlopen("libcuda.so.1", RTLD_LAZY);
  if (!handle) {
    PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1");
    return NULL;
  }
  // Clear any existing error
  dlerror();
  cuLaunchKernelEx_t cuLaunchKernelExHandle = (cuLaunchKernelEx_t)dlsym(handle, "cuLaunchKernelEx");
  // Check for errors
  const char *dlsym_error = dlerror();
  if (dlsym_error) {
    PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from libcuda.so.1");
    return NULL;
  }
  return cuLaunchKernelExHandle;
}

static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction functionz) {
  void *params[] = { c              3       K   | ]	}d | V  
dS )z&argNr   )r   r   s     r   r$   z make_launcher.<locals>.<genexpr>   s(       < < < < < < < <r   a   };
  if (gridX*gridY*gridZ > 0) {
    if (num_ctas == 1) {
      CUDA_CHECK(cuLaunchKernel(function, gridX, gridY, gridZ, 32*num_warps, 1, 1, shared_memory, stream, params, 0));
    } else {
      CUlaunchAttribute launchAttr[2];
      launchAttr[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
      launchAttr[0].value.clusterDim.x = clusterDimX;
      launchAttr[0].value.clusterDim.y = clusterDimY;
      launchAttr[0].value.clusterDim.z = clusterDimZ;
      launchAttr[1].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
      launchAttr[1].value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
      CUlaunchConfig config;
      config.gridDimX = gridX * clusterDimX;
      config.gridDimY = gridY * clusterDimY;
      config.gridDimZ = gridZ * clusterDimZ;
      config.blockDimX = 32 * num_warps;
      config.blockDimY = 1;
      config.blockDimZ = 1;
      config.sharedMemBytes = shared_memory;
      config.hStream = stream;
      config.attrs = launchAttr;
      config.numAttrs = 2;
      static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
      if (cuLaunchKernelExHandle == NULL) {
        cuLaunchKernelExHandle = getLaunchKernelExHandle();
      }
      CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
    }
  }
}

typedef struct _DevicePtrInfo {
    CUdeviceptr dev_ptr;
    bool valid;
} DevicePtrInfo;

static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {
  DevicePtrInfo ptr_info;
  ptr_info.dev_ptr = 0;
  ptr_info.valid = true;
  if (PyLong_Check(obj)) {
    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(obj);
    return ptr_info;
  }
  if (obj == Py_None) {
    // valid nullptr
    return ptr_info;
  }
  PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr");
  if(ptr){
    PyObject *empty_tuple = PyTuple_New(0);
    PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL);
    Py_DECREF(empty_tuple);
    Py_DECREF(ptr);
    if (!PyLong_Check(ret)) {
      PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
      ptr_info.valid = false;
      return ptr_info;
    }
    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(ret);
    if(!ptr_info.dev_ptr)
      return ptr_info;
    uint64_t dev_ptr;
    int status = cuPointerGetAttribute(&dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
    if (status == CUDA_ERROR_INVALID_VALUE) {
        PyErr_Format(PyExc_ValueError,
                     "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
        ptr_info.valid = false;
    }
    ptr_info.dev_ptr = dev_ptr;
    Py_DECREF(ret);  // Thanks ChatGPT!
    return ptr_info;
  }
  PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
  ptr_info.valid = false;
  return ptr_info;
}

static PyObject* launch(PyObject* self, PyObject* args) {
  int gridX, gridY, gridZ;
  uint64_t _stream;
  uint64_t _function;
  PyObject *launch_enter_hook = NULL;
  PyObject *launch_exit_hook = NULL;
  PyObject *kernel_metadata = NULL;
  PyObject *launch_metadata = NULL;
   c                 8    g | ]\  }} |           d | dS )z _arg; r   )r   r   r   r   s      r   r   z!make_launcher.<locals>.<listcomp>  s8    OOOEAr##//!///OOOr   z
  if(!PyArg_ParseTuple(args, "z", &gridX, &gridY, &gridZ, &_stream, &_function,
                                           &kernel_metadata, &launch_metadata,
                                           &launch_enter_hook, &launch_exit_hook ai  )) {
    return NULL;
  }

  int num_warps, num_ctas, shared_memory, clusterDimX, clusterDimY, clusterDimZ;
  if (!PyArg_ParseTuple(kernel_metadata, "iiiiii", &num_warps, &num_ctas, &shared_memory, &clusterDimX, &clusterDimY, &clusterDimZ)) {
    PyErr_SetString(PyExc_TypeError, "kernel_metadata must be a tuple");
    return NULL;
  }

  // extract launch metadata
  if (launch_enter_hook != Py_None){
    PyObject* args = Py_BuildValue("(O)", launch_metadata);
    PyObject* ret = PyObject_CallObject(launch_enter_hook, args);
    Py_DECREF(args);
    if (!ret)
      return NULL;
  }

  // raise exception asap
  r   c                 N    g | ]"\  }}|d          dk    rd| d| d| d| d	nd#S )r   ru   zDevicePtrInfo ptr_infoz = getPointer(_argr   z); if (!ptr_infoz.valid) return NULL;r   r   r   s      r   r   z!make_launcher.<locals>.<listcomp>1  s~      d  d  d  IN  IJ  LNoqrsotx{o{o{kqkkAkkkkTUkkkk  BD  d  d  dr   z;
  Py_BEGIN_ALLOW_THREADS;
  _launch(gridX, gridY, gridZ, num_warps, num_ctas, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (CUstream)_stream, (CUfunction)_functionc              3   J   K   | ]\  }}|d          dk    rd| dnd| V  dS )r   ru   ptr_infoz.dev_ptr_argNr   r   s      r   r$   z make_launcher.<locals>.<genexpr>3  s        dx  dx  ]b  ]^  `b  A  BC  D  FI  I  I  e{  pq  e{  e{  e{  e{  OY  VW  OY  OY  dx  dx  dx  dx  dx  dxr   aW  );
  Py_END_ALLOW_THREADS;
  if (PyErr_Occurred()) {
    return NULL;
  }

  if(launch_exit_hook != Py_None){
    PyObject* args = Py_BuildValue("(O)", launch_metadata);
    PyObject* ret = PyObject_CallObject(launch_exit_hook, args);
    Py_DECREF(args);
    if (!ret)
      return NULL;

  }

  // return None
  Py_INCREF(Py_None);
  return Py_None;
}

static PyMethodDef ModuleMethods[] = {
  {"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"},
  {NULL, NULL, 0, NULL} // sentinel
};

static struct PyModuleDef ModuleDef = {
  PyModuleDef_HEAD_INIT,
  "__triton_launcher",
  NULL, //documentation
  -1, //size
  ModuleMethods
};

PyMODINIT_FUNC PyInit___triton_launcher(void) {
  PyObject *m = PyModule_Create(&ModuleDef);
  if(m == NULL) {
    return NULL;
  }
  PyModule_AddFunctions(m, ModuleMethods);
  return m;
}
)r    itemsvalueslenkeys)r   	signatureids	arg_declsargs_formatformat	args_listparamsrN   r   r   s   `        @@r   make_launcherr   t   s3    		SSARARSSSSSI  
    ''WWWWWIDTDTDVDVWWWXXK;&FPST]P^P^abPbPbtyy L L)//:K:K L L LLLLLhjI A@@@))@@@FH\ RU  V_  R`  R`  cd  Rd  Rd  ~B  EN  ~N  ~N  jl]H H^ yy < <V < < <<<_H HL 88OOOOY__=N=NOOOPPMH HN !'OH HR S\SH Hz 99  d  d  R[  Ra  Ra  Rc  Rc  d  d  d  e  e{H H~ |  @I  |J  |J  MN  |N  |N  TX  [_  [d  [d  dx  dx  fo  fu  fu  fw  fw  dx  dx  dx  [x  [x  Tx  Tx  TVH H HCR Jr   c                       e Zd Zd Zd ZdS )CudaLauncherc                    dt          d          rj        j        nt                      i}t          d          rj        nt                      }fdfd|                                D             }fdj                                        D             }t          |||          t          d          }|j
        | _
        d S )Nids_of_const_exprsfnr   c                 p    t          | t                    rj        j                            |           n| S r4   )
isinstancer*   r   	arg_namesindex)r   rN   s    r   <lambda>z'CudaLauncher.__init__.<locals>.<lambda>e  s/    As9K9KRCF,221555QR r   c                 .    i | ]\  }} |          |S r   r   r   rP   valuecst_keys      r   
<dictcomp>z)CudaLauncher.__init__.<locals>.<dictcomp>f  s'    MMMZS%WWS\\5MMMr   c                 .    i | ]\  }} |          |S r   r   r   s      r   r   z)CudaLauncher.__init__.<locals>.<dictcomp>g  s'    QQQZS%WWS\\5QQQr   __triton_launcher)r_   r   
constexprstupler   dictr   r   r   rZ   launch)rm   rN   metadatar   r   r   rY   r   s    `     @r   rn   zCudaLauncher.__init__b  s    #'#t:L:L%YSV%6%6RWRYRYZ%,S+%>%>JCMMDFF	RRRRMMMM9??;L;LMMM	QQQQ3=;N;N;P;PQQQ	Iy#66%c+>??jr   c                       | j         |i | d S r4   )r   )rm   argskwargss      r   __call__zCudaLauncher.__call__l  s    T$V$$$$$r   N)ro   rp   rq   rn   r   r   r   r   r   r   `  s2        ! ! !% % % % %r   r   c                   :     e Zd Z fdZd Zed             Z xZS )
CudaDriverc                     t                      | _        t          | _        t	                                                       d S r4   )r\   utilsr   launcher_clsr`   rn   )rm   rc   s    r   rn   zCudaDriver.__init__r  s2    [[
(r   c                     |                                  }|                     |          }|d         dz  |d         z   }d}t          d||          S )Nr   
          r
   )get_current_deviceget_device_capabilityr   )rm   device
capability	warp_sizes       r   get_current_targetzCudaDriver.get_current_targetw  sT    ((**//77
]R'*Q-7
	Y777r   c                  X    dd l } | j                                        o| j        j        d u S )Nr   )torchr
   is_availableversionhip)r   s    r   	is_activezCudaDriver.is_active~  s.    z&&((Hem.?4.GHr   )ro   rp   rq   rn   r   staticmethodr   rr   rs   s   @r   r   r   p  si            
8 8 8 I I \I I I I Ir   r   )!	functoolsr   r;   r&   r@   pathlibr   triton.runtime.buildr   triton.runtime.cacher   triton.backends.compilerr   triton.backends.driverr   r   r   realpath__file__r    rD   r5   rE   	lru_cacher2   r6   rZ   objectr\   r   r   r   r   r   r   r   <module>r      s       				             ' ' ' ' ' ' 2 2 2 2 2 2 . . . . . . , , , , , ,
'//"'**844
5
5w||GY//0We,,H	   0 , , ,  0A A A A A A A A,
 
 
,i i iX% % % % %6 % % % I I I I I I I I I Ir   