
    Pid                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ ej                            ej                            e                    Zej                            ed          gZej                            ed	          Zd
gZda e j                    d             Z e j                    d             Z G d de          Zd ZddddddZ ddddddZ!dZ" e#e"          Z$d Z% e&d  e'd          D                       Z(de(d<   de(d<   de(d<   d Z)d  Z* G d! d"e          Z+ G d# d$e          Z,dS )%    N)Path)knobs)compile_module_from_src)_allocation)	GPUTarget)	GPUDriverincludelibcudac                     t           j        j        x} r| gS t          j        ddg                              d          }d |                                D             }d |D             }t          j        d          }|r!|sd |	                    d	          D             }d
}|r|dt          |          z  z  }|dz  }n
|dz  }|dz  }t          d |D                       s
J |            |S )Nz/sbin/ldconfigz-pignore)errorsc                 J    g | ] }d |v |                                 d         !S )libcuda.so.1)split).0lines     u/var/www/development/aibuddy-work/election-extract/venv/lib/python3.11/site-packages/triton/backends/nvidia/driver.py
<listcomp>z libcuda_dirs.<locals>.<listcomp>   s0    UUUnPT>T>TDJJLL>T>T>T    c                 L    g | ]!}t           j                            |          "S  )ospathdirname)r   locs     r   r   z libcuda_dirs.<locals>.<listcomp>   s&    111SBGOOC  111r   LD_LIBRARY_PATHc                     g | ]A}t           j                            t           j                            |d                     ?|BS )r   r   r   existsjoin)r   dirs     r   r   z libcuda_dirs.<locals>.<listcomp>!   sA    sssPRPWP\P\]`bpPqPqArArssssr   :zlibcuda.so cannot found!
z!Possible files are located at %s.z:Please create a symlink of libcuda.so to any of the files.z<Please make sure GPU is set up and then run "/sbin/ldconfig"z- (requires sudo) to refresh the linker cache.c              3      K   | ]A}t           j                            t           j                            |d                     V  BdS )r   Nr    )r   r   s     r   	<genexpr>zlibcuda_dirs.<locals>.<genexpr>)   s@      SSdrw~~bgll4@@AASSSSSSr   )r   nvidialibcuda_path
subprocesscheck_outputdecode
splitlinesr   getenvr   strany)env_libcuda_pathlibslocsdirsenv_ld_library_pathmsgs         r   libcuda_dirsr6      s3    <44 " !!"$4d#;<<CC8CTTD VU):):UUUD11D111D)$566 t4 tss288==sss
&C ?2SYY>>KKMM>>SSdSSSSSXXUXXXXKr   c                  .    t           gt                      S N)libdevice_dirr6   r   r   r   library_dirsr:   -   s    +LNN++r   c                   $     e Zd Z fdZd Z xZS )	CudaUtilsc                     t          | d          s-t          t          |                               |           | _        | j        S )Ninstance)hasattrsuperr<   __new__r>   )cls	__class__s    r   rA   zCudaUtils.__new__9   s<    sJ'' 	> C0088==CL|r   c                 d   t          t          t          j                            t
          d                                                    dt                      t          t                    }|j
        a
|j        | _        |j        | _        |j        | _        |j        | _        |j        | _        d S )Nzdriver.c
cuda_utilssrcnamer:   include_dirs	libraries)r   r   r   r   r"   r   	read_textr:   rI   rJ   PyCUtensorMapload_binaryget_device_propertiescuOccupancyMaxActiveClustersset_printf_fifo_sizefill_tma_descriptor)selfmods     r   __init__zCudaUtils.__init__>   s    %RW\\':6677AACC%%
 
 
 )?%(%>",/,L)$'$<!#&#:   r   )__name__
__module____qualname__rA   rT   __classcell__rC   s   @r   r<   r<   7   sG            
; ; ; ; ; ; ;r   r<   c                     | d         dk    rdS |                      d          rdS i ddddd	d
dddddddddddddddddddddddddd|          S )Nr   *CUdeviceptr
tensordescCUtensorMapi1int8_ti8i16int16_ti32int32_ti64int64_tu1uint8_tu8u16uint16_tu32uint32_tu64uint64_tfp16doublebf16fp32f32fp64	nvTmaDesc)
startswith)tys    r   	ty_to_cpprz   T   s    	!u||}	}}\"" }hh 	y 	y	
 	y 	i 	i 	z 	z 	z 	 	 	 	x 	  	]!" 	#
 
r   rl   rn   rp   )rq   rs   rt   ru   rv   	pack_fp16	pack_bf16	pack_fp32	pack_fp64iiiKKppOOOOOOc                    fd}fdfdfd ||                                           }d t          |          D             }d                    fd|                                 D                       }t          |z   }g }|                                 D ]} ||           d t          |          D             }t	          |          d	k    r4d
d
                    d |                                D                       z   nd}	g }
|                                D ]b\  }}|dk    r|t          v r&|
                    t          |          d|            ;|
                    t          |           d|            cd
                    |
          }g }|                                D ]\  }}|d	         dk    r|                    d| d           +|t          v r|                    d| d           N|dk    r|                    d|            m|dk    r|                    d|            t          t	          |                    }d}d |                                D             }d |                                D             }d |                                D             }d |                                D             }|                    d           |                    d           dt	          |          d	k    rd
|z   nd dd
                    |           d|                    fd|                                D                        d | d!|	 d"|                    |           d|                    |           d|                    |           d#t	          |          d	k    rd
d
                    |          z   nd d$}|S )%Nc                    g }d}| D ]k}t          |t                    r=|                    d          r'
r
|         nd }|dz  }t          j        d|          }|                    d          }|                    d          }|                    d          dz   }|X|                    d|z              t          d|z            D ]}	|                    d           |                    d	           n|                    d
           t          |          D ]}	|                    d           t          |          D ]}	|                    d           V|                    |           m
r|t          
          k    sJ |S )Nr   r]      ztensordesc<([^[>]*)\[([^]]*)\]   ,r[   rf   r_   rw   rd   )

isinstancer.   rx   rematchgroupcountappendrangelen)	signatureoutputtensordesc_idxsigmetar   dtypeshapendim_tensordesc_metas             r   _expand_signaturez(make_launcher.<locals>._expand_signature   s     	# 	#C#s## #|(D(D #:IS~66t!#!CSIIAA{{3''!+<MM#+... #1t8__ - -e,,,,MM$''''MM+...t ) )AMM%((((t ) )AMM%(((() c"""""LnO8L8L&L&L&L&Lr   c                     t          | t                    r| D ]} ||           d S |                    |            d S r8   )r   tupler   )r   r   x_flatten_signatures      r   r   z)make_launcher.<locals>._flatten_signature   s^    c5!! 	 . .""1f----. . MM#r   c                     t          | t                    r)d                    t          |                     }d| dS | d         dk    rdS | dv rdS t	          |           S )Nr   []r   r[   z	PyObject*	constexprrw   )r   r   r"   maprz   )ry   val_extracted_types     r   r   z&make_launcher.<locals>._extracted_type   sm    b%   	((33344Cs:::a5C<<;+++;}}r   c                    t          | t                    r)d                    t          |                     }d| dS | d         dk    rdS | dv rdS |                     d          rdS d	d
ddddddddd
t          |                    S )N ()r   r[   Or   r]   dlbhiLBHIK)
rr   longr`   rc   re   rg   ri   rl   rn   rp   )r   r   r"   r   rx   rz   )ry   r   	format_ofs     r   r   z make_launcher.<locals>.format_of   s    b%   	''#i,,--Cs:::a5C<<3+++3==&& 	3
 
 B-- 	r   c                     i | ]\  }}||	S r   r   r   r   ss      r   
<dictcomp>z!make_launcher.<locals>.<dictcomp>   s    >>>$!QA>>>r   r   c                 &    g | ]} |          S r   r   )r   ry   r   s     r   r   z!make_launcher.<locals>.<listcomp>   s!    FFFR99R==FFFr   c                     i | ]\  }}||	S r   r   r   s      r   r   z!make_launcher.<locals>.<dictcomp>   s    <<<$!QA<<<r   r   , c              3   &   K   | ]\  }}d | V  dS )z&_argNr   r   r   ry   s      r   r&   z make_launcher.<locals>.<genexpr>   s,       L LB L L L L L Lr   r   z argr[   ptr_infoz.dev_ptr_arg_storagerw   z*tma_ptrz
  c                 J    g | ] \  }}|d          dk    d| d| d| d| d	!S )r   r[   zDevicePtrInfo ptr_infoz = getPointer(_argr   z); if (!ptr_infoz.valid) return NULL;r   r   s      r   r   z!make_launcher.<locals>.<listcomp>   sU       Ara5C<< 	feeaee1eeaeee<<r   c           	      8    g | ]\  }}|d k    d| d| d| dS )rw   zCUtensorMap* tma_ptrz = getTmaDesc(_argz); if (!tma_ptrz) return NULL;r   r   s      r   r   z!make_launcher.<locals>.<listcomp>   sO       \a\]_a 	XqWWAWWaWWWr   c           
      n    g | ]2\  }}|t           v t           |          d | dt          |          d| d3S ) _argz_storage = z(_argz);)FLOAT_STORAGE_TYPEFLOAT_PACK_FUNCTIONr   s      r   r   z!make_launcher.<locals>.<listcomp>   s_       Ar### b!YYYY6I"6MYYTUYYY###r   c                 *    g | ]\  }}|d k    d| S )r   z&argr   r   s      r   r   z!make_launcher.<locals>.<listcomp>  s,    MMMUQ2;L;LjQjj;L;L;Lr   z&global_scratchz&profile_scratcha  
#include "cuda.h"
#include <dlfcn.h>
#include <stdbool.h>
#include <stdlib.h>
#define PY_SSIZE_T_CLEAN
#include <Python.h>

typedef struct {
  PyObject_HEAD;
  _Alignas(128) CUtensorMap tensorMap;
} PyCUtensorMapObject;

static inline void gpuAssert(CUresult code, const char *file, int line)
{
   if (code != CUDA_SUCCESS)
   {
      const char* prefix = "Triton Error [CUDA]: ";
      const char* str;
      cuGetErrorString(code, &str);
      char err[1024] = {0};
      strcat(err, prefix);
      strcat(err, str);
      PyGILState_STATE gil_state;
      gil_state = PyGILState_Ensure();
      PyErr_SetString(PyExc_RuntimeError, err);
      PyGILState_Release(gil_state);
   }
}

#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

typedef CUresult (*cuLaunchKernelEx_t)(const CUlaunchConfig* config, CUfunction f, void** kernelParams, void** extra);

static cuLaunchKernelEx_t getLaunchKernelExHandle() {
  // Open the shared library
  void* handle = dlopen("libcuda.so.1", RTLD_LAZY);
  if (!handle) {
    PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1");
    return NULL;
  }
  // Clear any existing error
  dlerror();
  cuLaunchKernelEx_t cuLaunchKernelExHandle = (cuLaunchKernelEx_t)dlsym(handle, "cuLaunchKernelEx");
  // Check for errors
  const char *dlsym_error = dlerror();
  if (dlsym_error) {
    PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from libcuda.so.1");
    return NULL;
  }
  return cuLaunchKernelExHandle;
}

static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int launch_cooperative_grid, int launch_pdl, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction function, CUdeviceptr global_scratch, CUdeviceptr profile_scratchz) {
  void *params[] = { a   };
  if (gridX*gridY*gridZ > 0) {
    // 4 attributes that we can currently pass maximum
    CUlaunchAttribute launchAttr[4];
    static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
    if (cuLaunchKernelExHandle == NULL) {
      cuLaunchKernelExHandle = getLaunchKernelExHandle();
    }
    CUlaunchConfig config;
    config.gridDimX = gridX;
    config.gridDimY = gridY;
    config.gridDimZ = gridZ;

    if (num_ctas != 1) {
      config.gridDimX *= clusterDimX;
      config.gridDimY *= clusterDimY;
      config.gridDimZ *= clusterDimZ;
    }

    config.blockDimX = 32 * num_warps;
    config.blockDimY = 1;
    config.blockDimZ = 1;
    config.sharedMemBytes = shared_memory;
    config.hStream = stream;
    config.attrs = launchAttr;
    int num_attrs = 0;

    if (launch_pdl != 0) {
      CUlaunchAttribute pdlAttr = { .id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION, .value = 1};
      launchAttr[num_attrs] = pdlAttr;
      ++num_attrs;
    }

    if (launch_cooperative_grid != 0) {
      CUlaunchAttribute coopAttr = { .id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE, .value = 1};
      launchAttr[num_attrs] = coopAttr;
      ++num_attrs;
    }

    if (num_ctas != 1) {
      CUlaunchAttribute clusterAttr = {};
      clusterAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
      clusterAttr.value.clusterDim.x = clusterDimX;
      clusterAttr.value.clusterDim.y = clusterDimY;
      clusterAttr.value.clusterDim.z = clusterDimZ;
      launchAttr[num_attrs] = clusterAttr;
      ++num_attrs;

      CUlaunchAttribute clusterSchedulingAttr = {};
      clusterSchedulingAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
      clusterSchedulingAttr.value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
      launchAttr[num_attrs] = clusterSchedulingAttr;
      ++num_attrs;
    }

    config.numAttrs = num_attrs;

    CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
  }
}

typedef struct _DevicePtrInfo {
    CUdeviceptr dev_ptr;
    bool valid;
} DevicePtrInfo;

static PyObject* data_ptr_str = NULL;
static PyObject* py_tensor_map_type = NULL;

static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {
  DevicePtrInfo ptr_info;
  ptr_info.dev_ptr = 0;
  ptr_info.valid = true;
  if (PyLong_Check(obj)) {
    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(obj);
    return ptr_info;
  }
  if (obj == Py_None) {
    // valid nullptr
    return ptr_info;
  }
  PyObject *ret = PyObject_CallMethodNoArgs(obj, data_ptr_str);
  if (!ret) {
    PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
    ptr_info.valid = false;
    goto cleanup;
  }
  if (!PyLong_Check(ret)) {
    PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
    ptr_info.valid = false;
    goto cleanup;
  }
  ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(ret);
  if(!ptr_info.dev_ptr)
    return ptr_info;
  uint64_t dev_ptr;
  int status = cuPointerGetAttribute(&dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
  if (status == CUDA_ERROR_INVALID_VALUE) {
      PyErr_Format(PyExc_ValueError,
                   "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
      ptr_info.valid = false;
  } else if (status != CUDA_SUCCESS) {
      CUDA_CHECK(status);  // Catch any other cuda API errors
      ptr_info.valid = false;
  }
  ptr_info.dev_ptr = dev_ptr;
cleanup:
  Py_XDECREF(ret);
  return ptr_info;

}

static inline CUtensorMap* getTmaDesc(PyObject *obj) {
  if (sizeof(CUtensorMap*) != 8) {
    PyErr_SetString(PyExc_SystemError, "getTmaDesc() requires 64-bit compilation");
    return NULL;
  }

if (Py_TYPE(obj) != (PyTypeObject*)py_tensor_map_type) {
    PyErr_Format(PyExc_TypeError, "object must be of type PyCUtensorMap, got %s", Py_TYPE(obj)->tp_name);
    return NULL;
}

  CUtensorMap* map = &((PyCUtensorMapObject*)obj)->tensorMap;
  uintptr_t align_128 = (uintptr_t)map & (128 - 1);
  if (align_128 != 0) {
    PyErr_Format(PyExc_ValueError, "CUtensorMap must be aligned to 128B, but got (&map) mod 128 = %ld", align_128);
    return NULL;
  }
  return map;
}

static void ensureCudaContext() {
  CUcontext pctx;
  CUDA_CHECK(cuCtxGetCurrent(&pctx));
  if (!pctx) {
    // Ensure device context.
    CUdevice device;
    CUDA_CHECK(cuDeviceGet(&device, 0));
    CUDA_CHECK(cuDevicePrimaryCtxRetain(&pctx, device));
    CUDA_CHECK(cuCtxSetCurrent(pctx));
  }
}

static uint16_t pack_fp16(double f) {
    uint16_t result;
    // from https://github.com/python/pythoncapi-compat
#if 0x030600B1 <= PY_VERSION_HEX && PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION)
    _PyFloat_Pack2(f, (unsigned char*)&result, 1);
#else
    PyFloat_Pack2(f, (unsigned char*)&result, 1);
#endif
    return result;
}

static uint16_t pack_bf16(double f) {
    float f32 = (float)f;
    uint32_t u32 = *(uint32_t*)&f32;
    return (uint16_t)(u32 >> 16);
}

static uint32_t pack_fp32(double f) {
    float f32 = (float)f;
    return *(uint32_t*)&f32;
}

static uint64_t pack_fp64(double f) {
    return *(uint64_t*)&f;
}

static PyObject* launch(PyObject* self, PyObject* args) {
  // ensure cuda context is valid before calling any CUDA APIs, e.g. before getPointer calls cuPointerGetAttributes
  ensureCudaContext();

  int gridX, gridY, gridZ;
  uint64_t _stream;
  uint64_t _function;
  int launch_cooperative_grid;
  int launch_pdl;
  PyObject *launch_enter_hook = NULL;
  PyObject *launch_exit_hook = NULL;
  PyObject *kernel_metadata = NULL;
  PyObject *launch_metadata = NULL;
  PyObject *global_scratch_obj = NULL;
  PyObject *profile_scratch_obj = NULL;
  c                 8    g | ]\  }} |           d | dS )r   ;r   )r   r   ry   r   s      r   r   z!make_launcher.<locals>.<listcomp>  s8    RRRuq"OOB''22a222RRRr   z
  if(!PyArg_ParseTuple(args, "aM  ", &gridX, &gridY, &gridZ,
                                           &_stream, &_function, &launch_cooperative_grid, &launch_pdl, &global_scratch_obj, &profile_scratch_obj,
                                           &kernel_metadata, &launch_metadata,
                                           &launch_enter_hook, &launch_exit_hookaT  )) {
    return NULL;
  }

  int num_warps, num_ctas, shared_memory, clusterDimX, clusterDimY, clusterDimZ;
  if (!PyArg_ParseTuple(kernel_metadata, "iiiiii", &num_warps, &num_ctas, &shared_memory, &clusterDimX, &clusterDimY, &clusterDimZ)) {
    PyErr_SetString(PyExc_TypeError, "kernel_metadata must be a tuple");
    return NULL;
  }

  // extract launch metadata
  if (launch_enter_hook != Py_None){
    PyObject* ret = PyObject_CallOneArg(launch_enter_hook, launch_metadata);
    if (!ret)
      return NULL;
    Py_DECREF(ret);
  }

  CUdeviceptr global_scratch = 0;
  if (global_scratch_obj != Py_None) {
    DevicePtrInfo global_scratch_info = getPointer(global_scratch_obj, -1);
    if (!global_scratch_info.valid) {
      return NULL;
    }
    global_scratch = global_scratch_info.dev_ptr;
  }

  CUdeviceptr profile_scratch = 0;
  if (profile_scratch_obj != Py_None) {
    DevicePtrInfo profile_scratch_info = getPointer(profile_scratch_obj, -1);
    if (!profile_scratch_info.valid) {
      return NULL;
    }
    profile_scratch = profile_scratch_info.dev_ptr;
  }

  // raise exception asap
  z
  Py_BEGIN_ALLOW_THREADS;
  _launch(gridX, gridY, gridZ, num_warps, num_ctas, launch_cooperative_grid, launch_pdl, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (CUstream)_stream, (CUfunction)_function, global_scratch, profile_scratchap  );
  Py_END_ALLOW_THREADS;
  if (PyErr_Occurred()) {
    return NULL;
  }

  if(launch_exit_hook != Py_None){
    PyObject* ret = PyObject_CallOneArg(launch_exit_hook, launch_metadata);
    if (!ret)
      return NULL;
    Py_DECREF(ret);
  }

  Py_RETURN_NONE;
}

static PyMethodDef ModuleMethods[] = {
  {"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"},
  {NULL, NULL, 0, NULL} // sentinel
};

static struct PyModuleDef ModuleDef = {
  PyModuleDef_HEAD_INIT,
  "__triton_launcher",
  NULL, //documentation
  -1, //size
  ModuleMethods
};

PyMODINIT_FUNC PyInit___triton_launcher(void) {
  data_ptr_str = PyUnicode_InternFromString("data_ptr");
  if(data_ptr_str == NULL) {
    return NULL;
  }
  PyObject* driver_mod = PyImport_ImportModule("triton.backends.nvidia.driver");
  if (driver_mod == NULL) {
    return NULL;
  }
  py_tensor_map_type = PyObject_GetAttrString(driver_mod, "PyCUtensorMap");
  if (py_tensor_map_type == NULL) {
    return NULL;
  }

  PyObject *m = PyModule_Create(&ModuleDef);
  if(m == NULL) {
    return NULL;
  }
  PyModule_AddFunctions(m, ModuleMethods);
  return m;
}
)
values	enumerater"   _BASE_ARGS_FORMATr   itemsr   r   rz   r   )	constantsr   r   r   expand_signatureargs_formatformatflat_signaturer   	args_listarg_decl_listr   ry   	arg_declsinternal_args_listparamsnewline	ptr_decls	tma_declsfloat_storage_declsrG   r   r   r   s     `                  @@@r   make_launcherr      s   % % % % %N            . )()9)9););<<>>),<"="=>>>I''FFFF93C3C3E3EFFFGGK,FN!! 0 03////<<)N";";<<<IPST]P^P^abPbPbtyy L L)//:K:K L L LLLLLhjI M"" < <2###  $6r$:!C!C!C!CDDDD  IbMM!:!:q!:!:;;;;		-((I"" 	2 	22a5C<<%%&<&<&<&<====%%%%%&8Q&8&8&89999;%%nnn5555;%%jQjj1113y>>""F G __&&  I
 enetetevev  I __&&  
 NMioo&7&7MMMF
MM#$$$
MM$%%%Nj x{  |E  xF  xF  IJ  xJ  xJ  dh  kt  dt  dt  PRkN Nl yy((mN N^ <<RRRR	@Q@QRRRSS_N N` !'aN Nf R[gN Np <<	qN Nr <<	sN Nt <<#$$uN Nx BE  FX  BY  BY  \]  B]  B]  Z^  ae  aj  aj  k}  a~  a~  Z~  Z~  ceyN N NC^
 Jr   c              #      K   | ]}||fV  	d S r8   r   )r   r   s     r   r&   r&   [  s&      ::1A::::::r      
      	   c           
         |/| j         g| j        | j        | j        dk    | j        | j        S |d         }|d         }|d         }|d         }|d         }| j        }| j        }|d         dk    sJ | j        dk    rdnd	}	|rt	          |          }|dxx         d
z  cc<   t
          j        j        j        j	        
                    | j                                         ||t          |         ||||	          }
|
g||S )Nnanswizzle	elem_size	elem_type
block_size
fp4_paddedr   r   r   r   )baser   stridespaddinglisttritonruntimedriveractiveutilsrQ   data_ptrTMA_DTYPE_DEVICE_TO_HOST)argmetadatar   r   r   r   r   r   r   r   cu_tensor_maps              r   make_tensordesc_argr   a  s1    c39cs{cCK54Hc39cWZWbccy!G%I%I,'J,'JIEkG2;!;%''aaQG Ub			Q			N)06JJ +	 	M ,E,G,,r   c                 Z    t          d |                                D                       }|s S t          d t          |                                          D                       r"t	                    t	                    k    sJ sd gt	                    z   fd}|S )Nc              3   j   K   | ].}t          |t                    o|                    d           V  /dS )r]   Nr   r.   rx   )r   r   s     r   r&   z)wrap_handle_tensordesc.<locals>.<genexpr>  s>      rrX[jc22Ss~~l7S7Srrrrrrr   c                 l    g | ]1\  }}t          |t                    |                    d           /|2S )r]   r   )r   r   r   s      r   r   z*wrap_handle_tensordesc.<locals>.<listcomp>  s@    pppvq#*S#:N:NpSVSaSabnSoSoppppr   c                     t          | d t                             }d}t          | t          d                    D ]M\  }}|v r/|                    t	          ||                              |dz  }8|                    |           N | S )Nr   r   )r   _BASE_ARGS_FORMAT_LENr   extendr   r   )args
final_argsr   r   r   launchertensordesc_indicesr   s        r   innerz%wrap_handle_tensordesc.<locals>.inner  s    $5 55677
%:%;%; <== 	' 	'FAs&&&!!"5c?>;Z"["[\\\!#!!#&&&&x$$r   )r/   r   setr   r   )r  r   r   has_tensor_desc_argr  r  s   ` `  @r   wrap_handle_tensordescr    s    rr_h_o_o_q_qrrrrr pp9#3#3#5#566pppr rQ#o"6"6#>P:Q:Q"Q"Q"Q"Q ;&3'9#:#::	% 	% 	% 	% 	% 	% 	% Lr   c                       e Zd Zd Zd ZdS )CudaLauncherc                    t          d          rj        nt                      }fdfd|                                D             }d j                                        D             }t          |dd           }t          |||          t          dt                      t          t                    }t          j        t          j        |j        d          | _        t#          |j        ||          | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        d S )	Nr   c                 r    t          | t                    r j        j                            |           fn| S r8   )r   r.   fn	arg_namesindex)r   rG   s    r   <lambda>z'CudaLauncher.__init__.<locals>.<lambda>  s2    Z3=O=OVSV-33A6699UV r   c                 .    i | ]\  }} |          |S r   r   )r   idxvaluearg_idxs      r   r   z)CudaLauncher.__init__.<locals>.<dictcomp>  s'    MMMZS%WWS\\5MMMr   c                     i | ]\  }}||	S r   r   )r   r  r  s      r   r   z)CudaLauncher.__init__.<locals>.<dictcomp>  s    HHHJCS%HHHr   r   __triton_launcherrF   r   )r?   r   dictr   r   getattrr   r   r:   rI   rJ   	functoolsreduceoperatormulcluster_dimsnum_ctasr  launchglobal_scratch_sizeglobal_scratch_alignprofile_scratch_sizeprofile_scratch_alignlaunch_cooperative_grid
launch_pdl)rR   rG   r   r   r   r   rS   r  s    `     @r   rT   zCudaLauncher.__init__  s5   %,S+%>%>JCMMDFF	VVVVMMMM9??;L;LMMM	HH#-2E2E2G2GHHH	!(,=tDDIy/BB%$%%
 
 
 "(x7LaPP,SZOTT#+#? $,$A!$,$A!%-%C"'/'G$"-r   c                       fd} | j          j        t          j                  } | j         j        t          j                  }	  j        | j         j	        ||	g	|R   d S )Nc                     | dk    r6z  z  }|	j         z  | z  }|                                } |||
          S d S Nr   )r  get)sizealign	allocator	grid_size
alloc_sizealloc_fngridXgridYgridZrR   streams         r   allocate_scratchz/CudaLauncher.__call__.<locals>.allocate_scratch  sQ    axx!EME1	&6=
$==??x
E6:::4r   )
r  r  r   
_allocatorr   r!  _profile_allocatorr  r"  r#  )
rR   r.  r/  r0  r1  functionr   r2  global_scratchprofile_scratchs
   `````     r   __call__zCudaLauncher.__call__  s    	 	 	 	 	 	 	 	 	 *)$*BDD]_j_uvv**4+DdF`+6+IK KE5%4;WY]Yh"O	<6:	< 	< 	< 	< 	< 	<r   N)rU   rV   rW   rT   r8  r   r   r   r  r    s2        . . .0< < < < <r   r  c                   h     e Zd Z fdZd Zd Zd Zed             Zde	de	fdZ
d	 Zd
 Zd Z xZS )
CudaDriverc                     t                      | _        t          | _        t	                                                       d S r8   )r<   r   r  launcher_clsr@   rT   )rR   rC   s    r   rT   zCudaDriver.__init__  s2    [[
(r   c                     |                                  }|                     |          }|d         dz  |d         z   }d}t          d||          S )Nr   r   r       r   )get_current_deviceget_device_capabilityr   )rR   device
capability	warp_sizes       r   get_current_targetzCudaDriver.get_current_target  sT    ((**//77
]R'*Q-7
	Y777r   c                 Z    dd l }|                    d|                                           S )Nr   r   )torchrA  r?  rR   rF  s     r   get_active_torch_devicez"CudaDriver.get_active_torch_device  s+    ||FD$;$;$=$=>>>r   c                     dd l }|j        S r&  )rF  r   rG  s     r   get_device_interfacezCudaDriver.get_device_interface  s    zr   c                  |    	 dd l } | j                                        o| j        j        d u S # t
          $ r Y dS w xY w)Nr   F)rF  r   is_availableversionhipImportError)rF  s    r   	is_activezCudaDriver.is_active  sU    	LLL:**,,L%-2Ct2KL 	 	 	55	s   *- 
;;ry   returnc                      t          |          S r8   )rz   )rR   ry   s     r   map_python_to_cpp_typez!CudaDriver.map_python_to_cpp_type  s    }}r   c                     ddl m} |S )Nr   )do_bench)triton.testingrU  )rR   rU  s     r   get_benchmarkerzCudaDriver.get_benchmarker  s    ++++++r   c                 h    dd l }d}|                    t          |dz            |j        d          S )Nr   i      r   )r   rA  )rF  emptyint)rR   rF  
cache_sizes      r   get_empty_cache_for_benchmarkz(CudaDriver.get_empty_cache_for_benchmark  s8    
 '
{{3zQ//uy{PPPr   c                 .    |                                  d S r8   )zero_)rR   caches     r   clear_cachezCudaDriver.clear_cache  s    r   )rU   rV   rW   rT   rD  rH  rJ  staticmethodrP  r.   rS  rW  r]  ra  rX   rY   s   @r   r:  r:    s            
8 8 8? ? ?     \       Q Q Q      r   r:  )-r  r  r   r)   r   r   pathlibr   r   triton.runtime.buildr   triton.runtimer   triton.backends.compilerr   triton.backends.driverr   r   r   realpath__file__r"   rI   r9   rJ   rL   	lru_cacher6   r:   objectr<   rz   r   r   r   r   r   r   r  r   r   r   r  r  r:  r   r   r   <module>rl     s        				      				             8 8 8 8 8 8 & & & & & & . . . . . . , , , , , ,
'//"'**844
5
5Wi001We,,H	   . , , ,; ; ; ; ; ; ; ;:
 
 
4       $ -.. W W Wv  4::b		:::::         $- $- $-N  2(< (< (< (< (<6 (< (< (<V/ / / / / / / / / /r   