[RUNTIME] Make apis compatible with cuda 11 drivers (#2081)

https://github.com/openai/triton/issues/2042
2026-04-05 03:01:17 -04:00 · 2023-08-11 20:46:56 -04:00
parent 421ce18988
commit 382e8fb1fa
2 changed files with 67 additions and 7 deletions
--- a/python/triton/compiler/make_launcher.py
+++ b/python/triton/compiler/make_launcher.py
@@ -250,6 +250,7 @@ def generate_launcher(constants, signature, ids):
 #include \"cuda.h\"
 #include <stdbool.h>
 #include <Python.h>
+#include <dlfcn.h>

 static inline void gpuAssert(CUresult code, const char *file, int line)
 {{
@@ -267,9 +268,30 @@ static inline void gpuAssert(CUresult code, const char *file, int line)

 #define CUDA_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }}

+typedef CUresult (*cuLaunchKernelEx_t)(const CUlaunchConfig* config, CUfunction f, void** kernelParams, void** extra);
+
+static cuLaunchKernelEx_t getLaunchKernelExHandle() {{
+  // Open the shared library
+  void* handle = dlopen("libcuda.so", RTLD_LAZY);
+  if (!handle) {{
+    PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so");
+    return NULL;
+  }}
+  // Clear any existing error
+  dlerror();
+  cuLaunchKernelEx_t cuLaunchKernelExHandle = (cuLaunchKernelEx_t)dlsym(handle, "cuLaunchKernelEx");
+  // Check for errors
+  const char *dlsym_error = dlerror();
+  if (dlsym_error) {{
+    PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from libcuda.so");
+    return NULL;
+  }}
+  return cuLaunchKernelExHandle;
+}}
+
 static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction function{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{
  void *params[] = {{ {', '.join(f"&arg{i}" for i in params)} }};
-  if(gridX*gridY*gridZ > 0){{
+  if (gridX*gridY*gridZ > 0) {{
    if (num_ctas == 1) {{
      CUDA_CHECK(cuLaunchKernel(function, gridX, gridY, gridZ, 32*num_warps, 1, 1, shared_memory, stream, params, 0));
    }} else {{
@@ -291,7 +313,11 @@ static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas
      config.hStream = stream;
      config.attrs = launchAttr;
      config.numAttrs = 2;
-      CUDA_CHECK(cuLaunchKernelEx(&config, function, params, 0));
+      static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
+      if (cuLaunchKernelExHandle == NULL) {{
+        cuLaunchKernelExHandle = getLaunchKernelExHandle();
+      }}
+      CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
    }}
  }}
 }}
--- a/python/triton/runtime/backends/cuda.c
+++ b/python/triton/runtime/backends/cuda.c
@@ -1,4 +1,5 @@
 #include "cuda.h"
+#include <dlfcn.h>
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>

@@ -338,6 +339,36 @@ static cuuint32_t *list_to_cuuint32_array(PyObject *listObj) {
  return array;
 }

+typedef CUresult (*cuTensorMapEncodeTiled_t)(
+    CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType,
+    cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim,
+    const cuuint64_t *globalStrides, const cuuint32_t *boxDim,
+    const cuuint32_t *elementStrides, CUtensorMapInterleave interleave,
+    CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion,
+    CUtensorMapFloatOOBfill oobFill);
+
+static cuTensorMapEncodeTiled_t getCuTensorMapEncodeTiledHandle() {
+  // Open the shared library
+  void *handle = dlopen("libcuda.so", RTLD_LAZY);
+  if (!handle) {
+    PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so");
+    return NULL;
+  }
+  // Clear any existing error
+  dlerror();
+  cuTensorMapEncodeTiled_t cuTensorMapEncodeTiledHandle =
+      (cuTensorMapEncodeTiled_t)dlsym(handle, "cuTensorMapEncodeTiled");
+  // Check for errors
+  const char *dlsym_error = dlerror();
+  if (dlsym_error) {
+    PyErr_SetString(
+        PyExc_RuntimeError,
+        "Failed to retrieve cuTensorMapEncodeTiled from libcuda.so");
+    return NULL;
+  }
+  return cuTensorMapEncodeTiledHandle;
+}
+
 static PyObject *tensorMapEncodeTiled(PyObject *self, PyObject *args) {
  CUtensorMap *tensorMap = (CUtensorMap *)malloc(sizeof(CUtensorMap));
  CUtensorMapDataType tensorDataType;
@@ -364,18 +395,21 @@ static PyObject *tensorMapEncodeTiled(PyObject *self, PyObject *args) {
  cuuint32_t *boxDim = list_to_cuuint32_array(boxDimObj);
  cuuint32_t *elementStrides = list_to_cuuint32_array(elementStridesObj);

+  static cuTensorMapEncodeTiled_t cuTensorMapEncodeTiledHandle = NULL;
+  if (cuTensorMapEncodeTiledHandle == NULL) {
+    cuTensorMapEncodeTiledHandle = getCuTensorMapEncodeTiledHandle();
+  }
  // Call the function
-  CUDA_CHECK(cuTensorMapEncodeTiled(tensorMap, tensorDataType, tensorRank,
-                                    globalAddress, globalDim, globalStrides,
-                                    boxDim, elementStrides, interleave, swizzle,
-                                    l2Promotion, oobFill));
+  CUDA_CHECK(cuTensorMapEncodeTiledHandle(
+      tensorMap, tensorDataType, tensorRank, globalAddress, globalDim,
+      globalStrides, boxDim, elementStrides, interleave, swizzle, l2Promotion,
+      oobFill));

  // Clean up
  free(globalDim);
  free(globalStrides);
  free(boxDim);
  free(elementStrides);
-
  // Return the tensor map as a normal pointer
  return PyLong_FromUnsignedLongLong((unsigned long long)tensorMap);
 }