[RUNTIME] Make apis compatible with cuda 11 drivers (#2081)

https://github.com/openai/triton/issues/2042
This commit is contained in:
Keren Zhou
2023-08-11 20:46:56 -04:00
committed by GitHub
parent 421ce18988
commit 382e8fb1fa
2 changed files with 67 additions and 7 deletions

View File

@@ -250,6 +250,7 @@ def generate_launcher(constants, signature, ids):
#include \"cuda.h\"
#include <stdbool.h>
#include <Python.h>
#include <dlfcn.h>
static inline void gpuAssert(CUresult code, const char *file, int line)
{{
@@ -267,9 +268,30 @@ static inline void gpuAssert(CUresult code, const char *file, int line)
#define CUDA_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }}
typedef CUresult (*cuLaunchKernelEx_t)(const CUlaunchConfig* config, CUfunction f, void** kernelParams, void** extra);
static cuLaunchKernelEx_t getLaunchKernelExHandle() {{
// Open the shared library
void* handle = dlopen("libcuda.so", RTLD_LAZY);
if (!handle) {{
PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so");
return NULL;
}}
// Clear any existing error
dlerror();
cuLaunchKernelEx_t cuLaunchKernelExHandle = (cuLaunchKernelEx_t)dlsym(handle, "cuLaunchKernelEx");
// Check for errors
const char *dlsym_error = dlerror();
if (dlsym_error) {{
PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from libcuda.so");
return NULL;
}}
return cuLaunchKernelExHandle;
}}
static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction function{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{
void *params[] = {{ {', '.join(f"&arg{i}" for i in params)} }};
if(gridX*gridY*gridZ > 0){{
if (gridX*gridY*gridZ > 0) {{
if (num_ctas == 1) {{
CUDA_CHECK(cuLaunchKernel(function, gridX, gridY, gridZ, 32*num_warps, 1, 1, shared_memory, stream, params, 0));
}} else {{
@@ -291,7 +313,11 @@ static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas
config.hStream = stream;
config.attrs = launchAttr;
config.numAttrs = 2;
CUDA_CHECK(cuLaunchKernelEx(&config, function, params, 0));
static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
if (cuLaunchKernelExHandle == NULL) {{
cuLaunchKernelExHandle = getLaunchKernelExHandle();
}}
CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
}}
}}
}}

View File

@@ -1,4 +1,5 @@
#include "cuda.h"
#include <dlfcn.h>
#define PY_SSIZE_T_CLEAN
#include <Python.h>
@@ -338,6 +339,36 @@ static cuuint32_t *list_to_cuuint32_array(PyObject *listObj) {
return array;
}
typedef CUresult (*cuTensorMapEncodeTiled_t)(
CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType,
cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim,
const cuuint64_t *globalStrides, const cuuint32_t *boxDim,
const cuuint32_t *elementStrides, CUtensorMapInterleave interleave,
CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion,
CUtensorMapFloatOOBfill oobFill);
static cuTensorMapEncodeTiled_t getCuTensorMapEncodeTiledHandle() {
// Open the shared library
void *handle = dlopen("libcuda.so", RTLD_LAZY);
if (!handle) {
PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so");
return NULL;
}
// Clear any existing error
dlerror();
cuTensorMapEncodeTiled_t cuTensorMapEncodeTiledHandle =
(cuTensorMapEncodeTiled_t)dlsym(handle, "cuTensorMapEncodeTiled");
// Check for errors
const char *dlsym_error = dlerror();
if (dlsym_error) {
PyErr_SetString(
PyExc_RuntimeError,
"Failed to retrieve cuTensorMapEncodeTiled from libcuda.so");
return NULL;
}
return cuTensorMapEncodeTiledHandle;
}
static PyObject *tensorMapEncodeTiled(PyObject *self, PyObject *args) {
CUtensorMap *tensorMap = (CUtensorMap *)malloc(sizeof(CUtensorMap));
CUtensorMapDataType tensorDataType;
@@ -364,18 +395,21 @@ static PyObject *tensorMapEncodeTiled(PyObject *self, PyObject *args) {
cuuint32_t *boxDim = list_to_cuuint32_array(boxDimObj);
cuuint32_t *elementStrides = list_to_cuuint32_array(elementStridesObj);
static cuTensorMapEncodeTiled_t cuTensorMapEncodeTiledHandle = NULL;
if (cuTensorMapEncodeTiledHandle == NULL) {
cuTensorMapEncodeTiledHandle = getCuTensorMapEncodeTiledHandle();
}
// Call the function
CUDA_CHECK(cuTensorMapEncodeTiled(tensorMap, tensorDataType, tensorRank,
globalAddress, globalDim, globalStrides,
boxDim, elementStrides, interleave, swizzle,
l2Promotion, oobFill));
CUDA_CHECK(cuTensorMapEncodeTiledHandle(
tensorMap, tensorDataType, tensorRank, globalAddress, globalDim,
globalStrides, boxDim, elementStrides, interleave, swizzle, l2Promotion,
oobFill));
// Clean up
free(globalDim);
free(globalStrides);
free(boxDim);
free(elementStrides);
// Return the tensor map as a normal pointer
return PyLong_FromUnsignedLongLong((unsigned long long)tensorMap);
}