mirror of
https://github.com/nod-ai/AMD-SHARK-Studio.git
synced 2026-04-03 03:00:17 -04:00
Pass --iree-hal-cuda-llvm-target-arch for GPU execution automatically. (#66)
* Pass --iree-hal-cuda-llvm-target-arch for GPU execution automatically. Add IREE CUDA compute capability flag only if sm_70 or later. * Add Validate Models step to nightly workflow.
This commit is contained in:
7
.github/workflows/nightly.yml
vendored
7
.github/workflows/nightly.yml
vendored
@@ -52,6 +52,13 @@ jobs:
|
||||
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
|
||||
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
|
||||
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
|
||||
- name: Validate Models
|
||||
run: |
|
||||
cd $GITHUB_WORKSPACE
|
||||
./setup_venv.sh
|
||||
source shark.venv/bin/activate
|
||||
pytest
|
||||
|
||||
- name: Build the package
|
||||
run: |
|
||||
package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
|
||||
|
||||
71
shark/cuda_utils.py
Normal file
71
shark/cuda_utils.py
Normal file
@@ -0,0 +1,71 @@
|
||||
# Copyright 2020 The Nod Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
import ctypes
|
||||
|
||||
|
||||
#Some constants taken from cuda.h
|
||||
CUDA_SUCCESS = 0
|
||||
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
|
||||
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39
|
||||
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
|
||||
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36
|
||||
|
||||
def get_cuda_sm_cc():
|
||||
libnames = ('libcuda.so', 'libcuda.dylib', 'cuda.dll')
|
||||
for libname in libnames:
|
||||
try:
|
||||
cuda = ctypes.CDLL(libname)
|
||||
except OSError:
|
||||
continue
|
||||
else:
|
||||
break
|
||||
else:
|
||||
raise OSError("could not load any of: " + ' '.join(libnames))
|
||||
|
||||
nGpus = ctypes.c_int()
|
||||
name = b' ' * 100
|
||||
cc_major = ctypes.c_int()
|
||||
cc_minor = ctypes.c_int()
|
||||
|
||||
result = ctypes.c_int()
|
||||
device = ctypes.c_int()
|
||||
context = ctypes.c_void_p()
|
||||
error_str = ctypes.c_char_p()
|
||||
|
||||
result = cuda.cuInit(0)
|
||||
if result != CUDA_SUCCESS:
|
||||
cuda.cuGetErrorString(result, ctypes.byref(error_str))
|
||||
print("cuInit failed with error code %d: %s" % (result, error_str.value.decode()))
|
||||
return 1
|
||||
result = cuda.cuDeviceGetCount(ctypes.byref(nGpus))
|
||||
if result != CUDA_SUCCESS:
|
||||
cuda.cuGetErrorString(result, ctypes.byref(error_str))
|
||||
print("cuDeviceGetCount failed with error code %d: %s" % (result, error_str.value.decode()))
|
||||
return 1
|
||||
print("Found %d device(s)." % nGpus.value)
|
||||
for i in range(nGpus.value):
|
||||
result = cuda.cuDeviceGet(ctypes.byref(device),i)
|
||||
if result != CUDA_SUCCESS:
|
||||
cuda.cuGetErrorString(result, ctypes.byref(error_str))
|
||||
print("cuDeviceGet failed with error code %d: %s" % (result, error_str.value.decode()))
|
||||
return 1
|
||||
print("Device: %d" % i)
|
||||
if cuda.cuDeviceGetName(ctypes.c_char_p(name), len(name), device) == CUDA_SUCCESS:
|
||||
print(" Name: %s" % (name.split(b'\0', 1)[0].decode()))
|
||||
if cuda.cuDeviceComputeCapability(ctypes.byref(cc_major), ctypes.byref(cc_minor), device) == CUDA_SUCCESS:
|
||||
print(" Compute Capability: %d.%d" % (cc_major.value, cc_minor.value))
|
||||
sm = f"sm_{cc_major.value}{cc_minor.value}"
|
||||
return sm
|
||||
@@ -17,6 +17,7 @@ import iree.runtime.scripts.iree_benchmark_module as benchmark_module
|
||||
import iree.compiler as ireec
|
||||
from iree.compiler import tf as tfc
|
||||
from shark.torch_mlir_utils import get_module_name_for_asm_dump
|
||||
from shark.cuda_utils import get_cuda_sm_cc
|
||||
import subprocess
|
||||
import numpy as np
|
||||
import os
|
||||
@@ -79,8 +80,14 @@ def get_iree_cpu_args():
|
||||
def get_iree_gpu_args():
|
||||
ireert.flags.FUNCTION_INPUT_VALIDATION = False
|
||||
ireert.flags.parse_flags("--cuda_allow_inline_execution")
|
||||
return ["--iree-hal-cuda-disable-loop-nounroll-wa"]
|
||||
|
||||
sm_arch = get_cuda_sm_cc()
|
||||
if sm_arch in ['sm_70', 'sm_72', 'sm_75', 'sm_80', 'sm_84', 'sm_86']:
|
||||
return [
|
||||
"--iree-hal-cuda-disable-loop-nounroll-wa",
|
||||
f"--iree-hal-cuda-llvm-target-arch={sm_arch}"
|
||||
]
|
||||
else:
|
||||
return ["--iree-hal-cuda-disable-loop-nounroll-wa"]
|
||||
|
||||
def get_iree_vulkan_args():
|
||||
return [
|
||||
@@ -174,7 +181,13 @@ def get_results(compiled_vm, input, config, frontend="torch"):
|
||||
device_inputs = input
|
||||
if frontend in ["torch", "pytorch"]:
|
||||
device_inputs = [ireert.asdevicearray(config.device, a) for a in input]
|
||||
|
||||
if frontend in ["tensorflow", "tf"]:
|
||||
device_inputs = []
|
||||
for a in input:
|
||||
if (isinstance(a, list)):
|
||||
device_inputs.append([ireert.asdevicearray(config.device, val, dtype=np.int32) for val in a])
|
||||
else:
|
||||
device_inputs.append(ireert.asdevicearray(config.device, a))
|
||||
result = compiled_vm(*device_inputs)
|
||||
result_tensors = []
|
||||
if (isinstance(result, tuple)):
|
||||
|
||||
Reference in New Issue
Block a user