Pass --iree-hal-cuda-llvm-target-arch for GPU execution automatically. (#66)

* Pass --iree-hal-cuda-llvm-target-arch for GPU execution automatically.

Add IREE CUDA compute capability flag only if sm_70 or later.

* Add Validate Models step to nightly workflow.
This commit is contained in:
Ean Garvey
2022-06-01 10:38:25 -05:00
committed by GitHub
parent f57730d2db
commit 87098e315f
3 changed files with 94 additions and 3 deletions

View File

@@ -52,6 +52,13 @@ jobs:
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Validate Models
run: |
cd $GITHUB_WORKSPACE
./setup_venv.sh
source shark.venv/bin/activate
pytest
- name: Build the package
run: |
package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"

71
shark/cuda_utils.py Normal file
View File

@@ -0,0 +1,71 @@
# Copyright 2020 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import ctypes
#Some constants taken from cuda.h
CUDA_SUCCESS = 0
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36
def get_cuda_sm_cc():
libnames = ('libcuda.so', 'libcuda.dylib', 'cuda.dll')
for libname in libnames:
try:
cuda = ctypes.CDLL(libname)
except OSError:
continue
else:
break
else:
raise OSError("could not load any of: " + ' '.join(libnames))
nGpus = ctypes.c_int()
name = b' ' * 100
cc_major = ctypes.c_int()
cc_minor = ctypes.c_int()
result = ctypes.c_int()
device = ctypes.c_int()
context = ctypes.c_void_p()
error_str = ctypes.c_char_p()
result = cuda.cuInit(0)
if result != CUDA_SUCCESS:
cuda.cuGetErrorString(result, ctypes.byref(error_str))
print("cuInit failed with error code %d: %s" % (result, error_str.value.decode()))
return 1
result = cuda.cuDeviceGetCount(ctypes.byref(nGpus))
if result != CUDA_SUCCESS:
cuda.cuGetErrorString(result, ctypes.byref(error_str))
print("cuDeviceGetCount failed with error code %d: %s" % (result, error_str.value.decode()))
return 1
print("Found %d device(s)." % nGpus.value)
for i in range(nGpus.value):
result = cuda.cuDeviceGet(ctypes.byref(device),i)
if result != CUDA_SUCCESS:
cuda.cuGetErrorString(result, ctypes.byref(error_str))
print("cuDeviceGet failed with error code %d: %s" % (result, error_str.value.decode()))
return 1
print("Device: %d" % i)
if cuda.cuDeviceGetName(ctypes.c_char_p(name), len(name), device) == CUDA_SUCCESS:
print(" Name: %s" % (name.split(b'\0', 1)[0].decode()))
if cuda.cuDeviceComputeCapability(ctypes.byref(cc_major), ctypes.byref(cc_minor), device) == CUDA_SUCCESS:
print(" Compute Capability: %d.%d" % (cc_major.value, cc_minor.value))
sm = f"sm_{cc_major.value}{cc_minor.value}"
return sm

View File

@@ -17,6 +17,7 @@ import iree.runtime.scripts.iree_benchmark_module as benchmark_module
import iree.compiler as ireec
from iree.compiler import tf as tfc
from shark.torch_mlir_utils import get_module_name_for_asm_dump
from shark.cuda_utils import get_cuda_sm_cc
import subprocess
import numpy as np
import os
@@ -79,8 +80,14 @@ def get_iree_cpu_args():
def get_iree_gpu_args():
ireert.flags.FUNCTION_INPUT_VALIDATION = False
ireert.flags.parse_flags("--cuda_allow_inline_execution")
return ["--iree-hal-cuda-disable-loop-nounroll-wa"]
sm_arch = get_cuda_sm_cc()
if sm_arch in ['sm_70', 'sm_72', 'sm_75', 'sm_80', 'sm_84', 'sm_86']:
return [
"--iree-hal-cuda-disable-loop-nounroll-wa",
f"--iree-hal-cuda-llvm-target-arch={sm_arch}"
]
else:
return ["--iree-hal-cuda-disable-loop-nounroll-wa"]
def get_iree_vulkan_args():
return [
@@ -174,7 +181,13 @@ def get_results(compiled_vm, input, config, frontend="torch"):
device_inputs = input
if frontend in ["torch", "pytorch"]:
device_inputs = [ireert.asdevicearray(config.device, a) for a in input]
if frontend in ["tensorflow", "tf"]:
device_inputs = []
for a in input:
if (isinstance(a, list)):
device_inputs.append([ireert.asdevicearray(config.device, val, dtype=np.int32) for val in a])
else:
device_inputs.append(ireert.asdevicearray(config.device, a))
result = compiled_vm(*device_inputs)
result_tensors = []
if (isinstance(result, tuple)):