Pass --iree-hal-cuda-llvm-target-arch for GPU execution automatically. (#66)

* Pass --iree-hal-cuda-llvm-target-arch for GPU execution automatically. Add IREE CUDA compute capability flag only if sm_70 or later. * Add Validate Models step to nightly workflow.
2026-04-03 03:00:17 -04:00 · 2022-06-01 10:38:25 -05:00
parent f57730d2db
commit 87098e315f
3 changed files with 94 additions and 3 deletions
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -52,6 +52,13 @@ jobs:
        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Validate Models
+      run: |
+        cd $GITHUB_WORKSPACE
+        ./setup_venv.sh
+        source shark.venv/bin/activate
+        pytest
+
    - name: Build the package
      run: |
        package_version="$(printf '%(%Y%m%d)T.${{ github.run_number }}')"
--- a/shark/cuda_utils.py
+++ b/shark/cuda_utils.py
@@ -0,0 +1,71 @@
+# Copyright 2020 The Nod Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import ctypes
+
+
+#Some constants taken from cuda.h
+CUDA_SUCCESS = 0
+CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
+CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39
+CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
+CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36
+
+def get_cuda_sm_cc():
+    libnames = ('libcuda.so', 'libcuda.dylib', 'cuda.dll')
+    for libname in libnames:
+        try:
+            cuda = ctypes.CDLL(libname)
+        except OSError:
+            continue
+        else:
+            break
+    else:
+        raise OSError("could not load any of: " + ' '.join(libnames))
+
+    nGpus = ctypes.c_int()
+    name = b' ' * 100
+    cc_major = ctypes.c_int()
+    cc_minor = ctypes.c_int()
+
+    result = ctypes.c_int()
+    device = ctypes.c_int()
+    context = ctypes.c_void_p()
+    error_str = ctypes.c_char_p()
+
+    result = cuda.cuInit(0)
+    if result != CUDA_SUCCESS:
+        cuda.cuGetErrorString(result, ctypes.byref(error_str))
+        print("cuInit failed with error code %d: %s" % (result, error_str.value.decode()))
+        return 1
+    result = cuda.cuDeviceGetCount(ctypes.byref(nGpus))
+    if result != CUDA_SUCCESS:
+        cuda.cuGetErrorString(result, ctypes.byref(error_str))
+        print("cuDeviceGetCount failed with error code %d: %s" % (result, error_str.value.decode()))
+        return 1
+    print("Found %d device(s)." % nGpus.value)
+    for i in range(nGpus.value):
+        result = cuda.cuDeviceGet(ctypes.byref(device),i)
+        if result != CUDA_SUCCESS:
+            cuda.cuGetErrorString(result, ctypes.byref(error_str))
+            print("cuDeviceGet failed with error code %d: %s" % (result, error_str.value.decode()))
+            return 1
+        print("Device: %d" % i)
+        if cuda.cuDeviceGetName(ctypes.c_char_p(name), len(name), device) == CUDA_SUCCESS:
+            print("  Name: %s" % (name.split(b'\0', 1)[0].decode()))
+        if cuda.cuDeviceComputeCapability(ctypes.byref(cc_major), ctypes.byref(cc_minor), device) == CUDA_SUCCESS:
+            print("  Compute Capability: %d.%d" % (cc_major.value, cc_minor.value))
+    sm = f"sm_{cc_major.value}{cc_minor.value}"
+    return sm
--- a/shark/iree_utils.py
+++ b/shark/iree_utils.py
@@ -17,6 +17,7 @@ import iree.runtime.scripts.iree_benchmark_module as benchmark_module
 import iree.compiler as ireec
 from iree.compiler import tf as tfc
 from shark.torch_mlir_utils import get_module_name_for_asm_dump
+from shark.cuda_utils import get_cuda_sm_cc
 import subprocess
 import numpy as np
 import os
@@ -79,8 +80,14 @@ def get_iree_cpu_args():
 def get_iree_gpu_args():
    ireert.flags.FUNCTION_INPUT_VALIDATION = False
    ireert.flags.parse_flags("--cuda_allow_inline_execution")
-    return ["--iree-hal-cuda-disable-loop-nounroll-wa"]
-
+    sm_arch = get_cuda_sm_cc()
+    if sm_arch in ['sm_70', 'sm_72', 'sm_75', 'sm_80', 'sm_84', 'sm_86']:
+        return [
+            "--iree-hal-cuda-disable-loop-nounroll-wa",
+            f"--iree-hal-cuda-llvm-target-arch={sm_arch}"
+        ]
+    else:
+        return ["--iree-hal-cuda-disable-loop-nounroll-wa"]

 def get_iree_vulkan_args():
    return [
@@ -174,7 +181,13 @@ def get_results(compiled_vm, input, config, frontend="torch"):
    device_inputs = input
    if frontend in ["torch", "pytorch"]:
        device_inputs = [ireert.asdevicearray(config.device, a) for a in input]
-
+    if frontend in ["tensorflow", "tf"]:
+        device_inputs = []
+        for a in input:
+            if (isinstance(a, list)):
+                device_inputs.append([ireert.asdevicearray(config.device, val, dtype=np.int32) for val in a])
+            else:
+                device_inputs.append(ireert.asdevicearray(config.device, a))
    result = compiled_vm(*device_inputs)
    result_tensors = []
    if (isinstance(result, tuple)):