HIP CI that compiles (to RDNA3) but doesn't have to run (#2482)

* hip amd compilation * gate the test properly * cleanup unused import * remove superfluous numpy conversion * add SpeedyNet tests (f32 [passes] & f16 [fails]) * make CI verbose (error log from hip compiler) * test the real ops_hip * Merge branch 'tinygrad:master' into ci/hip-compilation * fix CI * cleanup * really fix CI * Fix CI Three: the refixening --------- Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
2026-01-09 15:08:02 -05:00 · 2023-11-28 12:17:06 +07:00
parent 756b01f46f
commit 136dbd8b36
3 changed files with 88 additions and 6 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -276,6 +276,47 @@ jobs:
    - name: Run WEBGPU Efficientnet
      run: node test/test_webgpu.js

+  testhipcompilation:
+    name: HIP Compilation Tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+
+    steps:
+    - name: Checkout Code
+      uses: actions/checkout@v3
+    - name: Set up Python 3.11
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.11
+    - name: Cache python packages
+      uses: actions/cache@v3
+      with:
+        path: ${{ env.Python3_ROOT_DIR }}/lib/python3.11/site-packages
+        key: testing-packages-${{ hashFiles('**/setup.py') }}
+    - name: Cache downloads
+      uses: actions/cache@v3
+      with:
+        path: ~/.cache/tinygrad/downloads/
+        key: downloads-cache-hipcompilation-${{ env.DOWNLOAD_CACHE_VERSION }}
+    - name: Install HIP tools
+      run: |
+        wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
+        # ROCm repository for jammy
+        sudo tee /etc/apt/sources.list.d/rocm.list <<'EOF'
+        deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/debian jammy main
+        EOF
+        # Prefer packages from the rocm repository over system packages
+        echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
+        sudo apt update
+        sudo apt install --allow-unauthenticated -y rocm-hip-libraries hip-dev
+    - name: Install Python Dependencies
+      run: pip install -e '.[testing]'
+    - name: Test HIP compilation on RDNA3 [gfx1100]
+      run: |
+        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/hip/lib
+        MOCKHIP=1 HIP=1 python -m pytest -s test/test_hip_rdna3.py
+
+
  tests:
    strategy:
      fail-fast: false
--- a/test/test_hip_rdna3.py
+++ b/test/test_hip_rdna3.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+import unittest
+from tinygrad import Tensor, Device
+from tinygrad.helpers import dtypes
+from examples.beautiful_mnist import Model as MNIST
+from examples.hlb_cifar10 import SpeedyResNet
+
+@unittest.skipIf(Device.DEFAULT != "HIP", reason="testing HIP->rdna3 compilation needs HIP=1")
+class TestHIPCompilationRDNA(unittest.TestCase):
+  def test_compile_hip_mnist(self):
+    model = MNIST()
+
+    input = Tensor.rand(512,1,28,28)
+    output = model(input)
+    output.numpy()
+
+  def test_compile_hip_speedyresnet(self):
+    W = Tensor.rand(12,3,2,2)
+    model = SpeedyResNet(W)
+
+    input = Tensor.rand(512, 3, 32, 32)
+    output = model(input)
+    output.numpy()
+
+  @unittest.expectedFailure
+  def test_compile_hip_speedyresnet_hf(self):
+    Tensor.default_type = dtypes.float16
+
+    W = Tensor.rand(12,3,2,2)
+    model = SpeedyResNet(W)
+
+    input = Tensor.rand(512, 3, 32, 32)
+    output = model(input)
+    output.numpy()
+
+if __name__ == "__main__":
+  unittest.main()
--- a/tinygrad/runtime/ops_hip.py
+++ b/tinygrad/runtime/ops_hip.py
@@ -5,7 +5,7 @@ from typing import Tuple, List, Any, Dict, cast, Optional, Callable
 from tinygrad.helpers import DEBUG, getenv, diskcache
 from tinygrad.device import Compiled, CompiledASTRunner, update_stats
 from tinygrad.renderer.hip import HIPRenderer
-from tinygrad.runtime.lib import RawBufferCopyInOut, LRUAllocator, RawBufferTransfer, RawBuffer
+from tinygrad.runtime.lib import RawBufferCopyInOut, LRUAllocator, RawBufferTransfer, RawBuffer, RawMallocBuffer
 from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.shape.symbolic import Variable
 from tinygrad.jit import JitItem, get_input_replace, get_jit_stats, get_jc_idxs_with_updatable_launch_dims, get_jc_idxs_with_updatable_var_vals, GraphException
@@ -24,12 +24,14 @@ class HIPAllocator(LRUAllocator):
  def _do_free(self, buf): hip.hipFree(buf)
  def _cached_bufkey(self, size, dtype, device): return (device, size*dtype.itemsize) # Buffers of the same length could be reused, no matter what dtype.

+MOCKHIP = getenv("MOCKHIP") # for CI. don't run kernels, only check if they compile
+
 class _HIP:
  def __init__(self, device=None):
    self.default_device = device or getenv("HIP_DEFAULT_DEVICE")
-    hip.hipSetDevice(self.default_device)
-    self.device_count = hip.hipGetDeviceCount()
-    self.allocator = HIPAllocator(hip.hipGetDeviceProperties(self.default_device).totalGlobalMem)
+    self.device_count = 0 if MOCKHIP else hip.hipGetDeviceCount()
+    if not MOCKHIP: hip.hipSetDevice(self.default_device)
+    self.allocator = None if MOCKHIP else HIPAllocator(hip.hipGetDeviceProperties(self.default_device).totalGlobalMem)
 HIP = _HIP()

 class RawHIPBuffer(RawBufferCopyInOut, RawBufferTransfer):
@@ -47,7 +49,8 @@ class RawHIPBuffer(RawBufferCopyInOut, RawBufferTransfer):
@diskcache
 def compile_hip(prg) -> bytes:
  prog = hip.hiprtcCreateProgram(prg, "<null>", [], [])
-  hip.hiprtcCompileProgram(prog, [f'--offload-arch={hip.hipGetDeviceProperties(HIP.default_device).gcnArchName}'])
+  arch = "gfx1100" if MOCKHIP else hip.hipGetDeviceProperties(HIP.default_device).gcnArchName 
+  hip.hiprtcCompileProgram(prog, [f'--offload-arch={arch}'])
  return hip.hiprtcGetCode(prog)

 def time_execution(cb, enable=False):
@@ -77,6 +80,7 @@ class HIPProgram:
      self.prgs.append(hip.hipModuleGetFunction(self.modules[-1], name))

  def __call__(self, *args, global_size:Tuple[int,int,int], local_size:Tuple[int,int,int], wait=False):
+    if MOCKHIP: return
    hip.hipSetDevice(args[0]._device)
    if self.c_struct_t is None: self.c_struct_t = hip.getCStructForType([(ctypes.c_void_p if not isinstance(x, int) else ctypes.c_int) for x in args])
    c_params = cast(Callable, self.c_struct_t)(*[x._buf if not isinstance(x, int) else x for x in args])
@@ -137,4 +141,4 @@ class HIPGraph:
    update_stats(f"<batched {len(self.jit_cache)}>", self.op_estimate, self.mem_estimate, var_vals, et, buf_count=len(input_rawbuffers), jit=jit, num_kernels=len(self.jit_cache))
    return et

-HIPDevice = Compiled(RawHIPBuffer, LinearizerOptions(device="HIP"), HIPRenderer, compile_hip, HIPProgram, hip.hipDeviceSynchronize, graph=HIPGraph)
+HIPDevice = Compiled(RawHIPBuffer if not MOCKHIP else RawMallocBuffer, LinearizerOptions(device="HIP"), HIPRenderer, compile_hip, HIPProgram, hip.hipDeviceSynchronize, graph=HIPGraph)