HIP compilation on CI targeting RDNA3 (#2459)

* hip amd compilation

* gate the test properly

* cleanup unused import

* remove superfluous numpy conversion

* add SpeedyNet tests (f32 [passes] & f16 [fails])

* make CI verbose (error log from hip compiler)

* test the real ops_hip

* Merge branch 'tinygrad:master' into ci/hip-compilation

* fix CI

* cleanup

* really fix CI
This commit is contained in:
Davi Silva
2023-11-28 11:33:11 +07:00
committed by GitHub
parent d43485ae9e
commit d275ff930a
3 changed files with 87 additions and 7 deletions

View File

@@ -276,6 +276,47 @@ jobs:
- name: Run WEBGPU Efficientnet
run: node test/test_webgpu.js
testhipcompilation:
name: HIP Compilation Tests
runs-on: ubuntu-latest
timeout-minutes: 20
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Cache python packages
uses: actions/cache@v3
with:
path: ${{ env.Python3_ROOT_DIR }}/lib/python3.11/site-packages
key: testing-packages-${{ hashFiles('**/setup.py') }}
- name: Cache downloads
uses: actions/cache@v3
with:
path: ~/.cache/tinygrad/downloads/
key: downloads-cache-hipcompilation-${{ env.DOWNLOAD_CACHE_VERSION }}
- name: Install HIP tools
run: |
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
# ROCm repository for jammy
sudo tee /etc/apt/sources.list.d/rocm.list <<'EOF'
deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/debian jammy main
EOF
# Prefer packages from the rocm repository over system packages
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
sudo apt update
sudo apt install --allow-unauthenticated -y rocm-hip-libraries hip-dev
- name: Install Python Dependencies
run: pip install -e '.[testing]'
- name: Test HIP compilation on RDNA3 [gfx1100]
run: |
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/hip/lib
CI=1 python -m pytest -s test/test_hip_rdna3.py
tests:
strategy:
fail-fast: false

37
test/test_hip_rdna3.py Normal file
View File

@@ -0,0 +1,37 @@
#!/usr/bin/env python
import unittest
from tinygrad import Tensor, Device
from tinygrad.helpers import dtypes
from examples.beautiful_mnist import Model as MNIST
from examples.hlb_cifar10 import SpeedyResNet
@unittest.skipIf(Device.DEFAULT != "HIP", reason="testing HIP->rdna3 compilation needs HIP=1")
class TestHIPCompilationRDNA(unittest.TestCase):
def test_compile_hip_mnist(self):
model = MNIST()
input = Tensor.rand(512,1,28,28)
output = model(input)
output.numpy()
def test_compile_hip_speedyresnet(self):
W = Tensor.rand(12,3,2,2)
model = SpeedyResNet(W)
input = Tensor.rand(512, 3, 32, 32)
output = model(input)
output.numpy()
@unittest.expectedFailure
def test_compile_hip_speedyresnet_hf(self):
Tensor.default_type = dtypes.float16
W = Tensor.rand(12,3,2,2)
model = SpeedyResNet(W)
input = Tensor.rand(512, 3, 32, 32)
output = model(input)
output.numpy()
if __name__ == "__main__":
unittest.main()

View File

@@ -2,10 +2,10 @@ import numpy as np
import ctypes
import extra.hip_wrapper as hip
from typing import Tuple, List, Any, Dict, cast, Optional, Callable
from tinygrad.helpers import DEBUG, getenv, diskcache
from tinygrad.helpers import CI, DEBUG, getenv, diskcache
from tinygrad.device import Compiled, CompiledASTRunner, update_stats
from tinygrad.renderer.hip import HIPRenderer
from tinygrad.runtime.lib import RawBufferCopyInOut, LRUAllocator, RawBufferTransfer, RawBuffer
from tinygrad.runtime.lib import RawBufferCopyInOut, LRUAllocator, RawBufferTransfer, RawBuffer, RawMallocBuffer
from tinygrad.codegen.kernel import LinearizerOptions
from tinygrad.shape.symbolic import Variable
from tinygrad.jit import JitItem, get_input_replace, get_jit_stats, get_jc_idxs_with_updatable_launch_dims, get_jc_idxs_with_updatable_var_vals, GraphException
@@ -27,9 +27,9 @@ class HIPAllocator(LRUAllocator):
class _HIP:
def __init__(self, device=None):
self.default_device = device or getenv("HIP_DEFAULT_DEVICE")
hip.hipSetDevice(self.default_device)
self.device_count = hip.hipGetDeviceCount()
self.allocator = HIPAllocator(hip.hipGetDeviceProperties(self.default_device).totalGlobalMem)
self.device_count = 0 if CI else hip.hipGetDeviceCount()
if not CI: hip.hipSetDevice(self.default_device)
self.allocator = None if CI else HIPAllocator(hip.hipGetDeviceProperties(self.default_device).totalGlobalMem)
HIP = _HIP()
class RawHIPBuffer(RawBufferCopyInOut, RawBufferTransfer):
@@ -47,7 +47,8 @@ class RawHIPBuffer(RawBufferCopyInOut, RawBufferTransfer):
@diskcache
def compile_hip(prg) -> bytes:
prog = hip.hiprtcCreateProgram(prg, "<null>", [], [])
hip.hiprtcCompileProgram(prog, [f'--offload-arch={hip.hipGetDeviceProperties(HIP.default_device).gcnArchName}'])
arch = "gfx1100" if CI else hip.hipGetDeviceProperties(HIP.default_device).gcnArchName
hip.hiprtcCompileProgram(prog, [f'--offload-arch={arch}'])
return hip.hiprtcGetCode(prog)
def time_execution(cb, enable=False):
@@ -77,6 +78,7 @@ class HIPProgram:
self.prgs.append(hip.hipModuleGetFunction(self.modules[-1], name))
def __call__(self, *args, global_size:Tuple[int,int,int], local_size:Tuple[int,int,int], wait=False):
if CI: return
hip.hipSetDevice(args[0]._device)
if self.c_struct_t is None: self.c_struct_t = hip.getCStructForType([(ctypes.c_void_p if not isinstance(x, int) else ctypes.c_int) for x in args])
c_params = cast(Callable, self.c_struct_t)(*[x._buf if not isinstance(x, int) else x for x in args])
@@ -137,4 +139,4 @@ class HIPGraph:
update_stats(f"<batched {len(self.jit_cache)}>", self.op_estimate, self.mem_estimate, var_vals, et, buf_count=len(input_rawbuffers), jit=jit, num_kernels=len(self.jit_cache))
return et
HIPBuffer = Compiled(RawHIPBuffer, LinearizerOptions(device="HIP"), HIPRenderer, compile_hip, HIPProgram, hip.hipDeviceSynchronize, graph=HIPGraph)
HIPBuffer = Compiled(RawHIPBuffer if not CI else RawMallocBuffer, LinearizerOptions(device="HIP"), HIPRenderer, compile_hip, HIPProgram, hip.hipDeviceSynchronize, graph=HIPGraph)