HIP CI that compiles (to RDNA3) but doesn't have to run (#2482)

* hip amd compilation

* gate the test properly

* cleanup unused import

* remove superfluous numpy conversion

* add SpeedyNet tests (f32 [passes] & f16 [fails])

* make CI verbose (error log from hip compiler)

* test the real ops_hip

* Merge branch 'tinygrad:master' into ci/hip-compilation

* fix CI

* cleanup

* really fix CI

* Fix CI Three: the refixening

---------

Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
This commit is contained in:
Davi Silva
2023-11-28 12:17:06 +07:00
committed by GitHub
parent 756b01f46f
commit 136dbd8b36
3 changed files with 88 additions and 6 deletions

View File

@@ -276,6 +276,47 @@ jobs:
- name: Run WEBGPU Efficientnet
run: node test/test_webgpu.js
testhipcompilation:
name: HIP Compilation Tests
runs-on: ubuntu-latest
timeout-minutes: 20
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Cache python packages
uses: actions/cache@v3
with:
path: ${{ env.Python3_ROOT_DIR }}/lib/python3.11/site-packages
key: testing-packages-${{ hashFiles('**/setup.py') }}
- name: Cache downloads
uses: actions/cache@v3
with:
path: ~/.cache/tinygrad/downloads/
key: downloads-cache-hipcompilation-${{ env.DOWNLOAD_CACHE_VERSION }}
- name: Install HIP tools
run: |
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
# ROCm repository for jammy
sudo tee /etc/apt/sources.list.d/rocm.list <<'EOF'
deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/debian jammy main
EOF
# Prefer packages from the rocm repository over system packages
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
sudo apt update
sudo apt install --allow-unauthenticated -y rocm-hip-libraries hip-dev
- name: Install Python Dependencies
run: pip install -e '.[testing]'
- name: Test HIP compilation on RDNA3 [gfx1100]
run: |
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/hip/lib
MOCKHIP=1 HIP=1 python -m pytest -s test/test_hip_rdna3.py
tests:
strategy:
fail-fast: false

37
test/test_hip_rdna3.py Normal file
View File

@@ -0,0 +1,37 @@
#!/usr/bin/env python
import unittest
from tinygrad import Tensor, Device
from tinygrad.helpers import dtypes
from examples.beautiful_mnist import Model as MNIST
from examples.hlb_cifar10 import SpeedyResNet
@unittest.skipIf(Device.DEFAULT != "HIP", reason="testing HIP->rdna3 compilation needs HIP=1")
class TestHIPCompilationRDNA(unittest.TestCase):
def test_compile_hip_mnist(self):
model = MNIST()
input = Tensor.rand(512,1,28,28)
output = model(input)
output.numpy()
def test_compile_hip_speedyresnet(self):
W = Tensor.rand(12,3,2,2)
model = SpeedyResNet(W)
input = Tensor.rand(512, 3, 32, 32)
output = model(input)
output.numpy()
@unittest.expectedFailure
def test_compile_hip_speedyresnet_hf(self):
Tensor.default_type = dtypes.float16
W = Tensor.rand(12,3,2,2)
model = SpeedyResNet(W)
input = Tensor.rand(512, 3, 32, 32)
output = model(input)
output.numpy()
if __name__ == "__main__":
unittest.main()

View File

@@ -5,7 +5,7 @@ from typing import Tuple, List, Any, Dict, cast, Optional, Callable
from tinygrad.helpers import DEBUG, getenv, diskcache
from tinygrad.device import Compiled, CompiledASTRunner, update_stats
from tinygrad.renderer.hip import HIPRenderer
from tinygrad.runtime.lib import RawBufferCopyInOut, LRUAllocator, RawBufferTransfer, RawBuffer
from tinygrad.runtime.lib import RawBufferCopyInOut, LRUAllocator, RawBufferTransfer, RawBuffer, RawMallocBuffer
from tinygrad.codegen.kernel import LinearizerOptions
from tinygrad.shape.symbolic import Variable
from tinygrad.jit import JitItem, get_input_replace, get_jit_stats, get_jc_idxs_with_updatable_launch_dims, get_jc_idxs_with_updatable_var_vals, GraphException
@@ -24,12 +24,14 @@ class HIPAllocator(LRUAllocator):
def _do_free(self, buf): hip.hipFree(buf)
def _cached_bufkey(self, size, dtype, device): return (device, size*dtype.itemsize) # Buffers of the same length could be reused, no matter what dtype.
MOCKHIP = getenv("MOCKHIP") # for CI. don't run kernels, only check if they compile
class _HIP:
def __init__(self, device=None):
self.default_device = device or getenv("HIP_DEFAULT_DEVICE")
hip.hipSetDevice(self.default_device)
self.device_count = hip.hipGetDeviceCount()
self.allocator = HIPAllocator(hip.hipGetDeviceProperties(self.default_device).totalGlobalMem)
self.device_count = 0 if MOCKHIP else hip.hipGetDeviceCount()
if not MOCKHIP: hip.hipSetDevice(self.default_device)
self.allocator = None if MOCKHIP else HIPAllocator(hip.hipGetDeviceProperties(self.default_device).totalGlobalMem)
HIP = _HIP()
class RawHIPBuffer(RawBufferCopyInOut, RawBufferTransfer):
@@ -47,7 +49,8 @@ class RawHIPBuffer(RawBufferCopyInOut, RawBufferTransfer):
@diskcache
def compile_hip(prg) -> bytes:
prog = hip.hiprtcCreateProgram(prg, "<null>", [], [])
hip.hiprtcCompileProgram(prog, [f'--offload-arch={hip.hipGetDeviceProperties(HIP.default_device).gcnArchName}'])
arch = "gfx1100" if MOCKHIP else hip.hipGetDeviceProperties(HIP.default_device).gcnArchName
hip.hiprtcCompileProgram(prog, [f'--offload-arch={arch}'])
return hip.hiprtcGetCode(prog)
def time_execution(cb, enable=False):
@@ -77,6 +80,7 @@ class HIPProgram:
self.prgs.append(hip.hipModuleGetFunction(self.modules[-1], name))
def __call__(self, *args, global_size:Tuple[int,int,int], local_size:Tuple[int,int,int], wait=False):
if MOCKHIP: return
hip.hipSetDevice(args[0]._device)
if self.c_struct_t is None: self.c_struct_t = hip.getCStructForType([(ctypes.c_void_p if not isinstance(x, int) else ctypes.c_int) for x in args])
c_params = cast(Callable, self.c_struct_t)(*[x._buf if not isinstance(x, int) else x for x in args])
@@ -137,4 +141,4 @@ class HIPGraph:
update_stats(f"<batched {len(self.jit_cache)}>", self.op_estimate, self.mem_estimate, var_vals, et, buf_count=len(input_rawbuffers), jit=jit, num_kernels=len(self.jit_cache))
return et
HIPDevice = Compiled(RawHIPBuffer, LinearizerOptions(device="HIP"), HIPRenderer, compile_hip, HIPProgram, hip.hipDeviceSynchronize, graph=HIPGraph)
HIPDevice = Compiled(RawHIPBuffer if not MOCKHIP else RawMallocBuffer, LinearizerOptions(device="HIP"), HIPRenderer, compile_hip, HIPProgram, hip.hipDeviceSynchronize, graph=HIPGraph)