diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f37a652f4d..0123e1b54c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -292,55 +292,6 @@ jobs: - name: Test LLaMA compile speed run: PYTHONPATH="." METAL=1 python test/external/external_test_speed_llama.py - testhipcompilation: - name: HIP Compilation Tests - runs-on: ubuntu-latest - timeout-minutes: 20 - - steps: - - name: Checkout Code - uses: actions/checkout@v4 - - name: Set up Python 3.11 - uses: actions/setup-python@v5 - with: - python-version: 3.11 - - name: Cache python packages - uses: actions/cache@v3 - with: - path: ${{ env.Python3_ROOT_DIR }}/lib/python3.11/site-packages - key: testing-packages-${{ hashFiles('**/setup.py') }} - - name: Cache downloads - uses: actions/cache@v3 - with: - path: ~/.cache/tinygrad/downloads/ - key: downloads-cache-hipcompilation-${{ env.DOWNLOAD_CACHE_VERSION }} - - name: Install HIP tools - run: | - echo 'Acquire::http::Pipeline-Depth "5";' | sudo tee -a /etc/apt/apt.conf.d/99parallel - wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null - # ROCm repository for jammy - sudo tee /etc/apt/sources.list.d/rocm.list <<'EOF' - deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/debian jammy main - EOF - # Prefer packages from the rocm repository over system packages - echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600 - sudo apt update - sudo apt install --no-install-recommends --allow-unauthenticated -y rocm-hip-libraries hip-dev - - name: Install Python Dependencies - run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu - - name: Verify HIP autogen - run: | - cp tinygrad/runtime/autogen/hip.py /tmp/hip.py.bak - cp tinygrad/runtime/autogen/comgr.py /tmp/comgr.py.bak - ./autogen_stubs.sh hip - diff /tmp/hip.py.bak tinygrad/runtime/autogen/hip.py - diff /tmp/comgr.py.bak tinygrad/runtime/autogen/comgr.py - - name: Test HIP compilation on RDNA3 [gfx1100] - # test/test_symbolic_ops.py can't run here, it was comparing empty memory - run: | - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/hip/lib - MOCKHIP=1 HIP=1 python -m pytest -s test/test_hip_rdna3.py - # testwebgl: # name: WebGL Tests # runs-on: ubuntu-latest @@ -378,7 +329,7 @@ jobs: strategy: fail-fast: false matrix: - backend: [llvm, clang, gpu, cuda] #, triton] #, ptx] + backend: [llvm, clang, gpu, cuda, hip] #, triton] #, ptx] name: Tests on (${{ matrix.backend }}) runs-on: ubuntu-latest @@ -402,7 +353,7 @@ jobs: path: ~/.cache/tinygrad/downloads/ key: downloads-cache-${{ matrix.backend }}-${{ env.DOWNLOAD_CACHE_VERSION }} - name: Set env - run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'clang' && 'CLANG=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'cuda' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\n' || matrix.backend == 'PTX' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nPTX=1' || matrix.backend == 'triton' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nTRITON=1\nTRITON_PTXAS_PATH=/usr/bin/ptxas'}}" >> $GITHUB_ENV + run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'clang' && 'CLANG=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'cuda' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\n' || matrix.backend == 'PTX' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nPTX=1' || matrix.backend == 'triton' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nTRITON=1\nTRITON_PTXAS_PATH=/usr/bin/ptxas' || matrix.backend == 'hip' && 'HIP=1\nHIPCPU=1\nFORWARD_ONLY=1' }}" >> $GITHUB_ENV - name: Install OpenCL if: matrix.backend == 'gpu' run: | @@ -444,11 +395,25 @@ jobs: run: | cd ${{ github.workspace }}/gpuocelot/ocelot/build sudo ninja install -d explain + - name: Install packages (hip) + if: matrix.backend == 'hip' + run: | + echo 'Acquire::http::Pipeline-Depth "5";' | sudo tee -a /etc/apt/apt.conf.d/99parallel + wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null + sudo tee /etc/apt/sources.list.d/rocm.list <<'EOF' + deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/debian jammy main + EOF + echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600 + sudo apt update + sudo apt install --no-install-recommends --allow-unauthenticated -y rocm-hip-libraries hip-dev + curl -s https://api.github.com/repos/Qazalin/remu/releases/latest | \ + jq -r '.assets[] | select(.name == "libremu.so").browser_download_url' | \ + sudo xargs curl -L -o /usr/local/lib/libremu.so - name: Install dependencies run: pip install -e '.[testing${{matrix.backend=='llvm'&&',llvm'||matrix.backend=='cuda'&&',cuda'||matrix.backend=='ptx'&&',cuda'||matrix.backend=='triton'&&',triton'||''}}]' --extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ - name: Check Device.DEFAULT and print some source run: | - python -c "from tinygrad import Device; assert Device.DEFAULT in ['LLVM','CLANG','CUDA','GPU'], Device.DEFAULT" + python -c "from tinygrad import Device; assert Device.DEFAULT in ['LLVM','CLANG','CUDA','GPU', 'HIP'], Device.DEFAULT" DEBUG=5 PYTHONPATH=${{ github.workspace }} FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add - name: Verify OpenCL autogen if: matrix.backend == 'gpu' @@ -462,8 +427,16 @@ jobs: cp tinygrad/runtime/autogen/cuda.py /tmp/cuda.py.bak ./autogen_stubs.sh cuda diff /tmp/cuda.py.bak tinygrad/runtime/autogen/cuda.py - - name: Run pytest (not cuda) - if: matrix.backend!='cuda' && matrix.backend!='ptx' && matrix.backend!='triton' + - name: Verify HIP autogen + if: matrix.backend == 'hip' + run: | + cp tinygrad/runtime/autogen/hip.py /tmp/hip.py.bak + cp tinygrad/runtime/autogen/comgr.py /tmp/comgr.py.bak + ./autogen_stubs.sh hip + diff /tmp/hip.py.bak tinygrad/runtime/autogen/hip.py + diff /tmp/comgr.py.bak tinygrad/runtime/autogen/comgr.py + - name: Run pytest (not cuda or hip) + if: matrix.backend!='cuda' && matrix.backend!='ptx' && matrix.backend!='triton' && matrix.backend != 'hip' run: python -m pytest -n=auto test/ --durations=20 - name: Run ONNX (only LLVM) if: matrix.backend == 'llvm' @@ -471,6 +444,9 @@ jobs: - name: Run pytest (cuda) if: matrix.backend=='cuda'||matrix.backend=='ptx'||matrix.backend=='triton' run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors)' --ignore=test/external --ignore=test/models --durations=20 + - name: Run pytest (hip) + if: matrix.backend=='hip' + run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py --durations=20 #testunicorn: # name: ARM64 unicorn Test @@ -495,4 +471,4 @@ jobs: # - name: Install dependencies # run: pip install -e '.[testing,arm]' --extra-index-url https://download.pytorch.org/whl/cpu # - name: Test arm - # run: CI=1 ARM64=1 CLANG=1 python -m pytest -n=auto test/ -k 'not (test_nn.py and (test_conv_transpose2d or test_conv2d))' --ignore=test/models --ignore=test/test_speed_v_torch.py --ignore=test/test_net_speed.py --ignore=test/test_specific_conv.py --ignore=test/unit/test_disk_tensor.py \ No newline at end of file + # run: CI=1 ARM64=1 CLANG=1 python -m pytest -n=auto test/ -k 'not (test_nn.py and (test_conv_transpose2d or test_conv2d))' --ignore=test/models --ignore=test/test_speed_v_torch.py --ignore=test/test_net_speed.py --ignore=test/test_specific_conv.py --ignore=test/unit/test_disk_tensor.py diff --git a/test/test_dtype_alu.py b/test/test_dtype_alu.py index c14db2cb4f..e9b4a8111a 100644 --- a/test/test_dtype_alu.py +++ b/test/test_dtype_alu.py @@ -143,9 +143,10 @@ class TestDTypeALU(unittest.TestCase): @given(ht.int32, ht.int32, ht.float32, strat.sampled_from(integer_binary_operations), strat.sampled_from(binary_operations)) def test_int32_midcast_float(self, a, b, c, op1, op2): universal_test_midcast(a, b, c, op1, op2, dtypes.int32, dtypes.float32) - # Metal and CUDACPU behave differently than numpy in CI for overflows - @given(strat.floats(width=32, min_value=0, max_value=10.0) if CI and (Device.DEFAULT == "METAL" or getenv("CUDACPU")) else ht.float32, - strat.floats(width=32, min_value=0, max_value=10.0) if CI and (Device.DEFAULT == "METAL" or getenv("CUDACPU")) else ht.float32, + # Metal and CUDACPU and HIP behave differently than numpy in CI for overflows + skip_overflow = CI and (Device.DEFAULT in ["METAL","HIP"] or getenv("CUDACPU")) + @given(strat.floats(width=32, min_value=0, max_value=10.0) if skip_overflow else ht.float32, + strat.floats(width=32, min_value=0, max_value=10.0) if skip_overflow else ht.float32, ht.int32, strat.sampled_from(binary_operations), strat.sampled_from(integer_binary_operations)) def test_float_midcast_int32(self, a, b, c, op1, op2): universal_test_midcast(a, b, c, op1, op2, dtypes.float32, dtypes.int32) diff --git a/test/test_hip_rdna3.py b/test/test_hip_rdna3.py deleted file mode 100644 index 4e39d237a2..0000000000 --- a/test/test_hip_rdna3.py +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/env python -import unittest -import operator -from tinygrad import Tensor, Device, dtypes -from tinygrad.helpers import DEBUG, to_function_name -from tinygrad.codegen.linearizer import Linearizer -from tinygrad.renderer.cstyle import HIPRenderer -from examples.beautiful_mnist import Model as MNIST -from examples.hlb_cifar10 import SpeedyResNet - -from hypothesis import given, strategies as strat, settings -settings.register_profile("my_profile", deadline=None) -settings.load_profile("my_profile") -print(settings.default) - -@unittest.skipIf(Device.DEFAULT != "HIP", reason="testing HIP->rdna3 compilation needs HIP=1") -class TestHIPCompilationRDNA(unittest.TestCase): - def test_compile_hip_mnist(self): - model = MNIST() - - input = Tensor.rand(512,1,28,28) - output = model(input) - output.numpy() - - def test_compile_hip_speedyresnet(self): - W = Tensor.rand(12,3,2,2) - model = SpeedyResNet(W) - - input = Tensor.rand(512, 3, 32, 32) - output = model(input) - output.numpy() - - def test_compile_hip_speedyresnet_hf(self): - old_default_float = dtypes.default_float - dtypes.default_float = dtypes.float16 - - W = Tensor.rand(12,3,2,2) - model = SpeedyResNet(W) - - input = Tensor.rand(512, 3, 32, 32) - output = model(input) - output.numpy() - - dtypes.default_float = old_default_float - -def compile_ast_to_hip(out: Tensor): - from tinygrad.runtime.ops_hip import compile_hip - - lin = Linearizer(out.lazydata.schedule()[-1].ast) - lin.hand_coded_optimizations() - lin.linearize() - code = HIPRenderer(to_function_name(lin.name), lin.uops)[0] - if DEBUG >= 4: print(code) - compile_hip(code) - -binary_operations = [operator.add, operator.sub, operator.mul] -unary_operations = [Tensor.exp, Tensor.log, operator.neg, Tensor.sin, Tensor.sqrt, Tensor.reciprocal] -float_dtypes = [dtypes.float16, dtypes.float32] - -@unittest.skipIf(Device.DEFAULT != "HIP", reason="testing HIP->rdna3 compilation needs HIP=1") -class TestHIPALUCompilation(unittest.TestCase): - @given(strat.sampled_from(unary_operations), strat.sampled_from(float_dtypes)) - def test_unary_ops(self, op, dtype): - a = Tensor.randn(4,4, dtype=dtype) - out = op(a) - compile_ast_to_hip(out) - - @given(strat.sampled_from(binary_operations), strat.sampled_from(float_dtypes)) - def test_binary_ops(self, op, dtype): - a = Tensor.randn(4,4, dtype=dtype) - b = Tensor.randn(4,4, dtype=dtype) - out = op(a,b) - compile_ast_to_hip(out) - -if __name__ == "__main__": - unittest.main() diff --git a/tinygrad/runtime/ops_hip.py b/tinygrad/runtime/ops_hip.py index 7225da2ffa..27be2f4229 100644 --- a/tinygrad/runtime/ops_hip.py +++ b/tinygrad/runtime/ops_hip.py @@ -4,13 +4,11 @@ from typing import Tuple, TypeVar, List, Any, cast, Set import tinygrad.runtime.autogen.hip as hip from tinygrad.helpers import DEBUG, getenv, init_c_var from tinygrad.helpers import from_mv, round_up, to_mv, colored, init_c_struct_t -from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, BufferOptions, JITRunner, Device, Buffer, update_stats, Compiler +from tinygrad.device import Compiled, LRUAllocator, BufferOptions, JITRunner, Device, Buffer, MallocAllocator, update_stats, Compiler from tinygrad.renderer.cstyle import HIPRenderer from tinygrad.codegen.kernel import LinearizerOptions from tinygrad.runtime.compiler.hip_comgr import compile_hip -# The default HIP stream is used for everything. -MOCKHIP = getenv("MOCKHIP") # for CI. don't run kernels, only check if they compile class HIPCompiler(Compiler): linearizer_opts = LinearizerOptions("HIP") @@ -38,7 +36,6 @@ class HIPProgram: asm = subprocess.check_output(["/opt/rocm/llvm/bin/llvm-objdump", '-d', '-'], input=lib) print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x])) - if MOCKHIP: return hip_set_device(self.device) self.module = init_c_var(hip.hipModule_t(), lambda x: check(hip.hipModuleLoadData(ctypes.byref(x), lib))) self.prg = init_c_var(hip.hipFunction_t(), lambda x: check(hip.hipModuleGetFunction(ctypes.byref(x), self.module, name.encode("utf-8")))) @@ -47,7 +44,6 @@ class HIPProgram: if hasattr(self, 'module'): check(hip.hipModuleUnload(self.module)) def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False): - if MOCKHIP: return float("inf") hip_set_device(self.device) if not hasattr(self, "vargs"): self.c_args = init_c_struct_t(tuple([(f'f{i}', hip.hipDeviceptr_t) for i in range(len(args))] + @@ -134,13 +130,13 @@ class HIPAllocator(LRUAllocator): class HIPDevice(Compiled): def __init__(self, device:str=""): self.device = int(device.split(":")[1]) if ":" in device else 0 - self.arch = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device))).gcnArchName.decode() if not MOCKHIP else "gfx1100" # noqa: E501 + self.arch = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device))).gcnArchName.decode() self.pending_copyin: List[ctypes.c_void_p] = [] self.track_cross_buffer: List[Any] = [] self.peers: Set[int] = set() from tinygrad.runtime.graph.hip import HIPGraph - super().__init__(device, MallocAllocator if MOCKHIP else HIPAllocator(self), HIPCompiler(self.arch), + super().__init__(device, HIPAllocator(self), HIPCompiler(self.arch), functools.partial(HIPProgram, self.device), HIPGraph) def synchronize(self): hip_set_device(self.device) @@ -172,3 +168,18 @@ class HIPWaitEvent(JITRunner): hip_set_device(self.device.device) check(hip.hipStreamWaitValue32(None, rawbufs[0]._buf, 1, 1, 0xFFFFFFFF)) update_stats(colored("wait", "RED"), 0, 0, {}, None, 1, jit, device=self.dname) + +if getenv("HIPCPU"): + hip = ctypes.CDLL("/usr/local/lib/libremu.so") # type: ignore[assignment] + + class HIPProgram: # type: ignore[no-redef] + def __init__(self, name:str, lib:bytes): + self.name, self.lib = name, lib + def __call__(self, *args, global_size, local_size, vals=(), wait=False): + args = (*args, *vals) + hip.hipModuleLaunchKernel(self.lib, len(self.lib), *global_size, *local_size, 0, None, None, + len(args), (ctypes.c_void_p * len(args))(*[ctypes.cast(x, ctypes.c_void_p) for x in args])) + + class HIPDevice(Compiled): # type: ignore[no-redef] + def __init__(self, device=""): + super().__init__(device, MallocAllocator, HIPCompiler("gfx1100"), HIPProgram)