From 9b02aef45a43c97655260299fdcf20fb92fb785e Mon Sep 17 00:00:00 2001
From: nimlgen <138685161+nimlgen@users.noreply.github.com>
Date: Tue, 14 May 2024 17:58:19 +0300
Subject: [PATCH] remove rhip (#4579)

* remove rhip

* remove hip runner
---
 .github/workflows/test.yml                    | 21 ++++++++-----------
 .../runtime => extra/backends}/ops_rhip.py    |  0
 test/helpers.py                               |  2 +-
 test/test_dtype_alu.py                        |  2 +-
 test/test_linearizer.py                       |  4 ++--
 test/test_randomness.py                       |  2 +-
 test/unit/test_disk_tensor.py                 |  1 -
 7 files changed, 14 insertions(+), 18 deletions(-)
 rename {tinygrad/runtime => extra/backends}/ops_rhip.py (100%)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index d6cdf7c5fe..5064f1062a 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -332,7 +332,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        backend: [llvm, clang, gpu, cuda, hip, ptx, amd] #, triton]
+        backend: [llvm, clang, gpu, cuda, ptx, amd] #, triton]
 
     name: Tests on (${{ matrix.backend }})
     runs-on: ubuntu-latest
@@ -356,7 +356,7 @@ jobs:
           path: ~/.cache/tinygrad/downloads/
           key: downloads-cache-${{ matrix.backend }}-${{ env.DOWNLOAD_CACHE_VERSION }}
       - name: Set env
-        run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'clang' && 'CLANG=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'cuda' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\n' || matrix.backend == 'PTX' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nPTX=1' || matrix.backend == 'triton' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nTRITON=1\nTRITON_PTXAS_PATH=/usr/bin/ptxas' || matrix.backend == 'hip' && 'RHIP=1\nFORWARD_ONLY=1' || matrix.backend == 'amd' && 'AMD=1\nMOCKGPU=1\nFORWARD_ONLY=1' }}" >> $GITHUB_ENV
+        run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'clang' && 'CLANG=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'cuda' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\n' || matrix.backend == 'PTX' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nPTX=1' || matrix.backend == 'triton' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nTRITON=1\nTRITON_PTXAS_PATH=/usr/bin/ptxas' || matrix.backend == 'amd' && 'AMD=1\nMOCKGPU=1\nFORWARD_ONLY=1' }}" >> $GITHUB_ENV
       - name: Install OpenCL
         if: matrix.backend == 'gpu'
         run: |
@@ -398,8 +398,8 @@ jobs:
         run: |
           cd ${{ github.workspace }}/gpuocelot/ocelot/build
           sudo ninja install -d explain
-      - name: Install packages (hip)
-        if: matrix.backend == 'hip' || matrix.backend == 'amd'
+      - name: Install packages (amd)
+        if: matrix.backend == 'amd'
         run: |
           echo 'Acquire::http::Pipeline-Depth "5";' | sudo tee -a /etc/apt/apt.conf.d/99parallel
           wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
@@ -416,7 +416,7 @@ jobs:
         run: pip install -e '.[testing${{matrix.backend=='llvm'&&',llvm'||matrix.backend=='cuda'&&',cuda'||matrix.backend=='ptx'&&',cuda'||matrix.backend=='triton'&&',triton'||''}}]' --extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/
       - name: Check Device.DEFAULT and print some source
         run: |
-          PYTHONPATH=${{ github.workspace }} python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['LLVM','CLANG','CUDA','GPU','RHIP','AMD'], Device.DEFAULT"
+          PYTHONPATH=${{ github.workspace }} python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['LLVM','CLANG','CUDA','GPU','AMD'], Device.DEFAULT"
           DEBUG=5 PYTHONPATH=${{ github.workspace }} FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
       - name: Verify OpenCL autogen
         if: matrix.backend == 'gpu'
@@ -433,8 +433,8 @@ jobs:
           ./autogen_stubs.sh nv
           diff /tmp/cuda.py.bak tinygrad/runtime/autogen/cuda.py
           diff /tmp/nv_gpu.py.bak tinygrad/runtime/autogen/nv_gpu.py
-      - name: Verify HIP autogen
-        if: matrix.backend == 'hip'
+      - name: Verify AMD autogen
+        if: matrix.backend == 'amd'
         run: |
           cp tinygrad/runtime/autogen/hsa.py /tmp/hsa.py.bak
           cp tinygrad/runtime/autogen/comgr.py /tmp/comgr.py.bak
@@ -442,8 +442,8 @@ jobs:
           ./autogen_stubs.sh comgr
           diff /tmp/hsa.py.bak tinygrad/runtime/autogen/hsa.py
           diff /tmp/comgr.py.bak tinygrad/runtime/autogen/comgr.py
-      - name: Run pytest (not cuda or hip/amd)
-        if: matrix.backend!='cuda' && matrix.backend!='ptx' && matrix.backend!='triton' && matrix.backend != 'hip' && matrix.backend != 'amd'
+      - name: Run pytest (not cuda or amd)
+        if: matrix.backend!='cuda' && matrix.backend!='ptx' && matrix.backend!='triton' && matrix.backend != 'amd'
         run: python -m pytest -n=auto test/ --durations=20
       - name: Run ONNX (only LLVM)
         if: matrix.backend == 'llvm'
@@ -451,9 +451,6 @@ jobs:
       - name: Run pytest (cuda)
         if: matrix.backend=='cuda'||matrix.backend=='ptx'||matrix.backend=='triton'
         run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors)' --ignore=test/external --ignore=test/models --durations=20
-      - name: Run pytest (hip)
-        if: matrix.backend=='hip'
-        run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/imported/test_indexing.py test/external/external_test_hip_compile.py --durations=20
       - name: Run pytest (amd)
         if: matrix.backend=='amd'
         run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/imported/test_indexing.py test/external/external_test_hcq.py --durations=20
diff --git a/tinygrad/runtime/ops_rhip.py b/extra/backends/ops_rhip.py
similarity index 100%
rename from tinygrad/runtime/ops_rhip.py
rename to extra/backends/ops_rhip.py
diff --git a/test/helpers.py b/test/helpers.py
index f1f39f9ce3..cf09bb6fe5 100644
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -26,7 +26,7 @@ def assert_jit_cache_len(fxn, expected_len):
 def is_dtype_supported(dtype: DType, device: str = Device.DEFAULT):
   if dtype == dtypes.bfloat16:
     # NOTE: this requires bf16 buffer support
-    return device in {"RHIP", "HSA", "AMD"} or (device == "CUDA" and not CI and not getenv("PTX"))
+    return device in {"HSA", "AMD"} or (device == "CUDA" and not CI and not getenv("PTX"))
   if device in ["WEBGPU", "WEBGL"]: return dtype in [dtypes.float, dtypes.int32, dtypes.uint32]
   if device == "CUDA" and getenv("PTX") and dtype in (dtypes.int8, dtypes.uint8): return False
   # for CI GPU and OSX, cl_khr_fp16 isn't supported
diff --git a/test/test_dtype_alu.py b/test/test_dtype_alu.py
index 32f615dc9c..c3f8e27c00 100644
--- a/test/test_dtype_alu.py
+++ b/test/test_dtype_alu.py
@@ -145,7 +145,7 @@ class TestDTypeALU(unittest.TestCase):
   def test_int32_midcast_float(self, a, b, c, op1, op2): universal_test_midcast(a, b, c, op1, op2, dtypes.int32, dtypes.float32)
 
   # Metal and CUDACPU and HIP behave differently than numpy in CI for overflows
-  skip_overflow = CI and (Device.DEFAULT in {"RHIP", "HSA", "AMD"} or getenv("CUDACPU"))
+  skip_overflow = CI and (Device.DEFAULT in {"HSA", "AMD"} or getenv("CUDACPU"))
   @given(strat.floats(width=32, min_value=0, max_value=10.0) if skip_overflow else ht.float32,
          strat.floats(width=32, min_value=0, max_value=10.0) if skip_overflow else ht.float32,
          ht.int32, strat.sampled_from(binary_operations), strat.sampled_from(integer_binary_operations))
diff --git a/test/test_linearizer.py b/test/test_linearizer.py
index ff1536aabb..9c9654fff8 100644
--- a/test/test_linearizer.py
+++ b/test/test_linearizer.py
@@ -292,7 +292,7 @@ class TestLinearizer(unittest.TestCase):
       # check correctness
       helper_tc_allclose(tc.dims[0]+pad, tc.dims[1]+pad, tc.dims[2]+pad, tc.dtype_in, tc.dtype_out, tc_opt=2)
 
-  @unittest.skipIf(CI and Device.DEFAULT in {"RHIP", "AMD"}, "RHIP/AMD CI is really slow here")
+  @unittest.skipIf(CI and Device.DEFAULT in {"AMD"}, "AMD CI is really slow here")
   def test_tensor_cores_multi_reduce(self):
     if not Device[Device.DEFAULT].renderer.has_tensor_cores:
       self.skipTest("device doesn't have tensor cores")
@@ -852,7 +852,7 @@ class TestKernelOpts(unittest.TestCase):
       ], apply_tc=True, atol=atol, rtol=rtol)
 
   def test_padto_matmul(self):
-    if CI and Device.DEFAULT in ["CUDA", "RHIP", "AMD"]: self.skipTest("super slow on CUDA and RHIP because of the big grid dims")
+    if CI and Device.DEFAULT in ["CUDA", "AMD"]: self.skipTest("super slow on CUDA and AMD because of the big grid dims")
     N = 17 * 17
     Tensor.manual_seed(289)
     a = Tensor.rand(N, N)
diff --git a/test/test_randomness.py b/test/test_randomness.py
index e66beb4637..47b8d02cba 100644
--- a/test/test_randomness.py
+++ b/test/test_randomness.py
@@ -104,7 +104,7 @@ class TestRandomness(unittest.TestCase):
     self.assertTrue(equal_distribution(Tensor.randn, torch.randn, lambda x: np.random.randn(*x)))
 
   @given(strat.sampled_from([dtypes.float, dtypes.float16, dtypes.bfloat16]))
-  @unittest.skipIf(Device.DEFAULT in ["HSA", "RHIP", "AMD"], "bfloat16 local buffer broken in HSA")
+  @unittest.skipIf(Device.DEFAULT in ["HSA", "AMD"], "bfloat16 local buffer broken in HSA")
   def test_randn_finite(self, default_float):
     if not is_dtype_supported(default_float): return
     old_default_float = dtypes.default_float
diff --git a/test/unit/test_disk_tensor.py b/test/unit/test_disk_tensor.py
index 4017aa6eaf..982059e196 100644
--- a/test/unit/test_disk_tensor.py
+++ b/test/unit/test_disk_tensor.py
@@ -284,7 +284,6 @@ class TestDiskTensor(unittest.TestCase):
     ret = t.to("CLANG").bitcast(dtypes.uint16) + 1
     assert ret.tolist() == [2827, 3341, 3855, 4369, 4883]
 
-  @unittest.skipIf(Device.DEFAULT == "RHIP", "no real HIP device exists in CI")
   def test_bf16_disk_write_read(self):
     t = Tensor([10000, -1, -1000, -10000, 20], dtype=dtypes.float32)
     t.to(f"disk:{temp('f32')}").realize()