diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index a432379b0e..250cedf037 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -375,7 +375,7 @@ jobs:
           path: ~/.cache/tinygrad/downloads/
           key: downloads-cache-${{ matrix.backend }}-${{ env.DOWNLOAD_CACHE_VERSION }}
       - name: Set env
-        run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'clang' && 'CLANG=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'cuda' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\n' || matrix.backend == 'PTX' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nPTX=1' || matrix.backend == 'triton' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nTRITON=1\nTRITON_PTXAS_PATH=/usr/bin/ptxas' || matrix.backend == 'hip' && 'HIP=1\nHIPCPU=1\nFORWARD_ONLY=1' }}" >> $GITHUB_ENV
+        run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'clang' && 'CLANG=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'cuda' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\n' || matrix.backend == 'PTX' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nPTX=1' || matrix.backend == 'triton' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nTRITON=1\nTRITON_PTXAS_PATH=/usr/bin/ptxas' || matrix.backend == 'hip' && 'RHIP=1\nFORWARD_ONLY=1' }}" >> $GITHUB_ENV
       - name: Install OpenCL
         if: matrix.backend == 'gpu'
         run: |
@@ -435,7 +435,7 @@ jobs:
         run: pip install -e '.[testing${{matrix.backend=='llvm'&&',llvm'||matrix.backend=='cuda'&&',cuda'||matrix.backend=='ptx'&&',cuda'||matrix.backend=='triton'&&',triton'||''}}]' --extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/
       - name: Check Device.DEFAULT and print some source
         run: |
-          python -c "from tinygrad import Device; assert Device.DEFAULT in ['LLVM','CLANG','CUDA','GPU', 'HIP'], Device.DEFAULT"
+          python -c "from tinygrad import Device; assert Device.DEFAULT in ['LLVM','CLANG','CUDA','GPU','RHIP'], Device.DEFAULT"
           DEBUG=5 PYTHONPATH=${{ github.workspace }} FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
       - name: Verify OpenCL autogen
         if: matrix.backend == 'gpu'
diff --git a/tinygrad/runtime/graph/hip.py b/extra/backends/graph_hip.py
similarity index 100%
rename from tinygrad/runtime/graph/hip.py
rename to extra/backends/graph_hip.py
diff --git a/tinygrad/runtime/ops_hip.py b/extra/backends/ops_hip.py
similarity index 100%
rename from tinygrad/runtime/ops_hip.py
rename to extra/backends/ops_hip.py
diff --git a/test/helpers.py b/test/helpers.py
index 273705bae5..ab198d5d5c 100644
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -23,7 +23,7 @@ def assert_jit_cache_len(fxn, expected_len):
 def is_dtype_supported(dtype: DType, device: str = Device.DEFAULT):
   if dtype == dtypes.bfloat16:
     # NOTE: this requires bf16 buffer support
-    return device in ["HIP"]
+    return device in {"RHIP", "HSA"}
   if device in ["WEBGPU", "WEBGL"]: return dtype in [dtypes.float, dtypes.int32, dtypes.uint32]
   # for CI GPU, cl_khr_fp16 isn't supported
   # for CI LLVM, it segfaults because it can't link to the casting function
diff --git a/test/test_dtype_alu.py b/test/test_dtype_alu.py
index d196937c0e..93e581aeee 100644
--- a/test/test_dtype_alu.py
+++ b/test/test_dtype_alu.py
@@ -142,7 +142,7 @@ class TestDTypeALU(unittest.TestCase):
   def test_int32_midcast_float(self, a, b, c, op1, op2): universal_test_midcast(a, b, c, op1, op2, dtypes.int32, dtypes.float32)
 
   # Metal and CUDACPU and HIP behave differently than numpy in CI for overflows
-  skip_overflow = CI and (Device.DEFAULT == "HIP" or getenv("CUDACPU"))
+  skip_overflow = CI and (Device.DEFAULT in {"RHIP", "HSA"} or getenv("CUDACPU"))
   @given(strat.floats(width=32, min_value=0, max_value=10.0) if skip_overflow else ht.float32,
          strat.floats(width=32, min_value=0, max_value=10.0) if skip_overflow else ht.float32,
          ht.int32, strat.sampled_from(binary_operations), strat.sampled_from(integer_binary_operations))
diff --git a/test/test_linearizer.py b/test/test_linearizer.py
index 70bd2bb68e..7ab58c1222 100644
--- a/test/test_linearizer.py
+++ b/test/test_linearizer.py
@@ -706,7 +706,7 @@ class TestLinearizerOpts(unittest.TestCase):
       ], apply_tc=True, atol=atol, rtol=rtol)
 
   def test_padto_matmul(self):
-    if Device.DEFAULT in ["CUDA", "HIP"]: self.skipTest("super slow on CUDA and HIP because of the big grid dims")
+    if Device.DEFAULT in ["CUDA", "RHIP"]: self.skipTest("super slow on CUDA and RHIP because of the big grid dims")
     N = 17 * 17
     Tensor.manual_seed(289)
     a = Tensor.rand(N, N)
diff --git a/test/test_linearizer_overflows.py b/test/test_linearizer_overflows.py
index 6fdd7edf2f..c03de6094f 100644
--- a/test/test_linearizer_overflows.py
+++ b/test/test_linearizer_overflows.py
@@ -17,7 +17,6 @@ def _test_overflow(ast, opts):
   lin.linearize()
   bufs = bufs_from_lin(lin)
   print(bufs)
-  if bufs[0].device in {"HIP", "HSA"}: print([hex(x._buf.value) for x in bufs])
   time_linearizer(lin, bufs)
 
 # NOTE: if you want these to trigger, set launch bounds on HIP kernels
diff --git a/test/unit/test_disk_tensor.py b/test/unit/test_disk_tensor.py
index 154a514bff..50e08c2885 100644
--- a/test/unit/test_disk_tensor.py
+++ b/test/unit/test_disk_tensor.py
@@ -2,7 +2,7 @@ import pathlib, unittest
 import numpy as np
 from tinygrad import Tensor, Device, dtypes
 from tinygrad.nn.state import safe_load, safe_save, get_state_dict, torch_load
-from tinygrad.helpers import Timing, fetch, temp, getenv
+from tinygrad.helpers import Timing, fetch, temp
 from test.helpers import is_dtype_supported
 
 def compare_weights_both(url):
@@ -214,7 +214,7 @@ class TestDiskTensor(unittest.TestCase):
 
     np.testing.assert_array_equal(t.numpy(), np.array([3] * 10))
 
-  @unittest.skipIf(getenv("HIPCPU"), "no real HIP device exists in CI")
+  @unittest.skipIf(Device.DEFAULT == "RHIP", "no real HIP device exists in CI")
   def test_bf16_disk_write_read(self):
     t = Tensor([10000, -1, -1000, -10000, 20]).cast(dtypes.float32)
     t.to(f"disk:{temp('f32')}").realize()
diff --git a/tinygrad/realize.py b/tinygrad/realize.py
index 59ddb7df34..303acb3f81 100644
--- a/tinygrad/realize.py
+++ b/tinygrad/realize.py
@@ -31,10 +31,6 @@ def lower_schedule_item(si:ScheduleItem) -> Optional[JITRunner]:
   if si.ast[0].op is BufferOps.STORE: return Device[si.outputs[0].device].get_runner(*si.ast)
   assert len(si.ast) == 1 and len(si.outputs) == 1, "only ASTRunner supports multioutput"
   out, ast = si.outputs[0], si.ast[0]
-  if ast.op in {LoadOps.SYNC, LoadOps.WAIT, LoadOps.COPY} and out.device.startswith("HIP") and si.inputs[0].device.startswith("HIP"):
-    from tinygrad.runtime.ops_hip import HIPSyncEvent, HIPWaitEvent
-    if ast.op is LoadOps.SYNC: return HIPSyncEvent(out)
-    if ast.op is LoadOps.WAIT: return HIPWaitEvent(out.device)
   if ast.op in {LoadOps.SYNC, LoadOps.WAIT} and out.device.startswith("HSA") and si.inputs[0].device.startswith("HSA"):
     # Our HSA runtime handles synchronization
     if ast.op is LoadOps.SYNC: return None
diff --git a/tinygrad/runtime/ops_hsa.py b/tinygrad/runtime/ops_hsa.py
index aabb5e0856..dcef69a1ee 100644
--- a/tinygrad/runtime/ops_hsa.py
+++ b/tinygrad/runtime/ops_hsa.py
@@ -3,10 +3,11 @@ import ctypes, functools, subprocess, io, atexit, collections, json
 from typing import Tuple, TypeVar, List, Dict, Any
 import tinygrad.runtime.autogen.hsa as hsa
 from tinygrad.helpers import DEBUG, init_c_var, from_mv, round_up, to_mv, init_c_struct_t, getenv
-from tinygrad.device import Compiled, LRUAllocator, BufferOptions
+from tinygrad.device import Compiled, LRUAllocator, BufferOptions, Compiler
 from tinygrad.codegen.kernel import LinearizerOptions
-from tinygrad.runtime.ops_hip import HIPCompiler
 from tinygrad.runtime.driver.hsa import check, scan_agents, find_memory_pool, AQLQueue
+from tinygrad.renderer.cstyle import HIPRenderer
+from tinygrad.runtime.driver.hip_comgr import compile_hip
 
 PROFILE = getenv("PROFILE", 0)
 
@@ -40,8 +41,13 @@ class HSAProfiler:
     print(f"Saved HSA profile to {path}")
 Profiler = HSAProfiler()
 
-class HSACompiler(HIPCompiler):
+class HSACompiler(Compiler):
   linearizer_opts = LinearizerOptions("HSA", has_tensor_cores=True, shared_max=65536)
+  def __init__(self, arch:str):
+    self.arch = arch
+    super().__init__(f"compile_hip_{self.arch}")
+  def render(self, name:str, uops) -> str: return HIPRenderer(name, uops)
+  def compile(self, src:str) -> bytes: return compile_hip(src, self.arch)
 
 class HSAProgram:
   def __init__(self, device:HSADevice, name:str, lib:bytes):
diff --git a/tinygrad/runtime/ops_rhip.py b/tinygrad/runtime/ops_rhip.py
new file mode 100644
index 0000000000..7bf252ba59
--- /dev/null
+++ b/tinygrad/runtime/ops_rhip.py
@@ -0,0 +1,17 @@
+import ctypes
+from tinygrad.device import Compiled, MallocAllocator
+from tinygrad.runtime.ops_hsa import HSACompiler
+
+rhip = ctypes.CDLL("/usr/local/lib/libremu.so")
+class RHIPProgram:
+  def __init__(self, name:str, lib:bytes):
+    self.name, self.lib = name, lib
+  def __call__(self, *args, global_size, local_size, vals=(), wait=False):
+    args = (*args, *vals)
+    rhip.hipModuleLaunchKernel(self.lib, len(self.lib), *global_size, *local_size, 0, None, None,
+                              len(args), (ctypes.c_void_p * len(args))(*[ctypes.cast(x, ctypes.c_void_p) for x in args]))
+
+class RHIPDevice(Compiled):
+  def __init__(self, device:str=""):
+    self.device = int(device.split(":")[1]) if ":" in device else 0
+    super().__init__(device, MallocAllocator, HSACompiler("gfx1100"), RHIPProgram)