diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index f37a652f4d..0123e1b54c 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -292,55 +292,6 @@ jobs:
     - name: Test LLaMA compile speed
       run: PYTHONPATH="." METAL=1 python test/external/external_test_speed_llama.py
 
-  testhipcompilation:
-    name: HIP Compilation Tests
-    runs-on: ubuntu-latest
-    timeout-minutes: 20
-
-    steps:
-    - name: Checkout Code
-      uses: actions/checkout@v4
-    - name: Set up Python 3.11
-      uses: actions/setup-python@v5
-      with:
-        python-version: 3.11
-    - name: Cache python packages
-      uses: actions/cache@v3
-      with:
-        path: ${{ env.Python3_ROOT_DIR }}/lib/python3.11/site-packages
-        key: testing-packages-${{ hashFiles('**/setup.py') }}
-    - name: Cache downloads
-      uses: actions/cache@v3
-      with:
-        path: ~/.cache/tinygrad/downloads/
-        key: downloads-cache-hipcompilation-${{ env.DOWNLOAD_CACHE_VERSION }}
-    - name: Install HIP tools
-      run: |
-        echo 'Acquire::http::Pipeline-Depth "5";' | sudo tee -a /etc/apt/apt.conf.d/99parallel
-        wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
-        # ROCm repository for jammy
-        sudo tee /etc/apt/sources.list.d/rocm.list <<'EOF'
-        deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/debian jammy main
-        EOF
-        # Prefer packages from the rocm repository over system packages
-        echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
-        sudo apt update
-        sudo apt install --no-install-recommends --allow-unauthenticated -y rocm-hip-libraries hip-dev
-    - name: Install Python Dependencies
-      run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
-    - name: Verify HIP autogen
-      run: |
-        cp tinygrad/runtime/autogen/hip.py /tmp/hip.py.bak
-        cp tinygrad/runtime/autogen/comgr.py /tmp/comgr.py.bak
-        ./autogen_stubs.sh hip
-        diff /tmp/hip.py.bak tinygrad/runtime/autogen/hip.py
-        diff /tmp/comgr.py.bak tinygrad/runtime/autogen/comgr.py
-    - name: Test HIP compilation on RDNA3 [gfx1100]
-      # test/test_symbolic_ops.py can't run here, it was comparing empty memory
-      run: |
-        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/hip/lib
-        MOCKHIP=1 HIP=1 python -m pytest -s test/test_hip_rdna3.py
-
 #  testwebgl:
 #    name: WebGL Tests
 #    runs-on: ubuntu-latest
@@ -378,7 +329,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        backend: [llvm, clang, gpu, cuda] #, triton] #, ptx]
+        backend: [llvm, clang, gpu, cuda, hip] #, triton] #, ptx]
 
     name: Tests on (${{ matrix.backend }})
     runs-on: ubuntu-latest
@@ -402,7 +353,7 @@ jobs:
           path: ~/.cache/tinygrad/downloads/
           key: downloads-cache-${{ matrix.backend }}-${{ env.DOWNLOAD_CACHE_VERSION }}
       - name: Set env
-        run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'clang' && 'CLANG=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'cuda' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\n' || matrix.backend == 'PTX' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nPTX=1' || matrix.backend == 'triton' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nTRITON=1\nTRITON_PTXAS_PATH=/usr/bin/ptxas'}}" >> $GITHUB_ENV
+        run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'clang' && 'CLANG=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'cuda' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\n' || matrix.backend == 'PTX' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nPTX=1' || matrix.backend == 'triton' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nTRITON=1\nTRITON_PTXAS_PATH=/usr/bin/ptxas' || matrix.backend == 'hip' && 'HIP=1\nHIPCPU=1\nFORWARD_ONLY=1' }}" >> $GITHUB_ENV
       - name: Install OpenCL
         if: matrix.backend == 'gpu'
         run: |
@@ -444,11 +395,25 @@ jobs:
         run: |
           cd ${{ github.workspace }}/gpuocelot/ocelot/build
           sudo ninja install -d explain
+      - name: Install packages (hip)
+        if: matrix.backend == 'hip'
+        run: |
+          echo 'Acquire::http::Pipeline-Depth "5";' | sudo tee -a /etc/apt/apt.conf.d/99parallel
+          wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
+          sudo tee /etc/apt/sources.list.d/rocm.list <<'EOF'
+          deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/debian jammy main
+          EOF
+          echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
+          sudo apt update
+          sudo apt install --no-install-recommends --allow-unauthenticated -y rocm-hip-libraries hip-dev
+          curl -s https://api.github.com/repos/Qazalin/remu/releases/latest | \
+          jq -r '.assets[] | select(.name == "libremu.so").browser_download_url' | \
+          sudo xargs curl -L -o /usr/local/lib/libremu.so
       - name: Install dependencies
         run: pip install -e '.[testing${{matrix.backend=='llvm'&&',llvm'||matrix.backend=='cuda'&&',cuda'||matrix.backend=='ptx'&&',cuda'||matrix.backend=='triton'&&',triton'||''}}]' --extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/
       - name: Check Device.DEFAULT and print some source
         run: |
-          python -c "from tinygrad import Device; assert Device.DEFAULT in ['LLVM','CLANG','CUDA','GPU'], Device.DEFAULT"
+          python -c "from tinygrad import Device; assert Device.DEFAULT in ['LLVM','CLANG','CUDA','GPU', 'HIP'], Device.DEFAULT"
           DEBUG=5 PYTHONPATH=${{ github.workspace }} FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
       - name: Verify OpenCL autogen
         if: matrix.backend == 'gpu'
@@ -462,8 +427,16 @@ jobs:
           cp tinygrad/runtime/autogen/cuda.py /tmp/cuda.py.bak
           ./autogen_stubs.sh cuda
           diff /tmp/cuda.py.bak tinygrad/runtime/autogen/cuda.py
-      - name: Run pytest (not cuda)
-        if: matrix.backend!='cuda' && matrix.backend!='ptx' && matrix.backend!='triton'
+      - name: Verify HIP autogen
+        if: matrix.backend == 'hip'
+        run: |
+          cp tinygrad/runtime/autogen/hip.py /tmp/hip.py.bak
+          cp tinygrad/runtime/autogen/comgr.py /tmp/comgr.py.bak
+          ./autogen_stubs.sh hip
+          diff /tmp/hip.py.bak tinygrad/runtime/autogen/hip.py
+          diff /tmp/comgr.py.bak tinygrad/runtime/autogen/comgr.py
+      - name: Run pytest (not cuda or hip)
+        if: matrix.backend!='cuda' && matrix.backend!='ptx' && matrix.backend!='triton' && matrix.backend != 'hip'
         run: python -m pytest -n=auto test/ --durations=20
       - name: Run ONNX (only LLVM)
         if: matrix.backend == 'llvm'
@@ -471,6 +444,9 @@ jobs:
       - name: Run pytest (cuda)
         if: matrix.backend=='cuda'||matrix.backend=='ptx'||matrix.backend=='triton'
         run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors)' --ignore=test/external --ignore=test/models --durations=20
+      - name: Run pytest (hip)
+        if: matrix.backend=='hip'
+        run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py --durations=20
 
   #testunicorn:
   #  name: ARM64 unicorn Test
@@ -495,4 +471,4 @@ jobs:
   #    - name: Install dependencies
   #      run: pip install -e '.[testing,arm]' --extra-index-url https://download.pytorch.org/whl/cpu
   #    - name: Test arm
-  #      run: CI=1 ARM64=1 CLANG=1 python -m pytest -n=auto test/ -k 'not (test_nn.py and (test_conv_transpose2d or test_conv2d))' --ignore=test/models --ignore=test/test_speed_v_torch.py --ignore=test/test_net_speed.py --ignore=test/test_specific_conv.py  --ignore=test/unit/test_disk_tensor.py
\ No newline at end of file
+  #      run: CI=1 ARM64=1 CLANG=1 python -m pytest -n=auto test/ -k 'not (test_nn.py and (test_conv_transpose2d or test_conv2d))' --ignore=test/models --ignore=test/test_speed_v_torch.py --ignore=test/test_net_speed.py --ignore=test/test_specific_conv.py  --ignore=test/unit/test_disk_tensor.py
diff --git a/test/test_dtype_alu.py b/test/test_dtype_alu.py
index c14db2cb4f..e9b4a8111a 100644
--- a/test/test_dtype_alu.py
+++ b/test/test_dtype_alu.py
@@ -143,9 +143,10 @@ class TestDTypeALU(unittest.TestCase):
   @given(ht.int32, ht.int32, ht.float32, strat.sampled_from(integer_binary_operations), strat.sampled_from(binary_operations))
   def test_int32_midcast_float(self, a, b, c, op1, op2): universal_test_midcast(a, b, c, op1, op2, dtypes.int32, dtypes.float32)
 
-  # Metal and CUDACPU behave differently than numpy in CI for overflows
-  @given(strat.floats(width=32, min_value=0, max_value=10.0) if CI and (Device.DEFAULT == "METAL" or getenv("CUDACPU")) else ht.float32,
-         strat.floats(width=32, min_value=0, max_value=10.0) if CI and (Device.DEFAULT == "METAL" or getenv("CUDACPU")) else ht.float32,
+  # Metal and CUDACPU and HIP behave differently than numpy in CI for overflows
+  skip_overflow = CI and (Device.DEFAULT in ["METAL","HIP"] or getenv("CUDACPU"))
+  @given(strat.floats(width=32, min_value=0, max_value=10.0) if skip_overflow else ht.float32,
+         strat.floats(width=32, min_value=0, max_value=10.0) if skip_overflow else ht.float32,
          ht.int32, strat.sampled_from(binary_operations), strat.sampled_from(integer_binary_operations))
   def test_float_midcast_int32(self, a, b, c, op1, op2): universal_test_midcast(a, b, c, op1, op2, dtypes.float32, dtypes.int32)
 
diff --git a/test/test_hip_rdna3.py b/test/test_hip_rdna3.py
deleted file mode 100644
index 4e39d237a2..0000000000
--- a/test/test_hip_rdna3.py
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/usr/bin/env python
-import unittest
-import operator
-from tinygrad import Tensor, Device, dtypes
-from tinygrad.helpers import DEBUG, to_function_name
-from tinygrad.codegen.linearizer import Linearizer
-from tinygrad.renderer.cstyle import HIPRenderer
-from examples.beautiful_mnist import Model as MNIST
-from examples.hlb_cifar10 import SpeedyResNet
-
-from hypothesis import given, strategies as strat, settings
-settings.register_profile("my_profile", deadline=None)
-settings.load_profile("my_profile")
-print(settings.default)
-
-@unittest.skipIf(Device.DEFAULT != "HIP", reason="testing HIP->rdna3 compilation needs HIP=1")
-class TestHIPCompilationRDNA(unittest.TestCase):
-  def test_compile_hip_mnist(self):
-    model = MNIST()
-
-    input = Tensor.rand(512,1,28,28)
-    output = model(input)
-    output.numpy()
-
-  def test_compile_hip_speedyresnet(self):
-    W = Tensor.rand(12,3,2,2)
-    model = SpeedyResNet(W)
-
-    input = Tensor.rand(512, 3, 32, 32)
-    output = model(input)
-    output.numpy()
-
-  def test_compile_hip_speedyresnet_hf(self):
-    old_default_float = dtypes.default_float
-    dtypes.default_float = dtypes.float16
-
-    W = Tensor.rand(12,3,2,2)
-    model = SpeedyResNet(W)
-
-    input = Tensor.rand(512, 3, 32, 32)
-    output = model(input)
-    output.numpy()
-
-    dtypes.default_float = old_default_float
-
-def compile_ast_to_hip(out: Tensor):
-  from tinygrad.runtime.ops_hip import compile_hip
-
-  lin = Linearizer(out.lazydata.schedule()[-1].ast)
-  lin.hand_coded_optimizations()
-  lin.linearize()
-  code = HIPRenderer(to_function_name(lin.name), lin.uops)[0]
-  if DEBUG >= 4: print(code)
-  compile_hip(code)
-
-binary_operations = [operator.add, operator.sub, operator.mul]
-unary_operations = [Tensor.exp, Tensor.log, operator.neg, Tensor.sin, Tensor.sqrt, Tensor.reciprocal]
-float_dtypes = [dtypes.float16, dtypes.float32]
-
-@unittest.skipIf(Device.DEFAULT != "HIP", reason="testing HIP->rdna3 compilation needs HIP=1")
-class TestHIPALUCompilation(unittest.TestCase):
-  @given(strat.sampled_from(unary_operations), strat.sampled_from(float_dtypes))
-  def test_unary_ops(self, op, dtype):
-    a = Tensor.randn(4,4, dtype=dtype)
-    out = op(a)
-    compile_ast_to_hip(out)
-
-  @given(strat.sampled_from(binary_operations), strat.sampled_from(float_dtypes))
-  def test_binary_ops(self, op, dtype):
-    a = Tensor.randn(4,4, dtype=dtype)
-    b = Tensor.randn(4,4, dtype=dtype)
-    out = op(a,b)
-    compile_ast_to_hip(out)
-
-if __name__ == "__main__":
-  unittest.main()
diff --git a/tinygrad/runtime/ops_hip.py b/tinygrad/runtime/ops_hip.py
index 7225da2ffa..27be2f4229 100644
--- a/tinygrad/runtime/ops_hip.py
+++ b/tinygrad/runtime/ops_hip.py
@@ -4,13 +4,11 @@ from typing import Tuple, TypeVar, List, Any, cast, Set
 import tinygrad.runtime.autogen.hip as hip
 from tinygrad.helpers import DEBUG, getenv, init_c_var
 from tinygrad.helpers import from_mv, round_up, to_mv, colored, init_c_struct_t
-from tinygrad.device import Compiled, LRUAllocator, MallocAllocator, BufferOptions, JITRunner, Device, Buffer, update_stats, Compiler
+from tinygrad.device import Compiled, LRUAllocator, BufferOptions, JITRunner, Device, Buffer, MallocAllocator, update_stats, Compiler
 from tinygrad.renderer.cstyle import HIPRenderer
 from tinygrad.codegen.kernel import LinearizerOptions
 from tinygrad.runtime.compiler.hip_comgr import compile_hip
 
-# The default HIP stream is used for everything.
-MOCKHIP = getenv("MOCKHIP") # for CI. don't run kernels, only check if they compile
 
 class HIPCompiler(Compiler):
   linearizer_opts = LinearizerOptions("HIP")
@@ -38,7 +36,6 @@ class HIPProgram:
       asm = subprocess.check_output(["/opt/rocm/llvm/bin/llvm-objdump", '-d', '-'], input=lib)
       print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
 
-    if MOCKHIP: return
     hip_set_device(self.device)
     self.module = init_c_var(hip.hipModule_t(), lambda x: check(hip.hipModuleLoadData(ctypes.byref(x), lib)))
     self.prg = init_c_var(hip.hipFunction_t(), lambda x: check(hip.hipModuleGetFunction(ctypes.byref(x), self.module, name.encode("utf-8"))))
@@ -47,7 +44,6 @@ class HIPProgram:
     if hasattr(self, 'module'): check(hip.hipModuleUnload(self.module))
 
   def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
-    if MOCKHIP: return float("inf")
     hip_set_device(self.device)
     if not hasattr(self, "vargs"):
       self.c_args = init_c_struct_t(tuple([(f'f{i}', hip.hipDeviceptr_t) for i in range(len(args))] +
@@ -134,13 +130,13 @@ class HIPAllocator(LRUAllocator):
 class HIPDevice(Compiled):
   def __init__(self, device:str=""):
     self.device = int(device.split(":")[1]) if ":" in device else 0
-    self.arch = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device))).gcnArchName.decode() if not MOCKHIP else "gfx1100"  # noqa: E501
+    self.arch = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device))).gcnArchName.decode()
     self.pending_copyin: List[ctypes.c_void_p] = []
     self.track_cross_buffer: List[Any] = []
     self.peers: Set[int] = set()
 
     from tinygrad.runtime.graph.hip import HIPGraph
-    super().__init__(device, MallocAllocator if MOCKHIP else HIPAllocator(self), HIPCompiler(self.arch),
+    super().__init__(device, HIPAllocator(self), HIPCompiler(self.arch),
                      functools.partial(HIPProgram, self.device), HIPGraph)
   def synchronize(self):
     hip_set_device(self.device)
@@ -172,3 +168,18 @@ class HIPWaitEvent(JITRunner):
     hip_set_device(self.device.device)
     check(hip.hipStreamWaitValue32(None, rawbufs[0]._buf, 1, 1, 0xFFFFFFFF))
     update_stats(colored("wait", "RED"), 0, 0, {}, None, 1, jit, device=self.dname)
+
+if getenv("HIPCPU"):
+  hip = ctypes.CDLL("/usr/local/lib/libremu.so") # type: ignore[assignment]
+
+  class HIPProgram: # type: ignore[no-redef]
+    def __init__(self, name:str, lib:bytes):
+      self.name, self.lib = name, lib
+    def __call__(self, *args, global_size, local_size, vals=(), wait=False):
+      args = (*args, *vals)
+      hip.hipModuleLaunchKernel(self.lib, len(self.lib), *global_size, *local_size, 0, None, None,
+                                len(args), (ctypes.c_void_p * len(args))(*[ctypes.cast(x, ctypes.c_void_p) for x in args]))
+
+  class HIPDevice(Compiled): # type: ignore[no-redef]
+    def __init__(self, device=""):
+      super().__init__(device, MallocAllocator, HIPCompiler("gfx1100"), HIPProgram)