diff --git a/test/test_multitensor.py b/test/test_multitensor.py
index 0fcf4081ea..b5c37eea67 100644
--- a/test/test_multitensor.py
+++ b/test/test_multitensor.py
@@ -148,7 +148,7 @@ class TestMultiTensor(unittest.TestCase):
       a,b = _test_allreduce(Tensor.rand(256, 256))
       np.testing.assert_almost_equal(a.numpy(), b.numpy(), decimal=5)
 
-  @unittest.skipIf(Device.DEFAULT in {"NV", "AMD"}, "not supported in HCQ")
+  @unittest.skipIf(Device.DEFAULT in {"NV", "AMD"}, "not supported in HCQ #4817")
   def test_copy_jit(self):
     @TinyJit
     def copy_tensor(x:Tensor): return (x.to(f"{x.device.split(':')[0]}:1") + 1)
@@ -157,6 +157,7 @@ class TestMultiTensor(unittest.TestCase):
       x = copy_tensor(t)
       np.testing.assert_equal((t+1).numpy(), x.numpy())
 
+  @unittest.skipIf(Device.DEFAULT in {"NV", "AMD"}, "not supported in HCQ #4817")
   def test_allreduce_naive_jit(self):
     with Context(RING=0):
       jit_allreduce = TinyJit(_test_allreduce)
@@ -164,6 +165,7 @@ class TestMultiTensor(unittest.TestCase):
         a,b = jit_allreduce(Tensor.rand(256, 256))
         np.testing.assert_almost_equal(a.numpy(), b.numpy(), decimal=5)
 
+  @unittest.skipIf(Device.DEFAULT in {"NV", "AMD"}, "not supported in HCQ #4817")
   def test_allreduce_ring_jit(self):
     with Context(RING=2):
       jit_allreduce = TinyJit(_test_allreduce)
diff --git a/test/test_subbuffer.py b/test/test_subbuffer.py
index cd0f42ef39..b20da9a1b4 100644
--- a/test/test_subbuffer.py
+++ b/test/test_subbuffer.py
@@ -1,6 +1,6 @@
 import unittest
 from tinygrad import Device, dtypes, Tensor
-from tinygrad.helpers import CI
+from tinygrad.helpers import getenv
 from tinygrad.device import Buffer
 from tinygrad.lazy import view_supported_devices
 
@@ -41,11 +41,11 @@ class TestSubBuffer(unittest.TestCase):
     out = (vt + 100).tolist()
     assert out == [102, 103]
 
-  @unittest.skipIf(Device.DEFAULT != "CUDA" or CI, "only CUDA")
+  @unittest.skipIf(Device.DEFAULT not in {"CUDA", "NV", "AMD"} or getenv("CUDACPU"), "only NV, AMD, CUDA but not CUDACPU")
   def test_subbuffer_transfer(self):
     t = Tensor.arange(0, 10, dtype=dtypes.uint8).realize()
     vt = t[2:5].contiguous().realize()
-    out = vt.to("CUDA:1").realize().tolist()
+    out = vt.to(f"{Device.DEFAULT}:1").realize().tolist()
     assert out == [2, 3, 4]
 
 if __name__ == '__main__':
diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py
index c3496d43a5..889f469b24 100644
--- a/tinygrad/lazy.py
+++ b/tinygrad/lazy.py
@@ -22,7 +22,7 @@ def create_lazybuffer(device:str, st:ShapeTracker, dtype:DType, op:Optional[Op]=
   if enable_cache: lazycache[cache_key] = ret
   return ret
 
-view_supported_devices = {"LLVM", "CLANG", "CUDA", "DISK"}
+view_supported_devices = {"LLVM", "CLANG", "CUDA", "NV", "AMD", "DISK"}
 class LazyBuffer:
   def __init__(self, device:str, st:ShapeTracker, dtype:DType,
                op:Optional[Op]=None, arg:Any=None, srcs:Tuple[LazyBuffer, ...]=(),
diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py
index 4cea89ecb7..bf43a85e65 100644
--- a/tinygrad/runtime/ops_amd.py
+++ b/tinygrad/runtime/ops_amd.py
@@ -406,6 +406,8 @@ class AMDAllocator(LRUAllocator):
     HWPM4Queue().wait(src_dev.timeline_signal, src_dev.timeline_value).submit(dest_dev)
     src_dev.timeline_value += 1
 
+  def offset(self, buf, size:int, offset:int): return type(buf)(va_addr=buf.va_addr + offset, size=size)
+
 MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
 class AMDDevice(Compiled):
   kfd:int = -1
diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py
index 31bfd9dab4..2f5047c30c 100644
--- a/tinygrad/runtime/ops_nv.py
+++ b/tinygrad/runtime/ops_nv.py
@@ -363,6 +363,8 @@ class NVAllocator(LRUAllocator):
     HWComputeQueue().wait(src_dev.timeline_signal, src_dev.timeline_value).submit(dest_dev)
     src_dev.timeline_value += 1
 
+  def offset(self, buf, size:int, offset:int): return type(buf)(base=buf.base + offset, va_addr=buf.va_addr + offset, length=size)
+
 MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
 class NVDevice(Compiled):
   root = None