diff --git a/test/test_multitensor.py b/test/test_multitensor.py index 0fcf4081ea..b5c37eea67 100644 --- a/test/test_multitensor.py +++ b/test/test_multitensor.py @@ -148,7 +148,7 @@ class TestMultiTensor(unittest.TestCase): a,b = _test_allreduce(Tensor.rand(256, 256)) np.testing.assert_almost_equal(a.numpy(), b.numpy(), decimal=5) - @unittest.skipIf(Device.DEFAULT in {"NV", "AMD"}, "not supported in HCQ") + @unittest.skipIf(Device.DEFAULT in {"NV", "AMD"}, "not supported in HCQ #4817") def test_copy_jit(self): @TinyJit def copy_tensor(x:Tensor): return (x.to(f"{x.device.split(':')[0]}:1") + 1) @@ -157,6 +157,7 @@ class TestMultiTensor(unittest.TestCase): x = copy_tensor(t) np.testing.assert_equal((t+1).numpy(), x.numpy()) + @unittest.skipIf(Device.DEFAULT in {"NV", "AMD"}, "not supported in HCQ #4817") def test_allreduce_naive_jit(self): with Context(RING=0): jit_allreduce = TinyJit(_test_allreduce) @@ -164,6 +165,7 @@ class TestMultiTensor(unittest.TestCase): a,b = jit_allreduce(Tensor.rand(256, 256)) np.testing.assert_almost_equal(a.numpy(), b.numpy(), decimal=5) + @unittest.skipIf(Device.DEFAULT in {"NV", "AMD"}, "not supported in HCQ #4817") def test_allreduce_ring_jit(self): with Context(RING=2): jit_allreduce = TinyJit(_test_allreduce) diff --git a/test/test_subbuffer.py b/test/test_subbuffer.py index cd0f42ef39..b20da9a1b4 100644 --- a/test/test_subbuffer.py +++ b/test/test_subbuffer.py @@ -1,6 +1,6 @@ import unittest from tinygrad import Device, dtypes, Tensor -from tinygrad.helpers import CI +from tinygrad.helpers import getenv from tinygrad.device import Buffer from tinygrad.lazy import view_supported_devices @@ -41,11 +41,11 @@ class TestSubBuffer(unittest.TestCase): out = (vt + 100).tolist() assert out == [102, 103] - @unittest.skipIf(Device.DEFAULT != "CUDA" or CI, "only CUDA") + @unittest.skipIf(Device.DEFAULT not in {"CUDA", "NV", "AMD"} or getenv("CUDACPU"), "only NV, AMD, CUDA but not CUDACPU") def test_subbuffer_transfer(self): t = Tensor.arange(0, 10, dtype=dtypes.uint8).realize() vt = t[2:5].contiguous().realize() - out = vt.to("CUDA:1").realize().tolist() + out = vt.to(f"{Device.DEFAULT}:1").realize().tolist() assert out == [2, 3, 4] if __name__ == '__main__': diff --git a/tinygrad/lazy.py b/tinygrad/lazy.py index c3496d43a5..889f469b24 100644 --- a/tinygrad/lazy.py +++ b/tinygrad/lazy.py @@ -22,7 +22,7 @@ def create_lazybuffer(device:str, st:ShapeTracker, dtype:DType, op:Optional[Op]= if enable_cache: lazycache[cache_key] = ret return ret -view_supported_devices = {"LLVM", "CLANG", "CUDA", "DISK"} +view_supported_devices = {"LLVM", "CLANG", "CUDA", "NV", "AMD", "DISK"} class LazyBuffer: def __init__(self, device:str, st:ShapeTracker, dtype:DType, op:Optional[Op]=None, arg:Any=None, srcs:Tuple[LazyBuffer, ...]=(), diff --git a/tinygrad/runtime/ops_amd.py b/tinygrad/runtime/ops_amd.py index 4cea89ecb7..bf43a85e65 100644 --- a/tinygrad/runtime/ops_amd.py +++ b/tinygrad/runtime/ops_amd.py @@ -406,6 +406,8 @@ class AMDAllocator(LRUAllocator): HWPM4Queue().wait(src_dev.timeline_signal, src_dev.timeline_value).submit(dest_dev) src_dev.timeline_value += 1 + def offset(self, buf, size:int, offset:int): return type(buf)(va_addr=buf.va_addr + offset, size=size) + MAP_FIXED, MAP_NORESERVE = 0x10, 0x400 class AMDDevice(Compiled): kfd:int = -1 diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index 31bfd9dab4..2f5047c30c 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -363,6 +363,8 @@ class NVAllocator(LRUAllocator): HWComputeQueue().wait(src_dev.timeline_signal, src_dev.timeline_value).submit(dest_dev) src_dev.timeline_value += 1 + def offset(self, buf, size:int, offset:int): return type(buf)(base=buf.base + offset, va_addr=buf.va_addr + offset, length=size) + MAP_FIXED, MAP_NORESERVE = 0x10, 0x400 class NVDevice(Compiled): root = None