more work on kfd (#4079)

* more work on kfd

* fix multitensor test on kfd

* stuff
This commit is contained in:
George Hotz
2024-04-05 08:36:36 -07:00
committed by GitHub
parent e7ff5102cf
commit a337922c44
3 changed files with 42 additions and 11 deletions

28
test/external/fuzz_kfd.py vendored Normal file
View File

@@ -0,0 +1,28 @@
#!/usr/bin/env python3
import random
from tqdm import trange
from typing import List
from tinygrad import Device
from tinygrad.runtime.ops_kfd import KFDDevice, HWCopyQueue, HWComputeQueue
if __name__ == "__main__":
dev: List[KFDDevice] = [Device[f"KFD:{i}"] for i in range(6)]
print(f"got {len(dev)} devices")
buffers = [(rd:=random.choice(dev), rd.allocator.alloc(random.randint(1, 10000))) for i in range(100)]
for _ in trange(100000):
d1, b1 = random.choice(buffers)
d2, b2 = random.choice(buffers)
d1._gpu_map(b2)
q = HWComputeQueue()
q.signal(sig:=KFDDevice._get_signal(10))
qc = HWCopyQueue()
qc.wait(sig)
qc.copy(b1.va_addr, b2.va_addr, min(b1.size, b2.size))
d1.completion_signal.value = 1
qc.signal(d1.completion_signal)
qc.submit(d1)
q.wait(d1.completion_signal)
q.submit(d1)
KFDDevice._wait_on(d1.completion_signal.event_id)

View File

@@ -115,9 +115,8 @@ class TestMultiTensor(unittest.TestCase):
fn = f(n)
np.testing.assert_allclose(fX.numpy(), fn, rtol=1e-6, atol=1e-6)
@unittest.skipIf(CI and Device.DEFAULT == "CLANG", "clang is slow")
@unittest.skip("slow")
def test_fuzz_allreduce(self):
random.seed(41)
for it in range(100):
for n in range(2, 4+1):
@@ -132,7 +131,6 @@ class TestMultiTensor(unittest.TestCase):
assert mean_err < 1e-6, f"big mean error, iteration {it}_{n}"
assert max_err < 1e-6, f"big max error, iteration {it}_{n}"
def _test_matmul_shard_axis(self, shard_x, shard_w, device):
X = Tensor.kaiming_uniform(N, N).realize()
W = Tensor.kaiming_uniform(N, N).realize()