mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-04-29 03:00:14 -04:00
add _alloc_signal/_free_signal to hcq (#5264)
* add _alloc_signal/_free_signal api * oops, revert this * linter
This commit is contained in:
16
test/external/external_test_hcq.py
vendored
16
test/external/external_test_hcq.py
vendored
@@ -51,7 +51,7 @@ class TestHCQ(unittest.TestCase):
|
||||
TestHCQ.d0.synchronize() # wait for copyins to complete
|
||||
|
||||
def test_run_1000_times_one_submit(self):
|
||||
temp_signal, temp_value = TestHCQ.d0._get_signal(value=0), 0
|
||||
temp_signal, temp_value = TestHCQ.d0._alloc_signal(value=0), 0
|
||||
q = TestHCQ.compute_queue()
|
||||
for _ in range(1000):
|
||||
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.p.global_size, TestHCQ.runner.p.local_size)
|
||||
@@ -69,7 +69,7 @@ class TestHCQ(unittest.TestCase):
|
||||
assert (val:=TestHCQ.a.lazydata.buffer.as_buffer().cast("f")[0]) == 2000.0, f"got val {val}"
|
||||
|
||||
def test_run_1000_times(self):
|
||||
temp_signal = TestHCQ.d0._get_signal(value=0)
|
||||
temp_signal = TestHCQ.d0._alloc_signal(value=0)
|
||||
q = TestHCQ.compute_queue()
|
||||
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.p.global_size, TestHCQ.runner.p.local_size)
|
||||
q.signal(temp_signal, 2).wait(temp_signal, 2)
|
||||
@@ -84,7 +84,7 @@ class TestHCQ(unittest.TestCase):
|
||||
assert (val:=TestHCQ.a.lazydata.buffer.as_buffer().cast("f")[0]) == 2000.0, f"got val {val}"
|
||||
|
||||
def test_run_to_3(self):
|
||||
temp_signal = TestHCQ.d0._get_signal(value=0)
|
||||
temp_signal = TestHCQ.d0._alloc_signal(value=0)
|
||||
q = TestHCQ.compute_queue()
|
||||
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.p.global_size, TestHCQ.runner.p.local_size)
|
||||
q.signal(temp_signal, 1).wait(temp_signal, 1)
|
||||
@@ -109,7 +109,7 @@ class TestHCQ(unittest.TestCase):
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "NV", "Only NV supports bind")
|
||||
def test_bind_run(self):
|
||||
temp_signal = TestHCQ.d0._get_signal(value=0)
|
||||
temp_signal = TestHCQ.d0._alloc_signal(value=0)
|
||||
q = TestHCQ.compute_queue()
|
||||
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.p.global_size, TestHCQ.runner.p.local_size)
|
||||
q.signal(temp_signal, 2).wait(temp_signal, 2)
|
||||
@@ -141,7 +141,7 @@ class TestHCQ(unittest.TestCase):
|
||||
|
||||
@unittest.skipIf(CI, "Can't handle async update on CPU")
|
||||
def test_wait_signal(self):
|
||||
temp_signal = TestHCQ.d0._get_signal(value=0)
|
||||
temp_signal = TestHCQ.d0._alloc_signal(value=0)
|
||||
TestHCQ.compute_queue().wait(temp_signal, value=1).signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
|
||||
with self.assertRaises(RuntimeError):
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value, timeout=50)
|
||||
@@ -152,7 +152,7 @@ class TestHCQ(unittest.TestCase):
|
||||
|
||||
@unittest.skipIf(CI, "Can't handle async update on CPU")
|
||||
def test_wait_copy_signal(self):
|
||||
temp_signal = TestHCQ.d0._get_signal(value=0)
|
||||
temp_signal = TestHCQ.d0._alloc_signal(value=0)
|
||||
TestHCQ.copy_queue().wait(temp_signal, value=1).signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
|
||||
with self.assertRaises(RuntimeError):
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value, timeout=50)
|
||||
@@ -267,7 +267,7 @@ class TestHCQ(unittest.TestCase):
|
||||
q = TestHCQ.compute_queue()
|
||||
qc = TestHCQ.copy_queue()
|
||||
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.p.global_size, TestHCQ.runner.p.local_size) # b = [1, 2]
|
||||
q.signal(sig:=TestHCQ.d0._get_signal(value=0), value=1)
|
||||
q.signal(sig:=TestHCQ.d0._alloc_signal(value=0), value=1)
|
||||
qc.wait(sig, value=1)
|
||||
qc.copy(TestHCQ.a.lazydata.buffer._buf.va_addr, TestHCQ.b.lazydata.buffer._buf.va_addr, 8)
|
||||
qc.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
|
||||
@@ -282,7 +282,7 @@ class TestHCQ(unittest.TestCase):
|
||||
d1 = Device[f"{Device.DEFAULT}:1"]
|
||||
q1 = TestHCQ.compute_queue()
|
||||
q2 = TestHCQ.compute_queue()
|
||||
q1.signal(sig:=TestHCQ.d0._get_signal(value=0), value=0xfff)
|
||||
q1.signal(sig:=TestHCQ.d0._alloc_signal(value=0), value=0xfff)
|
||||
q2.wait(sig, value=0xfff)
|
||||
q2.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
|
||||
q2.submit(TestHCQ.d0)
|
||||
|
||||
2
test/external/fuzz_kfd.py
vendored
2
test/external/fuzz_kfd.py
vendored
@@ -16,7 +16,7 @@ if __name__ == "__main__":
|
||||
d2, b2 = random.choice(buffers)
|
||||
d1._gpu_map(b2)
|
||||
q = HWComputeQueue()
|
||||
q.signal(sig:=AMDDevice._get_signal(10))
|
||||
q.signal(sig:=AMDDevice._alloc_signal(10))
|
||||
qc = HWCopyQueue()
|
||||
qc.wait(sig)
|
||||
qc.copy(b1.va_addr, b2.va_addr, min(b1.size, b2.size))
|
||||
|
||||
@@ -48,7 +48,7 @@ class TestHCQ(unittest.TestCase):
|
||||
def test_signal_update(self):
|
||||
for queue_type in [TestHCQ.d0.hw_compute_queue_t]:
|
||||
with self.subTest(name=str(queue_type)):
|
||||
q = queue_type().signal(fake_signal := TestHCQ.d0._get_signal(), 0x1000)
|
||||
q = queue_type().signal(fake_signal := TestHCQ.d0._alloc_signal(), 0x1000)
|
||||
|
||||
q.update_signal(0, signal=TestHCQ.d0.timeline_signal, value=TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
|
||||
@@ -58,26 +58,26 @@ class TestHCQ(unittest.TestCase):
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
|
||||
TestHCQ.d0.timeline_value += 1
|
||||
|
||||
TestHCQ.d0.signals_pool.append(fake_signal)
|
||||
TestHCQ.d0._free_signal(fake_signal)
|
||||
|
||||
# Test wait
|
||||
def test_wait(self):
|
||||
for queue_type in [TestHCQ.d0.hw_compute_queue_t, TestHCQ.d0.hw_copy_queue_t]:
|
||||
with self.subTest(name=str(queue_type)):
|
||||
fake_signal = TestHCQ.d0._get_signal()
|
||||
fake_signal = TestHCQ.d0._alloc_signal()
|
||||
TestHCQ.d0._set_signal(fake_signal, 1)
|
||||
queue_type().wait(fake_signal, 1) \
|
||||
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
|
||||
TestHCQ.d0.timeline_value += 1
|
||||
|
||||
TestHCQ.d0.signals_pool.append(fake_signal)
|
||||
TestHCQ.d0._free_signal(fake_signal)
|
||||
|
||||
@unittest.skipIf(MOCKGPU, "Can't handle async update on MOCKGPU for now")
|
||||
def test_wait_late_set(self):
|
||||
for queue_type in [TestHCQ.d0.hw_compute_queue_t, TestHCQ.d0.hw_copy_queue_t]:
|
||||
with self.subTest(name=str(queue_type)):
|
||||
fake_signal = TestHCQ.d0._get_signal()
|
||||
fake_signal = TestHCQ.d0._alloc_signal()
|
||||
queue_type().wait(fake_signal, 1) \
|
||||
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
|
||||
|
||||
@@ -89,12 +89,12 @@ class TestHCQ(unittest.TestCase):
|
||||
|
||||
TestHCQ.d0.timeline_value += 1
|
||||
|
||||
TestHCQ.d0.signals_pool.append(fake_signal)
|
||||
TestHCQ.d0._free_signal(fake_signal)
|
||||
|
||||
def test_wait_update(self):
|
||||
for queue_type in [TestHCQ.d0.hw_compute_queue_t, TestHCQ.d0.hw_copy_queue_t]:
|
||||
with self.subTest(name=str(queue_type)):
|
||||
fake_signal = TestHCQ.d0._get_signal()
|
||||
fake_signal = TestHCQ.d0._alloc_signal()
|
||||
q = queue_type().wait(TestHCQ.d0.timeline_signal, 0xffffffff).signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
|
||||
|
||||
TestHCQ.d0._set_signal(fake_signal, 0x30)
|
||||
@@ -103,7 +103,7 @@ class TestHCQ(unittest.TestCase):
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
|
||||
TestHCQ.d0.timeline_value += 1
|
||||
|
||||
TestHCQ.d0.signals_pool.append(fake_signal)
|
||||
TestHCQ.d0._free_signal(fake_signal)
|
||||
|
||||
# Test exec
|
||||
def test_exec_one_kernel(self):
|
||||
@@ -167,7 +167,7 @@ class TestHCQ(unittest.TestCase):
|
||||
with self.subTest(name=str(queue_type)):
|
||||
if not hasattr(queue_type(), 'bind'): self.skipTest("queue does not support bind api")
|
||||
|
||||
fake_signal = TestHCQ.d0._get_signal()
|
||||
fake_signal = TestHCQ.d0._alloc_signal()
|
||||
q = queue_type().wait(TestHCQ.d0.timeline_signal, 0xffffffff).signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
|
||||
q.bind(TestHCQ.d0)
|
||||
|
||||
@@ -177,13 +177,13 @@ class TestHCQ(unittest.TestCase):
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value)
|
||||
TestHCQ.d0.timeline_value += 1
|
||||
|
||||
TestHCQ.d0.signals_pool.append(fake_signal)
|
||||
TestHCQ.d0._free_signal(fake_signal)
|
||||
|
||||
# Test multidevice
|
||||
def test_multidevice_signal_wait(self):
|
||||
d1 = Device[f"{Device.DEFAULT}:1"]
|
||||
|
||||
TestHCQ.d0.hw_copy_queue_t().signal(sig:=TestHCQ.d0._get_signal(value=0), value=0xfff) \
|
||||
TestHCQ.d0.hw_copy_queue_t().signal(sig:=TestHCQ.d0._alloc_signal(value=0), value=0xfff) \
|
||||
.signal(TestHCQ.d0.timeline_signal, TestHCQ.d0.timeline_value).submit(TestHCQ.d0)
|
||||
|
||||
d1.hw_copy_queue_t().wait(sig, value=0xfff) \
|
||||
@@ -195,13 +195,13 @@ class TestHCQ(unittest.TestCase):
|
||||
d1._wait_signal(d1.timeline_signal, d1.timeline_value)
|
||||
d1.timeline_value += 1
|
||||
|
||||
TestHCQ.d0.signals_pool.append(sig)
|
||||
TestHCQ.d0._free_signal(sig)
|
||||
|
||||
# Test profile api
|
||||
def test_speed_exec_time(self):
|
||||
TestHCQ.d0._prof_setup()
|
||||
|
||||
sig_st, sig_en = TestHCQ.d0._get_signal(), TestHCQ.d0._get_signal()
|
||||
sig_st, sig_en = TestHCQ.d0._alloc_signal(), TestHCQ.d0._alloc_signal()
|
||||
TestHCQ.d0.hw_compute_queue_t().timestamp(sig_st) \
|
||||
.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.p.global_size, TestHCQ.runner.p.local_size) \
|
||||
.timestamp(sig_en) \
|
||||
@@ -212,7 +212,8 @@ class TestHCQ(unittest.TestCase):
|
||||
|
||||
et = TestHCQ.d0._gpu2cpu_time(TestHCQ.d0._read_timestamp(sig_en), True) - TestHCQ.d0._gpu2cpu_time(TestHCQ.d0._read_timestamp(sig_st), True)
|
||||
|
||||
TestHCQ.d0.signals_pool += [sig_st, sig_en]
|
||||
TestHCQ.d0._free_signal(sig_st)
|
||||
TestHCQ.d0._free_signal(sig_en)
|
||||
|
||||
print(f"exec kernel time: {et:.2f} us")
|
||||
assert 1 <= et <= (2000 if CI else 20)
|
||||
@@ -225,7 +226,7 @@ class TestHCQ(unittest.TestCase):
|
||||
a = Buffer(Device.DEFAULT, SZ, dtypes.uint8, options=BufferOptions(nolru=True)).allocate()
|
||||
b = Buffer(Device.DEFAULT, SZ, dtypes.uint8, options=BufferOptions(nolru=True)).allocate()
|
||||
|
||||
sig_st, sig_en = TestHCQ.d0._get_signal(), TestHCQ.d0._get_signal()
|
||||
sig_st, sig_en = TestHCQ.d0._alloc_signal(), TestHCQ.d0._alloc_signal()
|
||||
TestHCQ.d0.hw_copy_queue_t().timestamp(sig_st) \
|
||||
.copy(a._buf.va_addr, b._buf.va_addr, SZ) \
|
||||
.timestamp(sig_en) \
|
||||
@@ -237,7 +238,8 @@ class TestHCQ(unittest.TestCase):
|
||||
et = TestHCQ.d0._gpu2cpu_time(TestHCQ.d0._read_timestamp(sig_en), True) - TestHCQ.d0._gpu2cpu_time(TestHCQ.d0._read_timestamp(sig_st), True)
|
||||
et_ms = et / 1e3
|
||||
|
||||
TestHCQ.d0.signals_pool += [sig_st, sig_en]
|
||||
TestHCQ.d0._free_signal(sig_st)
|
||||
TestHCQ.d0._free_signal(sig_en)
|
||||
|
||||
gb_s = ((SZ / 1e9) / et_ms) * 1e3
|
||||
print(f"same device copy: {et_ms:.2f} ms, {gb_s:.2f} GB/s")
|
||||
@@ -251,7 +253,7 @@ class TestHCQ(unittest.TestCase):
|
||||
a = Buffer(Device.DEFAULT, SZ, dtypes.uint8, options=BufferOptions(nolru=True)).allocate()
|
||||
TestHCQ.d0._gpu_map(b._buf)
|
||||
|
||||
sig_st, sig_en = TestHCQ.d0._get_signal(), TestHCQ.d0._get_signal()
|
||||
sig_st, sig_en = TestHCQ.d0._alloc_signal(), TestHCQ.d0._alloc_signal()
|
||||
TestHCQ.d0.hw_copy_queue_t().timestamp(sig_st) \
|
||||
.copy(a._buf.va_addr, b._buf.va_addr, SZ) \
|
||||
.timestamp(sig_en) \
|
||||
@@ -263,7 +265,8 @@ class TestHCQ(unittest.TestCase):
|
||||
et = TestHCQ.d0._gpu2cpu_time(TestHCQ.d0._read_timestamp(sig_en), True) - TestHCQ.d0._gpu2cpu_time(TestHCQ.d0._read_timestamp(sig_st), True)
|
||||
et_ms = et / 1e3
|
||||
|
||||
TestHCQ.d0.signals_pool += [sig_st, sig_en]
|
||||
TestHCQ.d0._free_signal(sig_st)
|
||||
TestHCQ.d0._free_signal(sig_en)
|
||||
|
||||
gb_s = ((SZ / 1e9) / et_ms) * 1e3
|
||||
print(f"cross device copy: {et_ms:.2f} ms, {gb_s:.2f} GB/s")
|
||||
|
||||
Reference in New Issue
Block a user