mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-10 23:48:01 -05:00
WIP PM4 Support (#4110)
* pm4 kernel launch works * disable USE_THREAD_DIMENSIONS * add kernel code * work on real pm4 * pm4 signal * same * gate pm4 * hcq tests pass * ops passes * pm4 is closer * pm4 debug (#4165) * start debug tests passing * prg * smth * hdp flush * cleaner 1 * do not need this * logs not need * small things * linter * remove AQL * test hcq * fix tests * it's subtracting, it shouldn't be -1 * pm4 changes (#4251) * not need this anymore * sdma signal with non atomic --------- Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
This commit is contained in:
@@ -128,6 +128,7 @@ generate_hsa() {
|
||||
/opt/rocm/include/hsa/hsa_ext_amd.h \
|
||||
/opt/rocm/include/hsa/amd_hsa_signal.h \
|
||||
/opt/rocm/include/hsa/amd_hsa_queue.h \
|
||||
/opt/rocm/include/hsa/amd_hsa_kernel_code.h \
|
||||
/opt/rocm/include/hsa/hsa_ext_finalize.h /opt/rocm/include/hsa/hsa_ext_image.h \
|
||||
/opt/rocm/include/hsa/hsa_ven_amd_aqlprofile.h \
|
||||
--clang-args="-I/opt/rocm/include" \
|
||||
|
||||
157
extra/hip_gpu_driver/test_pm4.py
Normal file
157
extra/hip_gpu_driver/test_pm4.py
Normal file
@@ -0,0 +1,157 @@
|
||||
import time
|
||||
from hexdump import hexdump
|
||||
from tinygrad import Tensor, Device
|
||||
import tinygrad.runtime.autogen.amd_gpu as amd_gpu
|
||||
import tinygrad.runtime.autogen.kfd as kfd
|
||||
import tinygrad.runtime.autogen.hsa as hsa
|
||||
from tinygrad.engine.schedule import create_schedule
|
||||
from tinygrad.runtime.ops_kfd import kio, KFDProgram
|
||||
from tinygrad.helpers import to_mv
|
||||
|
||||
DISPATCH_INIT_VALUE = 0x21 | 0x8000
|
||||
|
||||
#mmCOMPUTE_START_X = 0x2e04
|
||||
#mmCOMPUTE_PGM_LO = 0x2e0c
|
||||
|
||||
BASE_ADDR = 0x00001260
|
||||
PACKET3_SET_SH_REG_START = 0x2c00
|
||||
SUB = PACKET3_SET_SH_REG_START - BASE_ADDR
|
||||
|
||||
regCOMPUTE_PGM_LO = 0x1bac - SUB
|
||||
regCOMPUTE_START_X = 0x1ba4 - SUB
|
||||
regCOMPUTE_NUM_THREAD_X = 0x1ba7 - SUB
|
||||
regCOMPUTE_USER_DATA_0 = 0x1be0 - SUB
|
||||
regCOMPUTE_USER_DATA_8 = 0x1be8 - SUB
|
||||
|
||||
regCOMPUTE_PGM_RSRC1 = 0x1bb2 - SUB
|
||||
regCOMPUTE_PGM_RSRC2 = 0x1bb3 - SUB
|
||||
|
||||
# DEBUG=6 python3 extra/hip_gpu_driver/test_pm4.py
|
||||
# sudo umr -i 1 -s amd744c.gfx1100 --sbank 1 1 2 | grep regCOMPUTE
|
||||
|
||||
# 0x00009025
|
||||
|
||||
COMPUTE_SHADER_EN = 1
|
||||
USE_THREAD_DIMENSIONS = 1 << 5
|
||||
CS_W32_EN = 1 << 15
|
||||
|
||||
def format_struct(s):
|
||||
sdats = []
|
||||
for field_name, field_type in s._fields_:
|
||||
dat = getattr(s, field_name)
|
||||
if isinstance(dat, int): sdats.append(f"{field_name}:0x{dat:X}")
|
||||
else: sdats.append(f"{field_name}:{dat}")
|
||||
return sdats
|
||||
|
||||
if __name__ == "__main__":
|
||||
dev = Device["KFD"]
|
||||
|
||||
a = Tensor([0.,1.,2.], device="KFD").realize()
|
||||
b = a + 7
|
||||
b.lazydata.buffer.allocate()
|
||||
si = create_schedule([b.lazydata])[-1]
|
||||
runner = dev.get_runner(*si.ast)
|
||||
prg: KFDProgram = runner.clprg
|
||||
print("device initted")
|
||||
|
||||
# Compute Queue
|
||||
|
||||
gart_compute = dev._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
eop_buffer = dev._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
compute_ring = dev._gpu_alloc(0x800000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
ctx_save_restore_address = dev._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
compute_queue = kio.create_queue(dev.kfd, ring_base_address=compute_ring.va_addr, ring_size=compute_ring.size, gpu_id=dev.gpu_id,
|
||||
queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
||||
#eop_buffer_address=eop_buffer.va_addr, eop_buffer_size=eop_buffer.size,
|
||||
#ctx_save_restore_address=ctx_save_restore_address.va_addr, ctx_save_restore_size=ctx_save_restore_address.size,
|
||||
#ctl_stack_size = 0xa000,
|
||||
write_pointer_address=gart_compute.va_addr, read_pointer_address=gart_compute.va_addr+8)
|
||||
compute_doorbell = to_mv(dev.doorbells + compute_queue.doorbell_offset - dev.doorbells_base, 4).cast("I")
|
||||
|
||||
#scratch = dev._gpu_alloc(0x10000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
ka = to_mv(dev.kernargs_ptr, 0x10).cast("Q")
|
||||
ka[0] = b.lazydata.buffer._buf.va_addr
|
||||
ka[1] = a.lazydata.buffer._buf.va_addr
|
||||
|
||||
compute_read_pointer = to_mv(compute_queue.read_pointer_address, 8).cast("Q")
|
||||
compute_write_pointer = to_mv(compute_queue.write_pointer_address, 8).cast("Q")
|
||||
|
||||
hexdump(to_mv(prg.handle, 0x40))
|
||||
code = hsa.amd_kernel_code_t.from_address(prg.handle)
|
||||
|
||||
#print(format_struct(code))
|
||||
#print("code")
|
||||
#hexdump(to_mv(code_ptr, 0x100))
|
||||
#runner.local_size = [2,1,1]
|
||||
|
||||
print(runner.local_size, runner.global_size)
|
||||
|
||||
#pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), mmCOMPUTE_PGM_LO,
|
||||
# prg.handle&0xFFFFFFFF, prg.handle>>32, 0, 0, (scratch.va_addr>>8)&0xFFFFFFFF, scratch.va_addr>>40]
|
||||
code_ptr = (prg.handle + code.kernel_code_entry_byte_offset) >> 8
|
||||
pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), regCOMPUTE_PGM_LO, code_ptr&0xFFFFFFFF, code_ptr>>32, 0, 0, 0, 0]
|
||||
pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_PGM_RSRC1, code.compute_pgm_rsrc1, code.compute_pgm_rsrc2]
|
||||
pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_USER_DATA_0, dev.kernargs_ptr&0xFFFFFFFF, dev.kernargs_ptr>>32]
|
||||
#pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_USER_DATA_0, 0, 0]
|
||||
pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), regCOMPUTE_START_X, 0,0,0,
|
||||
runner.local_size[0],runner.local_size[1],runner.local_size[2],0,0]
|
||||
# disabled USE_THREAD_DIMENSIONS
|
||||
pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3),
|
||||
runner.global_size[0],runner.global_size[1],runner.global_size[2], CS_W32_EN | COMPUTE_SHADER_EN]
|
||||
|
||||
#pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_NOP, 0x3fff)]*0x200
|
||||
|
||||
"""
|
||||
addr=0x0
|
||||
sz=(1 << 64)-1
|
||||
gli=0
|
||||
glv=0
|
||||
glk=0
|
||||
gl1=0
|
||||
gl2=0
|
||||
pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0,
|
||||
sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0,
|
||||
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | \
|
||||
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
|
||||
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2)]
|
||||
print(pm4_cmd)
|
||||
"""
|
||||
|
||||
wptr = 0
|
||||
pm4_buffer_view = to_mv(compute_ring.va_addr, compute_ring.size).cast("I")
|
||||
|
||||
for j in range(0x80000):
|
||||
for i, value in enumerate(pm4_cmd): pm4_buffer_view[wptr+i] = value
|
||||
wptr += len(pm4_cmd)
|
||||
|
||||
compute_write_pointer[0] = wptr
|
||||
compute_doorbell[0] = wptr
|
||||
for k in range(10):
|
||||
done = compute_read_pointer[0] == compute_write_pointer[0]
|
||||
print(compute_read_pointer[0], compute_write_pointer[0], done)
|
||||
if done: break
|
||||
time.sleep(0.01)
|
||||
break
|
||||
#break
|
||||
|
||||
#print(compute_read_pointer[0])
|
||||
#time.sleep(0.05)
|
||||
#print(compute_read_pointer[0])
|
||||
|
||||
#time.sleep(100)
|
||||
|
||||
print(a.numpy())
|
||||
print(b.numpy())
|
||||
exit(0)
|
||||
|
||||
#pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), mmCOMPUTE_PGM_LO, 0,0,0,1,1,1,0,0]
|
||||
|
||||
|
||||
#pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, )]
|
||||
|
||||
|
||||
#pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0,
|
||||
# sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0,
|
||||
# amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | \
|
||||
# amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
|
||||
# amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2)]
|
||||
43
extra/hip_gpu_driver/test_sdma_fun.py
Normal file
43
extra/hip_gpu_driver/test_sdma_fun.py
Normal file
@@ -0,0 +1,43 @@
|
||||
import ctypes, mmap, time
|
||||
from tinygrad.runtime.ops_kfd import KFDDevice, kio, sdma_pkts, libc
|
||||
import tinygrad.runtime.autogen.amd_sdma as amd_sdma
|
||||
import tinygrad.runtime.autogen.kfd as kfd
|
||||
from tinygrad.helpers import to_mv
|
||||
|
||||
if __name__ == "__main__":
|
||||
dev = KFDDevice()
|
||||
|
||||
sdma_ring = dev._gpu_alloc(1 << 22, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
|
||||
gart = dev._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
sdma_queue = kio.create_queue(KFDDevice.kfd,
|
||||
ring_base_address=sdma_ring.va_addr, ring_size=sdma_ring.size, gpu_id=dev.gpu_id,
|
||||
queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
||||
write_pointer_address=gart.va_addr + 0x100, read_pointer_address=gart.va_addr + 0x108)
|
||||
|
||||
doorbells_base = sdma_queue.doorbell_offset & (~0xfff)
|
||||
doorbells = libc.mmap(0, 8192, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, doorbells_base)
|
||||
|
||||
sdma_read_pointer = to_mv(sdma_queue.read_pointer_address, 8).cast("Q")
|
||||
sdma_write_pointer = to_mv(sdma_queue.write_pointer_address, 8).cast("Q")
|
||||
sdma_doorbell = to_mv(doorbells + sdma_queue.doorbell_offset - doorbells_base, 4).cast("I")
|
||||
|
||||
test_write_page = dev._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
|
||||
cmd = sdma_pkts.timestamp(op=amd_sdma.SDMA_OP_TIMESTAMP, sub_op=amd_sdma.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL, addr=test_write_page.va_addr)
|
||||
|
||||
sdma_doorbell_value = 0
|
||||
def blit_sdma_command(cmd):
|
||||
ctypes.memmove(sdma_ring.va_addr + (sdma_doorbell_value % sdma_ring.size), ctypes.addressof(cmd), sz:=ctypes.sizeof(cmd))
|
||||
return sz
|
||||
|
||||
while True:
|
||||
sdma_doorbell_value += blit_sdma_command(cmd)
|
||||
sdma_write_pointer[0] = sdma_doorbell_value
|
||||
sdma_doorbell[0] = sdma_doorbell_value
|
||||
while sdma_read_pointer[0] != sdma_write_pointer[0]: continue
|
||||
tm = to_mv(test_write_page.va_addr, 0x1000).cast("Q")[0]/1e8
|
||||
print(f"{tm:.3f} s @ 0x{sdma_ring.va_addr + (sdma_doorbell_value % sdma_ring.size):X} R:0x{sdma_queue.read_pointer_address:X} W:0x{sdma_queue.write_pointer_address:X}")
|
||||
time.sleep(0.01)
|
||||
|
||||
|
||||
|
||||
|
||||
41
test/external/external_test_hcq.py
vendored
41
test/external/external_test_hcq.py
vendored
@@ -2,7 +2,7 @@ import unittest, ctypes, struct, time
|
||||
from tinygrad import Device, Tensor, dtypes
|
||||
from tinygrad.buffer import Buffer, BufferOptions
|
||||
from tinygrad.engine.schedule import create_schedule
|
||||
from tinygrad.runtime.ops_kfd import KFDDevice, HWCopyQueue, HWComputeQueue
|
||||
from tinygrad.runtime.ops_kfd import KFDDevice, HWCopyQueue, HWPM4Queue
|
||||
|
||||
def _time_queue(q, d):
|
||||
st = time.perf_counter()
|
||||
@@ -15,7 +15,7 @@ class TestHCQ(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(self):
|
||||
TestHCQ.d0: KFDDevice = Device["KFD"]
|
||||
TestHCQ.d1: KFDDevice = Device["KFD:1"]
|
||||
#TestHCQ.d1: KFDDevice = Device["KFD:1"]
|
||||
TestHCQ.a = Tensor([0.,1.], device="KFD").realize()
|
||||
TestHCQ.b = self.a + 1
|
||||
si = create_schedule([self.b.lazydata])[-1]
|
||||
@@ -26,13 +26,14 @@ class TestHCQ(unittest.TestCase):
|
||||
TestHCQ.addr2 = struct.pack("QQ", TestHCQ.a.lazydata.buffer._buf.va_addr, TestHCQ.b.lazydata.buffer._buf.va_addr)
|
||||
ctypes.memmove(TestHCQ.d0.kernargs_ptr, TestHCQ.addr, len(TestHCQ.addr))
|
||||
ctypes.memmove(TestHCQ.d0.kernargs_ptr+len(TestHCQ.addr), TestHCQ.addr2, len(TestHCQ.addr2))
|
||||
TestHCQ.compute_queue = HWPM4Queue
|
||||
|
||||
def setUp(self):
|
||||
TestHCQ.a.lazydata.buffer.copyin(memoryview(bytearray(struct.pack("ff", 0, 1))))
|
||||
TestHCQ.b.lazydata.buffer.copyin(memoryview(bytearray(struct.pack("ff", 0, 0))))
|
||||
|
||||
def test_run_1000_times_one_submit(self):
|
||||
q = HWComputeQueue()
|
||||
q = TestHCQ.compute_queue()
|
||||
for _ in range(1000):
|
||||
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.global_size, TestHCQ.runner.local_size)
|
||||
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr+len(TestHCQ.addr), TestHCQ.runner.global_size, TestHCQ.runner.local_size)
|
||||
@@ -42,20 +43,21 @@ class TestHCQ(unittest.TestCase):
|
||||
assert (val:=TestHCQ.a.lazydata.buffer.as_buffer().cast("f")[0]) == 2000.0, f"got val {val}"
|
||||
|
||||
def test_run_1000_times(self):
|
||||
q = HWComputeQueue()
|
||||
q = TestHCQ.compute_queue()
|
||||
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.global_size, TestHCQ.runner.local_size)
|
||||
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr+len(TestHCQ.addr), TestHCQ.runner.global_size,
|
||||
TestHCQ.runner.local_size, TestHCQ.d0.completion_signal)
|
||||
for _ in range(1000):
|
||||
q.submit(TestHCQ.d0)
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal)
|
||||
TestHCQ.d0.completion_signal.value = 1
|
||||
# confirm signal was reset
|
||||
with self.assertRaises(RuntimeError):
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=50)
|
||||
assert (val:=TestHCQ.a.lazydata.buffer.as_buffer().cast("f")[0]) == 2000.0, f"got val {val}"
|
||||
|
||||
def test_run_to_3(self):
|
||||
q = HWComputeQueue()
|
||||
q = TestHCQ.compute_queue()
|
||||
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.global_size, TestHCQ.runner.local_size)
|
||||
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr+len(TestHCQ.addr), TestHCQ.runner.global_size, TestHCQ.runner.local_size)
|
||||
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.global_size, TestHCQ.runner.local_size, TestHCQ.d0.completion_signal)
|
||||
@@ -65,12 +67,12 @@ class TestHCQ(unittest.TestCase):
|
||||
|
||||
def test_wait_signal(self):
|
||||
TestHCQ.d0.completion_signal.value = 1
|
||||
HWComputeQueue().wait(TestHCQ.d0.completion_signal).signal(TestHCQ.d0.completion_signal).submit(TestHCQ.d0)
|
||||
TestHCQ.compute_queue().wait(TestHCQ.d0.completion_signal).signal(TestHCQ.d0.completion_signal).submit(TestHCQ.d0)
|
||||
with self.assertRaises(RuntimeError):
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=50)
|
||||
# clean up
|
||||
TestHCQ.d0.completion_signal.value = 0
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=1000)
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=1000, skip_check=True)
|
||||
|
||||
def test_wait_copy_signal(self):
|
||||
TestHCQ.d0.completion_signal.value = 1
|
||||
@@ -79,25 +81,26 @@ class TestHCQ(unittest.TestCase):
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=50)
|
||||
# clean up
|
||||
TestHCQ.d0.completion_signal.value = 0
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=1000)
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=1000, skip_check=True)
|
||||
|
||||
def test_run_normal(self):
|
||||
q = HWComputeQueue()
|
||||
q = TestHCQ.compute_queue()
|
||||
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.global_size, TestHCQ.runner.local_size, TestHCQ.d0.completion_signal)
|
||||
q.submit(TestHCQ.d0)
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal)
|
||||
assert (val:=TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[0]) == 1.0, f"got val {val}"
|
||||
|
||||
def test_submit_empty_queues(self):
|
||||
HWComputeQueue().submit(TestHCQ.d0)
|
||||
TestHCQ.compute_queue().submit(TestHCQ.d0)
|
||||
HWCopyQueue().submit(TestHCQ.d0)
|
||||
|
||||
def test_signal_timeout(self):
|
||||
TestHCQ.d0.completion_signal.value = 1
|
||||
with self.assertRaises(RuntimeError):
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=50)
|
||||
|
||||
def test_signal(self):
|
||||
HWComputeQueue().signal(TestHCQ.d0.completion_signal).submit(TestHCQ.d0)
|
||||
TestHCQ.compute_queue().signal(TestHCQ.d0.completion_signal).submit(TestHCQ.d0)
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal)
|
||||
|
||||
def test_copy_signal(self):
|
||||
@@ -105,7 +108,7 @@ class TestHCQ(unittest.TestCase):
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal)
|
||||
|
||||
def test_run_signal(self):
|
||||
q = HWComputeQueue()
|
||||
q = TestHCQ.compute_queue()
|
||||
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.global_size, TestHCQ.runner.local_size)
|
||||
q.signal(TestHCQ.d0.completion_signal)
|
||||
q.submit(TestHCQ.d0)
|
||||
@@ -117,9 +120,10 @@ class TestHCQ(unittest.TestCase):
|
||||
q.copy(TestHCQ.a.lazydata.buffer._buf.va_addr, TestHCQ.b.lazydata.buffer._buf.va_addr, 8)
|
||||
q.copy(TestHCQ.b.lazydata.buffer._buf.va_addr, TestHCQ.a.lazydata.buffer._buf.va_addr, 8)
|
||||
q.signal(TestHCQ.d0.completion_signal)
|
||||
for _ in range(1000):
|
||||
for i in range(1000):
|
||||
q.submit(TestHCQ.d0)
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal)
|
||||
TestHCQ.d0.completion_signal.value = 1
|
||||
# confirm signal was reset
|
||||
with self.assertRaises(RuntimeError):
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=50)
|
||||
@@ -147,8 +151,8 @@ class TestHCQ(unittest.TestCase):
|
||||
|
||||
def test_cross_device_copy_bandwidth(self):
|
||||
SZ = 2_000_000_000
|
||||
a = Buffer("KFD", SZ, dtypes.uint8, options=BufferOptions(nolru=True)).allocate()
|
||||
b = Buffer("KFD:1", SZ, dtypes.uint8, options=BufferOptions(nolru=True)).allocate()
|
||||
a = Buffer("KFD", SZ, dtypes.uint8, options=BufferOptions(nolru=True)).allocate()
|
||||
TestHCQ.d0._gpu_map(b._buf)
|
||||
q = HWCopyQueue()
|
||||
q.copy(a._buf.va_addr, b._buf.va_addr, SZ)
|
||||
@@ -158,7 +162,7 @@ class TestHCQ(unittest.TestCase):
|
||||
assert gb_s > 2 and gb_s < 50
|
||||
|
||||
def test_interleave_compute_and_copy(self):
|
||||
q = HWComputeQueue()
|
||||
q = TestHCQ.compute_queue()
|
||||
qc = HWCopyQueue()
|
||||
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.global_size, TestHCQ.runner.local_size) # b = [1, 2]
|
||||
q.signal(sig:=KFDDevice._get_signal(10))
|
||||
@@ -173,12 +177,13 @@ class TestHCQ(unittest.TestCase):
|
||||
assert (val:=TestHCQ.a.lazydata.buffer.as_buffer().cast("f")[0]) == 1.0, f"got val {val}"
|
||||
|
||||
def test_cross_device_signal(self):
|
||||
q1 = HWComputeQueue()
|
||||
q2 = HWComputeQueue()
|
||||
d1 = Device["KFD:1"]
|
||||
q1 = TestHCQ.compute_queue()
|
||||
q2 = TestHCQ.compute_queue()
|
||||
q1.signal(TestHCQ.d0.completion_signal)
|
||||
q2.wait(TestHCQ.d0.completion_signal)
|
||||
q2.submit(TestHCQ.d0)
|
||||
q1.submit(TestHCQ.d1)
|
||||
q1.submit(d1)
|
||||
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -3720,6 +3720,540 @@ struct_amd_signal_s._fields_ = [
|
||||
]
|
||||
|
||||
amd_signal_t = struct_amd_signal_s
|
||||
amd_kernel_code_version32_t = ctypes.c_uint32
|
||||
|
||||
# values for enumeration 'amd_kernel_code_version_t'
|
||||
amd_kernel_code_version_t__enumvalues = {
|
||||
1: 'AMD_KERNEL_CODE_VERSION_MAJOR',
|
||||
1: 'AMD_KERNEL_CODE_VERSION_MINOR',
|
||||
}
|
||||
AMD_KERNEL_CODE_VERSION_MAJOR = 1
|
||||
AMD_KERNEL_CODE_VERSION_MINOR = 1
|
||||
amd_kernel_code_version_t = ctypes.c_uint32 # enum
|
||||
amd_machine_kind16_t = ctypes.c_uint16
|
||||
|
||||
# values for enumeration 'amd_machine_kind_t'
|
||||
amd_machine_kind_t__enumvalues = {
|
||||
0: 'AMD_MACHINE_KIND_UNDEFINED',
|
||||
1: 'AMD_MACHINE_KIND_AMDGPU',
|
||||
}
|
||||
AMD_MACHINE_KIND_UNDEFINED = 0
|
||||
AMD_MACHINE_KIND_AMDGPU = 1
|
||||
amd_machine_kind_t = ctypes.c_uint32 # enum
|
||||
amd_machine_version16_t = ctypes.c_uint16
|
||||
|
||||
# values for enumeration 'amd_float_round_mode_t'
|
||||
amd_float_round_mode_t__enumvalues = {
|
||||
0: 'AMD_FLOAT_ROUND_MODE_NEAREST_EVEN',
|
||||
1: 'AMD_FLOAT_ROUND_MODE_PLUS_INFINITY',
|
||||
2: 'AMD_FLOAT_ROUND_MODE_MINUS_INFINITY',
|
||||
3: 'AMD_FLOAT_ROUND_MODE_ZERO',
|
||||
}
|
||||
AMD_FLOAT_ROUND_MODE_NEAREST_EVEN = 0
|
||||
AMD_FLOAT_ROUND_MODE_PLUS_INFINITY = 1
|
||||
AMD_FLOAT_ROUND_MODE_MINUS_INFINITY = 2
|
||||
AMD_FLOAT_ROUND_MODE_ZERO = 3
|
||||
amd_float_round_mode_t = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'amd_float_denorm_mode_t'
|
||||
amd_float_denorm_mode_t__enumvalues = {
|
||||
0: 'AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE_OUTPUT',
|
||||
1: 'AMD_FLOAT_DENORM_MODE_FLUSH_OUTPUT',
|
||||
2: 'AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE',
|
||||
3: 'AMD_FLOAT_DENORM_MODE_NO_FLUSH',
|
||||
}
|
||||
AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE_OUTPUT = 0
|
||||
AMD_FLOAT_DENORM_MODE_FLUSH_OUTPUT = 1
|
||||
AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE = 2
|
||||
AMD_FLOAT_DENORM_MODE_NO_FLUSH = 3
|
||||
amd_float_denorm_mode_t = ctypes.c_uint32 # enum
|
||||
amd_compute_pgm_rsrc_one32_t = ctypes.c_uint32
|
||||
|
||||
# values for enumeration 'amd_compute_pgm_rsrc_one_t'
|
||||
amd_compute_pgm_rsrc_one_t__enumvalues = {
|
||||
0: 'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT',
|
||||
6: 'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH',
|
||||
63: 'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT',
|
||||
6: 'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT',
|
||||
4: 'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH',
|
||||
960: 'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT',
|
||||
10: 'AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY_SHIFT',
|
||||
2: 'AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY_WIDTH',
|
||||
3072: 'AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY',
|
||||
12: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32_SHIFT',
|
||||
2: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32_WIDTH',
|
||||
12288: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32',
|
||||
14: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64_SHIFT',
|
||||
2: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64_WIDTH',
|
||||
49152: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64',
|
||||
16: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32_SHIFT',
|
||||
2: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32_WIDTH',
|
||||
196608: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32',
|
||||
18: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64_SHIFT',
|
||||
2: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64_WIDTH',
|
||||
786432: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64',
|
||||
20: 'AMD_COMPUTE_PGM_RSRC_ONE_PRIV_SHIFT',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_ONE_PRIV_WIDTH',
|
||||
1048576: 'AMD_COMPUTE_PGM_RSRC_ONE_PRIV',
|
||||
21: 'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP_SHIFT',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP_WIDTH',
|
||||
2097152: 'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP',
|
||||
22: 'AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE_SHIFT',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE_WIDTH',
|
||||
4194304: 'AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE',
|
||||
23: 'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE_SHIFT',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE_WIDTH',
|
||||
8388608: 'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE',
|
||||
24: 'AMD_COMPUTE_PGM_RSRC_ONE_BULKY_SHIFT',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_ONE_BULKY_WIDTH',
|
||||
16777216: 'AMD_COMPUTE_PGM_RSRC_ONE_BULKY',
|
||||
25: 'AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER_SHIFT',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER_WIDTH',
|
||||
33554432: 'AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER',
|
||||
26: 'AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1_SHIFT',
|
||||
6: 'AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1_WIDTH',
|
||||
-67108864: 'AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1',
|
||||
}
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT = 0
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH = 6
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT = 63
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT = 6
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH = 4
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT = 960
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY_SHIFT = 10
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY_WIDTH = 2
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY = 3072
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32_SHIFT = 12
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32_WIDTH = 2
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32 = 12288
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64_SHIFT = 14
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64_WIDTH = 2
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64 = 49152
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32_SHIFT = 16
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32_WIDTH = 2
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32 = 196608
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64_SHIFT = 18
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64_WIDTH = 2
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64 = 786432
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_PRIV_SHIFT = 20
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_PRIV_WIDTH = 1
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_PRIV = 1048576
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP_SHIFT = 21
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP_WIDTH = 1
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP = 2097152
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE_SHIFT = 22
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE_WIDTH = 1
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE = 4194304
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE_SHIFT = 23
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE_WIDTH = 1
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE = 8388608
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_BULKY_SHIFT = 24
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_BULKY_WIDTH = 1
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_BULKY = 16777216
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER_SHIFT = 25
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER_WIDTH = 1
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER = 33554432
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1_SHIFT = 26
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1_WIDTH = 6
|
||||
AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1 = -67108864
|
||||
amd_compute_pgm_rsrc_one_t = ctypes.c_int32 # enum
|
||||
|
||||
# values for enumeration 'amd_system_vgpr_workitem_id_t'
|
||||
amd_system_vgpr_workitem_id_t__enumvalues = {
|
||||
0: 'AMD_SYSTEM_VGPR_WORKITEM_ID_X',
|
||||
1: 'AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y',
|
||||
2: 'AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y_Z',
|
||||
3: 'AMD_SYSTEM_VGPR_WORKITEM_ID_UNDEFINED',
|
||||
}
|
||||
AMD_SYSTEM_VGPR_WORKITEM_ID_X = 0
|
||||
AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y = 1
|
||||
AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y_Z = 2
|
||||
AMD_SYSTEM_VGPR_WORKITEM_ID_UNDEFINED = 3
|
||||
amd_system_vgpr_workitem_id_t = ctypes.c_uint32 # enum
|
||||
amd_compute_pgm_rsrc_two32_t = ctypes.c_uint32
|
||||
|
||||
# values for enumeration 'amd_compute_pgm_rsrc_two_t'
|
||||
amd_compute_pgm_rsrc_two_t__enumvalues = {
|
||||
0: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET_SHIFT',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET_WIDTH',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT_SHIFT',
|
||||
5: 'AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT_WIDTH',
|
||||
62: 'AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT',
|
||||
6: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER_SHIFT',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER_WIDTH',
|
||||
64: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER',
|
||||
7: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X_SHIFT',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X_WIDTH',
|
||||
128: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X',
|
||||
8: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y_SHIFT',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y_WIDTH',
|
||||
256: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y',
|
||||
9: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z_SHIFT',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z_WIDTH',
|
||||
512: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z',
|
||||
10: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO_SHIFT',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO_WIDTH',
|
||||
1024: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO',
|
||||
11: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID_SHIFT',
|
||||
2: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID_WIDTH',
|
||||
6144: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID',
|
||||
13: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH_SHIFT',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH_WIDTH',
|
||||
8192: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH',
|
||||
14: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION_SHIFT',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION_WIDTH',
|
||||
16384: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION',
|
||||
15: 'AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE_SHIFT',
|
||||
9: 'AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE_WIDTH',
|
||||
16744448: 'AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE',
|
||||
24: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION_SHIFT',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION_WIDTH',
|
||||
16777216: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION',
|
||||
25: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE_SHIFT',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE_WIDTH',
|
||||
33554432: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE',
|
||||
26: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO_SHIFT',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO_WIDTH',
|
||||
67108864: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO',
|
||||
27: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW_SHIFT',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW_WIDTH',
|
||||
134217728: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW',
|
||||
28: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW_SHIFT',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW_WIDTH',
|
||||
268435456: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW',
|
||||
29: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT_SHIFT',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT_WIDTH',
|
||||
536870912: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT',
|
||||
30: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO_SHIFT',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO_WIDTH',
|
||||
1073741824: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO',
|
||||
31: 'AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1_SHIFT',
|
||||
1: 'AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1_WIDTH',
|
||||
-2147483648: 'AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1',
|
||||
}
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET_SHIFT = 0
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET_WIDTH = 1
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 1
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT_SHIFT = 1
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT_WIDTH = 5
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT = 62
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER_SHIFT = 6
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER_WIDTH = 1
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER = 64
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X_SHIFT = 7
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X_WIDTH = 1
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X = 128
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y_SHIFT = 8
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y_WIDTH = 1
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y = 256
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z_SHIFT = 9
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z_WIDTH = 1
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z = 512
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO_SHIFT = 10
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO_WIDTH = 1
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO = 1024
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID_SHIFT = 11
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID_WIDTH = 2
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID = 6144
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH_SHIFT = 13
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH_WIDTH = 1
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH = 8192
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION_SHIFT = 14
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION_WIDTH = 1
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION = 16384
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE_SHIFT = 15
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE_WIDTH = 9
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE = 16744448
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION_SHIFT = 24
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION_WIDTH = 1
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION = 16777216
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE_SHIFT = 25
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE_WIDTH = 1
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE = 33554432
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO_SHIFT = 26
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO_WIDTH = 1
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO = 67108864
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW_SHIFT = 27
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW_WIDTH = 1
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW = 134217728
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW_SHIFT = 28
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW_WIDTH = 1
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW = 268435456
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT_SHIFT = 29
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT_WIDTH = 1
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT = 536870912
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO_SHIFT = 30
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO_WIDTH = 1
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO = 1073741824
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1_SHIFT = 31
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1_WIDTH = 1
|
||||
AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1 = -2147483648
|
||||
amd_compute_pgm_rsrc_two_t = ctypes.c_int32 # enum
|
||||
|
||||
# values for enumeration 'amd_element_byte_size_t'
|
||||
amd_element_byte_size_t__enumvalues = {
|
||||
0: 'AMD_ELEMENT_BYTE_SIZE_2',
|
||||
1: 'AMD_ELEMENT_BYTE_SIZE_4',
|
||||
2: 'AMD_ELEMENT_BYTE_SIZE_8',
|
||||
3: 'AMD_ELEMENT_BYTE_SIZE_16',
|
||||
}
|
||||
AMD_ELEMENT_BYTE_SIZE_2 = 0
|
||||
AMD_ELEMENT_BYTE_SIZE_4 = 1
|
||||
AMD_ELEMENT_BYTE_SIZE_8 = 2
|
||||
AMD_ELEMENT_BYTE_SIZE_16 = 3
|
||||
amd_element_byte_size_t = ctypes.c_uint32 # enum
|
||||
amd_kernel_code_properties32_t = ctypes.c_uint32
|
||||
|
||||
# values for enumeration 'amd_kernel_code_properties_t'
|
||||
amd_kernel_code_properties_t__enumvalues = {
|
||||
0: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT',
|
||||
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH',
|
||||
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER',
|
||||
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR_SHIFT',
|
||||
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR_WIDTH',
|
||||
2: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR',
|
||||
2: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR_SHIFT',
|
||||
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR_WIDTH',
|
||||
4: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR',
|
||||
3: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT',
|
||||
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH',
|
||||
8: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR',
|
||||
4: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID_SHIFT',
|
||||
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID_WIDTH',
|
||||
16: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID',
|
||||
5: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT',
|
||||
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH',
|
||||
32: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT',
|
||||
6: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT',
|
||||
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH',
|
||||
64: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE',
|
||||
7: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT',
|
||||
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH',
|
||||
128: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X',
|
||||
8: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT',
|
||||
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH',
|
||||
256: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y',
|
||||
9: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT',
|
||||
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH',
|
||||
512: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z',
|
||||
10: 'AMD_KERNEL_CODE_PROPERTIES_RESERVED1_SHIFT',
|
||||
6: 'AMD_KERNEL_CODE_PROPERTIES_RESERVED1_WIDTH',
|
||||
64512: 'AMD_KERNEL_CODE_PROPERTIES_RESERVED1',
|
||||
16: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS_SHIFT',
|
||||
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS_WIDTH',
|
||||
65536: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS',
|
||||
17: 'AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE_SHIFT',
|
||||
2: 'AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE_WIDTH',
|
||||
393216: 'AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE',
|
||||
19: 'AMD_KERNEL_CODE_PROPERTIES_IS_PTR64_SHIFT',
|
||||
1: 'AMD_KERNEL_CODE_PROPERTIES_IS_PTR64_WIDTH',
|
||||
524288: 'AMD_KERNEL_CODE_PROPERTIES_IS_PTR64',
|
||||
20: 'AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK_SHIFT',
|
||||
1: 'AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK_WIDTH',
|
||||
1048576: 'AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK',
|
||||
21: 'AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED_SHIFT',
|
||||
1: 'AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED_WIDTH',
|
||||
2097152: 'AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED',
|
||||
22: 'AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED_SHIFT',
|
||||
1: 'AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED_WIDTH',
|
||||
4194304: 'AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED',
|
||||
23: 'AMD_KERNEL_CODE_PROPERTIES_RESERVED2_SHIFT',
|
||||
9: 'AMD_KERNEL_CODE_PROPERTIES_RESERVED2_WIDTH',
|
||||
-8388608: 'AMD_KERNEL_CODE_PROPERTIES_RESERVED2',
|
||||
}
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = 1
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR = 2
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR = 4
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR = 8
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID = 16
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT = 32
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = 64
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = 128
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = 256
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = 512
|
||||
AMD_KERNEL_CODE_PROPERTIES_RESERVED1_SHIFT = 10
|
||||
AMD_KERNEL_CODE_PROPERTIES_RESERVED1_WIDTH = 6
|
||||
AMD_KERNEL_CODE_PROPERTIES_RESERVED1 = 64512
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS_SHIFT = 16
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1
|
||||
AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS = 65536
|
||||
AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE_SHIFT = 17
|
||||
AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE_WIDTH = 2
|
||||
AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE = 393216
|
||||
AMD_KERNEL_CODE_PROPERTIES_IS_PTR64_SHIFT = 19
|
||||
AMD_KERNEL_CODE_PROPERTIES_IS_PTR64_WIDTH = 1
|
||||
AMD_KERNEL_CODE_PROPERTIES_IS_PTR64 = 524288
|
||||
AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK_SHIFT = 20
|
||||
AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK_WIDTH = 1
|
||||
AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK = 1048576
|
||||
AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED_SHIFT = 21
|
||||
AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED_WIDTH = 1
|
||||
AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED = 2097152
|
||||
AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED_SHIFT = 22
|
||||
AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED_WIDTH = 1
|
||||
AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED = 4194304
|
||||
AMD_KERNEL_CODE_PROPERTIES_RESERVED2_SHIFT = 23
|
||||
AMD_KERNEL_CODE_PROPERTIES_RESERVED2_WIDTH = 9
|
||||
AMD_KERNEL_CODE_PROPERTIES_RESERVED2 = -8388608
|
||||
amd_kernel_code_properties_t = ctypes.c_int32 # enum
|
||||
amd_powertwo8_t = ctypes.c_ubyte
|
||||
|
||||
# values for enumeration 'amd_powertwo_t'
|
||||
amd_powertwo_t__enumvalues = {
|
||||
0: 'AMD_POWERTWO_1',
|
||||
1: 'AMD_POWERTWO_2',
|
||||
2: 'AMD_POWERTWO_4',
|
||||
3: 'AMD_POWERTWO_8',
|
||||
4: 'AMD_POWERTWO_16',
|
||||
5: 'AMD_POWERTWO_32',
|
||||
6: 'AMD_POWERTWO_64',
|
||||
7: 'AMD_POWERTWO_128',
|
||||
8: 'AMD_POWERTWO_256',
|
||||
}
|
||||
AMD_POWERTWO_1 = 0
|
||||
AMD_POWERTWO_2 = 1
|
||||
AMD_POWERTWO_4 = 2
|
||||
AMD_POWERTWO_8 = 3
|
||||
AMD_POWERTWO_16 = 4
|
||||
AMD_POWERTWO_32 = 5
|
||||
AMD_POWERTWO_64 = 6
|
||||
AMD_POWERTWO_128 = 7
|
||||
AMD_POWERTWO_256 = 8
|
||||
amd_powertwo_t = ctypes.c_uint32 # enum
|
||||
amd_enabled_control_directive64_t = ctypes.c_uint64
|
||||
|
||||
# values for enumeration 'amd_enabled_control_directive_t'
|
||||
amd_enabled_control_directive_t__enumvalues = {
|
||||
1: 'AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_BREAK_EXCEPTIONS',
|
||||
2: 'AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_DETECT_EXCEPTIONS',
|
||||
4: 'AMD_ENABLED_CONTROL_DIRECTIVE_MAX_DYNAMIC_GROUP_SIZE',
|
||||
8: 'AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_GRID_SIZE',
|
||||
16: 'AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_WORKGROUP_SIZE',
|
||||
32: 'AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_DIM',
|
||||
64: 'AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_GRID_SIZE',
|
||||
128: 'AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_WORKGROUP_SIZE',
|
||||
256: 'AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRE_NO_PARTIAL_WORKGROUPS',
|
||||
}
|
||||
AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_BREAK_EXCEPTIONS = 1
|
||||
AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_DETECT_EXCEPTIONS = 2
|
||||
AMD_ENABLED_CONTROL_DIRECTIVE_MAX_DYNAMIC_GROUP_SIZE = 4
|
||||
AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_GRID_SIZE = 8
|
||||
AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_WORKGROUP_SIZE = 16
|
||||
AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_DIM = 32
|
||||
AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_GRID_SIZE = 64
|
||||
AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_WORKGROUP_SIZE = 128
|
||||
AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRE_NO_PARTIAL_WORKGROUPS = 256
|
||||
amd_enabled_control_directive_t = ctypes.c_uint32 # enum
|
||||
amd_exception_kind16_t = ctypes.c_uint16
|
||||
|
||||
# values for enumeration 'amd_exception_kind_t'
|
||||
amd_exception_kind_t__enumvalues = {
|
||||
1: 'AMD_EXCEPTION_KIND_INVALID_OPERATION',
|
||||
2: 'AMD_EXCEPTION_KIND_DIVISION_BY_ZERO',
|
||||
4: 'AMD_EXCEPTION_KIND_OVERFLOW',
|
||||
8: 'AMD_EXCEPTION_KIND_UNDERFLOW',
|
||||
16: 'AMD_EXCEPTION_KIND_INEXACT',
|
||||
}
|
||||
AMD_EXCEPTION_KIND_INVALID_OPERATION = 1
|
||||
AMD_EXCEPTION_KIND_DIVISION_BY_ZERO = 2
|
||||
AMD_EXCEPTION_KIND_OVERFLOW = 4
|
||||
AMD_EXCEPTION_KIND_UNDERFLOW = 8
|
||||
AMD_EXCEPTION_KIND_INEXACT = 16
|
||||
amd_exception_kind_t = ctypes.c_uint32 # enum
|
||||
class struct_amd_control_directives_s(Structure):
|
||||
pass
|
||||
|
||||
struct_amd_control_directives_s._pack_ = 1 # source:False
|
||||
struct_amd_control_directives_s._fields_ = [
|
||||
('enabled_control_directives', ctypes.c_uint64),
|
||||
('enable_break_exceptions', ctypes.c_uint16),
|
||||
('enable_detect_exceptions', ctypes.c_uint16),
|
||||
('max_dynamic_group_size', ctypes.c_uint32),
|
||||
('max_flat_grid_size', ctypes.c_uint64),
|
||||
('max_flat_workgroup_size', ctypes.c_uint32),
|
||||
('required_dim', ctypes.c_ubyte),
|
||||
('reserved1', ctypes.c_ubyte * 3),
|
||||
('required_grid_size', ctypes.c_uint64 * 3),
|
||||
('required_workgroup_size', ctypes.c_uint32 * 3),
|
||||
('reserved2', ctypes.c_ubyte * 60),
|
||||
]
|
||||
|
||||
amd_control_directives_t = struct_amd_control_directives_s
|
||||
class struct_amd_kernel_code_s(Structure):
|
||||
pass
|
||||
|
||||
struct_amd_kernel_code_s._pack_ = 1 # source:False
|
||||
struct_amd_kernel_code_s._fields_ = [
|
||||
('amd_kernel_code_version_major', ctypes.c_uint32),
|
||||
('amd_kernel_code_version_minor', ctypes.c_uint32),
|
||||
('amd_machine_kind', ctypes.c_uint16),
|
||||
('amd_machine_version_major', ctypes.c_uint16),
|
||||
('amd_machine_version_minor', ctypes.c_uint16),
|
||||
('amd_machine_version_stepping', ctypes.c_uint16),
|
||||
('kernel_code_entry_byte_offset', ctypes.c_int64),
|
||||
('kernel_code_prefetch_byte_offset', ctypes.c_int64),
|
||||
('kernel_code_prefetch_byte_size', ctypes.c_uint64),
|
||||
('max_scratch_backing_memory_byte_size', ctypes.c_uint64),
|
||||
('compute_pgm_rsrc1', ctypes.c_uint32),
|
||||
('compute_pgm_rsrc2', ctypes.c_uint32),
|
||||
('kernel_code_properties', ctypes.c_uint32),
|
||||
('workitem_private_segment_byte_size', ctypes.c_uint32),
|
||||
('workgroup_group_segment_byte_size', ctypes.c_uint32),
|
||||
('gds_segment_byte_size', ctypes.c_uint32),
|
||||
('kernarg_segment_byte_size', ctypes.c_uint64),
|
||||
('workgroup_fbarrier_count', ctypes.c_uint32),
|
||||
('wavefront_sgpr_count', ctypes.c_uint16),
|
||||
('workitem_vgpr_count', ctypes.c_uint16),
|
||||
('reserved_vgpr_first', ctypes.c_uint16),
|
||||
('reserved_vgpr_count', ctypes.c_uint16),
|
||||
('reserved_sgpr_first', ctypes.c_uint16),
|
||||
('reserved_sgpr_count', ctypes.c_uint16),
|
||||
('debug_wavefront_private_segment_offset_sgpr', ctypes.c_uint16),
|
||||
('debug_private_segment_buffer_sgpr', ctypes.c_uint16),
|
||||
('kernarg_segment_alignment', ctypes.c_ubyte),
|
||||
('group_segment_alignment', ctypes.c_ubyte),
|
||||
('private_segment_alignment', ctypes.c_ubyte),
|
||||
('wavefront_size', ctypes.c_ubyte),
|
||||
('call_convention', ctypes.c_int32),
|
||||
('reserved1', ctypes.c_ubyte * 12),
|
||||
('runtime_loader_kernel_symbol', ctypes.c_uint64),
|
||||
('control_directives', amd_control_directives_t),
|
||||
]
|
||||
|
||||
amd_kernel_code_t = struct_amd_kernel_code_s
|
||||
class struct_amd_runtime_loader_debug_info_s(Structure):
|
||||
pass
|
||||
|
||||
struct_amd_runtime_loader_debug_info_s._pack_ = 1 # source:False
|
||||
struct_amd_runtime_loader_debug_info_s._fields_ = [
|
||||
('elf_raw', ctypes.POINTER(None)),
|
||||
('elf_size', ctypes.c_uint64),
|
||||
('kernel_name', ctypes.POINTER(ctypes.c_char)),
|
||||
('owning_segment', ctypes.POINTER(None)),
|
||||
]
|
||||
|
||||
amd_runtime_loader_debug_info_t = struct_amd_runtime_loader_debug_info_s
|
||||
class struct_BrigModuleHeader(Structure):
|
||||
pass
|
||||
|
||||
@@ -4166,7 +4700,187 @@ struct_hsa_ven_amd_aqlprofile_1_00_pfn_s._fields_ = [
|
||||
hsa_ven_amd_aqlprofile_1_00_pfn_t = struct_hsa_ven_amd_aqlprofile_1_00_pfn_s
|
||||
hsa_ven_amd_aqlprofile_pfn_t = struct_hsa_ven_amd_aqlprofile_1_00_pfn_s
|
||||
__all__ = \
|
||||
['AMD_QUEUE_PROPERTIES_ENABLE_PROFILING',
|
||||
['AMD_COMPUTE_PGM_RSRC_ONE_BULKY',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_BULKY_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_BULKY_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_PRIV',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_PRIV_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_PRIV_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1_WIDTH',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT_SHIFT',
|
||||
'AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT_WIDTH',
|
||||
'AMD_ELEMENT_BYTE_SIZE_16', 'AMD_ELEMENT_BYTE_SIZE_2',
|
||||
'AMD_ELEMENT_BYTE_SIZE_4', 'AMD_ELEMENT_BYTE_SIZE_8',
|
||||
'AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_BREAK_EXCEPTIONS',
|
||||
'AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_DETECT_EXCEPTIONS',
|
||||
'AMD_ENABLED_CONTROL_DIRECTIVE_MAX_DYNAMIC_GROUP_SIZE',
|
||||
'AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_GRID_SIZE',
|
||||
'AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_WORKGROUP_SIZE',
|
||||
'AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_DIM',
|
||||
'AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_GRID_SIZE',
|
||||
'AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_WORKGROUP_SIZE',
|
||||
'AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRE_NO_PARTIAL_WORKGROUPS',
|
||||
'AMD_EXCEPTION_KIND_DIVISION_BY_ZERO',
|
||||
'AMD_EXCEPTION_KIND_INEXACT',
|
||||
'AMD_EXCEPTION_KIND_INVALID_OPERATION',
|
||||
'AMD_EXCEPTION_KIND_OVERFLOW', 'AMD_EXCEPTION_KIND_UNDERFLOW',
|
||||
'AMD_FLOAT_DENORM_MODE_FLUSH_OUTPUT',
|
||||
'AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE',
|
||||
'AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE_OUTPUT',
|
||||
'AMD_FLOAT_DENORM_MODE_NO_FLUSH',
|
||||
'AMD_FLOAT_ROUND_MODE_MINUS_INFINITY',
|
||||
'AMD_FLOAT_ROUND_MODE_NEAREST_EVEN',
|
||||
'AMD_FLOAT_ROUND_MODE_PLUS_INFINITY', 'AMD_FLOAT_ROUND_MODE_ZERO',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS_SHIFT',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS_WIDTH',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID_SHIFT',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID_WIDTH',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR_SHIFT',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR_WIDTH',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR_SHIFT',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR_WIDTH',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED_SHIFT',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED_WIDTH',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK_SHIFT',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK_WIDTH',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_IS_PTR64',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_IS_PTR64_SHIFT',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_IS_PTR64_WIDTH',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED_SHIFT',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED_WIDTH',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE_SHIFT',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE_WIDTH',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_RESERVED1',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_RESERVED1_SHIFT',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_RESERVED1_WIDTH',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_RESERVED2',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_RESERVED2_SHIFT',
|
||||
'AMD_KERNEL_CODE_PROPERTIES_RESERVED2_WIDTH',
|
||||
'AMD_KERNEL_CODE_VERSION_MAJOR', 'AMD_KERNEL_CODE_VERSION_MINOR',
|
||||
'AMD_MACHINE_KIND_AMDGPU', 'AMD_MACHINE_KIND_UNDEFINED',
|
||||
'AMD_POWERTWO_1', 'AMD_POWERTWO_128', 'AMD_POWERTWO_16',
|
||||
'AMD_POWERTWO_2', 'AMD_POWERTWO_256', 'AMD_POWERTWO_32',
|
||||
'AMD_POWERTWO_4', 'AMD_POWERTWO_64', 'AMD_POWERTWO_8',
|
||||
'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING',
|
||||
'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_SHIFT',
|
||||
'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_WIDTH',
|
||||
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER',
|
||||
@@ -4186,9 +4900,13 @@ __all__ = \
|
||||
'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_WIDTH',
|
||||
'AMD_SIGNAL_KIND_DOORBELL', 'AMD_SIGNAL_KIND_INVALID',
|
||||
'AMD_SIGNAL_KIND_LEGACY_DOORBELL', 'AMD_SIGNAL_KIND_USER',
|
||||
'BrigModule_t', 'HSA_ACCESS_PERMISSION_NONE',
|
||||
'HSA_ACCESS_PERMISSION_RO', 'HSA_ACCESS_PERMISSION_RW',
|
||||
'HSA_ACCESS_PERMISSION_WO', 'HSA_AGENT_FEATURE_AGENT_DISPATCH',
|
||||
'AMD_SYSTEM_VGPR_WORKITEM_ID_UNDEFINED',
|
||||
'AMD_SYSTEM_VGPR_WORKITEM_ID_X',
|
||||
'AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y',
|
||||
'AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y_Z', 'BrigModule_t',
|
||||
'HSA_ACCESS_PERMISSION_NONE', 'HSA_ACCESS_PERMISSION_RO',
|
||||
'HSA_ACCESS_PERMISSION_RW', 'HSA_ACCESS_PERMISSION_WO',
|
||||
'HSA_AGENT_FEATURE_AGENT_DISPATCH',
|
||||
'HSA_AGENT_FEATURE_KERNEL_DISPATCH',
|
||||
'HSA_AGENT_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES',
|
||||
'HSA_AGENT_INFO_CACHE_SIZE',
|
||||
@@ -4609,11 +5327,22 @@ __all__ = \
|
||||
'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK',
|
||||
'HSA_WAIT_STATE_ACTIVE', 'HSA_WAIT_STATE_BLOCKED',
|
||||
'HSA_WAVEFRONT_INFO_SIZE', 'MEMORY_TYPE_NONE',
|
||||
'MEMORY_TYPE_PINNED', 'amd_queue_properties32_t',
|
||||
'amd_queue_properties_t', 'amd_queue_t', 'amd_signal_kind64_t',
|
||||
'amd_signal_kind_t', 'amd_signal_t',
|
||||
'c__EA_hsa_access_permission_t', 'c__EA_hsa_agent_feature_t',
|
||||
'c__EA_hsa_agent_info_t',
|
||||
'MEMORY_TYPE_PINNED', 'amd_compute_pgm_rsrc_one32_t',
|
||||
'amd_compute_pgm_rsrc_one_t', 'amd_compute_pgm_rsrc_two32_t',
|
||||
'amd_compute_pgm_rsrc_two_t', 'amd_control_directives_t',
|
||||
'amd_element_byte_size_t', 'amd_enabled_control_directive64_t',
|
||||
'amd_enabled_control_directive_t', 'amd_exception_kind16_t',
|
||||
'amd_exception_kind_t', 'amd_float_denorm_mode_t',
|
||||
'amd_float_round_mode_t', 'amd_kernel_code_properties32_t',
|
||||
'amd_kernel_code_properties_t', 'amd_kernel_code_t',
|
||||
'amd_kernel_code_version32_t', 'amd_kernel_code_version_t',
|
||||
'amd_machine_kind16_t', 'amd_machine_kind_t',
|
||||
'amd_machine_version16_t', 'amd_powertwo8_t', 'amd_powertwo_t',
|
||||
'amd_queue_properties32_t', 'amd_queue_properties_t',
|
||||
'amd_queue_t', 'amd_runtime_loader_debug_info_t',
|
||||
'amd_signal_kind64_t', 'amd_signal_kind_t', 'amd_signal_t',
|
||||
'amd_system_vgpr_workitem_id_t', 'c__EA_hsa_access_permission_t',
|
||||
'c__EA_hsa_agent_feature_t', 'c__EA_hsa_agent_info_t',
|
||||
'c__EA_hsa_amd_agent_memory_pool_info_t',
|
||||
'c__EA_hsa_amd_copy_direction_t',
|
||||
'c__EA_hsa_amd_hw_exception_reset_cause_t',
|
||||
@@ -4982,8 +5711,9 @@ __all__ = \
|
||||
'hsa_wait_state_t__enumvalues', 'hsa_wavefront_get_info',
|
||||
'hsa_wavefront_info_t', 'hsa_wavefront_info_t__enumvalues',
|
||||
'hsa_wavefront_t', 'int32_t', 'kAqlProfileLib', 'size_t',
|
||||
'struct_BrigModuleHeader', 'struct_amd_queue_s',
|
||||
'struct_amd_signal_s',
|
||||
'struct_BrigModuleHeader', 'struct_amd_control_directives_s',
|
||||
'struct_amd_kernel_code_s', 'struct_amd_queue_s',
|
||||
'struct_amd_runtime_loader_debug_info_s', 'struct_amd_signal_s',
|
||||
'struct_c__SA_hsa_ext_amd_aql_pm4_packet_t',
|
||||
'struct_c__SA_hsa_ven_amd_aqlprofile_descriptor_t',
|
||||
'struct_c__SA_hsa_ven_amd_aqlprofile_event_t',
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
from __future__ import annotations
|
||||
from typing import Tuple, List, Any
|
||||
import os, fcntl, ctypes, functools, re, pathlib, mmap, struct, errno
|
||||
import os, fcntl, ctypes, functools, re, pathlib, mmap, struct, errno, subprocess
|
||||
from tinygrad.device import Compiled, LRUAllocator, Compiler, CompilerOptions
|
||||
from tinygrad.buffer import BufferOptions
|
||||
from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up
|
||||
from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, DEBUG
|
||||
from tinygrad.renderer.cstyle import HIPRenderer
|
||||
from tinygrad.runtime.driver.hip_comgr import compile_hip
|
||||
import tinygrad.runtime.autogen.kfd as kfd
|
||||
@@ -101,42 +101,191 @@ SHF_ALLOC = 0x2
|
||||
EMPTY_SIGNAL = hsa.hsa_signal_t()
|
||||
SIGNAL_VALUE_OFFSET = getattr(hsa.amd_signal_t, 'value').offset
|
||||
|
||||
class HWComputeQueue:
|
||||
BASE_ADDR = 0x00001260
|
||||
PACKET3_SET_SH_REG_START = 0x2c00
|
||||
SUB = PACKET3_SET_SH_REG_START - BASE_ADDR
|
||||
|
||||
regCOMPUTE_PGM_LO = 0x1bac - SUB
|
||||
regCOMPUTE_PGM_RSRC1 = 0x1bb2 - SUB
|
||||
regCOMPUTE_USER_DATA_0 = 0x1be0 - SUB
|
||||
regCOMPUTE_START_X = 0x1ba4 - SUB
|
||||
regCOMPUTE_TMPRING_SIZE = 0x1bb8 - SUB
|
||||
regCOMPUTE_RESOURCE_LIMITS = 0x1bb5 - SUB
|
||||
regCOMPUTE_RESTART_X = 0x1bbb - SUB
|
||||
regCOMPUTE_STATIC_THREAD_MGMT_SE0 = 0x1bb6 - SUB
|
||||
regCOMPUTE_STATIC_THREAD_MGMT_SE2 = 0x1bb9 - SUB
|
||||
regCOMPUTE_STATIC_THREAD_MGMT_SE4 = 0x1bcb - SUB
|
||||
|
||||
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ = 0x0106
|
||||
regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0107
|
||||
|
||||
# VGT_EVENT_TYPE in navi10_enum.h
|
||||
CACHE_FLUSH_AND_INV_TS_EVENT = 0x14
|
||||
CS_PARTIAL_FLUSH = 0x7
|
||||
|
||||
COMPUTE_SHADER_EN = 1
|
||||
FORCE_START_AT_000 = 1 << 2
|
||||
CS_W32_EN = 1 << 15
|
||||
|
||||
def format_struct(s):
|
||||
sdats = []
|
||||
for field_name, field_type in s._fields_:
|
||||
dat = getattr(s, field_name)
|
||||
if isinstance(dat, int): sdats.append(f"{field_name}:0x{dat:X}")
|
||||
else: sdats.append(f"{field_name}:{dat}")
|
||||
return sdats
|
||||
|
||||
"""
|
||||
regCOMPUTE_PGM_RSRC1 0 0x1bb2 12 0 0
|
||||
VGPRS 0 5
|
||||
SGPRS 6 9
|
||||
PRIORITY 10 11
|
||||
FLOAT_MODE 12 19
|
||||
PRIV 20 20
|
||||
DX10_CLAMP 21 21
|
||||
IEEE_MODE 23 23
|
||||
BULKY 24 24
|
||||
FP16_OVFL 26 26
|
||||
WGP_MODE 29 29
|
||||
MEM_ORDERED 30 30
|
||||
FWD_PROGRESS 31 31
|
||||
regCOMPUTE_PGM_RSRC2 0 0x1bb3 11 0 0
|
||||
SCRATCH_EN 0 0
|
||||
USER_SGPR 1 5
|
||||
TRAP_PRESENT 6 6
|
||||
TGID_X_EN 7 7
|
||||
TGID_Y_EN 8 8
|
||||
TGID_Z_EN 9 9
|
||||
TG_SIZE_EN 10 10
|
||||
TIDIG_COMP_CNT 11 12
|
||||
EXCP_EN_MSB 13 14
|
||||
LDS_SIZE 15 23
|
||||
EXCP_EN 24 30
|
||||
regCOMPUTE_RESOURCE_LIMITS 0 0x1bb5 6 0 0
|
||||
WAVES_PER_SH 0 9
|
||||
TG_PER_CU 12 15
|
||||
LOCK_THRESHOLD 16 21
|
||||
SIMD_DEST_CNTL 22 22
|
||||
FORCE_SIMD_DIST 23 23
|
||||
CU_GROUP_COUNT 24 26
|
||||
"""
|
||||
|
||||
class HWPM4Queue:
|
||||
def __init__(self): self.q = []
|
||||
|
||||
def hdp_flush(self):
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
|
||||
amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | amd_gpu.WAIT_REG_MEM_FUNCTION(3) | amd_gpu.WAIT_REG_MEM_ENGINE(0),
|
||||
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE, 0x0, 0x0, 0x20]
|
||||
|
||||
def invalidate_cache(self):
|
||||
# overkill?
|
||||
addr=0x0
|
||||
sz=(1 << 64)-1
|
||||
gli=1
|
||||
glv=1
|
||||
glk=1
|
||||
gl1=1
|
||||
gl2=1
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, #0x80000000,
|
||||
sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0,
|
||||
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | \
|
||||
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
|
||||
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2)]
|
||||
return self
|
||||
|
||||
def exec(self, prg:KFDProgram, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), completion_signal=None):
|
||||
if completion_signal is not None: completion_signal.value = 1
|
||||
self.q.append(hsa.hsa_kernel_dispatch_packet_t(
|
||||
header=DISPATCH_KERNEL_HEADER, setup=DISPATCH_KERNEL_SETUP,
|
||||
workgroup_size_x=local_size[0], workgroup_size_y=local_size[1], workgroup_size_z=local_size[2],
|
||||
grid_size_x=global_size[0]*local_size[0], grid_size_y=global_size[1]*local_size[1], grid_size_z=global_size[2]*local_size[2],
|
||||
kernel_object=prg.handle, group_segment_size=prg.group_segment_size, private_segment_size=prg.private_segment_size, kernarg_address=kernargs,
|
||||
completion_signal=hsa.hsa_signal_t(ctypes.addressof(completion_signal)) if completion_signal is not None else EMPTY_SIGNAL))
|
||||
self.hdp_flush()
|
||||
self.invalidate_cache()
|
||||
code = hsa.amd_kernel_code_t.from_address(prg.handle) # NOTE: this is wrong, it's not this object
|
||||
assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
|
||||
assert code.workitem_private_segment_byte_size == 0
|
||||
assert code.max_scratch_backing_memory_byte_size == 0
|
||||
assert code.kernel_code_prefetch_byte_size == 0
|
||||
#assert (mod:=(prg.group_segment_size%32)) == 0, f"group_segment_size is {prg.group_segment_size} mod is {mod}"
|
||||
#assert prg.private_segment_size == 0
|
||||
#for s in format_struct(code): print(s)
|
||||
#print(hex(code.compute_pgm_rsrc1), hex(code.compute_pgm_rsrc2))
|
||||
rsrc1, rsrc2 = code.compute_pgm_rsrc1, code.compute_pgm_rsrc2
|
||||
|
||||
# this is required
|
||||
lds_size = ((prg.group_segment_size+63)//64)&0x1FF
|
||||
assert lds_size <= 0x80 # larger numbers stall the GPU
|
||||
|
||||
# lds_size = 0x1ff
|
||||
|
||||
# rsrc2 |= ((((prg.group_segment_size+63)//64)&0x1FF) << 15)
|
||||
# rsrc2 |= 0x7f << 15
|
||||
# rsrc2 |= 0x1ff << 15
|
||||
|
||||
# rsrc2 |= ((prg.group_segment_size+31)//64) << 15
|
||||
# rsrc2 |= (prg.group_segment_size//32) << 15
|
||||
# user_sgpr =
|
||||
|
||||
prog_addr = (prg.handle + code.kernel_code_entry_byte_offset) >> 8
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), regCOMPUTE_PGM_LO, prog_addr&0xFFFFFFFF, prog_addr>>32, 0, 0,
|
||||
(prg.device.scratch.va_addr>>8)&0xFFFFFFFF, prg.device.scratch.va_addr>>40]
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_PGM_RSRC1, rsrc1, rsrc2 | (lds_size << 15)]
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), regCOMPUTE_TMPRING_SIZE, 0x00200200] # (waveSize << 12) | (numWaves)
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), regCOMPUTE_RESTART_X, 0,0,0,0]
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_STATIC_THREAD_MGMT_SE0, 0xFFFFFFFF,0xFFFFFFFF]
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_STATIC_THREAD_MGMT_SE2, 0xFFFFFFFF,0xFFFFFFFF]
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), regCOMPUTE_STATIC_THREAD_MGMT_SE4, 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF]
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_USER_DATA_0, kernargs&0xFFFFFFFF, kernargs>>32]
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), regCOMPUTE_START_X, 0,0,0, local_size[0],local_size[1],local_size[2],0,0]
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), regCOMPUTE_RESOURCE_LIMITS, 0]
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3),
|
||||
global_size[0],global_size[1],global_size[2], CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN]
|
||||
|
||||
# have to self wait since flush doesn't work
|
||||
self.signal(sig:=KFDDevice._get_signal())
|
||||
self.wait(sig)
|
||||
|
||||
if completion_signal: self.signal(completion_signal)
|
||||
return self
|
||||
|
||||
def signal(self, signal:hsa.amd_signal_t):
|
||||
def wait(self, signal:hsa.amd_signal_t, value=0):
|
||||
addr = ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
|
||||
amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(3) | amd_gpu.WAIT_REG_MEM_ENGINE(0),
|
||||
addr&0xFFFFFFFF, addr>>32, value, 0xffffffff, 4]
|
||||
return self
|
||||
|
||||
def signal(self, signal:hsa.amd_signal_t, value=0):
|
||||
#assert signal.value == 0, f"entering signal without it being set to 0, but {signal.value}"
|
||||
signal.value = 1
|
||||
self.q.append(hsa.hsa_barrier_and_packet_t(header=BARRIER_HEADER, completion_signal=hsa.hsa_signal_t(ctypes.addressof(signal))))
|
||||
return self
|
||||
|
||||
def wait(self, signal:hsa.amd_signal_t):
|
||||
sig = hsa.hsa_barrier_and_packet_t(header=BARRIER_HEADER)
|
||||
sig.dep_signal[0] = hsa.hsa_signal_t(ctypes.addressof(signal))
|
||||
self.q.append(sig)
|
||||
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
|
||||
addr = ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
|
||||
# event_index__mec_release_mem__end_of_pipe = 5
|
||||
# event_index__mec_release_mem__shader_done = 6
|
||||
amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | \
|
||||
amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | \
|
||||
amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | \
|
||||
amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ,
|
||||
amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(1) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(2) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
|
||||
addr&0xFFFFFFFF, addr>>32,
|
||||
value&0xFFFFFFFF, value>>32, 0]
|
||||
if signal.event_mailbox_ptr != 0:
|
||||
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
|
||||
# event_index__mec_release_mem__end_of_pipe = 5
|
||||
# event_index__mec_release_mem__shader_done = 6
|
||||
amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | \
|
||||
amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | \
|
||||
amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | \
|
||||
amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ,
|
||||
amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(1) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(2) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
|
||||
signal.event_mailbox_ptr&0xFFFFFFFF, signal.event_mailbox_ptr>>32,
|
||||
signal.event_id&0xFFFFFFFF, signal.event_id>>32,
|
||||
signal.event_id]
|
||||
return self
|
||||
|
||||
def submit(self, device:KFDDevice):
|
||||
if not len(self.q): return
|
||||
write_ptr, read_ptr = device.amd_aql_queue.write_dispatch_id, device.amd_aql_queue.read_dispatch_id
|
||||
if (len(self.q)+write_ptr-read_ptr)*AQL_PACKET_SIZE > device.aql_ring.size: raise RuntimeError("AQL queue overrun")
|
||||
for cmd in self.q:
|
||||
ring_addr = device.aql_ring.va_addr + (write_ptr*AQL_PACKET_SIZE) % device.aql_ring.size
|
||||
ctypes.memmove(ring_addr, ctypes.addressof(cmd), AQL_PACKET_SIZE)
|
||||
write_ptr += 1
|
||||
# TODO: add CPU memory barrier here
|
||||
device.amd_aql_queue.write_dispatch_id = write_ptr
|
||||
device.aql_doorbell[0] = device.aql_doorbell_value + len(self.q) - 1
|
||||
device.aql_doorbell_value += len(self.q)
|
||||
wptr = device.pm4_write_pointer[0]
|
||||
pm4_buffer_view = to_mv(device.pm4_ring.va_addr, device.pm4_ring.size).cast("I")
|
||||
for i, value in enumerate(self.q): pm4_buffer_view[(wptr+i)%(device.pm4_ring.size//4)] = value
|
||||
device.pm4_write_pointer[0] = wptr + len(self.q)
|
||||
device.pm4_doorbell[0] = wptr + len(self.q)
|
||||
return self
|
||||
|
||||
# prebuilt sdma packets
|
||||
@@ -180,10 +329,10 @@ class HWCopyQueue:
|
||||
self.q.append(sdma_cache_wb)
|
||||
return self
|
||||
|
||||
def signal(self, signal:hsa.amd_signal_t):
|
||||
def signal(self, signal:hsa.amd_signal_t, value=0):
|
||||
#assert signal.value == 0
|
||||
signal.value = 1
|
||||
self.q.append(sdma_pkts.atomic(op=amd_gpu.SDMA_OP_ATOMIC, operation=amd_gpu.SDMA_ATOMIC_ADD64,
|
||||
addr=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET, src_data=(1<<64)-1))
|
||||
self.q.append(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET, data=value))
|
||||
if signal.event_mailbox_ptr != 0:
|
||||
self.q.append(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=signal.event_mailbox_ptr, data=signal.event_id))
|
||||
self.q.append(sdma_pkts.trap(op=amd_gpu.SDMA_OP_TRAP, int_ctx=signal.event_id))
|
||||
@@ -200,6 +349,10 @@ class KFDProgram:
|
||||
# TODO; this API needs the type signature of the function and global_size/local_size
|
||||
self.device, self.name, self.lib = device, name, lib
|
||||
|
||||
if DEBUG >= 6:
|
||||
asm = subprocess.check_output(["/opt/rocm/llvm/bin/llvm-objdump", '-d', '-'], input=lib)
|
||||
print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
|
||||
|
||||
_phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
|
||||
sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
|
||||
|
||||
@@ -237,13 +390,13 @@ class KFDProgram:
|
||||
for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr)
|
||||
for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
|
||||
|
||||
HWComputeQueue().exec(self, self.device.kernargs_ptr, global_size, local_size,
|
||||
self.device.completion_signal if wait else None).submit(self.device)
|
||||
HWPM4Queue().exec(self, self.device.kernargs_ptr, global_size, local_size,
|
||||
self.device.completion_signal if wait else None).submit(self.device)
|
||||
self.device.kernargs_ptr += self.kernargs_segment_size
|
||||
|
||||
if wait:
|
||||
self.device._wait_signal(self.device.completion_signal)
|
||||
assert (wp:=self.device.amd_aql_queue.write_dispatch_id) == (rp:=self.device.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
|
||||
#assert (wp:=self.device.amd_aql_queue.write_dispatch_id) == (rp:=self.device.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
|
||||
return (self.device.completion_signal.end_ts-self.device.completion_signal.start_ts)/1e8
|
||||
|
||||
class KFDAllocator(LRUAllocator):
|
||||
@@ -301,9 +454,9 @@ class KFDAllocator(LRUAllocator):
|
||||
|
||||
def transfer(self, dest, src, sz:int, src_dev:KFDDevice, dest_dev:KFDDevice):
|
||||
dest_dev._gpu_map(src)
|
||||
q = HWComputeQueue().signal(sig := KFDDevice._get_signal())
|
||||
q = HWPM4Queue().signal(sig := KFDDevice._get_signal())
|
||||
HWCopyQueue().wait(sig).copy(dest.va_addr, src.va_addr, sz).signal(sigc := KFDDevice._get_signal()).submit(dest_dev)
|
||||
HWComputeQueue().wait(sigc).submit(dest_dev)
|
||||
HWPM4Queue().wait(sigc).submit(dest_dev)
|
||||
q.wait(sigc).submit(src_dev)
|
||||
|
||||
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
||||
@@ -351,7 +504,9 @@ class KFDDevice(Compiled):
|
||||
num = KFDDevice.signal_number
|
||||
KFDDevice.signal_number += 1
|
||||
if KFDDevice.signal_number == SIGNAL_COUNT: KFDDevice.signal_number = 16
|
||||
#print("signal", num)
|
||||
ret = hsa.amd_signal_t.from_address(KFDDevice.signals_page.va_addr + SIGNAL_SIZE*num)
|
||||
ret.value = 0
|
||||
ret.kind = hsa.AMD_SIGNAL_KIND_USER
|
||||
if sync_event is not None:
|
||||
ret.event_mailbox_ptr = KFDDevice.event_page.va_addr + sync_event.event_slot_index*8
|
||||
@@ -359,13 +514,17 @@ class KFDDevice(Compiled):
|
||||
return ret
|
||||
|
||||
@classmethod
|
||||
def _wait_signal(self, signal:hsa.amd_signal_t, timeout=60000):
|
||||
def _wait_signal(self, signal:hsa.amd_signal_t, timeout=10000, skip_check=False):
|
||||
assert signal.event_id != 0, "can't wait on this signal"
|
||||
evt_arr = (kfd.struct_kfd_event_data * 1)()
|
||||
evt_arr[0].event_id = signal.event_id
|
||||
ret = kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=timeout)
|
||||
if ret.wait_result != 0: raise RuntimeError(f"wait_result: {ret.wait_result}, {timeout} ms TIMEOUT!")
|
||||
|
||||
#val = signal.value
|
||||
#while val != 0: val = signal.value
|
||||
assert skip_check or signal.value == 0, f"not set to 0, but {signal.value}"
|
||||
|
||||
def __init__(self, device:str=""):
|
||||
if KFDDevice.kfd == -1:
|
||||
KFDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
|
||||
@@ -391,69 +550,52 @@ class KFDDevice(Compiled):
|
||||
self.signal_sdma = KFDDevice._get_signal(self.device_id*2+1, sync_event=kio.create_event(KFDDevice.kfd, auto_reset=1))
|
||||
|
||||
self.gart_aql = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
self.gart_sdma = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
self.aql_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
self.eop_buffer = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
self.kernargs = self._gpu_alloc(0x1000000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
self.kernargs_ptr = self.kernargs.va_addr
|
||||
self.ctx_save_restore_address = self._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
|
||||
# AQL Queue
|
||||
self.amd_aql_queue = hsa.amd_queue_t.from_address(self.gart_aql.va_addr)
|
||||
self.amd_aql_queue.write_dispatch_id = 0
|
||||
self.amd_aql_queue.read_dispatch_id = 0
|
||||
self.amd_aql_queue.read_dispatch_id_field_base_byte_offset = getattr(hsa.amd_queue_t, 'read_dispatch_id').offset
|
||||
self.amd_aql_queue.queue_properties = hsa.AMD_QUEUE_PROPERTIES_IS_PTR64 | hsa.AMD_QUEUE_PROPERTIES_ENABLE_PROFILING
|
||||
|
||||
self.amd_aql_queue.max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
|
||||
self.amd_aql_queue.max_wave_id = self.properties['max_waves_per_simd'] * self.properties['simd_per_cu'] - 1
|
||||
|
||||
# scratch setup
|
||||
max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
|
||||
max_wave_id = self.properties['max_waves_per_simd'] * self.properties['simd_per_cu'] - 1
|
||||
self.max_private_segment_size = 4096
|
||||
wave_scratch_len = round_up(((self.amd_aql_queue.max_wave_id + 1) * self.max_private_segment_size), 256) # gfx11 requires alignment of 256
|
||||
self.scratch_len = (self.amd_aql_queue.max_cu_id + 1) * self.properties['max_slots_scratch_cu'] * wave_scratch_len
|
||||
wave_scratch_len = round_up(((max_wave_id + 1) * self.max_private_segment_size), 256) # gfx11 requires alignment of 256
|
||||
self.scratch_len = (max_cu_id + 1) * self.properties['max_slots_scratch_cu'] * wave_scratch_len
|
||||
self.scratch = self._gpu_alloc(self.scratch_len, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
self.amd_aql_queue.scratch_backing_memory_location = self.scratch.va_addr
|
||||
self.amd_aql_queue.scratch_backing_memory_byte_size = self.scratch_len
|
||||
self.amd_aql_queue.scratch_wave64_lane_byte_size = self.max_private_segment_size * (self.amd_aql_queue.max_wave_id + 1) // 64
|
||||
self.amd_aql_queue.scratch_resource_descriptor[0] = self.scratch.va_addr & 0xFFFFFFFF
|
||||
self.amd_aql_queue.scratch_resource_descriptor[1] = ((self.scratch.va_addr >> 32) & 0xFFFF) | (1 << 30) # va_hi | SWIZZLE_ENABLE
|
||||
self.amd_aql_queue.scratch_resource_descriptor[2] = self.scratch_len & 0xFFFFFFFF
|
||||
self.amd_aql_queue.scratch_resource_descriptor[3] = 0x20814fac # FORMAT=BUF_FORMAT_32_UINT,OOB_SELECT=2,ADD_TID_ENABLE=1,TYPE=SQ_RSRC_BUF,SQ_SELs
|
||||
engines = self.properties['array_count'] // self.properties['simd_arrays_per_engine']
|
||||
self.amd_aql_queue.compute_tmpring_size = (wave_scratch_len // 256) << 12 | (self.scratch_len // (wave_scratch_len * engines))
|
||||
|
||||
self.aql_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.aql_ring.va_addr, ring_size=self.aql_ring.size, gpu_id=self.gpu_id,
|
||||
queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE_AQL, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
||||
eop_buffer_address=self.eop_buffer.va_addr, eop_buffer_size=self.eop_buffer.size,
|
||||
ctx_save_restore_address=self.ctx_save_restore_address.va_addr, ctx_save_restore_size=self.ctx_save_restore_address.size,
|
||||
ctl_stack_size = 0xa000,
|
||||
write_pointer_address=self.gart_aql.va_addr + getattr(hsa.amd_queue_t, 'write_dispatch_id').offset,
|
||||
read_pointer_address=self.gart_aql.va_addr + getattr(hsa.amd_queue_t, 'read_dispatch_id').offset)
|
||||
|
||||
self.doorbells_base = self.aql_queue.doorbell_offset & (~0x1fff) # doorbell is two pages
|
||||
self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, self.doorbells_base)
|
||||
self.aql_doorbell = to_mv(self.doorbells + self.aql_queue.doorbell_offset - self.doorbells_base, 4).cast("I")
|
||||
self.aql_doorbell_value = 0
|
||||
|
||||
# SDMA Queue
|
||||
self.gart_sdma = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
self.sdma_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
self.sdma_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.sdma_ring.va_addr, ring_size=self.sdma_ring.size, gpu_id=self.gpu_id,
|
||||
queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
||||
write_pointer_address=self.gart_sdma.va_addr, read_pointer_address=self.gart_sdma.va_addr+8)
|
||||
|
||||
# doorbell page
|
||||
self.doorbells_base = self.sdma_queue.doorbell_offset & (~0x1fff) # doorbell is two pages
|
||||
self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, self.doorbells_base)
|
||||
|
||||
self.sdma_read_pointer = to_mv(self.sdma_queue.read_pointer_address, 8).cast("Q")
|
||||
self.sdma_write_pointer = to_mv(self.sdma_queue.write_pointer_address, 8).cast("Q")
|
||||
self.sdma_doorbell = to_mv(self.doorbells + self.sdma_queue.doorbell_offset - self.doorbells_base, 4).cast("I")
|
||||
self.sdma_doorbell_value = 0
|
||||
|
||||
# PM4 stuff
|
||||
self.pm4_indirect_buf = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
pm4_indirect_cmd = (ctypes.c_uint32*13)(amd_gpu.PACKET3(amd_gpu.PACKET3_INDIRECT_BUFFER, 2), self.pm4_indirect_buf.va_addr & 0xffffffff,
|
||||
(self.pm4_indirect_buf.va_addr>>32) & 0xffffffff, 8 | amd_gpu.INDIRECT_BUFFER_VALID, 0xa)
|
||||
ctypes.memmove(ctypes.addressof(pm4_cmds:=(ctypes.c_uint16*27)(1))+2, ctypes.addressof(pm4_indirect_cmd), ctypes.sizeof(pm4_indirect_cmd))
|
||||
self.pm4_packet = hsa.hsa_ext_amd_aql_pm4_packet_t(header=VENDOR_HEADER, pm4_command=pm4_cmds,
|
||||
completion_signal=hsa.hsa_signal_t(ctypes.addressof(self.completion_signal)))
|
||||
# PM4 Queue
|
||||
self.pm4_ctx_save_restore_address = self._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
self.eop_pm4_buffer = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
self.gart_pm4 = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
self.pm4_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
self.pm4_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.pm4_ring.va_addr, ring_size=self.pm4_ring.size, gpu_id=self.gpu_id,
|
||||
queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
||||
eop_buffer_address=self.eop_pm4_buffer.va_addr, eop_buffer_size=self.eop_pm4_buffer.size,
|
||||
# TODO: are these needed? (i know eop is)
|
||||
ctx_save_restore_address=self.pm4_ctx_save_restore_address.va_addr, ctx_save_restore_size=self.pm4_ctx_save_restore_address.size,
|
||||
ctl_stack_size = 0xa000,
|
||||
write_pointer_address=self.gart_pm4.va_addr, read_pointer_address=self.gart_pm4.va_addr+8)
|
||||
|
||||
self.pm4_read_pointer = to_mv(self.pm4_queue.read_pointer_address, 8).cast("Q")
|
||||
self.pm4_write_pointer = to_mv(self.pm4_queue.write_pointer_address, 8).cast("Q")
|
||||
self.pm4_doorbell = to_mv(self.doorbells + self.pm4_queue.doorbell_offset - self.doorbells_base, 4).cast("I")
|
||||
|
||||
super().__init__(device, KFDAllocator(self), KFDCompiler(self.arch), functools.partial(KFDProgram, self))
|
||||
|
||||
@@ -469,23 +611,16 @@ class KFDDevice(Compiled):
|
||||
q.submit(self)
|
||||
|
||||
def _submit_cache_inv(self, addr=0x0, sz=(1 << 64)-1, gli=0, glv=0, glk=0, gl1=0, gl2=0):
|
||||
pm4_buffer_view = to_mv(self.pm4_indirect_buf.va_addr, 0x1000).cast("I")
|
||||
pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0,
|
||||
sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0,
|
||||
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | \
|
||||
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
|
||||
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2)]
|
||||
for i, value in enumerate(pm4_cmd): pm4_buffer_view[i] = value
|
||||
q = HWComputeQueue()
|
||||
q.q.append(self.pm4_packet)
|
||||
q.submit(self)
|
||||
HWPM4Queue().invalidate_cache().signal(self.completion_signal).submit(self)
|
||||
self._wait_signal(self.completion_signal)
|
||||
assert (wp:=self.amd_aql_queue.write_dispatch_id) == (rp:=self.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
|
||||
assert (wp:=(self.pm4_write_pointer[0]%(self.pm4_ring.size//4))) == (rp:=self.pm4_read_pointer[0]), \
|
||||
f"didn't run {wp} != {rp} len {self.pm4_ring.size//4}"
|
||||
|
||||
def synchronize(self):
|
||||
HWComputeQueue().signal(self.completion_signal).submit(self)
|
||||
HWPM4Queue().signal(self.completion_signal).submit(self)
|
||||
self._wait_signal(self.completion_signal)
|
||||
assert (wp:=self.amd_aql_queue.write_dispatch_id) == (rp:=self.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
|
||||
assert (wp:=(self.pm4_write_pointer[0]%(self.pm4_ring.size//4))) == (rp:=self.pm4_read_pointer[0]), \
|
||||
f"didn't run {wp} != {rp} len {self.pm4_ring.size//4}"
|
||||
|
||||
# reset kernargs
|
||||
self.kernargs_ptr = self.kernargs.va_addr
|
||||
|
||||
Reference in New Issue
Block a user