WIP PM4 Support (#4110)

* pm4 kernel launch works

* disable USE_THREAD_DIMENSIONS

* add kernel code

* work on real pm4

* pm4 signal

* same

* gate pm4

* hcq tests pass

* ops passes

* pm4 is closer

* pm4 debug (#4165)

* start debug tests passing

* prg

* smth

* hdp flush

* cleaner 1

* do not need this

* logs not need

* small things

* linter

* remove AQL

* test hcq

* fix tests

* it's subtracting, it shouldn't be -1

* pm4 changes (#4251)

* not need this anymore

* sdma signal with non atomic

---------

Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
This commit is contained in:
George Hotz
2024-04-23 08:31:27 +04:00
committed by GitHub
parent 3f6c7ca8bf
commit 2ae4f45272
6 changed files with 1193 additions and 122 deletions

View File

@@ -128,6 +128,7 @@ generate_hsa() {
/opt/rocm/include/hsa/hsa_ext_amd.h \
/opt/rocm/include/hsa/amd_hsa_signal.h \
/opt/rocm/include/hsa/amd_hsa_queue.h \
/opt/rocm/include/hsa/amd_hsa_kernel_code.h \
/opt/rocm/include/hsa/hsa_ext_finalize.h /opt/rocm/include/hsa/hsa_ext_image.h \
/opt/rocm/include/hsa/hsa_ven_amd_aqlprofile.h \
--clang-args="-I/opt/rocm/include" \

View File

@@ -0,0 +1,157 @@
import time
from hexdump import hexdump
from tinygrad import Tensor, Device
import tinygrad.runtime.autogen.amd_gpu as amd_gpu
import tinygrad.runtime.autogen.kfd as kfd
import tinygrad.runtime.autogen.hsa as hsa
from tinygrad.engine.schedule import create_schedule
from tinygrad.runtime.ops_kfd import kio, KFDProgram
from tinygrad.helpers import to_mv
DISPATCH_INIT_VALUE = 0x21 | 0x8000
#mmCOMPUTE_START_X = 0x2e04
#mmCOMPUTE_PGM_LO = 0x2e0c
BASE_ADDR = 0x00001260
PACKET3_SET_SH_REG_START = 0x2c00
SUB = PACKET3_SET_SH_REG_START - BASE_ADDR
regCOMPUTE_PGM_LO = 0x1bac - SUB
regCOMPUTE_START_X = 0x1ba4 - SUB
regCOMPUTE_NUM_THREAD_X = 0x1ba7 - SUB
regCOMPUTE_USER_DATA_0 = 0x1be0 - SUB
regCOMPUTE_USER_DATA_8 = 0x1be8 - SUB
regCOMPUTE_PGM_RSRC1 = 0x1bb2 - SUB
regCOMPUTE_PGM_RSRC2 = 0x1bb3 - SUB
# DEBUG=6 python3 extra/hip_gpu_driver/test_pm4.py
# sudo umr -i 1 -s amd744c.gfx1100 --sbank 1 1 2 | grep regCOMPUTE
# 0x00009025
COMPUTE_SHADER_EN = 1
USE_THREAD_DIMENSIONS = 1 << 5
CS_W32_EN = 1 << 15
def format_struct(s):
sdats = []
for field_name, field_type in s._fields_:
dat = getattr(s, field_name)
if isinstance(dat, int): sdats.append(f"{field_name}:0x{dat:X}")
else: sdats.append(f"{field_name}:{dat}")
return sdats
if __name__ == "__main__":
dev = Device["KFD"]
a = Tensor([0.,1.,2.], device="KFD").realize()
b = a + 7
b.lazydata.buffer.allocate()
si = create_schedule([b.lazydata])[-1]
runner = dev.get_runner(*si.ast)
prg: KFDProgram = runner.clprg
print("device initted")
# Compute Queue
gart_compute = dev._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
eop_buffer = dev._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
compute_ring = dev._gpu_alloc(0x800000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
ctx_save_restore_address = dev._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
compute_queue = kio.create_queue(dev.kfd, ring_base_address=compute_ring.va_addr, ring_size=compute_ring.size, gpu_id=dev.gpu_id,
queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
#eop_buffer_address=eop_buffer.va_addr, eop_buffer_size=eop_buffer.size,
#ctx_save_restore_address=ctx_save_restore_address.va_addr, ctx_save_restore_size=ctx_save_restore_address.size,
#ctl_stack_size = 0xa000,
write_pointer_address=gart_compute.va_addr, read_pointer_address=gart_compute.va_addr+8)
compute_doorbell = to_mv(dev.doorbells + compute_queue.doorbell_offset - dev.doorbells_base, 4).cast("I")
#scratch = dev._gpu_alloc(0x10000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
ka = to_mv(dev.kernargs_ptr, 0x10).cast("Q")
ka[0] = b.lazydata.buffer._buf.va_addr
ka[1] = a.lazydata.buffer._buf.va_addr
compute_read_pointer = to_mv(compute_queue.read_pointer_address, 8).cast("Q")
compute_write_pointer = to_mv(compute_queue.write_pointer_address, 8).cast("Q")
hexdump(to_mv(prg.handle, 0x40))
code = hsa.amd_kernel_code_t.from_address(prg.handle)
#print(format_struct(code))
#print("code")
#hexdump(to_mv(code_ptr, 0x100))
#runner.local_size = [2,1,1]
print(runner.local_size, runner.global_size)
#pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), mmCOMPUTE_PGM_LO,
# prg.handle&0xFFFFFFFF, prg.handle>>32, 0, 0, (scratch.va_addr>>8)&0xFFFFFFFF, scratch.va_addr>>40]
code_ptr = (prg.handle + code.kernel_code_entry_byte_offset) >> 8
pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), regCOMPUTE_PGM_LO, code_ptr&0xFFFFFFFF, code_ptr>>32, 0, 0, 0, 0]
pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_PGM_RSRC1, code.compute_pgm_rsrc1, code.compute_pgm_rsrc2]
pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_USER_DATA_0, dev.kernargs_ptr&0xFFFFFFFF, dev.kernargs_ptr>>32]
#pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_USER_DATA_0, 0, 0]
pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), regCOMPUTE_START_X, 0,0,0,
runner.local_size[0],runner.local_size[1],runner.local_size[2],0,0]
# disabled USE_THREAD_DIMENSIONS
pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3),
runner.global_size[0],runner.global_size[1],runner.global_size[2], CS_W32_EN | COMPUTE_SHADER_EN]
#pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_NOP, 0x3fff)]*0x200
"""
addr=0x0
sz=(1 << 64)-1
gli=0
glv=0
glk=0
gl1=0
gl2=0
pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0,
sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0,
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | \
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2)]
print(pm4_cmd)
"""
wptr = 0
pm4_buffer_view = to_mv(compute_ring.va_addr, compute_ring.size).cast("I")
for j in range(0x80000):
for i, value in enumerate(pm4_cmd): pm4_buffer_view[wptr+i] = value
wptr += len(pm4_cmd)
compute_write_pointer[0] = wptr
compute_doorbell[0] = wptr
for k in range(10):
done = compute_read_pointer[0] == compute_write_pointer[0]
print(compute_read_pointer[0], compute_write_pointer[0], done)
if done: break
time.sleep(0.01)
break
#break
#print(compute_read_pointer[0])
#time.sleep(0.05)
#print(compute_read_pointer[0])
#time.sleep(100)
print(a.numpy())
print(b.numpy())
exit(0)
#pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), mmCOMPUTE_PGM_LO, 0,0,0,1,1,1,0,0]
#pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, )]
#pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0,
# sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0,
# amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | \
# amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
# amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2)]

View File

@@ -0,0 +1,43 @@
import ctypes, mmap, time
from tinygrad.runtime.ops_kfd import KFDDevice, kio, sdma_pkts, libc
import tinygrad.runtime.autogen.amd_sdma as amd_sdma
import tinygrad.runtime.autogen.kfd as kfd
from tinygrad.helpers import to_mv
if __name__ == "__main__":
dev = KFDDevice()
sdma_ring = dev._gpu_alloc(1 << 22, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
gart = dev._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
sdma_queue = kio.create_queue(KFDDevice.kfd,
ring_base_address=sdma_ring.va_addr, ring_size=sdma_ring.size, gpu_id=dev.gpu_id,
queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
write_pointer_address=gart.va_addr + 0x100, read_pointer_address=gart.va_addr + 0x108)
doorbells_base = sdma_queue.doorbell_offset & (~0xfff)
doorbells = libc.mmap(0, 8192, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, doorbells_base)
sdma_read_pointer = to_mv(sdma_queue.read_pointer_address, 8).cast("Q")
sdma_write_pointer = to_mv(sdma_queue.write_pointer_address, 8).cast("Q")
sdma_doorbell = to_mv(doorbells + sdma_queue.doorbell_offset - doorbells_base, 4).cast("I")
test_write_page = dev._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
cmd = sdma_pkts.timestamp(op=amd_sdma.SDMA_OP_TIMESTAMP, sub_op=amd_sdma.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL, addr=test_write_page.va_addr)
sdma_doorbell_value = 0
def blit_sdma_command(cmd):
ctypes.memmove(sdma_ring.va_addr + (sdma_doorbell_value % sdma_ring.size), ctypes.addressof(cmd), sz:=ctypes.sizeof(cmd))
return sz
while True:
sdma_doorbell_value += blit_sdma_command(cmd)
sdma_write_pointer[0] = sdma_doorbell_value
sdma_doorbell[0] = sdma_doorbell_value
while sdma_read_pointer[0] != sdma_write_pointer[0]: continue
tm = to_mv(test_write_page.va_addr, 0x1000).cast("Q")[0]/1e8
print(f"{tm:.3f} s @ 0x{sdma_ring.va_addr + (sdma_doorbell_value % sdma_ring.size):X} R:0x{sdma_queue.read_pointer_address:X} W:0x{sdma_queue.write_pointer_address:X}")
time.sleep(0.01)

View File

@@ -2,7 +2,7 @@ import unittest, ctypes, struct, time
from tinygrad import Device, Tensor, dtypes
from tinygrad.buffer import Buffer, BufferOptions
from tinygrad.engine.schedule import create_schedule
from tinygrad.runtime.ops_kfd import KFDDevice, HWCopyQueue, HWComputeQueue
from tinygrad.runtime.ops_kfd import KFDDevice, HWCopyQueue, HWPM4Queue
def _time_queue(q, d):
st = time.perf_counter()
@@ -15,7 +15,7 @@ class TestHCQ(unittest.TestCase):
@classmethod
def setUpClass(self):
TestHCQ.d0: KFDDevice = Device["KFD"]
TestHCQ.d1: KFDDevice = Device["KFD:1"]
#TestHCQ.d1: KFDDevice = Device["KFD:1"]
TestHCQ.a = Tensor([0.,1.], device="KFD").realize()
TestHCQ.b = self.a + 1
si = create_schedule([self.b.lazydata])[-1]
@@ -26,13 +26,14 @@ class TestHCQ(unittest.TestCase):
TestHCQ.addr2 = struct.pack("QQ", TestHCQ.a.lazydata.buffer._buf.va_addr, TestHCQ.b.lazydata.buffer._buf.va_addr)
ctypes.memmove(TestHCQ.d0.kernargs_ptr, TestHCQ.addr, len(TestHCQ.addr))
ctypes.memmove(TestHCQ.d0.kernargs_ptr+len(TestHCQ.addr), TestHCQ.addr2, len(TestHCQ.addr2))
TestHCQ.compute_queue = HWPM4Queue
def setUp(self):
TestHCQ.a.lazydata.buffer.copyin(memoryview(bytearray(struct.pack("ff", 0, 1))))
TestHCQ.b.lazydata.buffer.copyin(memoryview(bytearray(struct.pack("ff", 0, 0))))
def test_run_1000_times_one_submit(self):
q = HWComputeQueue()
q = TestHCQ.compute_queue()
for _ in range(1000):
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.global_size, TestHCQ.runner.local_size)
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr+len(TestHCQ.addr), TestHCQ.runner.global_size, TestHCQ.runner.local_size)
@@ -42,20 +43,21 @@ class TestHCQ(unittest.TestCase):
assert (val:=TestHCQ.a.lazydata.buffer.as_buffer().cast("f")[0]) == 2000.0, f"got val {val}"
def test_run_1000_times(self):
q = HWComputeQueue()
q = TestHCQ.compute_queue()
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.global_size, TestHCQ.runner.local_size)
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr+len(TestHCQ.addr), TestHCQ.runner.global_size,
TestHCQ.runner.local_size, TestHCQ.d0.completion_signal)
for _ in range(1000):
q.submit(TestHCQ.d0)
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal)
TestHCQ.d0.completion_signal.value = 1
# confirm signal was reset
with self.assertRaises(RuntimeError):
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=50)
assert (val:=TestHCQ.a.lazydata.buffer.as_buffer().cast("f")[0]) == 2000.0, f"got val {val}"
def test_run_to_3(self):
q = HWComputeQueue()
q = TestHCQ.compute_queue()
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.global_size, TestHCQ.runner.local_size)
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr+len(TestHCQ.addr), TestHCQ.runner.global_size, TestHCQ.runner.local_size)
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.global_size, TestHCQ.runner.local_size, TestHCQ.d0.completion_signal)
@@ -65,12 +67,12 @@ class TestHCQ(unittest.TestCase):
def test_wait_signal(self):
TestHCQ.d0.completion_signal.value = 1
HWComputeQueue().wait(TestHCQ.d0.completion_signal).signal(TestHCQ.d0.completion_signal).submit(TestHCQ.d0)
TestHCQ.compute_queue().wait(TestHCQ.d0.completion_signal).signal(TestHCQ.d0.completion_signal).submit(TestHCQ.d0)
with self.assertRaises(RuntimeError):
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=50)
# clean up
TestHCQ.d0.completion_signal.value = 0
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=1000)
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=1000, skip_check=True)
def test_wait_copy_signal(self):
TestHCQ.d0.completion_signal.value = 1
@@ -79,25 +81,26 @@ class TestHCQ(unittest.TestCase):
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=50)
# clean up
TestHCQ.d0.completion_signal.value = 0
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=1000)
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=1000, skip_check=True)
def test_run_normal(self):
q = HWComputeQueue()
q = TestHCQ.compute_queue()
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.global_size, TestHCQ.runner.local_size, TestHCQ.d0.completion_signal)
q.submit(TestHCQ.d0)
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal)
assert (val:=TestHCQ.b.lazydata.buffer.as_buffer().cast("f")[0]) == 1.0, f"got val {val}"
def test_submit_empty_queues(self):
HWComputeQueue().submit(TestHCQ.d0)
TestHCQ.compute_queue().submit(TestHCQ.d0)
HWCopyQueue().submit(TestHCQ.d0)
def test_signal_timeout(self):
TestHCQ.d0.completion_signal.value = 1
with self.assertRaises(RuntimeError):
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=50)
def test_signal(self):
HWComputeQueue().signal(TestHCQ.d0.completion_signal).submit(TestHCQ.d0)
TestHCQ.compute_queue().signal(TestHCQ.d0.completion_signal).submit(TestHCQ.d0)
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal)
def test_copy_signal(self):
@@ -105,7 +108,7 @@ class TestHCQ(unittest.TestCase):
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal)
def test_run_signal(self):
q = HWComputeQueue()
q = TestHCQ.compute_queue()
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.global_size, TestHCQ.runner.local_size)
q.signal(TestHCQ.d0.completion_signal)
q.submit(TestHCQ.d0)
@@ -117,9 +120,10 @@ class TestHCQ(unittest.TestCase):
q.copy(TestHCQ.a.lazydata.buffer._buf.va_addr, TestHCQ.b.lazydata.buffer._buf.va_addr, 8)
q.copy(TestHCQ.b.lazydata.buffer._buf.va_addr, TestHCQ.a.lazydata.buffer._buf.va_addr, 8)
q.signal(TestHCQ.d0.completion_signal)
for _ in range(1000):
for i in range(1000):
q.submit(TestHCQ.d0)
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal)
TestHCQ.d0.completion_signal.value = 1
# confirm signal was reset
with self.assertRaises(RuntimeError):
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal, timeout=50)
@@ -147,8 +151,8 @@ class TestHCQ(unittest.TestCase):
def test_cross_device_copy_bandwidth(self):
SZ = 2_000_000_000
a = Buffer("KFD", SZ, dtypes.uint8, options=BufferOptions(nolru=True)).allocate()
b = Buffer("KFD:1", SZ, dtypes.uint8, options=BufferOptions(nolru=True)).allocate()
a = Buffer("KFD", SZ, dtypes.uint8, options=BufferOptions(nolru=True)).allocate()
TestHCQ.d0._gpu_map(b._buf)
q = HWCopyQueue()
q.copy(a._buf.va_addr, b._buf.va_addr, SZ)
@@ -158,7 +162,7 @@ class TestHCQ(unittest.TestCase):
assert gb_s > 2 and gb_s < 50
def test_interleave_compute_and_copy(self):
q = HWComputeQueue()
q = TestHCQ.compute_queue()
qc = HWCopyQueue()
q.exec(TestHCQ.runner.clprg, TestHCQ.d0.kernargs_ptr, TestHCQ.runner.global_size, TestHCQ.runner.local_size) # b = [1, 2]
q.signal(sig:=KFDDevice._get_signal(10))
@@ -173,12 +177,13 @@ class TestHCQ(unittest.TestCase):
assert (val:=TestHCQ.a.lazydata.buffer.as_buffer().cast("f")[0]) == 1.0, f"got val {val}"
def test_cross_device_signal(self):
q1 = HWComputeQueue()
q2 = HWComputeQueue()
d1 = Device["KFD:1"]
q1 = TestHCQ.compute_queue()
q2 = TestHCQ.compute_queue()
q1.signal(TestHCQ.d0.completion_signal)
q2.wait(TestHCQ.d0.completion_signal)
q2.submit(TestHCQ.d0)
q1.submit(TestHCQ.d1)
q1.submit(d1)
TestHCQ.d0._wait_signal(TestHCQ.d0.completion_signal)
if __name__ == "__main__":

View File

@@ -3720,6 +3720,540 @@ struct_amd_signal_s._fields_ = [
]
amd_signal_t = struct_amd_signal_s
amd_kernel_code_version32_t = ctypes.c_uint32
# values for enumeration 'amd_kernel_code_version_t'
amd_kernel_code_version_t__enumvalues = {
1: 'AMD_KERNEL_CODE_VERSION_MAJOR',
1: 'AMD_KERNEL_CODE_VERSION_MINOR',
}
AMD_KERNEL_CODE_VERSION_MAJOR = 1
AMD_KERNEL_CODE_VERSION_MINOR = 1
amd_kernel_code_version_t = ctypes.c_uint32 # enum
amd_machine_kind16_t = ctypes.c_uint16
# values for enumeration 'amd_machine_kind_t'
amd_machine_kind_t__enumvalues = {
0: 'AMD_MACHINE_KIND_UNDEFINED',
1: 'AMD_MACHINE_KIND_AMDGPU',
}
AMD_MACHINE_KIND_UNDEFINED = 0
AMD_MACHINE_KIND_AMDGPU = 1
amd_machine_kind_t = ctypes.c_uint32 # enum
amd_machine_version16_t = ctypes.c_uint16
# values for enumeration 'amd_float_round_mode_t'
amd_float_round_mode_t__enumvalues = {
0: 'AMD_FLOAT_ROUND_MODE_NEAREST_EVEN',
1: 'AMD_FLOAT_ROUND_MODE_PLUS_INFINITY',
2: 'AMD_FLOAT_ROUND_MODE_MINUS_INFINITY',
3: 'AMD_FLOAT_ROUND_MODE_ZERO',
}
AMD_FLOAT_ROUND_MODE_NEAREST_EVEN = 0
AMD_FLOAT_ROUND_MODE_PLUS_INFINITY = 1
AMD_FLOAT_ROUND_MODE_MINUS_INFINITY = 2
AMD_FLOAT_ROUND_MODE_ZERO = 3
amd_float_round_mode_t = ctypes.c_uint32 # enum
# values for enumeration 'amd_float_denorm_mode_t'
amd_float_denorm_mode_t__enumvalues = {
0: 'AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE_OUTPUT',
1: 'AMD_FLOAT_DENORM_MODE_FLUSH_OUTPUT',
2: 'AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE',
3: 'AMD_FLOAT_DENORM_MODE_NO_FLUSH',
}
AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE_OUTPUT = 0
AMD_FLOAT_DENORM_MODE_FLUSH_OUTPUT = 1
AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE = 2
AMD_FLOAT_DENORM_MODE_NO_FLUSH = 3
amd_float_denorm_mode_t = ctypes.c_uint32 # enum
amd_compute_pgm_rsrc_one32_t = ctypes.c_uint32
# values for enumeration 'amd_compute_pgm_rsrc_one_t'
amd_compute_pgm_rsrc_one_t__enumvalues = {
0: 'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT',
6: 'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH',
63: 'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT',
6: 'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT',
4: 'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH',
960: 'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT',
10: 'AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY_SHIFT',
2: 'AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY_WIDTH',
3072: 'AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY',
12: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32_SHIFT',
2: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32_WIDTH',
12288: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32',
14: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64_SHIFT',
2: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64_WIDTH',
49152: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64',
16: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32_SHIFT',
2: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32_WIDTH',
196608: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32',
18: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64_SHIFT',
2: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64_WIDTH',
786432: 'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64',
20: 'AMD_COMPUTE_PGM_RSRC_ONE_PRIV_SHIFT',
1: 'AMD_COMPUTE_PGM_RSRC_ONE_PRIV_WIDTH',
1048576: 'AMD_COMPUTE_PGM_RSRC_ONE_PRIV',
21: 'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP_SHIFT',
1: 'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP_WIDTH',
2097152: 'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP',
22: 'AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE_SHIFT',
1: 'AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE_WIDTH',
4194304: 'AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE',
23: 'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE_SHIFT',
1: 'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE_WIDTH',
8388608: 'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE',
24: 'AMD_COMPUTE_PGM_RSRC_ONE_BULKY_SHIFT',
1: 'AMD_COMPUTE_PGM_RSRC_ONE_BULKY_WIDTH',
16777216: 'AMD_COMPUTE_PGM_RSRC_ONE_BULKY',
25: 'AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER_SHIFT',
1: 'AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER_WIDTH',
33554432: 'AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER',
26: 'AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1_SHIFT',
6: 'AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1_WIDTH',
-67108864: 'AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1',
}
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT = 0
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH = 6
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT = 63
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT = 6
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH = 4
AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT = 960
AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY_SHIFT = 10
AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY_WIDTH = 2
AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY = 3072
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32_SHIFT = 12
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32_WIDTH = 2
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32 = 12288
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64_SHIFT = 14
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64_WIDTH = 2
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64 = 49152
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32_SHIFT = 16
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32_WIDTH = 2
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32 = 196608
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64_SHIFT = 18
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64_WIDTH = 2
AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64 = 786432
AMD_COMPUTE_PGM_RSRC_ONE_PRIV_SHIFT = 20
AMD_COMPUTE_PGM_RSRC_ONE_PRIV_WIDTH = 1
AMD_COMPUTE_PGM_RSRC_ONE_PRIV = 1048576
AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP_SHIFT = 21
AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP_WIDTH = 1
AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP = 2097152
AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE_SHIFT = 22
AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE_WIDTH = 1
AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE = 4194304
AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE_SHIFT = 23
AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE_WIDTH = 1
AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE = 8388608
AMD_COMPUTE_PGM_RSRC_ONE_BULKY_SHIFT = 24
AMD_COMPUTE_PGM_RSRC_ONE_BULKY_WIDTH = 1
AMD_COMPUTE_PGM_RSRC_ONE_BULKY = 16777216
AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER_SHIFT = 25
AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER_WIDTH = 1
AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER = 33554432
AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1_SHIFT = 26
AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1_WIDTH = 6
AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1 = -67108864
amd_compute_pgm_rsrc_one_t = ctypes.c_int32 # enum
# values for enumeration 'amd_system_vgpr_workitem_id_t'
amd_system_vgpr_workitem_id_t__enumvalues = {
0: 'AMD_SYSTEM_VGPR_WORKITEM_ID_X',
1: 'AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y',
2: 'AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y_Z',
3: 'AMD_SYSTEM_VGPR_WORKITEM_ID_UNDEFINED',
}
AMD_SYSTEM_VGPR_WORKITEM_ID_X = 0
AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y = 1
AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y_Z = 2
AMD_SYSTEM_VGPR_WORKITEM_ID_UNDEFINED = 3
amd_system_vgpr_workitem_id_t = ctypes.c_uint32 # enum
amd_compute_pgm_rsrc_two32_t = ctypes.c_uint32
# values for enumeration 'amd_compute_pgm_rsrc_two_t'
amd_compute_pgm_rsrc_two_t__enumvalues = {
0: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET_SHIFT',
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET_WIDTH',
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET',
1: 'AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT_SHIFT',
5: 'AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT_WIDTH',
62: 'AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT',
6: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER_SHIFT',
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER_WIDTH',
64: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER',
7: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X_SHIFT',
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X_WIDTH',
128: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X',
8: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y_SHIFT',
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y_WIDTH',
256: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y',
9: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z_SHIFT',
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z_WIDTH',
512: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z',
10: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO_SHIFT',
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO_WIDTH',
1024: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO',
11: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID_SHIFT',
2: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID_WIDTH',
6144: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID',
13: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH_SHIFT',
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH_WIDTH',
8192: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH',
14: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION_SHIFT',
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION_WIDTH',
16384: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION',
15: 'AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE_SHIFT',
9: 'AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE_WIDTH',
16744448: 'AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE',
24: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION_SHIFT',
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION_WIDTH',
16777216: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION',
25: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE_SHIFT',
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE_WIDTH',
33554432: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE',
26: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO_SHIFT',
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO_WIDTH',
67108864: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO',
27: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW_SHIFT',
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW_WIDTH',
134217728: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW',
28: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW_SHIFT',
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW_WIDTH',
268435456: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW',
29: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT_SHIFT',
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT_WIDTH',
536870912: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT',
30: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO_SHIFT',
1: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO_WIDTH',
1073741824: 'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO',
31: 'AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1_SHIFT',
1: 'AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1_WIDTH',
-2147483648: 'AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1',
}
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET_SHIFT = 0
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET_WIDTH = 1
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 1
AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT_SHIFT = 1
AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT_WIDTH = 5
AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT = 62
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER_SHIFT = 6
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER_WIDTH = 1
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER = 64
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X_SHIFT = 7
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X_WIDTH = 1
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X = 128
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y_SHIFT = 8
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y_WIDTH = 1
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y = 256
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z_SHIFT = 9
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z_WIDTH = 1
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z = 512
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO_SHIFT = 10
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO_WIDTH = 1
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO = 1024
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID_SHIFT = 11
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID_WIDTH = 2
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID = 6144
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH_SHIFT = 13
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH_WIDTH = 1
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH = 8192
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION_SHIFT = 14
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION_WIDTH = 1
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION = 16384
AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE_SHIFT = 15
AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE_WIDTH = 9
AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE = 16744448
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION_SHIFT = 24
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION_WIDTH = 1
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION = 16777216
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE_SHIFT = 25
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE_WIDTH = 1
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE = 33554432
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO_SHIFT = 26
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO_WIDTH = 1
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO = 67108864
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW_SHIFT = 27
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW_WIDTH = 1
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW = 134217728
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW_SHIFT = 28
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW_WIDTH = 1
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW = 268435456
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT_SHIFT = 29
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT_WIDTH = 1
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT = 536870912
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO_SHIFT = 30
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO_WIDTH = 1
AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO = 1073741824
AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1_SHIFT = 31
AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1_WIDTH = 1
AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1 = -2147483648
amd_compute_pgm_rsrc_two_t = ctypes.c_int32 # enum
# values for enumeration 'amd_element_byte_size_t'
amd_element_byte_size_t__enumvalues = {
0: 'AMD_ELEMENT_BYTE_SIZE_2',
1: 'AMD_ELEMENT_BYTE_SIZE_4',
2: 'AMD_ELEMENT_BYTE_SIZE_8',
3: 'AMD_ELEMENT_BYTE_SIZE_16',
}
AMD_ELEMENT_BYTE_SIZE_2 = 0
AMD_ELEMENT_BYTE_SIZE_4 = 1
AMD_ELEMENT_BYTE_SIZE_8 = 2
AMD_ELEMENT_BYTE_SIZE_16 = 3
amd_element_byte_size_t = ctypes.c_uint32 # enum
amd_kernel_code_properties32_t = ctypes.c_uint32
# values for enumeration 'amd_kernel_code_properties_t'
amd_kernel_code_properties_t__enumvalues = {
0: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT',
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH',
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER',
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR_SHIFT',
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR_WIDTH',
2: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR',
2: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR_SHIFT',
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR_WIDTH',
4: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR',
3: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT',
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH',
8: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR',
4: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID_SHIFT',
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID_WIDTH',
16: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID',
5: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT',
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH',
32: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT',
6: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT',
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH',
64: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE',
7: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT',
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH',
128: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X',
8: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT',
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH',
256: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y',
9: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT',
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH',
512: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z',
10: 'AMD_KERNEL_CODE_PROPERTIES_RESERVED1_SHIFT',
6: 'AMD_KERNEL_CODE_PROPERTIES_RESERVED1_WIDTH',
64512: 'AMD_KERNEL_CODE_PROPERTIES_RESERVED1',
16: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS_SHIFT',
1: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS_WIDTH',
65536: 'AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS',
17: 'AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE_SHIFT',
2: 'AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE_WIDTH',
393216: 'AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE',
19: 'AMD_KERNEL_CODE_PROPERTIES_IS_PTR64_SHIFT',
1: 'AMD_KERNEL_CODE_PROPERTIES_IS_PTR64_WIDTH',
524288: 'AMD_KERNEL_CODE_PROPERTIES_IS_PTR64',
20: 'AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK_SHIFT',
1: 'AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK_WIDTH',
1048576: 'AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK',
21: 'AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED_SHIFT',
1: 'AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED_WIDTH',
2097152: 'AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED',
22: 'AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED_SHIFT',
1: 'AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED_WIDTH',
4194304: 'AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED',
23: 'AMD_KERNEL_CODE_PROPERTIES_RESERVED2_SHIFT',
9: 'AMD_KERNEL_CODE_PROPERTIES_RESERVED2_WIDTH',
-8388608: 'AMD_KERNEL_CODE_PROPERTIES_RESERVED2',
}
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = 1
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR = 2
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR = 4
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR = 8
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID = 16
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT = 32
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = 64
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = 128
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = 256
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = 512
AMD_KERNEL_CODE_PROPERTIES_RESERVED1_SHIFT = 10
AMD_KERNEL_CODE_PROPERTIES_RESERVED1_WIDTH = 6
AMD_KERNEL_CODE_PROPERTIES_RESERVED1 = 64512
AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS_SHIFT = 16
AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1
AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS = 65536
AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE_SHIFT = 17
AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE_WIDTH = 2
AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE = 393216
AMD_KERNEL_CODE_PROPERTIES_IS_PTR64_SHIFT = 19
AMD_KERNEL_CODE_PROPERTIES_IS_PTR64_WIDTH = 1
AMD_KERNEL_CODE_PROPERTIES_IS_PTR64 = 524288
AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK_SHIFT = 20
AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK_WIDTH = 1
AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK = 1048576
AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED_SHIFT = 21
AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED_WIDTH = 1
AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED = 2097152
AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED_SHIFT = 22
AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED_WIDTH = 1
AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED = 4194304
AMD_KERNEL_CODE_PROPERTIES_RESERVED2_SHIFT = 23
AMD_KERNEL_CODE_PROPERTIES_RESERVED2_WIDTH = 9
AMD_KERNEL_CODE_PROPERTIES_RESERVED2 = -8388608
amd_kernel_code_properties_t = ctypes.c_int32 # enum
amd_powertwo8_t = ctypes.c_ubyte
# values for enumeration 'amd_powertwo_t'
amd_powertwo_t__enumvalues = {
0: 'AMD_POWERTWO_1',
1: 'AMD_POWERTWO_2',
2: 'AMD_POWERTWO_4',
3: 'AMD_POWERTWO_8',
4: 'AMD_POWERTWO_16',
5: 'AMD_POWERTWO_32',
6: 'AMD_POWERTWO_64',
7: 'AMD_POWERTWO_128',
8: 'AMD_POWERTWO_256',
}
AMD_POWERTWO_1 = 0
AMD_POWERTWO_2 = 1
AMD_POWERTWO_4 = 2
AMD_POWERTWO_8 = 3
AMD_POWERTWO_16 = 4
AMD_POWERTWO_32 = 5
AMD_POWERTWO_64 = 6
AMD_POWERTWO_128 = 7
AMD_POWERTWO_256 = 8
amd_powertwo_t = ctypes.c_uint32 # enum
amd_enabled_control_directive64_t = ctypes.c_uint64
# values for enumeration 'amd_enabled_control_directive_t'
amd_enabled_control_directive_t__enumvalues = {
1: 'AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_BREAK_EXCEPTIONS',
2: 'AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_DETECT_EXCEPTIONS',
4: 'AMD_ENABLED_CONTROL_DIRECTIVE_MAX_DYNAMIC_GROUP_SIZE',
8: 'AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_GRID_SIZE',
16: 'AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_WORKGROUP_SIZE',
32: 'AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_DIM',
64: 'AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_GRID_SIZE',
128: 'AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_WORKGROUP_SIZE',
256: 'AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRE_NO_PARTIAL_WORKGROUPS',
}
AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_BREAK_EXCEPTIONS = 1
AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_DETECT_EXCEPTIONS = 2
AMD_ENABLED_CONTROL_DIRECTIVE_MAX_DYNAMIC_GROUP_SIZE = 4
AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_GRID_SIZE = 8
AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_WORKGROUP_SIZE = 16
AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_DIM = 32
AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_GRID_SIZE = 64
AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_WORKGROUP_SIZE = 128
AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRE_NO_PARTIAL_WORKGROUPS = 256
amd_enabled_control_directive_t = ctypes.c_uint32 # enum
amd_exception_kind16_t = ctypes.c_uint16
# values for enumeration 'amd_exception_kind_t'
amd_exception_kind_t__enumvalues = {
1: 'AMD_EXCEPTION_KIND_INVALID_OPERATION',
2: 'AMD_EXCEPTION_KIND_DIVISION_BY_ZERO',
4: 'AMD_EXCEPTION_KIND_OVERFLOW',
8: 'AMD_EXCEPTION_KIND_UNDERFLOW',
16: 'AMD_EXCEPTION_KIND_INEXACT',
}
AMD_EXCEPTION_KIND_INVALID_OPERATION = 1
AMD_EXCEPTION_KIND_DIVISION_BY_ZERO = 2
AMD_EXCEPTION_KIND_OVERFLOW = 4
AMD_EXCEPTION_KIND_UNDERFLOW = 8
AMD_EXCEPTION_KIND_INEXACT = 16
amd_exception_kind_t = ctypes.c_uint32 # enum
class struct_amd_control_directives_s(Structure):
pass
struct_amd_control_directives_s._pack_ = 1 # source:False
struct_amd_control_directives_s._fields_ = [
('enabled_control_directives', ctypes.c_uint64),
('enable_break_exceptions', ctypes.c_uint16),
('enable_detect_exceptions', ctypes.c_uint16),
('max_dynamic_group_size', ctypes.c_uint32),
('max_flat_grid_size', ctypes.c_uint64),
('max_flat_workgroup_size', ctypes.c_uint32),
('required_dim', ctypes.c_ubyte),
('reserved1', ctypes.c_ubyte * 3),
('required_grid_size', ctypes.c_uint64 * 3),
('required_workgroup_size', ctypes.c_uint32 * 3),
('reserved2', ctypes.c_ubyte * 60),
]
amd_control_directives_t = struct_amd_control_directives_s
class struct_amd_kernel_code_s(Structure):
pass
struct_amd_kernel_code_s._pack_ = 1 # source:False
struct_amd_kernel_code_s._fields_ = [
('amd_kernel_code_version_major', ctypes.c_uint32),
('amd_kernel_code_version_minor', ctypes.c_uint32),
('amd_machine_kind', ctypes.c_uint16),
('amd_machine_version_major', ctypes.c_uint16),
('amd_machine_version_minor', ctypes.c_uint16),
('amd_machine_version_stepping', ctypes.c_uint16),
('kernel_code_entry_byte_offset', ctypes.c_int64),
('kernel_code_prefetch_byte_offset', ctypes.c_int64),
('kernel_code_prefetch_byte_size', ctypes.c_uint64),
('max_scratch_backing_memory_byte_size', ctypes.c_uint64),
('compute_pgm_rsrc1', ctypes.c_uint32),
('compute_pgm_rsrc2', ctypes.c_uint32),
('kernel_code_properties', ctypes.c_uint32),
('workitem_private_segment_byte_size', ctypes.c_uint32),
('workgroup_group_segment_byte_size', ctypes.c_uint32),
('gds_segment_byte_size', ctypes.c_uint32),
('kernarg_segment_byte_size', ctypes.c_uint64),
('workgroup_fbarrier_count', ctypes.c_uint32),
('wavefront_sgpr_count', ctypes.c_uint16),
('workitem_vgpr_count', ctypes.c_uint16),
('reserved_vgpr_first', ctypes.c_uint16),
('reserved_vgpr_count', ctypes.c_uint16),
('reserved_sgpr_first', ctypes.c_uint16),
('reserved_sgpr_count', ctypes.c_uint16),
('debug_wavefront_private_segment_offset_sgpr', ctypes.c_uint16),
('debug_private_segment_buffer_sgpr', ctypes.c_uint16),
('kernarg_segment_alignment', ctypes.c_ubyte),
('group_segment_alignment', ctypes.c_ubyte),
('private_segment_alignment', ctypes.c_ubyte),
('wavefront_size', ctypes.c_ubyte),
('call_convention', ctypes.c_int32),
('reserved1', ctypes.c_ubyte * 12),
('runtime_loader_kernel_symbol', ctypes.c_uint64),
('control_directives', amd_control_directives_t),
]
amd_kernel_code_t = struct_amd_kernel_code_s
class struct_amd_runtime_loader_debug_info_s(Structure):
pass
struct_amd_runtime_loader_debug_info_s._pack_ = 1 # source:False
struct_amd_runtime_loader_debug_info_s._fields_ = [
('elf_raw', ctypes.POINTER(None)),
('elf_size', ctypes.c_uint64),
('kernel_name', ctypes.POINTER(ctypes.c_char)),
('owning_segment', ctypes.POINTER(None)),
]
amd_runtime_loader_debug_info_t = struct_amd_runtime_loader_debug_info_s
class struct_BrigModuleHeader(Structure):
pass
@@ -4166,7 +4700,187 @@ struct_hsa_ven_amd_aqlprofile_1_00_pfn_s._fields_ = [
hsa_ven_amd_aqlprofile_1_00_pfn_t = struct_hsa_ven_amd_aqlprofile_1_00_pfn_s
hsa_ven_amd_aqlprofile_pfn_t = struct_hsa_ven_amd_aqlprofile_1_00_pfn_s
__all__ = \
['AMD_QUEUE_PROPERTIES_ENABLE_PROFILING',
['AMD_COMPUTE_PGM_RSRC_ONE_BULKY',
'AMD_COMPUTE_PGM_RSRC_ONE_BULKY_SHIFT',
'AMD_COMPUTE_PGM_RSRC_ONE_BULKY_WIDTH',
'AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER',
'AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER_SHIFT',
'AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER_WIDTH',
'AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE',
'AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE_SHIFT',
'AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE_WIDTH',
'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP',
'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP_SHIFT',
'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP_WIDTH',
'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE',
'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE_SHIFT',
'AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE_WIDTH',
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64',
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64_SHIFT',
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64_WIDTH',
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32',
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32_SHIFT',
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32_WIDTH',
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64',
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64_SHIFT',
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64_WIDTH',
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32',
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32_SHIFT',
'AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32_WIDTH',
'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT',
'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT',
'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH',
'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT',
'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT',
'AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH',
'AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY',
'AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY_SHIFT',
'AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY_WIDTH',
'AMD_COMPUTE_PGM_RSRC_ONE_PRIV',
'AMD_COMPUTE_PGM_RSRC_ONE_PRIV_SHIFT',
'AMD_COMPUTE_PGM_RSRC_ONE_PRIV_WIDTH',
'AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1',
'AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1_SHIFT',
'AMD_COMPUTE_PGM_RSRC_ONE_RESERVED1_WIDTH',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH_SHIFT',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH_WIDTH',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE_SHIFT',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE_WIDTH',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO_SHIFT',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO_WIDTH',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT_SHIFT',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT_WIDTH',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION_SHIFT',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION_WIDTH',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW_SHIFT',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW_WIDTH',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW_SHIFT',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW_WIDTH',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO_SHIFT',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO_WIDTH',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION_SHIFT',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION_WIDTH',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET_SHIFT',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET_WIDTH',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X_SHIFT',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X_WIDTH',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y_SHIFT',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y_WIDTH',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z_SHIFT',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z_WIDTH',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO_SHIFT',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO_WIDTH',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER_SHIFT',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER_WIDTH',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID_SHIFT',
'AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID_WIDTH',
'AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE',
'AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE_SHIFT',
'AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE_WIDTH',
'AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1',
'AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1_SHIFT',
'AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1_WIDTH',
'AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT',
'AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT_SHIFT',
'AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT_WIDTH',
'AMD_ELEMENT_BYTE_SIZE_16', 'AMD_ELEMENT_BYTE_SIZE_2',
'AMD_ELEMENT_BYTE_SIZE_4', 'AMD_ELEMENT_BYTE_SIZE_8',
'AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_BREAK_EXCEPTIONS',
'AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_DETECT_EXCEPTIONS',
'AMD_ENABLED_CONTROL_DIRECTIVE_MAX_DYNAMIC_GROUP_SIZE',
'AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_GRID_SIZE',
'AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_WORKGROUP_SIZE',
'AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_DIM',
'AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_GRID_SIZE',
'AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_WORKGROUP_SIZE',
'AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRE_NO_PARTIAL_WORKGROUPS',
'AMD_EXCEPTION_KIND_DIVISION_BY_ZERO',
'AMD_EXCEPTION_KIND_INEXACT',
'AMD_EXCEPTION_KIND_INVALID_OPERATION',
'AMD_EXCEPTION_KIND_OVERFLOW', 'AMD_EXCEPTION_KIND_UNDERFLOW',
'AMD_FLOAT_DENORM_MODE_FLUSH_OUTPUT',
'AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE',
'AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE_OUTPUT',
'AMD_FLOAT_DENORM_MODE_NO_FLUSH',
'AMD_FLOAT_ROUND_MODE_MINUS_INFINITY',
'AMD_FLOAT_ROUND_MODE_NEAREST_EVEN',
'AMD_FLOAT_ROUND_MODE_PLUS_INFINITY', 'AMD_FLOAT_ROUND_MODE_ZERO',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS_SHIFT',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS_WIDTH',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID_SHIFT',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID_WIDTH',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR_SHIFT',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR_WIDTH',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR_SHIFT',
'AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR_WIDTH',
'AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED',
'AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED_SHIFT',
'AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED_WIDTH',
'AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK',
'AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK_SHIFT',
'AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK_WIDTH',
'AMD_KERNEL_CODE_PROPERTIES_IS_PTR64',
'AMD_KERNEL_CODE_PROPERTIES_IS_PTR64_SHIFT',
'AMD_KERNEL_CODE_PROPERTIES_IS_PTR64_WIDTH',
'AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED',
'AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED_SHIFT',
'AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED_WIDTH',
'AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE',
'AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE_SHIFT',
'AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE_WIDTH',
'AMD_KERNEL_CODE_PROPERTIES_RESERVED1',
'AMD_KERNEL_CODE_PROPERTIES_RESERVED1_SHIFT',
'AMD_KERNEL_CODE_PROPERTIES_RESERVED1_WIDTH',
'AMD_KERNEL_CODE_PROPERTIES_RESERVED2',
'AMD_KERNEL_CODE_PROPERTIES_RESERVED2_SHIFT',
'AMD_KERNEL_CODE_PROPERTIES_RESERVED2_WIDTH',
'AMD_KERNEL_CODE_VERSION_MAJOR', 'AMD_KERNEL_CODE_VERSION_MINOR',
'AMD_MACHINE_KIND_AMDGPU', 'AMD_MACHINE_KIND_UNDEFINED',
'AMD_POWERTWO_1', 'AMD_POWERTWO_128', 'AMD_POWERTWO_16',
'AMD_POWERTWO_2', 'AMD_POWERTWO_256', 'AMD_POWERTWO_32',
'AMD_POWERTWO_4', 'AMD_POWERTWO_64', 'AMD_POWERTWO_8',
'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING',
'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_SHIFT',
'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_WIDTH',
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER',
@@ -4186,9 +4900,13 @@ __all__ = \
'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_WIDTH',
'AMD_SIGNAL_KIND_DOORBELL', 'AMD_SIGNAL_KIND_INVALID',
'AMD_SIGNAL_KIND_LEGACY_DOORBELL', 'AMD_SIGNAL_KIND_USER',
'BrigModule_t', 'HSA_ACCESS_PERMISSION_NONE',
'HSA_ACCESS_PERMISSION_RO', 'HSA_ACCESS_PERMISSION_RW',
'HSA_ACCESS_PERMISSION_WO', 'HSA_AGENT_FEATURE_AGENT_DISPATCH',
'AMD_SYSTEM_VGPR_WORKITEM_ID_UNDEFINED',
'AMD_SYSTEM_VGPR_WORKITEM_ID_X',
'AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y',
'AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y_Z', 'BrigModule_t',
'HSA_ACCESS_PERMISSION_NONE', 'HSA_ACCESS_PERMISSION_RO',
'HSA_ACCESS_PERMISSION_RW', 'HSA_ACCESS_PERMISSION_WO',
'HSA_AGENT_FEATURE_AGENT_DISPATCH',
'HSA_AGENT_FEATURE_KERNEL_DISPATCH',
'HSA_AGENT_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES',
'HSA_AGENT_INFO_CACHE_SIZE',
@@ -4609,11 +5327,22 @@ __all__ = \
'HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK',
'HSA_WAIT_STATE_ACTIVE', 'HSA_WAIT_STATE_BLOCKED',
'HSA_WAVEFRONT_INFO_SIZE', 'MEMORY_TYPE_NONE',
'MEMORY_TYPE_PINNED', 'amd_queue_properties32_t',
'amd_queue_properties_t', 'amd_queue_t', 'amd_signal_kind64_t',
'amd_signal_kind_t', 'amd_signal_t',
'c__EA_hsa_access_permission_t', 'c__EA_hsa_agent_feature_t',
'c__EA_hsa_agent_info_t',
'MEMORY_TYPE_PINNED', 'amd_compute_pgm_rsrc_one32_t',
'amd_compute_pgm_rsrc_one_t', 'amd_compute_pgm_rsrc_two32_t',
'amd_compute_pgm_rsrc_two_t', 'amd_control_directives_t',
'amd_element_byte_size_t', 'amd_enabled_control_directive64_t',
'amd_enabled_control_directive_t', 'amd_exception_kind16_t',
'amd_exception_kind_t', 'amd_float_denorm_mode_t',
'amd_float_round_mode_t', 'amd_kernel_code_properties32_t',
'amd_kernel_code_properties_t', 'amd_kernel_code_t',
'amd_kernel_code_version32_t', 'amd_kernel_code_version_t',
'amd_machine_kind16_t', 'amd_machine_kind_t',
'amd_machine_version16_t', 'amd_powertwo8_t', 'amd_powertwo_t',
'amd_queue_properties32_t', 'amd_queue_properties_t',
'amd_queue_t', 'amd_runtime_loader_debug_info_t',
'amd_signal_kind64_t', 'amd_signal_kind_t', 'amd_signal_t',
'amd_system_vgpr_workitem_id_t', 'c__EA_hsa_access_permission_t',
'c__EA_hsa_agent_feature_t', 'c__EA_hsa_agent_info_t',
'c__EA_hsa_amd_agent_memory_pool_info_t',
'c__EA_hsa_amd_copy_direction_t',
'c__EA_hsa_amd_hw_exception_reset_cause_t',
@@ -4982,8 +5711,9 @@ __all__ = \
'hsa_wait_state_t__enumvalues', 'hsa_wavefront_get_info',
'hsa_wavefront_info_t', 'hsa_wavefront_info_t__enumvalues',
'hsa_wavefront_t', 'int32_t', 'kAqlProfileLib', 'size_t',
'struct_BrigModuleHeader', 'struct_amd_queue_s',
'struct_amd_signal_s',
'struct_BrigModuleHeader', 'struct_amd_control_directives_s',
'struct_amd_kernel_code_s', 'struct_amd_queue_s',
'struct_amd_runtime_loader_debug_info_s', 'struct_amd_signal_s',
'struct_c__SA_hsa_ext_amd_aql_pm4_packet_t',
'struct_c__SA_hsa_ven_amd_aqlprofile_descriptor_t',
'struct_c__SA_hsa_ven_amd_aqlprofile_event_t',

View File

@@ -1,9 +1,9 @@
from __future__ import annotations
from typing import Tuple, List, Any
import os, fcntl, ctypes, functools, re, pathlib, mmap, struct, errno
import os, fcntl, ctypes, functools, re, pathlib, mmap, struct, errno, subprocess
from tinygrad.device import Compiled, LRUAllocator, Compiler, CompilerOptions
from tinygrad.buffer import BufferOptions
from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up
from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, DEBUG
from tinygrad.renderer.cstyle import HIPRenderer
from tinygrad.runtime.driver.hip_comgr import compile_hip
import tinygrad.runtime.autogen.kfd as kfd
@@ -101,42 +101,191 @@ SHF_ALLOC = 0x2
EMPTY_SIGNAL = hsa.hsa_signal_t()
SIGNAL_VALUE_OFFSET = getattr(hsa.amd_signal_t, 'value').offset
class HWComputeQueue:
BASE_ADDR = 0x00001260
PACKET3_SET_SH_REG_START = 0x2c00
SUB = PACKET3_SET_SH_REG_START - BASE_ADDR
regCOMPUTE_PGM_LO = 0x1bac - SUB
regCOMPUTE_PGM_RSRC1 = 0x1bb2 - SUB
regCOMPUTE_USER_DATA_0 = 0x1be0 - SUB
regCOMPUTE_START_X = 0x1ba4 - SUB
regCOMPUTE_TMPRING_SIZE = 0x1bb8 - SUB
regCOMPUTE_RESOURCE_LIMITS = 0x1bb5 - SUB
regCOMPUTE_RESTART_X = 0x1bbb - SUB
regCOMPUTE_STATIC_THREAD_MGMT_SE0 = 0x1bb6 - SUB
regCOMPUTE_STATIC_THREAD_MGMT_SE2 = 0x1bb9 - SUB
regCOMPUTE_STATIC_THREAD_MGMT_SE4 = 0x1bcb - SUB
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ = 0x0106
regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0107
# VGT_EVENT_TYPE in navi10_enum.h
CACHE_FLUSH_AND_INV_TS_EVENT = 0x14
CS_PARTIAL_FLUSH = 0x7
COMPUTE_SHADER_EN = 1
FORCE_START_AT_000 = 1 << 2
CS_W32_EN = 1 << 15
def format_struct(s):
sdats = []
for field_name, field_type in s._fields_:
dat = getattr(s, field_name)
if isinstance(dat, int): sdats.append(f"{field_name}:0x{dat:X}")
else: sdats.append(f"{field_name}:{dat}")
return sdats
"""
regCOMPUTE_PGM_RSRC1 0 0x1bb2 12 0 0
VGPRS 0 5
SGPRS 6 9
PRIORITY 10 11
FLOAT_MODE 12 19
PRIV 20 20
DX10_CLAMP 21 21
IEEE_MODE 23 23
BULKY 24 24
FP16_OVFL 26 26
WGP_MODE 29 29
MEM_ORDERED 30 30
FWD_PROGRESS 31 31
regCOMPUTE_PGM_RSRC2 0 0x1bb3 11 0 0
SCRATCH_EN 0 0
USER_SGPR 1 5
TRAP_PRESENT 6 6
TGID_X_EN 7 7
TGID_Y_EN 8 8
TGID_Z_EN 9 9
TG_SIZE_EN 10 10
TIDIG_COMP_CNT 11 12
EXCP_EN_MSB 13 14
LDS_SIZE 15 23
EXCP_EN 24 30
regCOMPUTE_RESOURCE_LIMITS 0 0x1bb5 6 0 0
WAVES_PER_SH 0 9
TG_PER_CU 12 15
LOCK_THRESHOLD 16 21
SIMD_DEST_CNTL 22 22
FORCE_SIMD_DIST 23 23
CU_GROUP_COUNT 24 26
"""
class HWPM4Queue:
def __init__(self): self.q = []
def hdp_flush(self):
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | amd_gpu.WAIT_REG_MEM_FUNCTION(3) | amd_gpu.WAIT_REG_MEM_ENGINE(0),
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE, 0x0, 0x0, 0x20]
def invalidate_cache(self):
# overkill?
addr=0x0
sz=(1 << 64)-1
gli=1
glv=1
glk=1
gl1=1
gl2=1
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, #0x80000000,
sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0,
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | \
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2)]
return self
def exec(self, prg:KFDProgram, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), completion_signal=None):
if completion_signal is not None: completion_signal.value = 1
self.q.append(hsa.hsa_kernel_dispatch_packet_t(
header=DISPATCH_KERNEL_HEADER, setup=DISPATCH_KERNEL_SETUP,
workgroup_size_x=local_size[0], workgroup_size_y=local_size[1], workgroup_size_z=local_size[2],
grid_size_x=global_size[0]*local_size[0], grid_size_y=global_size[1]*local_size[1], grid_size_z=global_size[2]*local_size[2],
kernel_object=prg.handle, group_segment_size=prg.group_segment_size, private_segment_size=prg.private_segment_size, kernarg_address=kernargs,
completion_signal=hsa.hsa_signal_t(ctypes.addressof(completion_signal)) if completion_signal is not None else EMPTY_SIGNAL))
self.hdp_flush()
self.invalidate_cache()
code = hsa.amd_kernel_code_t.from_address(prg.handle) # NOTE: this is wrong, it's not this object
assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
assert code.workitem_private_segment_byte_size == 0
assert code.max_scratch_backing_memory_byte_size == 0
assert code.kernel_code_prefetch_byte_size == 0
#assert (mod:=(prg.group_segment_size%32)) == 0, f"group_segment_size is {prg.group_segment_size} mod is {mod}"
#assert prg.private_segment_size == 0
#for s in format_struct(code): print(s)
#print(hex(code.compute_pgm_rsrc1), hex(code.compute_pgm_rsrc2))
rsrc1, rsrc2 = code.compute_pgm_rsrc1, code.compute_pgm_rsrc2
# this is required
lds_size = ((prg.group_segment_size+63)//64)&0x1FF
assert lds_size <= 0x80 # larger numbers stall the GPU
# lds_size = 0x1ff
# rsrc2 |= ((((prg.group_segment_size+63)//64)&0x1FF) << 15)
# rsrc2 |= 0x7f << 15
# rsrc2 |= 0x1ff << 15
# rsrc2 |= ((prg.group_segment_size+31)//64) << 15
# rsrc2 |= (prg.group_segment_size//32) << 15
# user_sgpr =
prog_addr = (prg.handle + code.kernel_code_entry_byte_offset) >> 8
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), regCOMPUTE_PGM_LO, prog_addr&0xFFFFFFFF, prog_addr>>32, 0, 0,
(prg.device.scratch.va_addr>>8)&0xFFFFFFFF, prg.device.scratch.va_addr>>40]
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_PGM_RSRC1, rsrc1, rsrc2 | (lds_size << 15)]
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), regCOMPUTE_TMPRING_SIZE, 0x00200200] # (waveSize << 12) | (numWaves)
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), regCOMPUTE_RESTART_X, 0,0,0,0]
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_STATIC_THREAD_MGMT_SE0, 0xFFFFFFFF,0xFFFFFFFF]
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_STATIC_THREAD_MGMT_SE2, 0xFFFFFFFF,0xFFFFFFFF]
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), regCOMPUTE_STATIC_THREAD_MGMT_SE4, 0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF]
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_USER_DATA_0, kernargs&0xFFFFFFFF, kernargs>>32]
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), regCOMPUTE_START_X, 0,0,0, local_size[0],local_size[1],local_size[2],0,0]
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), regCOMPUTE_RESOURCE_LIMITS, 0]
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3),
global_size[0],global_size[1],global_size[2], CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN]
# have to self wait since flush doesn't work
self.signal(sig:=KFDDevice._get_signal())
self.wait(sig)
if completion_signal: self.signal(completion_signal)
return self
def signal(self, signal:hsa.amd_signal_t):
def wait(self, signal:hsa.amd_signal_t, value=0):
addr = ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(3) | amd_gpu.WAIT_REG_MEM_ENGINE(0),
addr&0xFFFFFFFF, addr>>32, value, 0xffffffff, 4]
return self
def signal(self, signal:hsa.amd_signal_t, value=0):
#assert signal.value == 0, f"entering signal without it being set to 0, but {signal.value}"
signal.value = 1
self.q.append(hsa.hsa_barrier_and_packet_t(header=BARRIER_HEADER, completion_signal=hsa.hsa_signal_t(ctypes.addressof(signal))))
return self
def wait(self, signal:hsa.amd_signal_t):
sig = hsa.hsa_barrier_and_packet_t(header=BARRIER_HEADER)
sig.dep_signal[0] = hsa.hsa_signal_t(ctypes.addressof(signal))
self.q.append(sig)
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
addr = ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
# event_index__mec_release_mem__end_of_pipe = 5
# event_index__mec_release_mem__shader_done = 6
amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | \
amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | \
amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | \
amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ,
amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(1) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(2) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
addr&0xFFFFFFFF, addr>>32,
value&0xFFFFFFFF, value>>32, 0]
if signal.event_mailbox_ptr != 0:
self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
# event_index__mec_release_mem__end_of_pipe = 5
# event_index__mec_release_mem__shader_done = 6
amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | \
amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | \
amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | \
amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ,
amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(1) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(2) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
signal.event_mailbox_ptr&0xFFFFFFFF, signal.event_mailbox_ptr>>32,
signal.event_id&0xFFFFFFFF, signal.event_id>>32,
signal.event_id]
return self
def submit(self, device:KFDDevice):
if not len(self.q): return
write_ptr, read_ptr = device.amd_aql_queue.write_dispatch_id, device.amd_aql_queue.read_dispatch_id
if (len(self.q)+write_ptr-read_ptr)*AQL_PACKET_SIZE > device.aql_ring.size: raise RuntimeError("AQL queue overrun")
for cmd in self.q:
ring_addr = device.aql_ring.va_addr + (write_ptr*AQL_PACKET_SIZE) % device.aql_ring.size
ctypes.memmove(ring_addr, ctypes.addressof(cmd), AQL_PACKET_SIZE)
write_ptr += 1
# TODO: add CPU memory barrier here
device.amd_aql_queue.write_dispatch_id = write_ptr
device.aql_doorbell[0] = device.aql_doorbell_value + len(self.q) - 1
device.aql_doorbell_value += len(self.q)
wptr = device.pm4_write_pointer[0]
pm4_buffer_view = to_mv(device.pm4_ring.va_addr, device.pm4_ring.size).cast("I")
for i, value in enumerate(self.q): pm4_buffer_view[(wptr+i)%(device.pm4_ring.size//4)] = value
device.pm4_write_pointer[0] = wptr + len(self.q)
device.pm4_doorbell[0] = wptr + len(self.q)
return self
# prebuilt sdma packets
@@ -180,10 +329,10 @@ class HWCopyQueue:
self.q.append(sdma_cache_wb)
return self
def signal(self, signal:hsa.amd_signal_t):
def signal(self, signal:hsa.amd_signal_t, value=0):
#assert signal.value == 0
signal.value = 1
self.q.append(sdma_pkts.atomic(op=amd_gpu.SDMA_OP_ATOMIC, operation=amd_gpu.SDMA_ATOMIC_ADD64,
addr=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET, src_data=(1<<64)-1))
self.q.append(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=ctypes.addressof(signal) + SIGNAL_VALUE_OFFSET, data=value))
if signal.event_mailbox_ptr != 0:
self.q.append(sdma_pkts.fence(op=amd_gpu.SDMA_OP_FENCE, mtype=3, addr=signal.event_mailbox_ptr, data=signal.event_id))
self.q.append(sdma_pkts.trap(op=amd_gpu.SDMA_OP_TRAP, int_ctx=signal.event_id))
@@ -200,6 +349,10 @@ class KFDProgram:
# TODO; this API needs the type signature of the function and global_size/local_size
self.device, self.name, self.lib = device, name, lib
if DEBUG >= 6:
asm = subprocess.check_output(["/opt/rocm/llvm/bin/llvm-objdump", '-d', '-'], input=lib)
print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
_phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
@@ -237,13 +390,13 @@ class KFDProgram:
for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr)
for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
HWComputeQueue().exec(self, self.device.kernargs_ptr, global_size, local_size,
self.device.completion_signal if wait else None).submit(self.device)
HWPM4Queue().exec(self, self.device.kernargs_ptr, global_size, local_size,
self.device.completion_signal if wait else None).submit(self.device)
self.device.kernargs_ptr += self.kernargs_segment_size
if wait:
self.device._wait_signal(self.device.completion_signal)
assert (wp:=self.device.amd_aql_queue.write_dispatch_id) == (rp:=self.device.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
#assert (wp:=self.device.amd_aql_queue.write_dispatch_id) == (rp:=self.device.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
return (self.device.completion_signal.end_ts-self.device.completion_signal.start_ts)/1e8
class KFDAllocator(LRUAllocator):
@@ -301,9 +454,9 @@ class KFDAllocator(LRUAllocator):
def transfer(self, dest, src, sz:int, src_dev:KFDDevice, dest_dev:KFDDevice):
dest_dev._gpu_map(src)
q = HWComputeQueue().signal(sig := KFDDevice._get_signal())
q = HWPM4Queue().signal(sig := KFDDevice._get_signal())
HWCopyQueue().wait(sig).copy(dest.va_addr, src.va_addr, sz).signal(sigc := KFDDevice._get_signal()).submit(dest_dev)
HWComputeQueue().wait(sigc).submit(dest_dev)
HWPM4Queue().wait(sigc).submit(dest_dev)
q.wait(sigc).submit(src_dev)
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
@@ -351,7 +504,9 @@ class KFDDevice(Compiled):
num = KFDDevice.signal_number
KFDDevice.signal_number += 1
if KFDDevice.signal_number == SIGNAL_COUNT: KFDDevice.signal_number = 16
#print("signal", num)
ret = hsa.amd_signal_t.from_address(KFDDevice.signals_page.va_addr + SIGNAL_SIZE*num)
ret.value = 0
ret.kind = hsa.AMD_SIGNAL_KIND_USER
if sync_event is not None:
ret.event_mailbox_ptr = KFDDevice.event_page.va_addr + sync_event.event_slot_index*8
@@ -359,13 +514,17 @@ class KFDDevice(Compiled):
return ret
@classmethod
def _wait_signal(self, signal:hsa.amd_signal_t, timeout=60000):
def _wait_signal(self, signal:hsa.amd_signal_t, timeout=10000, skip_check=False):
assert signal.event_id != 0, "can't wait on this signal"
evt_arr = (kfd.struct_kfd_event_data * 1)()
evt_arr[0].event_id = signal.event_id
ret = kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=timeout)
if ret.wait_result != 0: raise RuntimeError(f"wait_result: {ret.wait_result}, {timeout} ms TIMEOUT!")
#val = signal.value
#while val != 0: val = signal.value
assert skip_check or signal.value == 0, f"not set to 0, but {signal.value}"
def __init__(self, device:str=""):
if KFDDevice.kfd == -1:
KFDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
@@ -391,69 +550,52 @@ class KFDDevice(Compiled):
self.signal_sdma = KFDDevice._get_signal(self.device_id*2+1, sync_event=kio.create_event(KFDDevice.kfd, auto_reset=1))
self.gart_aql = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.gart_sdma = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.aql_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.eop_buffer = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.kernargs = self._gpu_alloc(0x1000000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.kernargs_ptr = self.kernargs.va_addr
self.ctx_save_restore_address = self._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
# AQL Queue
self.amd_aql_queue = hsa.amd_queue_t.from_address(self.gart_aql.va_addr)
self.amd_aql_queue.write_dispatch_id = 0
self.amd_aql_queue.read_dispatch_id = 0
self.amd_aql_queue.read_dispatch_id_field_base_byte_offset = getattr(hsa.amd_queue_t, 'read_dispatch_id').offset
self.amd_aql_queue.queue_properties = hsa.AMD_QUEUE_PROPERTIES_IS_PTR64 | hsa.AMD_QUEUE_PROPERTIES_ENABLE_PROFILING
self.amd_aql_queue.max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
self.amd_aql_queue.max_wave_id = self.properties['max_waves_per_simd'] * self.properties['simd_per_cu'] - 1
# scratch setup
max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
max_wave_id = self.properties['max_waves_per_simd'] * self.properties['simd_per_cu'] - 1
self.max_private_segment_size = 4096
wave_scratch_len = round_up(((self.amd_aql_queue.max_wave_id + 1) * self.max_private_segment_size), 256) # gfx11 requires alignment of 256
self.scratch_len = (self.amd_aql_queue.max_cu_id + 1) * self.properties['max_slots_scratch_cu'] * wave_scratch_len
wave_scratch_len = round_up(((max_wave_id + 1) * self.max_private_segment_size), 256) # gfx11 requires alignment of 256
self.scratch_len = (max_cu_id + 1) * self.properties['max_slots_scratch_cu'] * wave_scratch_len
self.scratch = self._gpu_alloc(self.scratch_len, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.amd_aql_queue.scratch_backing_memory_location = self.scratch.va_addr
self.amd_aql_queue.scratch_backing_memory_byte_size = self.scratch_len
self.amd_aql_queue.scratch_wave64_lane_byte_size = self.max_private_segment_size * (self.amd_aql_queue.max_wave_id + 1) // 64
self.amd_aql_queue.scratch_resource_descriptor[0] = self.scratch.va_addr & 0xFFFFFFFF
self.amd_aql_queue.scratch_resource_descriptor[1] = ((self.scratch.va_addr >> 32) & 0xFFFF) | (1 << 30) # va_hi | SWIZZLE_ENABLE
self.amd_aql_queue.scratch_resource_descriptor[2] = self.scratch_len & 0xFFFFFFFF
self.amd_aql_queue.scratch_resource_descriptor[3] = 0x20814fac # FORMAT=BUF_FORMAT_32_UINT,OOB_SELECT=2,ADD_TID_ENABLE=1,TYPE=SQ_RSRC_BUF,SQ_SELs
engines = self.properties['array_count'] // self.properties['simd_arrays_per_engine']
self.amd_aql_queue.compute_tmpring_size = (wave_scratch_len // 256) << 12 | (self.scratch_len // (wave_scratch_len * engines))
self.aql_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.aql_ring.va_addr, ring_size=self.aql_ring.size, gpu_id=self.gpu_id,
queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE_AQL, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
eop_buffer_address=self.eop_buffer.va_addr, eop_buffer_size=self.eop_buffer.size,
ctx_save_restore_address=self.ctx_save_restore_address.va_addr, ctx_save_restore_size=self.ctx_save_restore_address.size,
ctl_stack_size = 0xa000,
write_pointer_address=self.gart_aql.va_addr + getattr(hsa.amd_queue_t, 'write_dispatch_id').offset,
read_pointer_address=self.gart_aql.va_addr + getattr(hsa.amd_queue_t, 'read_dispatch_id').offset)
self.doorbells_base = self.aql_queue.doorbell_offset & (~0x1fff) # doorbell is two pages
self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, self.doorbells_base)
self.aql_doorbell = to_mv(self.doorbells + self.aql_queue.doorbell_offset - self.doorbells_base, 4).cast("I")
self.aql_doorbell_value = 0
# SDMA Queue
self.gart_sdma = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.sdma_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.sdma_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.sdma_ring.va_addr, ring_size=self.sdma_ring.size, gpu_id=self.gpu_id,
queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
write_pointer_address=self.gart_sdma.va_addr, read_pointer_address=self.gart_sdma.va_addr+8)
# doorbell page
self.doorbells_base = self.sdma_queue.doorbell_offset & (~0x1fff) # doorbell is two pages
self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, self.doorbells_base)
self.sdma_read_pointer = to_mv(self.sdma_queue.read_pointer_address, 8).cast("Q")
self.sdma_write_pointer = to_mv(self.sdma_queue.write_pointer_address, 8).cast("Q")
self.sdma_doorbell = to_mv(self.doorbells + self.sdma_queue.doorbell_offset - self.doorbells_base, 4).cast("I")
self.sdma_doorbell_value = 0
# PM4 stuff
self.pm4_indirect_buf = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
pm4_indirect_cmd = (ctypes.c_uint32*13)(amd_gpu.PACKET3(amd_gpu.PACKET3_INDIRECT_BUFFER, 2), self.pm4_indirect_buf.va_addr & 0xffffffff,
(self.pm4_indirect_buf.va_addr>>32) & 0xffffffff, 8 | amd_gpu.INDIRECT_BUFFER_VALID, 0xa)
ctypes.memmove(ctypes.addressof(pm4_cmds:=(ctypes.c_uint16*27)(1))+2, ctypes.addressof(pm4_indirect_cmd), ctypes.sizeof(pm4_indirect_cmd))
self.pm4_packet = hsa.hsa_ext_amd_aql_pm4_packet_t(header=VENDOR_HEADER, pm4_command=pm4_cmds,
completion_signal=hsa.hsa_signal_t(ctypes.addressof(self.completion_signal)))
# PM4 Queue
self.pm4_ctx_save_restore_address = self._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.eop_pm4_buffer = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.gart_pm4 = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.pm4_ring = self._gpu_alloc(0x100000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.pm4_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.pm4_ring.va_addr, ring_size=self.pm4_ring.size, gpu_id=self.gpu_id,
queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
eop_buffer_address=self.eop_pm4_buffer.va_addr, eop_buffer_size=self.eop_pm4_buffer.size,
# TODO: are these needed? (i know eop is)
ctx_save_restore_address=self.pm4_ctx_save_restore_address.va_addr, ctx_save_restore_size=self.pm4_ctx_save_restore_address.size,
ctl_stack_size = 0xa000,
write_pointer_address=self.gart_pm4.va_addr, read_pointer_address=self.gart_pm4.va_addr+8)
self.pm4_read_pointer = to_mv(self.pm4_queue.read_pointer_address, 8).cast("Q")
self.pm4_write_pointer = to_mv(self.pm4_queue.write_pointer_address, 8).cast("Q")
self.pm4_doorbell = to_mv(self.doorbells + self.pm4_queue.doorbell_offset - self.doorbells_base, 4).cast("I")
super().__init__(device, KFDAllocator(self), KFDCompiler(self.arch), functools.partial(KFDProgram, self))
@@ -469,23 +611,16 @@ class KFDDevice(Compiled):
q.submit(self)
def _submit_cache_inv(self, addr=0x0, sz=(1 << 64)-1, gli=0, glv=0, glk=0, gl1=0, gl2=0):
pm4_buffer_view = to_mv(self.pm4_indirect_buf.va_addr, 0x1000).cast("I")
pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0,
sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0,
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | \
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2)]
for i, value in enumerate(pm4_cmd): pm4_buffer_view[i] = value
q = HWComputeQueue()
q.q.append(self.pm4_packet)
q.submit(self)
HWPM4Queue().invalidate_cache().signal(self.completion_signal).submit(self)
self._wait_signal(self.completion_signal)
assert (wp:=self.amd_aql_queue.write_dispatch_id) == (rp:=self.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
assert (wp:=(self.pm4_write_pointer[0]%(self.pm4_ring.size//4))) == (rp:=self.pm4_read_pointer[0]), \
f"didn't run {wp} != {rp} len {self.pm4_ring.size//4}"
def synchronize(self):
HWComputeQueue().signal(self.completion_signal).submit(self)
HWPM4Queue().signal(self.completion_signal).submit(self)
self._wait_signal(self.completion_signal)
assert (wp:=self.amd_aql_queue.write_dispatch_id) == (rp:=self.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
assert (wp:=(self.pm4_write_pointer[0]%(self.pm4_ring.size//4))) == (rp:=self.pm4_read_pointer[0]), \
f"didn't run {wp} != {rp} len {self.pm4_ring.size//4}"
# reset kernargs
self.kernargs_ptr = self.kernargs.va_addr