mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-02-11 07:05:04 -05:00
kfd driver wip (#3912)
* kfd driver wip * cleanups * kfd almost ready to ring doorbell * ding dong? * issues with signals * something * works * ops kfd * add amd_signal_t * works...sometimes * program runs * _gpu_alloc cleanup * cleanups * work * header + enable profiling (#3959) * header + enable profiling * just cleaner * measure * only local time domain * remove old comments * fix with master * elf parsing (#3965) * elf parsing * fix kernels with private * not used * clean up * clean up 2 * add flags * kfd sdma (#3970) * working sdma * remove driver, shorter * all commands we might need * svm * kfd remove hardcoded values (#4007) * remove hardcoded values * match above line * 7k lines + revert hsa * update that from origin * fix sdma reg gen * not the updated SDMA * compiler_opts * don't require kfd_ioctl * get ioctls from python * get ioctls from python * remove build_sdma_command * merge into 64-bit fields * shorter * fix property spelling and off by one --------- Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
This commit is contained in:
1430
tinygrad/runtime/autogen/amd_sdma.py
Normal file
1430
tinygrad/runtime/autogen/amd_sdma.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -3588,6 +3588,128 @@ try:
|
||||
hsa_amd_vmem_get_alloc_properties_from_handle.argtypes = [hsa_amd_vmem_alloc_handle_t, ctypes.POINTER(struct_hsa_amd_memory_pool_s), ctypes.POINTER(c__EA_hsa_amd_memory_type_t)]
|
||||
except AttributeError:
|
||||
pass
|
||||
amd_queue_properties32_t = ctypes.c_uint32
|
||||
|
||||
# values for enumeration 'amd_queue_properties_t'
|
||||
amd_queue_properties_t__enumvalues = {
|
||||
0: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_SHIFT',
|
||||
1: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_WIDTH',
|
||||
1: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER',
|
||||
1: 'AMD_QUEUE_PROPERTIES_IS_PTR64_SHIFT',
|
||||
1: 'AMD_QUEUE_PROPERTIES_IS_PTR64_WIDTH',
|
||||
2: 'AMD_QUEUE_PROPERTIES_IS_PTR64',
|
||||
2: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_SHIFT',
|
||||
1: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_WIDTH',
|
||||
4: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS',
|
||||
3: 'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_SHIFT',
|
||||
1: 'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_WIDTH',
|
||||
8: 'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING',
|
||||
4: 'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_SHIFT',
|
||||
1: 'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_WIDTH',
|
||||
16: 'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE',
|
||||
5: 'AMD_QUEUE_PROPERTIES_RESERVED1_SHIFT',
|
||||
27: 'AMD_QUEUE_PROPERTIES_RESERVED1_WIDTH',
|
||||
-32: 'AMD_QUEUE_PROPERTIES_RESERVED1',
|
||||
}
|
||||
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_SHIFT = 0
|
||||
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_WIDTH = 1
|
||||
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER = 1
|
||||
AMD_QUEUE_PROPERTIES_IS_PTR64_SHIFT = 1
|
||||
AMD_QUEUE_PROPERTIES_IS_PTR64_WIDTH = 1
|
||||
AMD_QUEUE_PROPERTIES_IS_PTR64 = 2
|
||||
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_SHIFT = 2
|
||||
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_WIDTH = 1
|
||||
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS = 4
|
||||
AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_SHIFT = 3
|
||||
AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_WIDTH = 1
|
||||
AMD_QUEUE_PROPERTIES_ENABLE_PROFILING = 8
|
||||
AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_SHIFT = 4
|
||||
AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_WIDTH = 1
|
||||
AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE = 16
|
||||
AMD_QUEUE_PROPERTIES_RESERVED1_SHIFT = 5
|
||||
AMD_QUEUE_PROPERTIES_RESERVED1_WIDTH = 27
|
||||
AMD_QUEUE_PROPERTIES_RESERVED1 = -32
|
||||
amd_queue_properties_t = ctypes.c_int32 # enum
|
||||
class struct_amd_queue_s(Structure):
|
||||
pass
|
||||
|
||||
struct_amd_queue_s._pack_ = 1 # source:False
|
||||
struct_amd_queue_s._fields_ = [
|
||||
('hsa_queue', hsa_queue_t),
|
||||
('reserved1', ctypes.c_uint32 * 4),
|
||||
('write_dispatch_id', ctypes.c_uint64),
|
||||
('group_segment_aperture_base_hi', ctypes.c_uint32),
|
||||
('private_segment_aperture_base_hi', ctypes.c_uint32),
|
||||
('max_cu_id', ctypes.c_uint32),
|
||||
('max_wave_id', ctypes.c_uint32),
|
||||
('max_legacy_doorbell_dispatch_id_plus_1', ctypes.c_uint64),
|
||||
('legacy_doorbell_lock', ctypes.c_uint32),
|
||||
('reserved2', ctypes.c_uint32 * 9),
|
||||
('read_dispatch_id', ctypes.c_uint64),
|
||||
('read_dispatch_id_field_base_byte_offset', ctypes.c_uint32),
|
||||
('compute_tmpring_size', ctypes.c_uint32),
|
||||
('scratch_resource_descriptor', ctypes.c_uint32 * 4),
|
||||
('scratch_backing_memory_location', ctypes.c_uint64),
|
||||
('scratch_backing_memory_byte_size', ctypes.c_uint64),
|
||||
('scratch_wave64_lane_byte_size', ctypes.c_uint32),
|
||||
('queue_properties', ctypes.c_uint32),
|
||||
('reserved3', ctypes.c_uint32 * 2),
|
||||
('queue_inactive_signal', hsa_signal_t),
|
||||
('reserved4', ctypes.c_uint32 * 14),
|
||||
]
|
||||
|
||||
amd_queue_t = struct_amd_queue_s
|
||||
amd_signal_kind64_t = ctypes.c_int64
|
||||
|
||||
# values for enumeration 'amd_signal_kind_t'
|
||||
amd_signal_kind_t__enumvalues = {
|
||||
0: 'AMD_SIGNAL_KIND_INVALID',
|
||||
1: 'AMD_SIGNAL_KIND_USER',
|
||||
-1: 'AMD_SIGNAL_KIND_DOORBELL',
|
||||
-2: 'AMD_SIGNAL_KIND_LEGACY_DOORBELL',
|
||||
}
|
||||
AMD_SIGNAL_KIND_INVALID = 0
|
||||
AMD_SIGNAL_KIND_USER = 1
|
||||
AMD_SIGNAL_KIND_DOORBELL = -1
|
||||
AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2
|
||||
amd_signal_kind_t = ctypes.c_int32 # enum
|
||||
class struct_amd_signal_s(Structure):
|
||||
pass
|
||||
|
||||
class union_amd_signal_s_0(Union):
|
||||
pass
|
||||
|
||||
union_amd_signal_s_0._pack_ = 1 # source:False
|
||||
union_amd_signal_s_0._fields_ = [
|
||||
('value', ctypes.c_int64),
|
||||
('legacy_hardware_doorbell_ptr', ctypes.POINTER(ctypes.c_uint32)),
|
||||
('hardware_doorbell_ptr', ctypes.POINTER(ctypes.c_uint64)),
|
||||
]
|
||||
|
||||
class union_amd_signal_s_1(Union):
|
||||
pass
|
||||
|
||||
union_amd_signal_s_1._pack_ = 1 # source:False
|
||||
union_amd_signal_s_1._fields_ = [
|
||||
('queue_ptr', ctypes.POINTER(struct_amd_queue_s)),
|
||||
('reserved2', ctypes.c_uint64),
|
||||
]
|
||||
|
||||
struct_amd_signal_s._pack_ = 1 # source:False
|
||||
struct_amd_signal_s._anonymous_ = ('_0', '_1',)
|
||||
struct_amd_signal_s._fields_ = [
|
||||
('kind', ctypes.c_int64),
|
||||
('_0', union_amd_signal_s_0),
|
||||
('event_mailbox_ptr', ctypes.c_uint64),
|
||||
('event_id', ctypes.c_uint32),
|
||||
('reserved1', ctypes.c_uint32),
|
||||
('start_ts', ctypes.c_uint64),
|
||||
('end_ts', ctypes.c_uint64),
|
||||
('_1', union_amd_signal_s_1),
|
||||
('reserved3', ctypes.c_uint32 * 2),
|
||||
]
|
||||
|
||||
amd_signal_t = struct_amd_signal_s
|
||||
class struct_BrigModuleHeader(Structure):
|
||||
pass
|
||||
|
||||
@@ -3713,7 +3835,27 @@ struct_hsa_ext_finalizer_1_00_pfn_s._fields_ = [
|
||||
|
||||
hsa_ext_finalizer_1_00_pfn_t = struct_hsa_ext_finalizer_1_00_pfn_s
|
||||
__all__ = \
|
||||
['BrigModule_t', 'HSA_ACCESS_PERMISSION_NONE',
|
||||
['AMD_QUEUE_PROPERTIES_ENABLE_PROFILING',
|
||||
'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_SHIFT',
|
||||
'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_WIDTH',
|
||||
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER',
|
||||
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS',
|
||||
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_SHIFT',
|
||||
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_WIDTH',
|
||||
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_SHIFT',
|
||||
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_WIDTH',
|
||||
'AMD_QUEUE_PROPERTIES_IS_PTR64',
|
||||
'AMD_QUEUE_PROPERTIES_IS_PTR64_SHIFT',
|
||||
'AMD_QUEUE_PROPERTIES_IS_PTR64_WIDTH',
|
||||
'AMD_QUEUE_PROPERTIES_RESERVED1',
|
||||
'AMD_QUEUE_PROPERTIES_RESERVED1_SHIFT',
|
||||
'AMD_QUEUE_PROPERTIES_RESERVED1_WIDTH',
|
||||
'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE',
|
||||
'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_SHIFT',
|
||||
'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_WIDTH',
|
||||
'AMD_SIGNAL_KIND_DOORBELL', 'AMD_SIGNAL_KIND_INVALID',
|
||||
'AMD_SIGNAL_KIND_LEGACY_DOORBELL', 'AMD_SIGNAL_KIND_USER',
|
||||
'BrigModule_t', 'HSA_ACCESS_PERMISSION_NONE',
|
||||
'HSA_ACCESS_PERMISSION_RO', 'HSA_ACCESS_PERMISSION_RW',
|
||||
'HSA_ACCESS_PERMISSION_WO', 'HSA_AGENT_FEATURE_AGENT_DISPATCH',
|
||||
'HSA_AGENT_FEATURE_KERNEL_DISPATCH',
|
||||
@@ -4082,8 +4224,10 @@ __all__ = \
|
||||
'HSA_VARIABLE_SEGMENT_READONLY', 'HSA_WAIT_STATE_ACTIVE',
|
||||
'HSA_WAIT_STATE_BLOCKED', 'HSA_WAVEFRONT_INFO_SIZE',
|
||||
'MEMORY_TYPE_NONE', 'MEMORY_TYPE_PINNED',
|
||||
'c__EA_hsa_access_permission_t', 'c__EA_hsa_agent_feature_t',
|
||||
'c__EA_hsa_agent_info_t',
|
||||
'amd_queue_properties32_t', 'amd_queue_properties_t',
|
||||
'amd_queue_t', 'amd_signal_kind64_t', 'amd_signal_kind_t',
|
||||
'amd_signal_t', 'c__EA_hsa_access_permission_t',
|
||||
'c__EA_hsa_agent_feature_t', 'c__EA_hsa_agent_info_t',
|
||||
'c__EA_hsa_amd_agent_memory_pool_info_t',
|
||||
'c__EA_hsa_amd_copy_direction_t',
|
||||
'c__EA_hsa_amd_hw_exception_reset_cause_t',
|
||||
@@ -4422,6 +4566,7 @@ __all__ = \
|
||||
'hsa_wait_state_t__enumvalues', 'hsa_wavefront_get_info',
|
||||
'hsa_wavefront_info_t', 'hsa_wavefront_info_t__enumvalues',
|
||||
'hsa_wavefront_t', 'int32_t', 'size_t', 'struct_BrigModuleHeader',
|
||||
'struct_amd_queue_s', 'struct_amd_signal_s',
|
||||
'struct_hsa_agent_dispatch_packet_s', 'struct_hsa_agent_s',
|
||||
'struct_hsa_amd_barrier_value_packet_s', 'struct_hsa_amd_event_s',
|
||||
'struct_hsa_amd_gpu_hw_exception_info_s',
|
||||
@@ -4455,4 +4600,5 @@ __all__ = \
|
||||
'struct_hsa_queue_s', 'struct_hsa_region_s',
|
||||
'struct_hsa_signal_group_s', 'struct_hsa_signal_s',
|
||||
'struct_hsa_wavefront_s', 'uint16_t', 'uint32_t', 'uint64_t',
|
||||
'union_amd_signal_s_0', 'union_amd_signal_s_1',
|
||||
'union_hsa_amd_event_s_0']
|
||||
|
||||
812
tinygrad/runtime/autogen/kfd.py
Normal file
812
tinygrad/runtime/autogen/kfd.py
Normal file
@@ -0,0 +1,812 @@
|
||||
# mypy: ignore-errors
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# TARGET arch is: []
|
||||
# WORD_SIZE is: 8
|
||||
# POINTER_SIZE is: 8
|
||||
# LONGDOUBLE_SIZE is: 16
|
||||
#
|
||||
import ctypes, os
|
||||
|
||||
|
||||
class AsDictMixin:
|
||||
@classmethod
|
||||
def as_dict(cls, self):
|
||||
result = {}
|
||||
if not isinstance(self, AsDictMixin):
|
||||
# not a structure, assume it's already a python object
|
||||
return self
|
||||
if not hasattr(cls, "_fields_"):
|
||||
return result
|
||||
# sys.version_info >= (3, 5)
|
||||
# for (field, *_) in cls._fields_: # noqa
|
||||
for field_tuple in cls._fields_: # noqa
|
||||
field = field_tuple[0]
|
||||
if field.startswith('PADDING_'):
|
||||
continue
|
||||
value = getattr(self, field)
|
||||
type_ = type(value)
|
||||
if hasattr(value, "_length_") and hasattr(value, "_type_"):
|
||||
# array
|
||||
if not hasattr(type_, "as_dict"):
|
||||
value = [v for v in value]
|
||||
else:
|
||||
type_ = type_._type_
|
||||
value = [type_.as_dict(v) for v in value]
|
||||
elif hasattr(value, "contents") and hasattr(value, "_type_"):
|
||||
# pointer
|
||||
try:
|
||||
if not hasattr(type_, "as_dict"):
|
||||
value = value.contents
|
||||
else:
|
||||
type_ = type_._type_
|
||||
value = type_.as_dict(value.contents)
|
||||
except ValueError:
|
||||
# nullptr
|
||||
value = None
|
||||
elif isinstance(value, AsDictMixin):
|
||||
# other structure
|
||||
value = type_.as_dict(value)
|
||||
result[field] = value
|
||||
return result
|
||||
|
||||
|
||||
class Structure(ctypes.Structure, AsDictMixin):
|
||||
|
||||
def __init__(self, *args, **kwds):
|
||||
# We don't want to use positional arguments fill PADDING_* fields
|
||||
|
||||
args = dict(zip(self.__class__._field_names_(), args))
|
||||
args.update(kwds)
|
||||
super(Structure, self).__init__(**args)
|
||||
|
||||
@classmethod
|
||||
def _field_names_(cls):
|
||||
if hasattr(cls, '_fields_'):
|
||||
return (f[0] for f in cls._fields_ if not f[0].startswith('PADDING'))
|
||||
else:
|
||||
return ()
|
||||
|
||||
@classmethod
|
||||
def get_type(cls, field):
|
||||
for f in cls._fields_:
|
||||
if f[0] == field:
|
||||
return f[1]
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def bind(cls, bound_fields):
|
||||
fields = {}
|
||||
for name, type_ in cls._fields_:
|
||||
if hasattr(type_, "restype"):
|
||||
if name in bound_fields:
|
||||
if bound_fields[name] is None:
|
||||
fields[name] = type_()
|
||||
else:
|
||||
# use a closure to capture the callback from the loop scope
|
||||
fields[name] = (
|
||||
type_((lambda callback: lambda *args: callback(*args))(
|
||||
bound_fields[name]))
|
||||
)
|
||||
del bound_fields[name]
|
||||
else:
|
||||
# default callback implementation (does nothing)
|
||||
try:
|
||||
default_ = type_(0).restype().value
|
||||
except TypeError:
|
||||
default_ = None
|
||||
fields[name] = type_((
|
||||
lambda default_: lambda *args: default_)(default_))
|
||||
else:
|
||||
# not a callback function, use default initialization
|
||||
if name in bound_fields:
|
||||
fields[name] = bound_fields[name]
|
||||
del bound_fields[name]
|
||||
else:
|
||||
fields[name] = type_()
|
||||
if len(bound_fields) != 0:
|
||||
raise ValueError(
|
||||
"Cannot bind the following unknown callback(s) {}.{}".format(
|
||||
cls.__name__, bound_fields.keys()
|
||||
))
|
||||
return cls(**fields)
|
||||
|
||||
|
||||
class Union(ctypes.Union, AsDictMixin):
|
||||
pass
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
KFD_IOCTL_H_INCLUDED = True # macro
|
||||
KFD_IOCTL_MAJOR_VERSION = 1 # macro
|
||||
KFD_IOCTL_MINOR_VERSION = 6 # macro
|
||||
KFD_IOC_QUEUE_TYPE_COMPUTE = 0x0 # macro
|
||||
KFD_IOC_QUEUE_TYPE_SDMA = 0x1 # macro
|
||||
KFD_IOC_QUEUE_TYPE_COMPUTE_AQL = 0x2 # macro
|
||||
KFD_IOC_QUEUE_TYPE_SDMA_XGMI = 0x3 # macro
|
||||
KFD_MAX_QUEUE_PERCENTAGE = 100 # macro
|
||||
KFD_MAX_QUEUE_PRIORITY = 15 # macro
|
||||
KFD_IOC_CACHE_POLICY_COHERENT = 0 # macro
|
||||
KFD_IOC_CACHE_POLICY_NONCOHERENT = 1 # macro
|
||||
NUM_OF_SUPPORTED_GPUS = 7 # macro
|
||||
MAX_ALLOWED_NUM_POINTS = 100 # macro
|
||||
MAX_ALLOWED_AW_BUFF_SIZE = 4096 # macro
|
||||
MAX_ALLOWED_WAC_BUFF_SIZE = 128 # macro
|
||||
KFD_IOC_EVENT_SIGNAL = 0 # macro
|
||||
KFD_IOC_EVENT_NODECHANGE = 1 # macro
|
||||
KFD_IOC_EVENT_DEVICESTATECHANGE = 2 # macro
|
||||
KFD_IOC_EVENT_HW_EXCEPTION = 3 # macro
|
||||
KFD_IOC_EVENT_SYSTEM_EVENT = 4 # macro
|
||||
KFD_IOC_EVENT_DEBUG_EVENT = 5 # macro
|
||||
KFD_IOC_EVENT_PROFILE_EVENT = 6 # macro
|
||||
KFD_IOC_EVENT_QUEUE_EVENT = 7 # macro
|
||||
KFD_IOC_EVENT_MEMORY = 8 # macro
|
||||
KFD_IOC_WAIT_RESULT_COMPLETE = 0 # macro
|
||||
KFD_IOC_WAIT_RESULT_TIMEOUT = 1 # macro
|
||||
KFD_IOC_WAIT_RESULT_FAIL = 2 # macro
|
||||
KFD_SIGNAL_EVENT_LIMIT = 4096 # macro
|
||||
KFD_HW_EXCEPTION_WHOLE_GPU_RESET = 0 # macro
|
||||
KFD_HW_EXCEPTION_PER_ENGINE_RESET = 1 # macro
|
||||
KFD_HW_EXCEPTION_GPU_HANG = 0 # macro
|
||||
KFD_HW_EXCEPTION_ECC = 1 # macro
|
||||
KFD_MEM_ERR_NO_RAS = 0 # macro
|
||||
KFD_MEM_ERR_SRAM_ECC = 1 # macro
|
||||
KFD_MEM_ERR_POISON_CONSUMED = 2 # macro
|
||||
KFD_MEM_ERR_GPU_HANG = 3 # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_VRAM = (1<<0) # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_GTT = (1<<1) # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_USERPTR = (1<<2) # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL = (1<<3) # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP = (1<<4) # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE = (1<<31) # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE = (1<<30) # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC = (1<<29) # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE = (1<<28) # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM = (1<<27) # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_COHERENT = (1<<26) # macro
|
||||
KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED = (1<<25) # macro
|
||||
# def KFD_SMI_EVENT_MASK_FROM_INDEX(i): # macro
|
||||
# return (1<<((i)-1))
|
||||
KFD_IOCTL_SVM_FLAG_HOST_ACCESS = 0x00000001 # macro
|
||||
KFD_IOCTL_SVM_FLAG_COHERENT = 0x00000002 # macro
|
||||
KFD_IOCTL_SVM_FLAG_HIVE_LOCAL = 0x00000004 # macro
|
||||
KFD_IOCTL_SVM_FLAG_GPU_RO = 0x00000008 # macro
|
||||
KFD_IOCTL_SVM_FLAG_GPU_EXEC = 0x00000010 # macro
|
||||
KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY = 0x00000020 # macro
|
||||
AMDKFD_IOCTL_BASE = 'K' # macro
|
||||
# def AMDKFD_IO(nr): # macro
|
||||
# return _IO('K',nr)
|
||||
# def AMDKFD_IOR(nr, type): # macro
|
||||
# return _IOR('K',nr,type)
|
||||
# def AMDKFD_IOW(nr, type): # macro
|
||||
# return _IOW('K',nr,type)
|
||||
# def AMDKFD_IOWR(nr, type): # macro
|
||||
# return _IOWR('K',nr,type)
|
||||
# AMDKFD_IOC_GET_VERSION = _IOR('K',nr,type) ( 0x01 , struct kfd_ioctl_get_version_args ) # macro
|
||||
# AMDKFD_IOC_CREATE_QUEUE = _IOWR('K',nr,type) ( 0x02 , struct kfd_ioctl_create_queue_args ) # macro
|
||||
# AMDKFD_IOC_DESTROY_QUEUE = _IOWR('K',nr,type) ( 0x03 , struct kfd_ioctl_destroy_queue_args ) # macro
|
||||
# AMDKFD_IOC_SET_MEMORY_POLICY = _IOW('K',nr,type) ( 0x04 , struct kfd_ioctl_set_memory_policy_args ) # macro
|
||||
# AMDKFD_IOC_GET_CLOCK_COUNTERS = _IOWR('K',nr,type) ( 0x05 , struct kfd_ioctl_get_clock_counters_args ) # macro
|
||||
# AMDKFD_IOC_GET_PROCESS_APERTURES = _IOR('K',nr,type) ( 0x06 , struct kfd_ioctl_get_process_apertures_args ) # macro
|
||||
# AMDKFD_IOC_UPDATE_QUEUE = _IOW('K',nr,type) ( 0x07 , struct kfd_ioctl_update_queue_args ) # macro
|
||||
# AMDKFD_IOC_CREATE_EVENT = _IOWR('K',nr,type) ( 0x08 , struct kfd_ioctl_create_event_args ) # macro
|
||||
# AMDKFD_IOC_DESTROY_EVENT = _IOW('K',nr,type) ( 0x09 , struct kfd_ioctl_destroy_event_args ) # macro
|
||||
# AMDKFD_IOC_SET_EVENT = _IOW('K',nr,type) ( 0x0A , struct kfd_ioctl_set_event_args ) # macro
|
||||
# AMDKFD_IOC_RESET_EVENT = _IOW('K',nr,type) ( 0x0B , struct kfd_ioctl_reset_event_args ) # macro
|
||||
# AMDKFD_IOC_WAIT_EVENTS = _IOWR('K',nr,type) ( 0x0C , struct kfd_ioctl_wait_events_args ) # macro
|
||||
# AMDKFD_IOC_DBG_REGISTER = _IOW('K',nr,type) ( 0x0D , struct kfd_ioctl_dbg_register_args ) # macro
|
||||
# AMDKFD_IOC_DBG_UNREGISTER = _IOW('K',nr,type) ( 0x0E , struct kfd_ioctl_dbg_unregister_args ) # macro
|
||||
# AMDKFD_IOC_DBG_ADDRESS_WATCH = _IOW('K',nr,type) ( 0x0F , struct kfd_ioctl_dbg_address_watch_args ) # macro
|
||||
# AMDKFD_IOC_DBG_WAVE_CONTROL = _IOW('K',nr,type) ( 0x10 , struct kfd_ioctl_dbg_wave_control_args ) # macro
|
||||
# AMDKFD_IOC_SET_SCRATCH_BACKING_VA = _IOWR('K',nr,type) ( 0x11 , struct kfd_ioctl_set_scratch_backing_va_args ) # macro
|
||||
# AMDKFD_IOC_GET_TILE_CONFIG = _IOWR('K',nr,type) ( 0x12 , struct kfd_ioctl_get_tile_config_args ) # macro
|
||||
# AMDKFD_IOC_SET_TRAP_HANDLER = _IOW('K',nr,type) ( 0x13 , struct kfd_ioctl_set_trap_handler_args ) # macro
|
||||
# AMDKFD_IOC_GET_PROCESS_APERTURES_NEW = _IOWR('K',nr,type) ( 0x14 , struct kfd_ioctl_get_process_apertures_new_args ) # macro
|
||||
# AMDKFD_IOC_ACQUIRE_VM = _IOW('K',nr,type) ( 0x15 , struct kfd_ioctl_acquire_vm_args ) # macro
|
||||
# AMDKFD_IOC_ALLOC_MEMORY_OF_GPU = _IOWR('K',nr,type) ( 0x16 , struct kfd_ioctl_alloc_memory_of_gpu_args ) # macro
|
||||
# AMDKFD_IOC_FREE_MEMORY_OF_GPU = _IOW('K',nr,type) ( 0x17 , struct kfd_ioctl_free_memory_of_gpu_args ) # macro
|
||||
# AMDKFD_IOC_MAP_MEMORY_TO_GPU = _IOWR('K',nr,type) ( 0x18 , struct kfd_ioctl_map_memory_to_gpu_args ) # macro
|
||||
# AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU = _IOWR('K',nr,type) ( 0x19 , struct kfd_ioctl_unmap_memory_from_gpu_args ) # macro
|
||||
# AMDKFD_IOC_SET_CU_MASK = _IOW('K',nr,type) ( 0x1A , struct kfd_ioctl_set_cu_mask_args ) # macro
|
||||
# AMDKFD_IOC_GET_QUEUE_WAVE_STATE = _IOWR('K',nr,type) ( 0x1B , struct kfd_ioctl_get_queue_wave_state_args ) # macro
|
||||
# AMDKFD_IOC_GET_DMABUF_INFO = _IOWR('K',nr,type) ( 0x1C , struct kfd_ioctl_get_dmabuf_info_args ) # macro
|
||||
# AMDKFD_IOC_IMPORT_DMABUF = _IOWR('K',nr,type) ( 0x1D , struct kfd_ioctl_import_dmabuf_args ) # macro
|
||||
# AMDKFD_IOC_ALLOC_QUEUE_GWS = _IOWR('K',nr,type) ( 0x1E , struct kfd_ioctl_alloc_queue_gws_args ) # macro
|
||||
# AMDKFD_IOC_SMI_EVENTS = _IOWR('K',nr,type) ( 0x1F , struct kfd_ioctl_smi_events_args ) # macro
|
||||
# AMDKFD_IOC_SVM = _IOWR('K',nr,type) ( 0x20 , struct kfd_ioctl_svm_args ) # macro
|
||||
# AMDKFD_IOC_SET_XNACK_MODE = _IOWR('K',nr,type) ( 0x21 , struct kfd_ioctl_set_xnack_mode_args ) # macro
|
||||
AMDKFD_COMMAND_START = 0x01 # macro
|
||||
AMDKFD_COMMAND_END = 0x22 # macro
|
||||
class struct_kfd_ioctl_get_version_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_get_version_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_get_version_args._fields_ = [
|
||||
('major_version', ctypes.c_uint32),
|
||||
('minor_version', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_create_queue_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_create_queue_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_create_queue_args._fields_ = [
|
||||
('ring_base_address', ctypes.c_uint64),
|
||||
('write_pointer_address', ctypes.c_uint64),
|
||||
('read_pointer_address', ctypes.c_uint64),
|
||||
('doorbell_offset', ctypes.c_uint64),
|
||||
('ring_size', ctypes.c_uint32),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('queue_type', ctypes.c_uint32),
|
||||
('queue_percentage', ctypes.c_uint32),
|
||||
('queue_priority', ctypes.c_uint32),
|
||||
('queue_id', ctypes.c_uint32),
|
||||
('eop_buffer_address', ctypes.c_uint64),
|
||||
('eop_buffer_size', ctypes.c_uint64),
|
||||
('ctx_save_restore_address', ctypes.c_uint64),
|
||||
('ctx_save_restore_size', ctypes.c_uint32),
|
||||
('ctl_stack_size', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_destroy_queue_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_destroy_queue_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_destroy_queue_args._fields_ = [
|
||||
('queue_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_update_queue_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_update_queue_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_update_queue_args._fields_ = [
|
||||
('ring_base_address', ctypes.c_uint64),
|
||||
('queue_id', ctypes.c_uint32),
|
||||
('ring_size', ctypes.c_uint32),
|
||||
('queue_percentage', ctypes.c_uint32),
|
||||
('queue_priority', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_set_cu_mask_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_set_cu_mask_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_set_cu_mask_args._fields_ = [
|
||||
('queue_id', ctypes.c_uint32),
|
||||
('num_cu_mask', ctypes.c_uint32),
|
||||
('cu_mask_ptr', ctypes.c_uint64),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_get_queue_wave_state_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_get_queue_wave_state_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_get_queue_wave_state_args._fields_ = [
|
||||
('ctl_stack_address', ctypes.c_uint64),
|
||||
('ctl_stack_used_size', ctypes.c_uint32),
|
||||
('save_area_used_size', ctypes.c_uint32),
|
||||
('queue_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_set_memory_policy_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_set_memory_policy_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_set_memory_policy_args._fields_ = [
|
||||
('alternate_aperture_base', ctypes.c_uint64),
|
||||
('alternate_aperture_size', ctypes.c_uint64),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('default_policy', ctypes.c_uint32),
|
||||
('alternate_policy', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_get_clock_counters_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_get_clock_counters_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_get_clock_counters_args._fields_ = [
|
||||
('gpu_clock_counter', ctypes.c_uint64),
|
||||
('cpu_clock_counter', ctypes.c_uint64),
|
||||
('system_clock_counter', ctypes.c_uint64),
|
||||
('system_clock_freq', ctypes.c_uint64),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_process_device_apertures(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_process_device_apertures._pack_ = 1 # source:False
|
||||
struct_kfd_process_device_apertures._fields_ = [
|
||||
('lds_base', ctypes.c_uint64),
|
||||
('lds_limit', ctypes.c_uint64),
|
||||
('scratch_base', ctypes.c_uint64),
|
||||
('scratch_limit', ctypes.c_uint64),
|
||||
('gpuvm_base', ctypes.c_uint64),
|
||||
('gpuvm_limit', ctypes.c_uint64),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_get_process_apertures_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_get_process_apertures_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_get_process_apertures_args._fields_ = [
|
||||
('process_apertures', struct_kfd_process_device_apertures * 7),
|
||||
('num_of_nodes', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_get_process_apertures_new_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_get_process_apertures_new_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_get_process_apertures_new_args._fields_ = [
|
||||
('kfd_process_device_apertures_ptr', ctypes.c_uint64),
|
||||
('num_of_nodes', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_dbg_register_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_dbg_register_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_dbg_register_args._fields_ = [
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_dbg_unregister_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_dbg_unregister_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_dbg_unregister_args._fields_ = [
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_dbg_address_watch_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_dbg_address_watch_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_dbg_address_watch_args._fields_ = [
|
||||
('content_ptr', ctypes.c_uint64),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('buf_size_in_bytes', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_dbg_wave_control_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_dbg_wave_control_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_dbg_wave_control_args._fields_ = [
|
||||
('content_ptr', ctypes.c_uint64),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('buf_size_in_bytes', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_create_event_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_create_event_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_create_event_args._fields_ = [
|
||||
('event_page_offset', ctypes.c_uint64),
|
||||
('event_trigger_data', ctypes.c_uint32),
|
||||
('event_type', ctypes.c_uint32),
|
||||
('auto_reset', ctypes.c_uint32),
|
||||
('node_id', ctypes.c_uint32),
|
||||
('event_id', ctypes.c_uint32),
|
||||
('event_slot_index', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_destroy_event_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_destroy_event_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_destroy_event_args._fields_ = [
|
||||
('event_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_set_event_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_set_event_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_set_event_args._fields_ = [
|
||||
('event_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_reset_event_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_reset_event_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_reset_event_args._fields_ = [
|
||||
('event_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_memory_exception_failure(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_memory_exception_failure._pack_ = 1 # source:False
|
||||
struct_kfd_memory_exception_failure._fields_ = [
|
||||
('NotPresent', ctypes.c_uint32),
|
||||
('ReadOnly', ctypes.c_uint32),
|
||||
('NoExecute', ctypes.c_uint32),
|
||||
('imprecise', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_hsa_memory_exception_data(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_hsa_memory_exception_data._pack_ = 1 # source:False
|
||||
struct_kfd_hsa_memory_exception_data._fields_ = [
|
||||
('failure', struct_kfd_memory_exception_failure),
|
||||
('va', ctypes.c_uint64),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('ErrorType', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_hsa_hw_exception_data(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_hsa_hw_exception_data._pack_ = 1 # source:False
|
||||
struct_kfd_hsa_hw_exception_data._fields_ = [
|
||||
('reset_type', ctypes.c_uint32),
|
||||
('reset_cause', ctypes.c_uint32),
|
||||
('memory_lost', ctypes.c_uint32),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_event_data(Structure):
|
||||
pass
|
||||
|
||||
class union_kfd_event_data_0(Union):
|
||||
pass
|
||||
|
||||
union_kfd_event_data_0._pack_ = 1 # source:False
|
||||
union_kfd_event_data_0._fields_ = [
|
||||
('memory_exception_data', struct_kfd_hsa_memory_exception_data),
|
||||
('hw_exception_data', struct_kfd_hsa_hw_exception_data),
|
||||
('PADDING_0', ctypes.c_ubyte * 16),
|
||||
]
|
||||
|
||||
struct_kfd_event_data._pack_ = 1 # source:False
|
||||
struct_kfd_event_data._anonymous_ = ('_0',)
|
||||
struct_kfd_event_data._fields_ = [
|
||||
('_0', union_kfd_event_data_0),
|
||||
('kfd_event_data_ext', ctypes.c_uint64),
|
||||
('event_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_wait_events_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_wait_events_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_wait_events_args._fields_ = [
|
||||
('events_ptr', ctypes.c_uint64),
|
||||
('num_events', ctypes.c_uint32),
|
||||
('wait_for_all', ctypes.c_uint32),
|
||||
('timeout', ctypes.c_uint32),
|
||||
('wait_result', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_set_scratch_backing_va_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_set_scratch_backing_va_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_set_scratch_backing_va_args._fields_ = [
|
||||
('va_addr', ctypes.c_uint64),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_get_tile_config_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_get_tile_config_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_get_tile_config_args._fields_ = [
|
||||
('tile_config_ptr', ctypes.c_uint64),
|
||||
('macro_tile_config_ptr', ctypes.c_uint64),
|
||||
('num_tile_configs', ctypes.c_uint32),
|
||||
('num_macro_tile_configs', ctypes.c_uint32),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('gb_addr_config', ctypes.c_uint32),
|
||||
('num_banks', ctypes.c_uint32),
|
||||
('num_ranks', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_set_trap_handler_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_set_trap_handler_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_set_trap_handler_args._fields_ = [
|
||||
('tba_addr', ctypes.c_uint64),
|
||||
('tma_addr', ctypes.c_uint64),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_acquire_vm_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_acquire_vm_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_acquire_vm_args._fields_ = [
|
||||
('drm_fd', ctypes.c_uint32),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_alloc_memory_of_gpu_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_alloc_memory_of_gpu_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_alloc_memory_of_gpu_args._fields_ = [
|
||||
('va_addr', ctypes.c_uint64),
|
||||
('size', ctypes.c_uint64),
|
||||
('handle', ctypes.c_uint64),
|
||||
('mmap_offset', ctypes.c_uint64),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('flags', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_free_memory_of_gpu_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_free_memory_of_gpu_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_free_memory_of_gpu_args._fields_ = [
|
||||
('handle', ctypes.c_uint64),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_map_memory_to_gpu_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_map_memory_to_gpu_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_map_memory_to_gpu_args._fields_ = [
|
||||
('handle', ctypes.c_uint64),
|
||||
('device_ids_array_ptr', ctypes.c_uint64),
|
||||
('n_devices', ctypes.c_uint32),
|
||||
('n_success', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_unmap_memory_from_gpu_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_unmap_memory_from_gpu_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_unmap_memory_from_gpu_args._fields_ = [
|
||||
('handle', ctypes.c_uint64),
|
||||
('device_ids_array_ptr', ctypes.c_uint64),
|
||||
('n_devices', ctypes.c_uint32),
|
||||
('n_success', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_alloc_queue_gws_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_alloc_queue_gws_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_alloc_queue_gws_args._fields_ = [
|
||||
('queue_id', ctypes.c_uint32),
|
||||
('num_gws', ctypes.c_uint32),
|
||||
('first_gws', ctypes.c_uint32),
|
||||
('pad', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_get_dmabuf_info_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_get_dmabuf_info_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_get_dmabuf_info_args._fields_ = [
|
||||
('size', ctypes.c_uint64),
|
||||
('metadata_ptr', ctypes.c_uint64),
|
||||
('metadata_size', ctypes.c_uint32),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('flags', ctypes.c_uint32),
|
||||
('dmabuf_fd', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_import_dmabuf_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_import_dmabuf_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_import_dmabuf_args._fields_ = [
|
||||
('va_addr', ctypes.c_uint64),
|
||||
('handle', ctypes.c_uint64),
|
||||
('gpu_id', ctypes.c_uint32),
|
||||
('dmabuf_fd', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
|
||||
# values for enumeration 'kfd_smi_event'
|
||||
kfd_smi_event__enumvalues = {
|
||||
0: 'KFD_SMI_EVENT_NONE',
|
||||
1: 'KFD_SMI_EVENT_VMFAULT',
|
||||
2: 'KFD_SMI_EVENT_THERMAL_THROTTLE',
|
||||
3: 'KFD_SMI_EVENT_GPU_PRE_RESET',
|
||||
4: 'KFD_SMI_EVENT_GPU_POST_RESET',
|
||||
}
|
||||
KFD_SMI_EVENT_NONE = 0
|
||||
KFD_SMI_EVENT_VMFAULT = 1
|
||||
KFD_SMI_EVENT_THERMAL_THROTTLE = 2
|
||||
KFD_SMI_EVENT_GPU_PRE_RESET = 3
|
||||
KFD_SMI_EVENT_GPU_POST_RESET = 4
|
||||
kfd_smi_event = ctypes.c_uint32 # enum
|
||||
class struct_kfd_ioctl_smi_events_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_smi_events_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_smi_events_args._fields_ = [
|
||||
('gpuid', ctypes.c_uint32),
|
||||
('anon_fd', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
|
||||
# values for enumeration 'kfd_mmio_remap'
|
||||
kfd_mmio_remap__enumvalues = {
|
||||
0: 'KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL',
|
||||
4: 'KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL',
|
||||
}
|
||||
KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL = 0
|
||||
KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL = 4
|
||||
kfd_mmio_remap = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'kfd_ioctl_svm_op'
|
||||
kfd_ioctl_svm_op__enumvalues = {
|
||||
0: 'KFD_IOCTL_SVM_OP_SET_ATTR',
|
||||
1: 'KFD_IOCTL_SVM_OP_GET_ATTR',
|
||||
}
|
||||
KFD_IOCTL_SVM_OP_SET_ATTR = 0
|
||||
KFD_IOCTL_SVM_OP_GET_ATTR = 1
|
||||
kfd_ioctl_svm_op = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'kfd_ioctl_svm_location'
|
||||
kfd_ioctl_svm_location__enumvalues = {
|
||||
0: 'KFD_IOCTL_SVM_LOCATION_SYSMEM',
|
||||
4294967295: 'KFD_IOCTL_SVM_LOCATION_UNDEFINED',
|
||||
}
|
||||
KFD_IOCTL_SVM_LOCATION_SYSMEM = 0
|
||||
KFD_IOCTL_SVM_LOCATION_UNDEFINED = 4294967295
|
||||
kfd_ioctl_svm_location = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'kfd_ioctl_svm_attr_type'
|
||||
kfd_ioctl_svm_attr_type__enumvalues = {
|
||||
0: 'KFD_IOCTL_SVM_ATTR_PREFERRED_LOC',
|
||||
1: 'KFD_IOCTL_SVM_ATTR_PREFETCH_LOC',
|
||||
2: 'KFD_IOCTL_SVM_ATTR_ACCESS',
|
||||
3: 'KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE',
|
||||
4: 'KFD_IOCTL_SVM_ATTR_NO_ACCESS',
|
||||
5: 'KFD_IOCTL_SVM_ATTR_SET_FLAGS',
|
||||
6: 'KFD_IOCTL_SVM_ATTR_CLR_FLAGS',
|
||||
7: 'KFD_IOCTL_SVM_ATTR_GRANULARITY',
|
||||
}
|
||||
KFD_IOCTL_SVM_ATTR_PREFERRED_LOC = 0
|
||||
KFD_IOCTL_SVM_ATTR_PREFETCH_LOC = 1
|
||||
KFD_IOCTL_SVM_ATTR_ACCESS = 2
|
||||
KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE = 3
|
||||
KFD_IOCTL_SVM_ATTR_NO_ACCESS = 4
|
||||
KFD_IOCTL_SVM_ATTR_SET_FLAGS = 5
|
||||
KFD_IOCTL_SVM_ATTR_CLR_FLAGS = 6
|
||||
KFD_IOCTL_SVM_ATTR_GRANULARITY = 7
|
||||
kfd_ioctl_svm_attr_type = ctypes.c_uint32 # enum
|
||||
class struct_kfd_ioctl_svm_attribute(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_svm_attribute._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_svm_attribute._fields_ = [
|
||||
('type', ctypes.c_uint32),
|
||||
('value', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_svm_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_svm_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_svm_args._fields_ = [
|
||||
('start_addr', ctypes.c_uint64),
|
||||
('size', ctypes.c_uint64),
|
||||
('op', ctypes.c_uint32),
|
||||
('nattr', ctypes.c_uint32),
|
||||
('attrs', struct_kfd_ioctl_svm_attribute * 0),
|
||||
]
|
||||
|
||||
class struct_kfd_ioctl_set_xnack_mode_args(Structure):
|
||||
pass
|
||||
|
||||
struct_kfd_ioctl_set_xnack_mode_args._pack_ = 1 # source:False
|
||||
struct_kfd_ioctl_set_xnack_mode_args._fields_ = [
|
||||
('xnack_enabled', ctypes.c_int32),
|
||||
]
|
||||
|
||||
__all__ = \
|
||||
['AMDKFD_COMMAND_END', 'AMDKFD_COMMAND_START',
|
||||
'AMDKFD_IOCTL_BASE', 'KFD_HW_EXCEPTION_ECC',
|
||||
'KFD_HW_EXCEPTION_GPU_HANG', 'KFD_HW_EXCEPTION_PER_ENGINE_RESET',
|
||||
'KFD_HW_EXCEPTION_WHOLE_GPU_RESET', 'KFD_IOCTL_H_INCLUDED',
|
||||
'KFD_IOCTL_MAJOR_VERSION', 'KFD_IOCTL_MINOR_VERSION',
|
||||
'KFD_IOCTL_SVM_ATTR_ACCESS', 'KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE',
|
||||
'KFD_IOCTL_SVM_ATTR_CLR_FLAGS', 'KFD_IOCTL_SVM_ATTR_GRANULARITY',
|
||||
'KFD_IOCTL_SVM_ATTR_NO_ACCESS',
|
||||
'KFD_IOCTL_SVM_ATTR_PREFERRED_LOC',
|
||||
'KFD_IOCTL_SVM_ATTR_PREFETCH_LOC', 'KFD_IOCTL_SVM_ATTR_SET_FLAGS',
|
||||
'KFD_IOCTL_SVM_FLAG_COHERENT', 'KFD_IOCTL_SVM_FLAG_GPU_EXEC',
|
||||
'KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY', 'KFD_IOCTL_SVM_FLAG_GPU_RO',
|
||||
'KFD_IOCTL_SVM_FLAG_HIVE_LOCAL', 'KFD_IOCTL_SVM_FLAG_HOST_ACCESS',
|
||||
'KFD_IOCTL_SVM_LOCATION_SYSMEM',
|
||||
'KFD_IOCTL_SVM_LOCATION_UNDEFINED', 'KFD_IOCTL_SVM_OP_GET_ATTR',
|
||||
'KFD_IOCTL_SVM_OP_SET_ATTR',
|
||||
'KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM',
|
||||
'KFD_IOC_ALLOC_MEM_FLAGS_COHERENT',
|
||||
'KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL',
|
||||
'KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE',
|
||||
'KFD_IOC_ALLOC_MEM_FLAGS_GTT',
|
||||
'KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP',
|
||||
'KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE',
|
||||
'KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC',
|
||||
'KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED',
|
||||
'KFD_IOC_ALLOC_MEM_FLAGS_USERPTR', 'KFD_IOC_ALLOC_MEM_FLAGS_VRAM',
|
||||
'KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE',
|
||||
'KFD_IOC_CACHE_POLICY_COHERENT',
|
||||
'KFD_IOC_CACHE_POLICY_NONCOHERENT', 'KFD_IOC_EVENT_DEBUG_EVENT',
|
||||
'KFD_IOC_EVENT_DEVICESTATECHANGE', 'KFD_IOC_EVENT_HW_EXCEPTION',
|
||||
'KFD_IOC_EVENT_MEMORY', 'KFD_IOC_EVENT_NODECHANGE',
|
||||
'KFD_IOC_EVENT_PROFILE_EVENT', 'KFD_IOC_EVENT_QUEUE_EVENT',
|
||||
'KFD_IOC_EVENT_SIGNAL', 'KFD_IOC_EVENT_SYSTEM_EVENT',
|
||||
'KFD_IOC_QUEUE_TYPE_COMPUTE', 'KFD_IOC_QUEUE_TYPE_COMPUTE_AQL',
|
||||
'KFD_IOC_QUEUE_TYPE_SDMA', 'KFD_IOC_QUEUE_TYPE_SDMA_XGMI',
|
||||
'KFD_IOC_WAIT_RESULT_COMPLETE', 'KFD_IOC_WAIT_RESULT_FAIL',
|
||||
'KFD_IOC_WAIT_RESULT_TIMEOUT', 'KFD_MAX_QUEUE_PERCENTAGE',
|
||||
'KFD_MAX_QUEUE_PRIORITY', 'KFD_MEM_ERR_GPU_HANG',
|
||||
'KFD_MEM_ERR_NO_RAS', 'KFD_MEM_ERR_POISON_CONSUMED',
|
||||
'KFD_MEM_ERR_SRAM_ECC', 'KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL',
|
||||
'KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL', 'KFD_SIGNAL_EVENT_LIMIT',
|
||||
'KFD_SMI_EVENT_GPU_POST_RESET', 'KFD_SMI_EVENT_GPU_PRE_RESET',
|
||||
'KFD_SMI_EVENT_NONE', 'KFD_SMI_EVENT_THERMAL_THROTTLE',
|
||||
'KFD_SMI_EVENT_VMFAULT', 'MAX_ALLOWED_AW_BUFF_SIZE',
|
||||
'MAX_ALLOWED_NUM_POINTS', 'MAX_ALLOWED_WAC_BUFF_SIZE',
|
||||
'NUM_OF_SUPPORTED_GPUS', 'kfd_ioctl_svm_attr_type',
|
||||
'kfd_ioctl_svm_location', 'kfd_ioctl_svm_op', 'kfd_mmio_remap',
|
||||
'kfd_smi_event', 'struct_kfd_event_data',
|
||||
'struct_kfd_hsa_hw_exception_data',
|
||||
'struct_kfd_hsa_memory_exception_data',
|
||||
'struct_kfd_ioctl_acquire_vm_args',
|
||||
'struct_kfd_ioctl_alloc_memory_of_gpu_args',
|
||||
'struct_kfd_ioctl_alloc_queue_gws_args',
|
||||
'struct_kfd_ioctl_create_event_args',
|
||||
'struct_kfd_ioctl_create_queue_args',
|
||||
'struct_kfd_ioctl_dbg_address_watch_args',
|
||||
'struct_kfd_ioctl_dbg_register_args',
|
||||
'struct_kfd_ioctl_dbg_unregister_args',
|
||||
'struct_kfd_ioctl_dbg_wave_control_args',
|
||||
'struct_kfd_ioctl_destroy_event_args',
|
||||
'struct_kfd_ioctl_destroy_queue_args',
|
||||
'struct_kfd_ioctl_free_memory_of_gpu_args',
|
||||
'struct_kfd_ioctl_get_clock_counters_args',
|
||||
'struct_kfd_ioctl_get_dmabuf_info_args',
|
||||
'struct_kfd_ioctl_get_process_apertures_args',
|
||||
'struct_kfd_ioctl_get_process_apertures_new_args',
|
||||
'struct_kfd_ioctl_get_queue_wave_state_args',
|
||||
'struct_kfd_ioctl_get_tile_config_args',
|
||||
'struct_kfd_ioctl_get_version_args',
|
||||
'struct_kfd_ioctl_import_dmabuf_args',
|
||||
'struct_kfd_ioctl_map_memory_to_gpu_args',
|
||||
'struct_kfd_ioctl_reset_event_args',
|
||||
'struct_kfd_ioctl_set_cu_mask_args',
|
||||
'struct_kfd_ioctl_set_event_args',
|
||||
'struct_kfd_ioctl_set_memory_policy_args',
|
||||
'struct_kfd_ioctl_set_scratch_backing_va_args',
|
||||
'struct_kfd_ioctl_set_trap_handler_args',
|
||||
'struct_kfd_ioctl_set_xnack_mode_args',
|
||||
'struct_kfd_ioctl_smi_events_args', 'struct_kfd_ioctl_svm_args',
|
||||
'struct_kfd_ioctl_svm_attribute',
|
||||
'struct_kfd_ioctl_unmap_memory_from_gpu_args',
|
||||
'struct_kfd_ioctl_update_queue_args',
|
||||
'struct_kfd_ioctl_wait_events_args',
|
||||
'struct_kfd_memory_exception_failure',
|
||||
'struct_kfd_process_device_apertures', 'union_kfd_event_data_0']
|
||||
333
tinygrad/runtime/ops_kfd.py
Normal file
333
tinygrad/runtime/ops_kfd.py
Normal file
@@ -0,0 +1,333 @@
|
||||
from __future__ import annotations
|
||||
from typing import Tuple
|
||||
import os, fcntl, ctypes, functools, re, pathlib, mmap, struct
|
||||
from tinygrad.device import Compiled, LRUAllocator, Compiler, BufferOptions, CompilerOptions
|
||||
from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up
|
||||
from tinygrad.renderer.cstyle import HIPRenderer
|
||||
from tinygrad.runtime.driver.hip_comgr import compile_hip
|
||||
import tinygrad.runtime.autogen.kfd as kfd
|
||||
import tinygrad.runtime.autogen.hsa as hsa
|
||||
import tinygrad.runtime.autogen.amd_sdma as amd_sdma
|
||||
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401
|
||||
|
||||
libc = ctypes.CDLL("libc.so.6")
|
||||
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
|
||||
libc.mmap.restype = ctypes.c_void_p
|
||||
|
||||
def node_sysfs_path(node_id, file): return f"/sys/devices/virtual/kfd/kfd/topology/nodes/{node_id}/{file}"
|
||||
|
||||
def kfd_ioctl(idir, nr, user_struct, fd, made_struct=None, **kwargs):
|
||||
made = made_struct or user_struct(**kwargs)
|
||||
ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(made)<<16) | (ord('K')<<8) | nr, made)
|
||||
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
|
||||
return made
|
||||
|
||||
def ioctls_from_header():
|
||||
#hdr = pathlib.Path("/usr/include/linux/kfd_ioctl.h").read_text().replace("\\\n", "")
|
||||
#pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_(IOW?R?)\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
|
||||
#matches = re.findall(pattern, hdr, re.MULTILINE)
|
||||
# get this from python instead
|
||||
hdrpy = (pathlib.Path(__file__).parent / "autogen" / "kfd.py").read_text()
|
||||
pattern = r'# (AMDKFD_IOC_[A-Z0-9_]+)\s=\s_(IOW?R?).*\(( 0x[0-9a-fA-F]+) ,\s+struct\s([A-Za-z0-9_]+)\s+\)'
|
||||
matches = re.findall(pattern, hdrpy, re.MULTILINE)
|
||||
idirs = {"IOW": 1, "IOR": 2, "IOWR": 3}
|
||||
fxns = {name.replace("AMDKFD_IOC_", "").lower():
|
||||
functools.partial(kfd_ioctl, idirs[idir], int(nr, 0x10), getattr(kfd, "struct_"+sname))
|
||||
for name, idir, nr, sname in matches}
|
||||
return type("KIO", (object, ), fxns)
|
||||
kio = ioctls_from_header()
|
||||
|
||||
def create_sdma_packets():
|
||||
# TODO: clean up this, if we want to keep it
|
||||
structs = {}
|
||||
for name,pkt in [(name,s) for name,s in amd_sdma.__dict__.items() if name.startswith("struct_SDMA_PKT_") and name.endswith("_TAG")]:
|
||||
names = set()
|
||||
fields = []
|
||||
for pkt_fields in pkt._fields_:
|
||||
if not pkt_fields[0].endswith("_UNION"): fields.append(pkt_fields)
|
||||
else:
|
||||
assert pkt_fields[1]._fields_[0][0] == '_0'
|
||||
for union_fields in pkt_fields[1]._fields_[0][1]._fields_:
|
||||
fname = union_fields[0]
|
||||
if fname in names: fname = pkt_fields[0]+fname
|
||||
names.add(fname)
|
||||
if fname.endswith("_63_32") and fields[-1][0].endswith("_31_0"):
|
||||
fields[-1] = tuple([fname[:-6], ctypes.c_ulong, 64]) # merge together 64-bit fields
|
||||
else:
|
||||
fields.append(tuple([fname, *union_fields[1:]]))
|
||||
new_name = name[16:-4].lower()
|
||||
structs[new_name] = init_c_struct_t(tuple(fields))
|
||||
assert ctypes.sizeof(structs[new_name]) == ctypes.sizeof(pkt), f"{ctypes.sizeof(structs[new_name])} != {ctypes.sizeof(pkt)}"
|
||||
return type("SDMA_PKTS", (object, ), structs)
|
||||
sdma_pkts = create_sdma_packets()
|
||||
|
||||
class KFDCompiler(Compiler):
|
||||
compiler_opts = CompilerOptions("KFD", has_tensor_cores=True, shared_max=65536)
|
||||
def __init__(self, arch:str):
|
||||
self.arch = arch
|
||||
super().__init__(f"compile_hip_{self.arch}")
|
||||
def render(self, name:str, uops) -> str: return HIPRenderer(name, uops)
|
||||
def compile(self, src:str) -> bytes: return compile_hip(src, self.arch)
|
||||
|
||||
AQL_PACKET_SIZE = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t)
|
||||
SDMA_MAX_COPY_SIZE = 0x400000
|
||||
|
||||
DISPATCH_KERNEL_SETUP = 3 << hsa.HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
|
||||
DISPATCH_KERNEL_HEADER = 1 << hsa.HSA_PACKET_HEADER_BARRIER
|
||||
DISPATCH_KERNEL_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE
|
||||
DISPATCH_KERNEL_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE
|
||||
DISPATCH_KERNEL_HEADER |= hsa.HSA_PACKET_TYPE_KERNEL_DISPATCH << hsa.HSA_PACKET_HEADER_TYPE
|
||||
|
||||
SHT_PROGBITS = 0x1
|
||||
SHF_ALLOC = 0x2
|
||||
|
||||
class KFDProgram:
|
||||
def __init__(self, device:KFDDevice, name:str, lib:bytes):
|
||||
# TODO; this API needs the type signature of the function and global_size/local_size
|
||||
self.device, self.name, self.lib = device, name, lib
|
||||
|
||||
_phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
|
||||
sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
|
||||
|
||||
lib_gpu_size = round_up(max(sh[5]+sh[3] for sh in sections if sh[1] == SHT_PROGBITS), 0x1000)
|
||||
self.lib_gpu = self.device._gpu_alloc(lib_gpu_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
|
||||
lib_gpu_view = to_mv(self.lib_gpu.va_addr, lib_gpu_size)
|
||||
|
||||
for _, sh_type, sh_flags, sh_addr, sh_offset, sh_size, _, _, _ in sections:
|
||||
if sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC: lib_gpu_view[sh_addr:sh_addr+sh_size] = self.lib[sh_offset:sh_offset+sh_size]
|
||||
|
||||
entry_point = min(sh[3] for sh in sections if sh[1] == SHT_PROGBITS and sh[2] & SHF_ALLOC)
|
||||
self.handle = self.lib_gpu.va_addr + entry_point
|
||||
self.group_segment_size = lib_gpu_view.cast("I")[entry_point//4]
|
||||
self.private_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 1]
|
||||
self.kernargs_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 2]
|
||||
assert self.private_segment_size <= self.device.max_private_segment_size, \
|
||||
f"{self.private_segment_size=} > {self.device.max_private_segment_size=}"
|
||||
|
||||
# NOTE: no programs are ever freed
|
||||
def __del__(self): kio.free_memory_of_gpu(KFDDevice.kfd, handle=self.lib_gpu.handle)
|
||||
|
||||
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
|
||||
if not hasattr(self, "args_struct_t"):
|
||||
self.args_struct_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(args))] +
|
||||
[(f'v{i}', ctypes.c_int) for i in range(len(vals))]))
|
||||
if ctypes.sizeof(self.args_struct_t) != self.kernargs_segment_size:
|
||||
raise RuntimeError(f"HSAProgram.__call__: incorrect args struct size {ctypes.sizeof(self.args_struct_t)} != {self.kernargs_segment_size}")
|
||||
args_st = self.args_struct_t.from_address(self.device.kernargs.va_addr)
|
||||
for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr)
|
||||
for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
|
||||
|
||||
self.device.completion_signal.value = 1 # reset the signal before call
|
||||
packet = hsa.hsa_kernel_dispatch_packet_t.from_address(self.device.aql_ring.va_addr +
|
||||
(self.device.aql_doorbell_value*AQL_PACKET_SIZE) % self.device.aql_ring.size)
|
||||
packet.workgroup_size_x, packet.workgroup_size_y, packet.workgroup_size_z = local_size
|
||||
packet.reserved0 = 0
|
||||
packet.grid_size_x, packet.grid_size_y, packet.grid_size_z = tuple(g*l for g,l in zip(global_size, local_size))
|
||||
packet.kernel_object = self.handle
|
||||
packet.kernarg_address = self.device.kernargs.va_addr
|
||||
packet.group_segment_size = self.group_segment_size
|
||||
packet.private_segment_size = self.private_segment_size # what it this and why doesn't it work? (see TestOps.test_dilated_conv_transpose2d)
|
||||
packet.reserved2 = 0
|
||||
packet.completion_signal = hsa.hsa_signal_t(ctypes.addressof(self.device.completion_signal))
|
||||
packet.setup = DISPATCH_KERNEL_SETUP
|
||||
packet.header = DISPATCH_KERNEL_HEADER
|
||||
|
||||
# one pending packet + ring doorbell
|
||||
self.device.amd_aql_queue.write_dispatch_id = self.device.aql_doorbell_value + 1
|
||||
self.device.aql_doorbell[0] = self.device.aql_doorbell_value
|
||||
self.device.aql_doorbell_value += 1
|
||||
|
||||
evt_arr = (kfd.struct_kfd_event_data * 1)()
|
||||
evt_arr[0].event_id = self.device.completion_signal.event_id
|
||||
kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000)
|
||||
|
||||
assert (wp:=self.device.amd_aql_queue.write_dispatch_id) == (rp:=self.device.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
|
||||
if wait: return (self.device.completion_signal.end_ts-self.device.completion_signal.start_ts)/1e9
|
||||
|
||||
class KFDAllocator(LRUAllocator):
|
||||
def __init__(self, device:KFDDevice):
|
||||
self.device = device
|
||||
super().__init__()
|
||||
|
||||
def _alloc(self, size:int, options:BufferOptions):
|
||||
if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
|
||||
else: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
|
||||
|
||||
def copyin(self, dest, src: memoryview):
|
||||
# TODO: need to make the address visible to gpu and pass it directly to sdma.
|
||||
self.device._map_userptr_to_gpu(ctypes.addressof(from_mv(src).contents), src.nbytes)
|
||||
self.device.completion_signal.value = 1
|
||||
self.device._submit_sdma(dest.va_addr, ctypes.addressof(from_mv(src).contents), src.nbytes, completion_signal=self.device.completion_signal)
|
||||
evt_arr = (kfd.struct_kfd_event_data * 1)()
|
||||
evt_arr[0].event_id = self.device.completion_signal.event_id
|
||||
kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000)
|
||||
|
||||
def copyout(self, dest:memoryview, src):
|
||||
self.device._map_userptr_to_gpu(ctypes.addressof(from_mv(dest).contents), dest.nbytes)
|
||||
self.device.completion_signal.value = 1
|
||||
self.device._submit_sdma(ctypes.addressof(from_mv(dest).contents), src.va_addr, dest.nbytes, completion_signal=self.device.completion_signal)
|
||||
evt_arr = (kfd.struct_kfd_event_data * 1)()
|
||||
evt_arr[0].event_id = self.device.completion_signal.event_id
|
||||
kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000)
|
||||
|
||||
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
|
||||
class KFDDevice(Compiled):
|
||||
kfd:int = -1
|
||||
|
||||
def _map_userptr_to_gpu(self, addr, size):
|
||||
self.map_uptr2gpu_struct.start_addr = addr&~0xfff
|
||||
self.map_uptr2gpu_struct.size = round_up(size+addr-(addr&~0xfff), 0x1000)
|
||||
kio.svm(self.kfd, made_struct=self.map_uptr2gpu_struct)
|
||||
|
||||
def _gpu_alloc(self, size:int, flags:int, uncached=False, public=False, map_to_gpu=True):
|
||||
flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
|
||||
if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED
|
||||
if public: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
|
||||
if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR:
|
||||
buf = addr = libc.mmap(0, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
|
||||
else:
|
||||
buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
|
||||
assert addr != 0xffffffffffffffff
|
||||
mem = kio.alloc_memory_of_gpu(self.kfd, va_addr=addr, size=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf)
|
||||
if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
|
||||
buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.drm_fd, mem.mmap_offset)
|
||||
assert buf != 0xffffffffffffffff
|
||||
assert addr == buf == mem.va_addr
|
||||
if map_to_gpu:
|
||||
arr = (ctypes.c_int32 * 1)(self.gpu_id)
|
||||
stm = kio.map_memory_to_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
|
||||
assert stm.n_success == 1
|
||||
return mem
|
||||
|
||||
def __init__(self, device:str=""):
|
||||
if KFDDevice.kfd == -1: KFDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
|
||||
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
||||
with open(node_sysfs_path(self.device_id+1, "gpu_id"), "r") as f: self.gpu_id = int(f.read())
|
||||
with open(node_sysfs_path(self.device_id+1, "properties"), "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
|
||||
self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)
|
||||
self.arch = f"gfx{self.properties['gfx_target_version']//100}"
|
||||
kio.acquire_vm(KFDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
|
||||
|
||||
self.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
self.sync_event = kio.create_event(KFDDevice.kfd, event_page_offset=self.event_page.handle, auto_reset=1)
|
||||
self.eop_buffer = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
self.aql_ring = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
|
||||
self.signals_page = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
|
||||
self.gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
|
||||
self.kernargs = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
self.ctx_save_restore_address = self._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
|
||||
self.completion_signal = hsa.amd_signal_t.from_address(self.signals_page.va_addr)
|
||||
self.completion_signal.value = 1
|
||||
self.completion_signal.kind = hsa.AMD_SIGNAL_KIND_USER
|
||||
self.completion_signal.event_mailbox_ptr = self.event_page.va_addr + self.sync_event.event_slot_index*8
|
||||
self.completion_signal.event_id = self.sync_event.event_id
|
||||
|
||||
# AQL Queue
|
||||
self.amd_aql_queue = hsa.amd_queue_t.from_address(self.gart.va_addr)
|
||||
self.amd_aql_queue.write_dispatch_id = 0
|
||||
self.amd_aql_queue.read_dispatch_id = 0
|
||||
self.amd_aql_queue.read_dispatch_id_field_base_byte_offset = getattr(hsa.amd_queue_t, 'read_dispatch_id').offset
|
||||
self.amd_aql_queue.queue_properties = hsa.AMD_QUEUE_PROPERTIES_IS_PTR64 | hsa.AMD_QUEUE_PROPERTIES_ENABLE_PROFILING
|
||||
|
||||
self.amd_aql_queue.max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
|
||||
self.amd_aql_queue.max_wave_id = self.properties['max_waves_per_simd'] * self.properties['simd_per_cu'] - 1
|
||||
|
||||
# scratch setup
|
||||
self.max_private_segment_size = 256
|
||||
self.scratch_len = self.max_private_segment_size * (self.amd_aql_queue.max_cu_id + 1) * (self.amd_aql_queue.max_wave_id + 1)
|
||||
self.scratch = self._gpu_alloc(self.scratch_len, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
|
||||
self.amd_aql_queue.scratch_backing_memory_location = self.scratch.va_addr
|
||||
self.amd_aql_queue.scratch_backing_memory_byte_size = self.scratch_len
|
||||
self.amd_aql_queue.scratch_wave64_lane_byte_size = self.max_private_segment_size * (self.amd_aql_queue.max_wave_id + 1) // 64
|
||||
self.amd_aql_queue.scratch_resource_descriptor[0] = self.scratch.va_addr & 0xFFFFFFFF
|
||||
self.amd_aql_queue.scratch_resource_descriptor[1] = ((self.scratch.va_addr >> 32) & 0xFFFF) | (1 << 30) # va_hi | SWIZZLE_ENABLE
|
||||
self.amd_aql_queue.scratch_resource_descriptor[2] = self.scratch_len & 0xFFFFFFFF
|
||||
self.amd_aql_queue.scratch_resource_descriptor[3] = 0x20814fac # FORMAT=BUF_FORMAT_32_UINT,OOB_SELECT=2,ADD_TID_ENABLE=1,TYPE=SQ_RSRC_BUF,SQ_SELs
|
||||
|
||||
wave_scratch = (((self.amd_aql_queue.max_wave_id + 1) * self.max_private_segment_size + 255) // 256)
|
||||
self.amd_aql_queue.compute_tmpring_size = wave_scratch << 12 | (self.amd_aql_queue.max_cu_id + 1)
|
||||
|
||||
self.aql_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.aql_ring.va_addr, ring_size=self.aql_ring.size, gpu_id=self.gpu_id,
|
||||
queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE_AQL, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
||||
eop_buffer_address=self.eop_buffer.va_addr, eop_buffer_size=self.eop_buffer.size,
|
||||
ctx_save_restore_address=self.ctx_save_restore_address.va_addr, ctx_save_restore_size=self.ctx_save_restore_address.size,
|
||||
ctl_stack_size = 0xa000,
|
||||
write_pointer_address=self.gart.va_addr + getattr(hsa.amd_queue_t, 'write_dispatch_id').offset,
|
||||
read_pointer_address=self.gart.va_addr + getattr(hsa.amd_queue_t, 'read_dispatch_id').offset)
|
||||
|
||||
self.doorbells_base = self.aql_queue.doorbell_offset & (~0xfff)
|
||||
self.doorbells = libc.mmap(0, 8192, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, self.doorbells_base)
|
||||
self.aql_doorbell = to_mv(self.doorbells + self.aql_queue.doorbell_offset - self.doorbells_base, 4).cast("I")
|
||||
self.aql_doorbell_value = 0
|
||||
|
||||
# SDMA Queue
|
||||
self.sdma_ring = self._gpu_alloc(1 << 20, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
|
||||
self.sdma_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.sdma_ring.va_addr, ring_size=self.sdma_ring.size, gpu_id=self.gpu_id,
|
||||
queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
|
||||
write_pointer_address=self.gart.va_addr + 0x100, read_pointer_address=self.gart.va_addr + 0x108)
|
||||
|
||||
self.sdma_read_pointer = to_mv(self.sdma_queue.read_pointer_address, 8).cast("Q")
|
||||
self.sdma_write_pointer = to_mv(self.sdma_queue.write_pointer_address, 8).cast("Q")
|
||||
self.sdma_doorbell = to_mv(self.doorbells + self.sdma_queue.doorbell_offset - self.doorbells_base, 4).cast("I")
|
||||
self.sdma_doorbell_value = 0
|
||||
|
||||
# prebuilt packets
|
||||
self.sdma_flush_hdp_pkt = sdma_pkts.hdp_flush(0x8, 0x0, 0x80000000, 0x0, 0x0, 0x0)
|
||||
self.sdma_cache_inv = sdma_pkts.gcr(op=amd_sdma.SDMA_OP_GCR, sub_op=amd_sdma.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
|
||||
GCR_CONTROL_GL2_INV=1, GCR_CONTROL_GL1_INV=1, GCR_CONTROL_GLV_INV=1, GCR_CONTROL_GLK_INV=1,
|
||||
GCR_CONTROL_GL2_RANGE=0)
|
||||
self.sdma_cache_wb = sdma_pkts.gcr(op=amd_sdma.SDMA_OP_GCR, sub_op=amd_sdma.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
|
||||
GCR_CONTROL_GL2_RANGE=0)
|
||||
|
||||
# Helpers
|
||||
map_uptr2gpu_struct_t = init_c_struct_t(tuple(kfd.struct_kfd_ioctl_svm_args._fields_[:-1]+[('attrs', kfd.struct_kfd_ioctl_svm_attribute*2)])) # type: ignore
|
||||
self.map_uptr2gpu_struct = map_uptr2gpu_struct_t(nattr=2, op=0x0)
|
||||
self.map_uptr2gpu_struct.attrs[0].type = kfd.KFD_IOCTL_SVM_ATTR_SET_FLAGS
|
||||
self.map_uptr2gpu_struct.attrs[0].value = kfd.KFD_IOCTL_SVM_FLAG_COHERENT
|
||||
self.map_uptr2gpu_struct.attrs[1].type = kfd.KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE
|
||||
self.map_uptr2gpu_struct.attrs[1].value = self.gpu_id
|
||||
|
||||
super().__init__(device, KFDAllocator(self), KFDCompiler(self.arch), functools.partial(KFDProgram, self))
|
||||
|
||||
def _submit_sdma(self, dest, src, copy_size, wait_signals=None, completion_signal=None):
|
||||
def blit_sdma_command(cmd):
|
||||
ctypes.memmove(self.sdma_ring.va_addr + (self.sdma_doorbell_value % self.sdma_ring.size), ctypes.addressof(cmd), sz:=ctypes.sizeof(cmd))
|
||||
self.sdma_doorbell_value += sz
|
||||
|
||||
if wait_signals is not None:
|
||||
# NOTE: we check only low 32 bits to be zeroed, we don't use higher values for signals
|
||||
for sig in wait_signals:
|
||||
poll_addr = ctypes.addressof(sig) + getattr(hsa.amd_signal_t, 'value').offset
|
||||
blit_sdma_command(sdma_pkts.poll_regmem(op=amd_sdma.SDMA_OP_POLL_REGMEM, mem_poll=1, func=0x3, addr=poll_addr,
|
||||
value=0, mask=0xffffffff, interval=0x04, retry_count=0xfff))
|
||||
|
||||
if completion_signal is not None:
|
||||
blit_sdma_command(sdma_pkts.timestamp(op=amd_sdma.SDMA_OP_TIMESTAMP, sub_op=amd_sdma.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL,
|
||||
addr=ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'start_ts').offset))
|
||||
blit_sdma_command(self.sdma_flush_hdp_pkt)
|
||||
blit_sdma_command(self.sdma_cache_inv)
|
||||
|
||||
copied = 0
|
||||
copies_commands = (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
|
||||
for _ in range(copies_commands):
|
||||
step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
|
||||
blit_sdma_command(sdma_pkts.copy_linear(op=amd_sdma.SDMA_OP_COPY, sub_op=amd_sdma.SDMA_SUBOP_COPY_LINEAR,
|
||||
count=step_copy_size-1, src_addr=src+copied, dst_addr=dest+copied))
|
||||
copied += step_copy_size
|
||||
|
||||
blit_sdma_command(self.sdma_cache_wb)
|
||||
if completion_signal is not None:
|
||||
blit_sdma_command(sdma_pkts.timestamp(op=amd_sdma.SDMA_OP_TIMESTAMP, sub_op=amd_sdma.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL,
|
||||
addr=ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'end_ts').offset))
|
||||
|
||||
if completion_signal is not None:
|
||||
signal_addr = ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'value').offset
|
||||
blit_sdma_command(sdma_pkts.atomic(op=amd_sdma.SDMA_OP_ATOMIC, operation=amd_sdma.SDMA_ATOMIC_ADD64, addr=signal_addr, src_data=(1<<64)-1))
|
||||
if completion_signal.event_mailbox_ptr != 0:
|
||||
blit_sdma_command(sdma_pkts.fence(op=amd_sdma.SDMA_OP_FENCE, mtype=3, addr=completion_signal.event_mailbox_ptr,
|
||||
data=completion_signal.event_id))
|
||||
blit_sdma_command(sdma_pkts.trap(op=amd_sdma.SDMA_OP_TRAP, int_ctx=completion_signal.event_id))
|
||||
|
||||
self.sdma_write_pointer[0] = self.sdma_doorbell_value
|
||||
self.sdma_doorbell[0] = self.sdma_doorbell_value
|
||||
Reference in New Issue
Block a user