kfd driver wip (#3912)

* kfd driver wip

* cleanups

* kfd almost ready to ring doorbell

* ding dong?

* issues with signals

* something

* works

* ops kfd

* add amd_signal_t

* works...sometimes

* program runs

* _gpu_alloc cleanup

* cleanups

* work

* header + enable profiling (#3959)

* header + enable profiling

* just cleaner

* measure

* only local time domain

* remove old comments

* fix with master

* elf parsing (#3965)

* elf parsing

* fix kernels with private

* not used

* clean up

* clean up 2

* add flags

* kfd sdma (#3970)

* working sdma

* remove driver, shorter

* all commands we might need

* svm

* kfd remove hardcoded values (#4007)

* remove hardcoded values

* match above line

* 7k lines + revert hsa

* update that from origin

* fix sdma reg gen

* not the updated SDMA

* compiler_opts

* don't require kfd_ioctl

* get ioctls from python

* get ioctls from python

* remove build_sdma_command

* merge into 64-bit fields

* shorter

* fix property spelling and off by one

---------

Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
This commit is contained in:
George Hotz
2024-03-30 15:08:12 -07:00
committed by GitHub
parent bee8eeae55
commit 2abb474d43
11 changed files with 3527 additions and 3040 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -3588,6 +3588,128 @@ try:
hsa_amd_vmem_get_alloc_properties_from_handle.argtypes = [hsa_amd_vmem_alloc_handle_t, ctypes.POINTER(struct_hsa_amd_memory_pool_s), ctypes.POINTER(c__EA_hsa_amd_memory_type_t)]
except AttributeError:
pass
amd_queue_properties32_t = ctypes.c_uint32
# values for enumeration 'amd_queue_properties_t'
amd_queue_properties_t__enumvalues = {
0: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_SHIFT',
1: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_WIDTH',
1: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER',
1: 'AMD_QUEUE_PROPERTIES_IS_PTR64_SHIFT',
1: 'AMD_QUEUE_PROPERTIES_IS_PTR64_WIDTH',
2: 'AMD_QUEUE_PROPERTIES_IS_PTR64',
2: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_SHIFT',
1: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_WIDTH',
4: 'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS',
3: 'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_SHIFT',
1: 'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_WIDTH',
8: 'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING',
4: 'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_SHIFT',
1: 'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_WIDTH',
16: 'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE',
5: 'AMD_QUEUE_PROPERTIES_RESERVED1_SHIFT',
27: 'AMD_QUEUE_PROPERTIES_RESERVED1_WIDTH',
-32: 'AMD_QUEUE_PROPERTIES_RESERVED1',
}
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_SHIFT = 0
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_WIDTH = 1
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER = 1
AMD_QUEUE_PROPERTIES_IS_PTR64_SHIFT = 1
AMD_QUEUE_PROPERTIES_IS_PTR64_WIDTH = 1
AMD_QUEUE_PROPERTIES_IS_PTR64 = 2
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_SHIFT = 2
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_WIDTH = 1
AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS = 4
AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_SHIFT = 3
AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_WIDTH = 1
AMD_QUEUE_PROPERTIES_ENABLE_PROFILING = 8
AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_SHIFT = 4
AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_WIDTH = 1
AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE = 16
AMD_QUEUE_PROPERTIES_RESERVED1_SHIFT = 5
AMD_QUEUE_PROPERTIES_RESERVED1_WIDTH = 27
AMD_QUEUE_PROPERTIES_RESERVED1 = -32
amd_queue_properties_t = ctypes.c_int32 # enum
class struct_amd_queue_s(Structure):
pass
struct_amd_queue_s._pack_ = 1 # source:False
struct_amd_queue_s._fields_ = [
('hsa_queue', hsa_queue_t),
('reserved1', ctypes.c_uint32 * 4),
('write_dispatch_id', ctypes.c_uint64),
('group_segment_aperture_base_hi', ctypes.c_uint32),
('private_segment_aperture_base_hi', ctypes.c_uint32),
('max_cu_id', ctypes.c_uint32),
('max_wave_id', ctypes.c_uint32),
('max_legacy_doorbell_dispatch_id_plus_1', ctypes.c_uint64),
('legacy_doorbell_lock', ctypes.c_uint32),
('reserved2', ctypes.c_uint32 * 9),
('read_dispatch_id', ctypes.c_uint64),
('read_dispatch_id_field_base_byte_offset', ctypes.c_uint32),
('compute_tmpring_size', ctypes.c_uint32),
('scratch_resource_descriptor', ctypes.c_uint32 * 4),
('scratch_backing_memory_location', ctypes.c_uint64),
('scratch_backing_memory_byte_size', ctypes.c_uint64),
('scratch_wave64_lane_byte_size', ctypes.c_uint32),
('queue_properties', ctypes.c_uint32),
('reserved3', ctypes.c_uint32 * 2),
('queue_inactive_signal', hsa_signal_t),
('reserved4', ctypes.c_uint32 * 14),
]
amd_queue_t = struct_amd_queue_s
amd_signal_kind64_t = ctypes.c_int64
# values for enumeration 'amd_signal_kind_t'
amd_signal_kind_t__enumvalues = {
0: 'AMD_SIGNAL_KIND_INVALID',
1: 'AMD_SIGNAL_KIND_USER',
-1: 'AMD_SIGNAL_KIND_DOORBELL',
-2: 'AMD_SIGNAL_KIND_LEGACY_DOORBELL',
}
AMD_SIGNAL_KIND_INVALID = 0
AMD_SIGNAL_KIND_USER = 1
AMD_SIGNAL_KIND_DOORBELL = -1
AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2
amd_signal_kind_t = ctypes.c_int32 # enum
class struct_amd_signal_s(Structure):
pass
class union_amd_signal_s_0(Union):
pass
union_amd_signal_s_0._pack_ = 1 # source:False
union_amd_signal_s_0._fields_ = [
('value', ctypes.c_int64),
('legacy_hardware_doorbell_ptr', ctypes.POINTER(ctypes.c_uint32)),
('hardware_doorbell_ptr', ctypes.POINTER(ctypes.c_uint64)),
]
class union_amd_signal_s_1(Union):
pass
union_amd_signal_s_1._pack_ = 1 # source:False
union_amd_signal_s_1._fields_ = [
('queue_ptr', ctypes.POINTER(struct_amd_queue_s)),
('reserved2', ctypes.c_uint64),
]
struct_amd_signal_s._pack_ = 1 # source:False
struct_amd_signal_s._anonymous_ = ('_0', '_1',)
struct_amd_signal_s._fields_ = [
('kind', ctypes.c_int64),
('_0', union_amd_signal_s_0),
('event_mailbox_ptr', ctypes.c_uint64),
('event_id', ctypes.c_uint32),
('reserved1', ctypes.c_uint32),
('start_ts', ctypes.c_uint64),
('end_ts', ctypes.c_uint64),
('_1', union_amd_signal_s_1),
('reserved3', ctypes.c_uint32 * 2),
]
amd_signal_t = struct_amd_signal_s
class struct_BrigModuleHeader(Structure):
pass
@@ -3713,7 +3835,27 @@ struct_hsa_ext_finalizer_1_00_pfn_s._fields_ = [
hsa_ext_finalizer_1_00_pfn_t = struct_hsa_ext_finalizer_1_00_pfn_s
__all__ = \
['BrigModule_t', 'HSA_ACCESS_PERMISSION_NONE',
['AMD_QUEUE_PROPERTIES_ENABLE_PROFILING',
'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_SHIFT',
'AMD_QUEUE_PROPERTIES_ENABLE_PROFILING_WIDTH',
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER',
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS',
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_SHIFT',
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS_WIDTH',
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_SHIFT',
'AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_WIDTH',
'AMD_QUEUE_PROPERTIES_IS_PTR64',
'AMD_QUEUE_PROPERTIES_IS_PTR64_SHIFT',
'AMD_QUEUE_PROPERTIES_IS_PTR64_WIDTH',
'AMD_QUEUE_PROPERTIES_RESERVED1',
'AMD_QUEUE_PROPERTIES_RESERVED1_SHIFT',
'AMD_QUEUE_PROPERTIES_RESERVED1_WIDTH',
'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE',
'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_SHIFT',
'AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE_WIDTH',
'AMD_SIGNAL_KIND_DOORBELL', 'AMD_SIGNAL_KIND_INVALID',
'AMD_SIGNAL_KIND_LEGACY_DOORBELL', 'AMD_SIGNAL_KIND_USER',
'BrigModule_t', 'HSA_ACCESS_PERMISSION_NONE',
'HSA_ACCESS_PERMISSION_RO', 'HSA_ACCESS_PERMISSION_RW',
'HSA_ACCESS_PERMISSION_WO', 'HSA_AGENT_FEATURE_AGENT_DISPATCH',
'HSA_AGENT_FEATURE_KERNEL_DISPATCH',
@@ -4082,8 +4224,10 @@ __all__ = \
'HSA_VARIABLE_SEGMENT_READONLY', 'HSA_WAIT_STATE_ACTIVE',
'HSA_WAIT_STATE_BLOCKED', 'HSA_WAVEFRONT_INFO_SIZE',
'MEMORY_TYPE_NONE', 'MEMORY_TYPE_PINNED',
'c__EA_hsa_access_permission_t', 'c__EA_hsa_agent_feature_t',
'c__EA_hsa_agent_info_t',
'amd_queue_properties32_t', 'amd_queue_properties_t',
'amd_queue_t', 'amd_signal_kind64_t', 'amd_signal_kind_t',
'amd_signal_t', 'c__EA_hsa_access_permission_t',
'c__EA_hsa_agent_feature_t', 'c__EA_hsa_agent_info_t',
'c__EA_hsa_amd_agent_memory_pool_info_t',
'c__EA_hsa_amd_copy_direction_t',
'c__EA_hsa_amd_hw_exception_reset_cause_t',
@@ -4422,6 +4566,7 @@ __all__ = \
'hsa_wait_state_t__enumvalues', 'hsa_wavefront_get_info',
'hsa_wavefront_info_t', 'hsa_wavefront_info_t__enumvalues',
'hsa_wavefront_t', 'int32_t', 'size_t', 'struct_BrigModuleHeader',
'struct_amd_queue_s', 'struct_amd_signal_s',
'struct_hsa_agent_dispatch_packet_s', 'struct_hsa_agent_s',
'struct_hsa_amd_barrier_value_packet_s', 'struct_hsa_amd_event_s',
'struct_hsa_amd_gpu_hw_exception_info_s',
@@ -4455,4 +4600,5 @@ __all__ = \
'struct_hsa_queue_s', 'struct_hsa_region_s',
'struct_hsa_signal_group_s', 'struct_hsa_signal_s',
'struct_hsa_wavefront_s', 'uint16_t', 'uint32_t', 'uint64_t',
'union_amd_signal_s_0', 'union_amd_signal_s_1',
'union_hsa_amd_event_s_0']

View File

@@ -0,0 +1,812 @@
# mypy: ignore-errors
# -*- coding: utf-8 -*-
#
# TARGET arch is: []
# WORD_SIZE is: 8
# POINTER_SIZE is: 8
# LONGDOUBLE_SIZE is: 16
#
import ctypes, os
class AsDictMixin:
@classmethod
def as_dict(cls, self):
result = {}
if not isinstance(self, AsDictMixin):
# not a structure, assume it's already a python object
return self
if not hasattr(cls, "_fields_"):
return result
# sys.version_info >= (3, 5)
# for (field, *_) in cls._fields_: # noqa
for field_tuple in cls._fields_: # noqa
field = field_tuple[0]
if field.startswith('PADDING_'):
continue
value = getattr(self, field)
type_ = type(value)
if hasattr(value, "_length_") and hasattr(value, "_type_"):
# array
if not hasattr(type_, "as_dict"):
value = [v for v in value]
else:
type_ = type_._type_
value = [type_.as_dict(v) for v in value]
elif hasattr(value, "contents") and hasattr(value, "_type_"):
# pointer
try:
if not hasattr(type_, "as_dict"):
value = value.contents
else:
type_ = type_._type_
value = type_.as_dict(value.contents)
except ValueError:
# nullptr
value = None
elif isinstance(value, AsDictMixin):
# other structure
value = type_.as_dict(value)
result[field] = value
return result
class Structure(ctypes.Structure, AsDictMixin):
def __init__(self, *args, **kwds):
# We don't want to use positional arguments fill PADDING_* fields
args = dict(zip(self.__class__._field_names_(), args))
args.update(kwds)
super(Structure, self).__init__(**args)
@classmethod
def _field_names_(cls):
if hasattr(cls, '_fields_'):
return (f[0] for f in cls._fields_ if not f[0].startswith('PADDING'))
else:
return ()
@classmethod
def get_type(cls, field):
for f in cls._fields_:
if f[0] == field:
return f[1]
return None
@classmethod
def bind(cls, bound_fields):
fields = {}
for name, type_ in cls._fields_:
if hasattr(type_, "restype"):
if name in bound_fields:
if bound_fields[name] is None:
fields[name] = type_()
else:
# use a closure to capture the callback from the loop scope
fields[name] = (
type_((lambda callback: lambda *args: callback(*args))(
bound_fields[name]))
)
del bound_fields[name]
else:
# default callback implementation (does nothing)
try:
default_ = type_(0).restype().value
except TypeError:
default_ = None
fields[name] = type_((
lambda default_: lambda *args: default_)(default_))
else:
# not a callback function, use default initialization
if name in bound_fields:
fields[name] = bound_fields[name]
del bound_fields[name]
else:
fields[name] = type_()
if len(bound_fields) != 0:
raise ValueError(
"Cannot bind the following unknown callback(s) {}.{}".format(
cls.__name__, bound_fields.keys()
))
return cls(**fields)
class Union(ctypes.Union, AsDictMixin):
pass
KFD_IOCTL_H_INCLUDED = True # macro
KFD_IOCTL_MAJOR_VERSION = 1 # macro
KFD_IOCTL_MINOR_VERSION = 6 # macro
KFD_IOC_QUEUE_TYPE_COMPUTE = 0x0 # macro
KFD_IOC_QUEUE_TYPE_SDMA = 0x1 # macro
KFD_IOC_QUEUE_TYPE_COMPUTE_AQL = 0x2 # macro
KFD_IOC_QUEUE_TYPE_SDMA_XGMI = 0x3 # macro
KFD_MAX_QUEUE_PERCENTAGE = 100 # macro
KFD_MAX_QUEUE_PRIORITY = 15 # macro
KFD_IOC_CACHE_POLICY_COHERENT = 0 # macro
KFD_IOC_CACHE_POLICY_NONCOHERENT = 1 # macro
NUM_OF_SUPPORTED_GPUS = 7 # macro
MAX_ALLOWED_NUM_POINTS = 100 # macro
MAX_ALLOWED_AW_BUFF_SIZE = 4096 # macro
MAX_ALLOWED_WAC_BUFF_SIZE = 128 # macro
KFD_IOC_EVENT_SIGNAL = 0 # macro
KFD_IOC_EVENT_NODECHANGE = 1 # macro
KFD_IOC_EVENT_DEVICESTATECHANGE = 2 # macro
KFD_IOC_EVENT_HW_EXCEPTION = 3 # macro
KFD_IOC_EVENT_SYSTEM_EVENT = 4 # macro
KFD_IOC_EVENT_DEBUG_EVENT = 5 # macro
KFD_IOC_EVENT_PROFILE_EVENT = 6 # macro
KFD_IOC_EVENT_QUEUE_EVENT = 7 # macro
KFD_IOC_EVENT_MEMORY = 8 # macro
KFD_IOC_WAIT_RESULT_COMPLETE = 0 # macro
KFD_IOC_WAIT_RESULT_TIMEOUT = 1 # macro
KFD_IOC_WAIT_RESULT_FAIL = 2 # macro
KFD_SIGNAL_EVENT_LIMIT = 4096 # macro
KFD_HW_EXCEPTION_WHOLE_GPU_RESET = 0 # macro
KFD_HW_EXCEPTION_PER_ENGINE_RESET = 1 # macro
KFD_HW_EXCEPTION_GPU_HANG = 0 # macro
KFD_HW_EXCEPTION_ECC = 1 # macro
KFD_MEM_ERR_NO_RAS = 0 # macro
KFD_MEM_ERR_SRAM_ECC = 1 # macro
KFD_MEM_ERR_POISON_CONSUMED = 2 # macro
KFD_MEM_ERR_GPU_HANG = 3 # macro
KFD_IOC_ALLOC_MEM_FLAGS_VRAM = (1<<0) # macro
KFD_IOC_ALLOC_MEM_FLAGS_GTT = (1<<1) # macro
KFD_IOC_ALLOC_MEM_FLAGS_USERPTR = (1<<2) # macro
KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL = (1<<3) # macro
KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP = (1<<4) # macro
KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE = (1<<31) # macro
KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE = (1<<30) # macro
KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC = (1<<29) # macro
KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE = (1<<28) # macro
KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM = (1<<27) # macro
KFD_IOC_ALLOC_MEM_FLAGS_COHERENT = (1<<26) # macro
KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED = (1<<25) # macro
# def KFD_SMI_EVENT_MASK_FROM_INDEX(i): # macro
# return (1<<((i)-1))
KFD_IOCTL_SVM_FLAG_HOST_ACCESS = 0x00000001 # macro
KFD_IOCTL_SVM_FLAG_COHERENT = 0x00000002 # macro
KFD_IOCTL_SVM_FLAG_HIVE_LOCAL = 0x00000004 # macro
KFD_IOCTL_SVM_FLAG_GPU_RO = 0x00000008 # macro
KFD_IOCTL_SVM_FLAG_GPU_EXEC = 0x00000010 # macro
KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY = 0x00000020 # macro
AMDKFD_IOCTL_BASE = 'K' # macro
# def AMDKFD_IO(nr): # macro
# return _IO('K',nr)
# def AMDKFD_IOR(nr, type): # macro
# return _IOR('K',nr,type)
# def AMDKFD_IOW(nr, type): # macro
# return _IOW('K',nr,type)
# def AMDKFD_IOWR(nr, type): # macro
# return _IOWR('K',nr,type)
# AMDKFD_IOC_GET_VERSION = _IOR('K',nr,type) ( 0x01 , struct kfd_ioctl_get_version_args ) # macro
# AMDKFD_IOC_CREATE_QUEUE = _IOWR('K',nr,type) ( 0x02 , struct kfd_ioctl_create_queue_args ) # macro
# AMDKFD_IOC_DESTROY_QUEUE = _IOWR('K',nr,type) ( 0x03 , struct kfd_ioctl_destroy_queue_args ) # macro
# AMDKFD_IOC_SET_MEMORY_POLICY = _IOW('K',nr,type) ( 0x04 , struct kfd_ioctl_set_memory_policy_args ) # macro
# AMDKFD_IOC_GET_CLOCK_COUNTERS = _IOWR('K',nr,type) ( 0x05 , struct kfd_ioctl_get_clock_counters_args ) # macro
# AMDKFD_IOC_GET_PROCESS_APERTURES = _IOR('K',nr,type) ( 0x06 , struct kfd_ioctl_get_process_apertures_args ) # macro
# AMDKFD_IOC_UPDATE_QUEUE = _IOW('K',nr,type) ( 0x07 , struct kfd_ioctl_update_queue_args ) # macro
# AMDKFD_IOC_CREATE_EVENT = _IOWR('K',nr,type) ( 0x08 , struct kfd_ioctl_create_event_args ) # macro
# AMDKFD_IOC_DESTROY_EVENT = _IOW('K',nr,type) ( 0x09 , struct kfd_ioctl_destroy_event_args ) # macro
# AMDKFD_IOC_SET_EVENT = _IOW('K',nr,type) ( 0x0A , struct kfd_ioctl_set_event_args ) # macro
# AMDKFD_IOC_RESET_EVENT = _IOW('K',nr,type) ( 0x0B , struct kfd_ioctl_reset_event_args ) # macro
# AMDKFD_IOC_WAIT_EVENTS = _IOWR('K',nr,type) ( 0x0C , struct kfd_ioctl_wait_events_args ) # macro
# AMDKFD_IOC_DBG_REGISTER = _IOW('K',nr,type) ( 0x0D , struct kfd_ioctl_dbg_register_args ) # macro
# AMDKFD_IOC_DBG_UNREGISTER = _IOW('K',nr,type) ( 0x0E , struct kfd_ioctl_dbg_unregister_args ) # macro
# AMDKFD_IOC_DBG_ADDRESS_WATCH = _IOW('K',nr,type) ( 0x0F , struct kfd_ioctl_dbg_address_watch_args ) # macro
# AMDKFD_IOC_DBG_WAVE_CONTROL = _IOW('K',nr,type) ( 0x10 , struct kfd_ioctl_dbg_wave_control_args ) # macro
# AMDKFD_IOC_SET_SCRATCH_BACKING_VA = _IOWR('K',nr,type) ( 0x11 , struct kfd_ioctl_set_scratch_backing_va_args ) # macro
# AMDKFD_IOC_GET_TILE_CONFIG = _IOWR('K',nr,type) ( 0x12 , struct kfd_ioctl_get_tile_config_args ) # macro
# AMDKFD_IOC_SET_TRAP_HANDLER = _IOW('K',nr,type) ( 0x13 , struct kfd_ioctl_set_trap_handler_args ) # macro
# AMDKFD_IOC_GET_PROCESS_APERTURES_NEW = _IOWR('K',nr,type) ( 0x14 , struct kfd_ioctl_get_process_apertures_new_args ) # macro
# AMDKFD_IOC_ACQUIRE_VM = _IOW('K',nr,type) ( 0x15 , struct kfd_ioctl_acquire_vm_args ) # macro
# AMDKFD_IOC_ALLOC_MEMORY_OF_GPU = _IOWR('K',nr,type) ( 0x16 , struct kfd_ioctl_alloc_memory_of_gpu_args ) # macro
# AMDKFD_IOC_FREE_MEMORY_OF_GPU = _IOW('K',nr,type) ( 0x17 , struct kfd_ioctl_free_memory_of_gpu_args ) # macro
# AMDKFD_IOC_MAP_MEMORY_TO_GPU = _IOWR('K',nr,type) ( 0x18 , struct kfd_ioctl_map_memory_to_gpu_args ) # macro
# AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU = _IOWR('K',nr,type) ( 0x19 , struct kfd_ioctl_unmap_memory_from_gpu_args ) # macro
# AMDKFD_IOC_SET_CU_MASK = _IOW('K',nr,type) ( 0x1A , struct kfd_ioctl_set_cu_mask_args ) # macro
# AMDKFD_IOC_GET_QUEUE_WAVE_STATE = _IOWR('K',nr,type) ( 0x1B , struct kfd_ioctl_get_queue_wave_state_args ) # macro
# AMDKFD_IOC_GET_DMABUF_INFO = _IOWR('K',nr,type) ( 0x1C , struct kfd_ioctl_get_dmabuf_info_args ) # macro
# AMDKFD_IOC_IMPORT_DMABUF = _IOWR('K',nr,type) ( 0x1D , struct kfd_ioctl_import_dmabuf_args ) # macro
# AMDKFD_IOC_ALLOC_QUEUE_GWS = _IOWR('K',nr,type) ( 0x1E , struct kfd_ioctl_alloc_queue_gws_args ) # macro
# AMDKFD_IOC_SMI_EVENTS = _IOWR('K',nr,type) ( 0x1F , struct kfd_ioctl_smi_events_args ) # macro
# AMDKFD_IOC_SVM = _IOWR('K',nr,type) ( 0x20 , struct kfd_ioctl_svm_args ) # macro
# AMDKFD_IOC_SET_XNACK_MODE = _IOWR('K',nr,type) ( 0x21 , struct kfd_ioctl_set_xnack_mode_args ) # macro
AMDKFD_COMMAND_START = 0x01 # macro
AMDKFD_COMMAND_END = 0x22 # macro
class struct_kfd_ioctl_get_version_args(Structure):
pass
struct_kfd_ioctl_get_version_args._pack_ = 1 # source:False
struct_kfd_ioctl_get_version_args._fields_ = [
('major_version', ctypes.c_uint32),
('minor_version', ctypes.c_uint32),
]
class struct_kfd_ioctl_create_queue_args(Structure):
pass
struct_kfd_ioctl_create_queue_args._pack_ = 1 # source:False
struct_kfd_ioctl_create_queue_args._fields_ = [
('ring_base_address', ctypes.c_uint64),
('write_pointer_address', ctypes.c_uint64),
('read_pointer_address', ctypes.c_uint64),
('doorbell_offset', ctypes.c_uint64),
('ring_size', ctypes.c_uint32),
('gpu_id', ctypes.c_uint32),
('queue_type', ctypes.c_uint32),
('queue_percentage', ctypes.c_uint32),
('queue_priority', ctypes.c_uint32),
('queue_id', ctypes.c_uint32),
('eop_buffer_address', ctypes.c_uint64),
('eop_buffer_size', ctypes.c_uint64),
('ctx_save_restore_address', ctypes.c_uint64),
('ctx_save_restore_size', ctypes.c_uint32),
('ctl_stack_size', ctypes.c_uint32),
]
class struct_kfd_ioctl_destroy_queue_args(Structure):
pass
struct_kfd_ioctl_destroy_queue_args._pack_ = 1 # source:False
struct_kfd_ioctl_destroy_queue_args._fields_ = [
('queue_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_update_queue_args(Structure):
pass
struct_kfd_ioctl_update_queue_args._pack_ = 1 # source:False
struct_kfd_ioctl_update_queue_args._fields_ = [
('ring_base_address', ctypes.c_uint64),
('queue_id', ctypes.c_uint32),
('ring_size', ctypes.c_uint32),
('queue_percentage', ctypes.c_uint32),
('queue_priority', ctypes.c_uint32),
]
class struct_kfd_ioctl_set_cu_mask_args(Structure):
pass
struct_kfd_ioctl_set_cu_mask_args._pack_ = 1 # source:False
struct_kfd_ioctl_set_cu_mask_args._fields_ = [
('queue_id', ctypes.c_uint32),
('num_cu_mask', ctypes.c_uint32),
('cu_mask_ptr', ctypes.c_uint64),
]
class struct_kfd_ioctl_get_queue_wave_state_args(Structure):
pass
struct_kfd_ioctl_get_queue_wave_state_args._pack_ = 1 # source:False
struct_kfd_ioctl_get_queue_wave_state_args._fields_ = [
('ctl_stack_address', ctypes.c_uint64),
('ctl_stack_used_size', ctypes.c_uint32),
('save_area_used_size', ctypes.c_uint32),
('queue_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_set_memory_policy_args(Structure):
pass
struct_kfd_ioctl_set_memory_policy_args._pack_ = 1 # source:False
struct_kfd_ioctl_set_memory_policy_args._fields_ = [
('alternate_aperture_base', ctypes.c_uint64),
('alternate_aperture_size', ctypes.c_uint64),
('gpu_id', ctypes.c_uint32),
('default_policy', ctypes.c_uint32),
('alternate_policy', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_get_clock_counters_args(Structure):
pass
struct_kfd_ioctl_get_clock_counters_args._pack_ = 1 # source:False
struct_kfd_ioctl_get_clock_counters_args._fields_ = [
('gpu_clock_counter', ctypes.c_uint64),
('cpu_clock_counter', ctypes.c_uint64),
('system_clock_counter', ctypes.c_uint64),
('system_clock_freq', ctypes.c_uint64),
('gpu_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_process_device_apertures(Structure):
pass
struct_kfd_process_device_apertures._pack_ = 1 # source:False
struct_kfd_process_device_apertures._fields_ = [
('lds_base', ctypes.c_uint64),
('lds_limit', ctypes.c_uint64),
('scratch_base', ctypes.c_uint64),
('scratch_limit', ctypes.c_uint64),
('gpuvm_base', ctypes.c_uint64),
('gpuvm_limit', ctypes.c_uint64),
('gpu_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_get_process_apertures_args(Structure):
pass
struct_kfd_ioctl_get_process_apertures_args._pack_ = 1 # source:False
struct_kfd_ioctl_get_process_apertures_args._fields_ = [
('process_apertures', struct_kfd_process_device_apertures * 7),
('num_of_nodes', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_get_process_apertures_new_args(Structure):
pass
struct_kfd_ioctl_get_process_apertures_new_args._pack_ = 1 # source:False
struct_kfd_ioctl_get_process_apertures_new_args._fields_ = [
('kfd_process_device_apertures_ptr', ctypes.c_uint64),
('num_of_nodes', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_dbg_register_args(Structure):
pass
struct_kfd_ioctl_dbg_register_args._pack_ = 1 # source:False
struct_kfd_ioctl_dbg_register_args._fields_ = [
('gpu_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_dbg_unregister_args(Structure):
pass
struct_kfd_ioctl_dbg_unregister_args._pack_ = 1 # source:False
struct_kfd_ioctl_dbg_unregister_args._fields_ = [
('gpu_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_dbg_address_watch_args(Structure):
pass
struct_kfd_ioctl_dbg_address_watch_args._pack_ = 1 # source:False
struct_kfd_ioctl_dbg_address_watch_args._fields_ = [
('content_ptr', ctypes.c_uint64),
('gpu_id', ctypes.c_uint32),
('buf_size_in_bytes', ctypes.c_uint32),
]
class struct_kfd_ioctl_dbg_wave_control_args(Structure):
pass
struct_kfd_ioctl_dbg_wave_control_args._pack_ = 1 # source:False
struct_kfd_ioctl_dbg_wave_control_args._fields_ = [
('content_ptr', ctypes.c_uint64),
('gpu_id', ctypes.c_uint32),
('buf_size_in_bytes', ctypes.c_uint32),
]
class struct_kfd_ioctl_create_event_args(Structure):
pass
struct_kfd_ioctl_create_event_args._pack_ = 1 # source:False
struct_kfd_ioctl_create_event_args._fields_ = [
('event_page_offset', ctypes.c_uint64),
('event_trigger_data', ctypes.c_uint32),
('event_type', ctypes.c_uint32),
('auto_reset', ctypes.c_uint32),
('node_id', ctypes.c_uint32),
('event_id', ctypes.c_uint32),
('event_slot_index', ctypes.c_uint32),
]
class struct_kfd_ioctl_destroy_event_args(Structure):
pass
struct_kfd_ioctl_destroy_event_args._pack_ = 1 # source:False
struct_kfd_ioctl_destroy_event_args._fields_ = [
('event_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_set_event_args(Structure):
pass
struct_kfd_ioctl_set_event_args._pack_ = 1 # source:False
struct_kfd_ioctl_set_event_args._fields_ = [
('event_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_reset_event_args(Structure):
pass
struct_kfd_ioctl_reset_event_args._pack_ = 1 # source:False
struct_kfd_ioctl_reset_event_args._fields_ = [
('event_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_memory_exception_failure(Structure):
pass
struct_kfd_memory_exception_failure._pack_ = 1 # source:False
struct_kfd_memory_exception_failure._fields_ = [
('NotPresent', ctypes.c_uint32),
('ReadOnly', ctypes.c_uint32),
('NoExecute', ctypes.c_uint32),
('imprecise', ctypes.c_uint32),
]
class struct_kfd_hsa_memory_exception_data(Structure):
pass
struct_kfd_hsa_memory_exception_data._pack_ = 1 # source:False
struct_kfd_hsa_memory_exception_data._fields_ = [
('failure', struct_kfd_memory_exception_failure),
('va', ctypes.c_uint64),
('gpu_id', ctypes.c_uint32),
('ErrorType', ctypes.c_uint32),
]
class struct_kfd_hsa_hw_exception_data(Structure):
pass
struct_kfd_hsa_hw_exception_data._pack_ = 1 # source:False
struct_kfd_hsa_hw_exception_data._fields_ = [
('reset_type', ctypes.c_uint32),
('reset_cause', ctypes.c_uint32),
('memory_lost', ctypes.c_uint32),
('gpu_id', ctypes.c_uint32),
]
class struct_kfd_event_data(Structure):
pass
class union_kfd_event_data_0(Union):
pass
union_kfd_event_data_0._pack_ = 1 # source:False
union_kfd_event_data_0._fields_ = [
('memory_exception_data', struct_kfd_hsa_memory_exception_data),
('hw_exception_data', struct_kfd_hsa_hw_exception_data),
('PADDING_0', ctypes.c_ubyte * 16),
]
struct_kfd_event_data._pack_ = 1 # source:False
struct_kfd_event_data._anonymous_ = ('_0',)
struct_kfd_event_data._fields_ = [
('_0', union_kfd_event_data_0),
('kfd_event_data_ext', ctypes.c_uint64),
('event_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_wait_events_args(Structure):
pass
struct_kfd_ioctl_wait_events_args._pack_ = 1 # source:False
struct_kfd_ioctl_wait_events_args._fields_ = [
('events_ptr', ctypes.c_uint64),
('num_events', ctypes.c_uint32),
('wait_for_all', ctypes.c_uint32),
('timeout', ctypes.c_uint32),
('wait_result', ctypes.c_uint32),
]
class struct_kfd_ioctl_set_scratch_backing_va_args(Structure):
pass
struct_kfd_ioctl_set_scratch_backing_va_args._pack_ = 1 # source:False
struct_kfd_ioctl_set_scratch_backing_va_args._fields_ = [
('va_addr', ctypes.c_uint64),
('gpu_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_get_tile_config_args(Structure):
pass
struct_kfd_ioctl_get_tile_config_args._pack_ = 1 # source:False
struct_kfd_ioctl_get_tile_config_args._fields_ = [
('tile_config_ptr', ctypes.c_uint64),
('macro_tile_config_ptr', ctypes.c_uint64),
('num_tile_configs', ctypes.c_uint32),
('num_macro_tile_configs', ctypes.c_uint32),
('gpu_id', ctypes.c_uint32),
('gb_addr_config', ctypes.c_uint32),
('num_banks', ctypes.c_uint32),
('num_ranks', ctypes.c_uint32),
]
class struct_kfd_ioctl_set_trap_handler_args(Structure):
pass
struct_kfd_ioctl_set_trap_handler_args._pack_ = 1 # source:False
struct_kfd_ioctl_set_trap_handler_args._fields_ = [
('tba_addr', ctypes.c_uint64),
('tma_addr', ctypes.c_uint64),
('gpu_id', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_acquire_vm_args(Structure):
pass
struct_kfd_ioctl_acquire_vm_args._pack_ = 1 # source:False
struct_kfd_ioctl_acquire_vm_args._fields_ = [
('drm_fd', ctypes.c_uint32),
('gpu_id', ctypes.c_uint32),
]
class struct_kfd_ioctl_alloc_memory_of_gpu_args(Structure):
pass
struct_kfd_ioctl_alloc_memory_of_gpu_args._pack_ = 1 # source:False
struct_kfd_ioctl_alloc_memory_of_gpu_args._fields_ = [
('va_addr', ctypes.c_uint64),
('size', ctypes.c_uint64),
('handle', ctypes.c_uint64),
('mmap_offset', ctypes.c_uint64),
('gpu_id', ctypes.c_uint32),
('flags', ctypes.c_uint32),
]
class struct_kfd_ioctl_free_memory_of_gpu_args(Structure):
pass
struct_kfd_ioctl_free_memory_of_gpu_args._pack_ = 1 # source:False
struct_kfd_ioctl_free_memory_of_gpu_args._fields_ = [
('handle', ctypes.c_uint64),
]
class struct_kfd_ioctl_map_memory_to_gpu_args(Structure):
pass
struct_kfd_ioctl_map_memory_to_gpu_args._pack_ = 1 # source:False
struct_kfd_ioctl_map_memory_to_gpu_args._fields_ = [
('handle', ctypes.c_uint64),
('device_ids_array_ptr', ctypes.c_uint64),
('n_devices', ctypes.c_uint32),
('n_success', ctypes.c_uint32),
]
class struct_kfd_ioctl_unmap_memory_from_gpu_args(Structure):
pass
struct_kfd_ioctl_unmap_memory_from_gpu_args._pack_ = 1 # source:False
struct_kfd_ioctl_unmap_memory_from_gpu_args._fields_ = [
('handle', ctypes.c_uint64),
('device_ids_array_ptr', ctypes.c_uint64),
('n_devices', ctypes.c_uint32),
('n_success', ctypes.c_uint32),
]
class struct_kfd_ioctl_alloc_queue_gws_args(Structure):
pass
struct_kfd_ioctl_alloc_queue_gws_args._pack_ = 1 # source:False
struct_kfd_ioctl_alloc_queue_gws_args._fields_ = [
('queue_id', ctypes.c_uint32),
('num_gws', ctypes.c_uint32),
('first_gws', ctypes.c_uint32),
('pad', ctypes.c_uint32),
]
class struct_kfd_ioctl_get_dmabuf_info_args(Structure):
pass
struct_kfd_ioctl_get_dmabuf_info_args._pack_ = 1 # source:False
struct_kfd_ioctl_get_dmabuf_info_args._fields_ = [
('size', ctypes.c_uint64),
('metadata_ptr', ctypes.c_uint64),
('metadata_size', ctypes.c_uint32),
('gpu_id', ctypes.c_uint32),
('flags', ctypes.c_uint32),
('dmabuf_fd', ctypes.c_uint32),
]
class struct_kfd_ioctl_import_dmabuf_args(Structure):
pass
struct_kfd_ioctl_import_dmabuf_args._pack_ = 1 # source:False
struct_kfd_ioctl_import_dmabuf_args._fields_ = [
('va_addr', ctypes.c_uint64),
('handle', ctypes.c_uint64),
('gpu_id', ctypes.c_uint32),
('dmabuf_fd', ctypes.c_uint32),
]
# values for enumeration 'kfd_smi_event'
kfd_smi_event__enumvalues = {
0: 'KFD_SMI_EVENT_NONE',
1: 'KFD_SMI_EVENT_VMFAULT',
2: 'KFD_SMI_EVENT_THERMAL_THROTTLE',
3: 'KFD_SMI_EVENT_GPU_PRE_RESET',
4: 'KFD_SMI_EVENT_GPU_POST_RESET',
}
KFD_SMI_EVENT_NONE = 0
KFD_SMI_EVENT_VMFAULT = 1
KFD_SMI_EVENT_THERMAL_THROTTLE = 2
KFD_SMI_EVENT_GPU_PRE_RESET = 3
KFD_SMI_EVENT_GPU_POST_RESET = 4
kfd_smi_event = ctypes.c_uint32 # enum
class struct_kfd_ioctl_smi_events_args(Structure):
pass
struct_kfd_ioctl_smi_events_args._pack_ = 1 # source:False
struct_kfd_ioctl_smi_events_args._fields_ = [
('gpuid', ctypes.c_uint32),
('anon_fd', ctypes.c_uint32),
]
# values for enumeration 'kfd_mmio_remap'
kfd_mmio_remap__enumvalues = {
0: 'KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL',
4: 'KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL',
}
KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL = 0
KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL = 4
kfd_mmio_remap = ctypes.c_uint32 # enum
# values for enumeration 'kfd_ioctl_svm_op'
kfd_ioctl_svm_op__enumvalues = {
0: 'KFD_IOCTL_SVM_OP_SET_ATTR',
1: 'KFD_IOCTL_SVM_OP_GET_ATTR',
}
KFD_IOCTL_SVM_OP_SET_ATTR = 0
KFD_IOCTL_SVM_OP_GET_ATTR = 1
kfd_ioctl_svm_op = ctypes.c_uint32 # enum
# values for enumeration 'kfd_ioctl_svm_location'
kfd_ioctl_svm_location__enumvalues = {
0: 'KFD_IOCTL_SVM_LOCATION_SYSMEM',
4294967295: 'KFD_IOCTL_SVM_LOCATION_UNDEFINED',
}
KFD_IOCTL_SVM_LOCATION_SYSMEM = 0
KFD_IOCTL_SVM_LOCATION_UNDEFINED = 4294967295
kfd_ioctl_svm_location = ctypes.c_uint32 # enum
# values for enumeration 'kfd_ioctl_svm_attr_type'
kfd_ioctl_svm_attr_type__enumvalues = {
0: 'KFD_IOCTL_SVM_ATTR_PREFERRED_LOC',
1: 'KFD_IOCTL_SVM_ATTR_PREFETCH_LOC',
2: 'KFD_IOCTL_SVM_ATTR_ACCESS',
3: 'KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE',
4: 'KFD_IOCTL_SVM_ATTR_NO_ACCESS',
5: 'KFD_IOCTL_SVM_ATTR_SET_FLAGS',
6: 'KFD_IOCTL_SVM_ATTR_CLR_FLAGS',
7: 'KFD_IOCTL_SVM_ATTR_GRANULARITY',
}
KFD_IOCTL_SVM_ATTR_PREFERRED_LOC = 0
KFD_IOCTL_SVM_ATTR_PREFETCH_LOC = 1
KFD_IOCTL_SVM_ATTR_ACCESS = 2
KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE = 3
KFD_IOCTL_SVM_ATTR_NO_ACCESS = 4
KFD_IOCTL_SVM_ATTR_SET_FLAGS = 5
KFD_IOCTL_SVM_ATTR_CLR_FLAGS = 6
KFD_IOCTL_SVM_ATTR_GRANULARITY = 7
kfd_ioctl_svm_attr_type = ctypes.c_uint32 # enum
class struct_kfd_ioctl_svm_attribute(Structure):
pass
struct_kfd_ioctl_svm_attribute._pack_ = 1 # source:False
struct_kfd_ioctl_svm_attribute._fields_ = [
('type', ctypes.c_uint32),
('value', ctypes.c_uint32),
]
class struct_kfd_ioctl_svm_args(Structure):
pass
struct_kfd_ioctl_svm_args._pack_ = 1 # source:False
struct_kfd_ioctl_svm_args._fields_ = [
('start_addr', ctypes.c_uint64),
('size', ctypes.c_uint64),
('op', ctypes.c_uint32),
('nattr', ctypes.c_uint32),
('attrs', struct_kfd_ioctl_svm_attribute * 0),
]
class struct_kfd_ioctl_set_xnack_mode_args(Structure):
pass
struct_kfd_ioctl_set_xnack_mode_args._pack_ = 1 # source:False
struct_kfd_ioctl_set_xnack_mode_args._fields_ = [
('xnack_enabled', ctypes.c_int32),
]
__all__ = \
['AMDKFD_COMMAND_END', 'AMDKFD_COMMAND_START',
'AMDKFD_IOCTL_BASE', 'KFD_HW_EXCEPTION_ECC',
'KFD_HW_EXCEPTION_GPU_HANG', 'KFD_HW_EXCEPTION_PER_ENGINE_RESET',
'KFD_HW_EXCEPTION_WHOLE_GPU_RESET', 'KFD_IOCTL_H_INCLUDED',
'KFD_IOCTL_MAJOR_VERSION', 'KFD_IOCTL_MINOR_VERSION',
'KFD_IOCTL_SVM_ATTR_ACCESS', 'KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE',
'KFD_IOCTL_SVM_ATTR_CLR_FLAGS', 'KFD_IOCTL_SVM_ATTR_GRANULARITY',
'KFD_IOCTL_SVM_ATTR_NO_ACCESS',
'KFD_IOCTL_SVM_ATTR_PREFERRED_LOC',
'KFD_IOCTL_SVM_ATTR_PREFETCH_LOC', 'KFD_IOCTL_SVM_ATTR_SET_FLAGS',
'KFD_IOCTL_SVM_FLAG_COHERENT', 'KFD_IOCTL_SVM_FLAG_GPU_EXEC',
'KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY', 'KFD_IOCTL_SVM_FLAG_GPU_RO',
'KFD_IOCTL_SVM_FLAG_HIVE_LOCAL', 'KFD_IOCTL_SVM_FLAG_HOST_ACCESS',
'KFD_IOCTL_SVM_LOCATION_SYSMEM',
'KFD_IOCTL_SVM_LOCATION_UNDEFINED', 'KFD_IOCTL_SVM_OP_GET_ATTR',
'KFD_IOCTL_SVM_OP_SET_ATTR',
'KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM',
'KFD_IOC_ALLOC_MEM_FLAGS_COHERENT',
'KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL',
'KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE',
'KFD_IOC_ALLOC_MEM_FLAGS_GTT',
'KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP',
'KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE',
'KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC',
'KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED',
'KFD_IOC_ALLOC_MEM_FLAGS_USERPTR', 'KFD_IOC_ALLOC_MEM_FLAGS_VRAM',
'KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE',
'KFD_IOC_CACHE_POLICY_COHERENT',
'KFD_IOC_CACHE_POLICY_NONCOHERENT', 'KFD_IOC_EVENT_DEBUG_EVENT',
'KFD_IOC_EVENT_DEVICESTATECHANGE', 'KFD_IOC_EVENT_HW_EXCEPTION',
'KFD_IOC_EVENT_MEMORY', 'KFD_IOC_EVENT_NODECHANGE',
'KFD_IOC_EVENT_PROFILE_EVENT', 'KFD_IOC_EVENT_QUEUE_EVENT',
'KFD_IOC_EVENT_SIGNAL', 'KFD_IOC_EVENT_SYSTEM_EVENT',
'KFD_IOC_QUEUE_TYPE_COMPUTE', 'KFD_IOC_QUEUE_TYPE_COMPUTE_AQL',
'KFD_IOC_QUEUE_TYPE_SDMA', 'KFD_IOC_QUEUE_TYPE_SDMA_XGMI',
'KFD_IOC_WAIT_RESULT_COMPLETE', 'KFD_IOC_WAIT_RESULT_FAIL',
'KFD_IOC_WAIT_RESULT_TIMEOUT', 'KFD_MAX_QUEUE_PERCENTAGE',
'KFD_MAX_QUEUE_PRIORITY', 'KFD_MEM_ERR_GPU_HANG',
'KFD_MEM_ERR_NO_RAS', 'KFD_MEM_ERR_POISON_CONSUMED',
'KFD_MEM_ERR_SRAM_ECC', 'KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL',
'KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL', 'KFD_SIGNAL_EVENT_LIMIT',
'KFD_SMI_EVENT_GPU_POST_RESET', 'KFD_SMI_EVENT_GPU_PRE_RESET',
'KFD_SMI_EVENT_NONE', 'KFD_SMI_EVENT_THERMAL_THROTTLE',
'KFD_SMI_EVENT_VMFAULT', 'MAX_ALLOWED_AW_BUFF_SIZE',
'MAX_ALLOWED_NUM_POINTS', 'MAX_ALLOWED_WAC_BUFF_SIZE',
'NUM_OF_SUPPORTED_GPUS', 'kfd_ioctl_svm_attr_type',
'kfd_ioctl_svm_location', 'kfd_ioctl_svm_op', 'kfd_mmio_remap',
'kfd_smi_event', 'struct_kfd_event_data',
'struct_kfd_hsa_hw_exception_data',
'struct_kfd_hsa_memory_exception_data',
'struct_kfd_ioctl_acquire_vm_args',
'struct_kfd_ioctl_alloc_memory_of_gpu_args',
'struct_kfd_ioctl_alloc_queue_gws_args',
'struct_kfd_ioctl_create_event_args',
'struct_kfd_ioctl_create_queue_args',
'struct_kfd_ioctl_dbg_address_watch_args',
'struct_kfd_ioctl_dbg_register_args',
'struct_kfd_ioctl_dbg_unregister_args',
'struct_kfd_ioctl_dbg_wave_control_args',
'struct_kfd_ioctl_destroy_event_args',
'struct_kfd_ioctl_destroy_queue_args',
'struct_kfd_ioctl_free_memory_of_gpu_args',
'struct_kfd_ioctl_get_clock_counters_args',
'struct_kfd_ioctl_get_dmabuf_info_args',
'struct_kfd_ioctl_get_process_apertures_args',
'struct_kfd_ioctl_get_process_apertures_new_args',
'struct_kfd_ioctl_get_queue_wave_state_args',
'struct_kfd_ioctl_get_tile_config_args',
'struct_kfd_ioctl_get_version_args',
'struct_kfd_ioctl_import_dmabuf_args',
'struct_kfd_ioctl_map_memory_to_gpu_args',
'struct_kfd_ioctl_reset_event_args',
'struct_kfd_ioctl_set_cu_mask_args',
'struct_kfd_ioctl_set_event_args',
'struct_kfd_ioctl_set_memory_policy_args',
'struct_kfd_ioctl_set_scratch_backing_va_args',
'struct_kfd_ioctl_set_trap_handler_args',
'struct_kfd_ioctl_set_xnack_mode_args',
'struct_kfd_ioctl_smi_events_args', 'struct_kfd_ioctl_svm_args',
'struct_kfd_ioctl_svm_attribute',
'struct_kfd_ioctl_unmap_memory_from_gpu_args',
'struct_kfd_ioctl_update_queue_args',
'struct_kfd_ioctl_wait_events_args',
'struct_kfd_memory_exception_failure',
'struct_kfd_process_device_apertures', 'union_kfd_event_data_0']

333
tinygrad/runtime/ops_kfd.py Normal file
View File

@@ -0,0 +1,333 @@
from __future__ import annotations
from typing import Tuple
import os, fcntl, ctypes, functools, re, pathlib, mmap, struct
from tinygrad.device import Compiled, LRUAllocator, Compiler, BufferOptions, CompilerOptions
from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up
from tinygrad.renderer.cstyle import HIPRenderer
from tinygrad.runtime.driver.hip_comgr import compile_hip
import tinygrad.runtime.autogen.kfd as kfd
import tinygrad.runtime.autogen.hsa as hsa
import tinygrad.runtime.autogen.amd_sdma as amd_sdma
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401
libc = ctypes.CDLL("libc.so.6")
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
libc.mmap.restype = ctypes.c_void_p
def node_sysfs_path(node_id, file): return f"/sys/devices/virtual/kfd/kfd/topology/nodes/{node_id}/{file}"
def kfd_ioctl(idir, nr, user_struct, fd, made_struct=None, **kwargs):
made = made_struct or user_struct(**kwargs)
ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(made)<<16) | (ord('K')<<8) | nr, made)
if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
return made
def ioctls_from_header():
#hdr = pathlib.Path("/usr/include/linux/kfd_ioctl.h").read_text().replace("\\\n", "")
#pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_(IOW?R?)\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
#matches = re.findall(pattern, hdr, re.MULTILINE)
# get this from python instead
hdrpy = (pathlib.Path(__file__).parent / "autogen" / "kfd.py").read_text()
pattern = r'# (AMDKFD_IOC_[A-Z0-9_]+)\s=\s_(IOW?R?).*\(( 0x[0-9a-fA-F]+) ,\s+struct\s([A-Za-z0-9_]+)\s+\)'
matches = re.findall(pattern, hdrpy, re.MULTILINE)
idirs = {"IOW": 1, "IOR": 2, "IOWR": 3}
fxns = {name.replace("AMDKFD_IOC_", "").lower():
functools.partial(kfd_ioctl, idirs[idir], int(nr, 0x10), getattr(kfd, "struct_"+sname))
for name, idir, nr, sname in matches}
return type("KIO", (object, ), fxns)
kio = ioctls_from_header()
def create_sdma_packets():
# TODO: clean up this, if we want to keep it
structs = {}
for name,pkt in [(name,s) for name,s in amd_sdma.__dict__.items() if name.startswith("struct_SDMA_PKT_") and name.endswith("_TAG")]:
names = set()
fields = []
for pkt_fields in pkt._fields_:
if not pkt_fields[0].endswith("_UNION"): fields.append(pkt_fields)
else:
assert pkt_fields[1]._fields_[0][0] == '_0'
for union_fields in pkt_fields[1]._fields_[0][1]._fields_:
fname = union_fields[0]
if fname in names: fname = pkt_fields[0]+fname
names.add(fname)
if fname.endswith("_63_32") and fields[-1][0].endswith("_31_0"):
fields[-1] = tuple([fname[:-6], ctypes.c_ulong, 64]) # merge together 64-bit fields
else:
fields.append(tuple([fname, *union_fields[1:]]))
new_name = name[16:-4].lower()
structs[new_name] = init_c_struct_t(tuple(fields))
assert ctypes.sizeof(structs[new_name]) == ctypes.sizeof(pkt), f"{ctypes.sizeof(structs[new_name])} != {ctypes.sizeof(pkt)}"
return type("SDMA_PKTS", (object, ), structs)
sdma_pkts = create_sdma_packets()
class KFDCompiler(Compiler):
compiler_opts = CompilerOptions("KFD", has_tensor_cores=True, shared_max=65536)
def __init__(self, arch:str):
self.arch = arch
super().__init__(f"compile_hip_{self.arch}")
def render(self, name:str, uops) -> str: return HIPRenderer(name, uops)
def compile(self, src:str) -> bytes: return compile_hip(src, self.arch)
AQL_PACKET_SIZE = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t)
SDMA_MAX_COPY_SIZE = 0x400000
DISPATCH_KERNEL_SETUP = 3 << hsa.HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
DISPATCH_KERNEL_HEADER = 1 << hsa.HSA_PACKET_HEADER_BARRIER
DISPATCH_KERNEL_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE
DISPATCH_KERNEL_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE
DISPATCH_KERNEL_HEADER |= hsa.HSA_PACKET_TYPE_KERNEL_DISPATCH << hsa.HSA_PACKET_HEADER_TYPE
SHT_PROGBITS = 0x1
SHF_ALLOC = 0x2
class KFDProgram:
def __init__(self, device:KFDDevice, name:str, lib:bytes):
# TODO; this API needs the type signature of the function and global_size/local_size
self.device, self.name, self.lib = device, name, lib
_phoff, _shoff, _flags, _ehsize, _phentsize, _phnum, _shentsize, _shnum, _shstrndx = struct.unpack_from("<QQIHHHHHH", self.lib, 0x20)
sections = [struct.unpack_from("<IIQQQQIIQ", self.lib, _shoff + i * _shentsize) for i in range(_shnum)]
lib_gpu_size = round_up(max(sh[5]+sh[3] for sh in sections if sh[1] == SHT_PROGBITS), 0x1000)
self.lib_gpu = self.device._gpu_alloc(lib_gpu_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
lib_gpu_view = to_mv(self.lib_gpu.va_addr, lib_gpu_size)
for _, sh_type, sh_flags, sh_addr, sh_offset, sh_size, _, _, _ in sections:
if sh_type == SHT_PROGBITS and sh_flags & SHF_ALLOC: lib_gpu_view[sh_addr:sh_addr+sh_size] = self.lib[sh_offset:sh_offset+sh_size]
entry_point = min(sh[3] for sh in sections if sh[1] == SHT_PROGBITS and sh[2] & SHF_ALLOC)
self.handle = self.lib_gpu.va_addr + entry_point
self.group_segment_size = lib_gpu_view.cast("I")[entry_point//4]
self.private_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 1]
self.kernargs_segment_size = lib_gpu_view.cast("I")[entry_point//4 + 2]
assert self.private_segment_size <= self.device.max_private_segment_size, \
f"{self.private_segment_size=} > {self.device.max_private_segment_size=}"
# NOTE: no programs are ever freed
def __del__(self): kio.free_memory_of_gpu(KFDDevice.kfd, handle=self.lib_gpu.handle)
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
if not hasattr(self, "args_struct_t"):
self.args_struct_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(args))] +
[(f'v{i}', ctypes.c_int) for i in range(len(vals))]))
if ctypes.sizeof(self.args_struct_t) != self.kernargs_segment_size:
raise RuntimeError(f"HSAProgram.__call__: incorrect args struct size {ctypes.sizeof(self.args_struct_t)} != {self.kernargs_segment_size}")
args_st = self.args_struct_t.from_address(self.device.kernargs.va_addr)
for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i].va_addr)
for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
self.device.completion_signal.value = 1 # reset the signal before call
packet = hsa.hsa_kernel_dispatch_packet_t.from_address(self.device.aql_ring.va_addr +
(self.device.aql_doorbell_value*AQL_PACKET_SIZE) % self.device.aql_ring.size)
packet.workgroup_size_x, packet.workgroup_size_y, packet.workgroup_size_z = local_size
packet.reserved0 = 0
packet.grid_size_x, packet.grid_size_y, packet.grid_size_z = tuple(g*l for g,l in zip(global_size, local_size))
packet.kernel_object = self.handle
packet.kernarg_address = self.device.kernargs.va_addr
packet.group_segment_size = self.group_segment_size
packet.private_segment_size = self.private_segment_size # what it this and why doesn't it work? (see TestOps.test_dilated_conv_transpose2d)
packet.reserved2 = 0
packet.completion_signal = hsa.hsa_signal_t(ctypes.addressof(self.device.completion_signal))
packet.setup = DISPATCH_KERNEL_SETUP
packet.header = DISPATCH_KERNEL_HEADER
# one pending packet + ring doorbell
self.device.amd_aql_queue.write_dispatch_id = self.device.aql_doorbell_value + 1
self.device.aql_doorbell[0] = self.device.aql_doorbell_value
self.device.aql_doorbell_value += 1
evt_arr = (kfd.struct_kfd_event_data * 1)()
evt_arr[0].event_id = self.device.completion_signal.event_id
kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000)
assert (wp:=self.device.amd_aql_queue.write_dispatch_id) == (rp:=self.device.amd_aql_queue.read_dispatch_id), f"didn't run {wp} != {rp}"
if wait: return (self.device.completion_signal.end_ts-self.device.completion_signal.start_ts)/1e9
class KFDAllocator(LRUAllocator):
def __init__(self, device:KFDDevice):
self.device = device
super().__init__()
def _alloc(self, size:int, options:BufferOptions):
if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
else: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
def copyin(self, dest, src: memoryview):
# TODO: need to make the address visible to gpu and pass it directly to sdma.
self.device._map_userptr_to_gpu(ctypes.addressof(from_mv(src).contents), src.nbytes)
self.device.completion_signal.value = 1
self.device._submit_sdma(dest.va_addr, ctypes.addressof(from_mv(src).contents), src.nbytes, completion_signal=self.device.completion_signal)
evt_arr = (kfd.struct_kfd_event_data * 1)()
evt_arr[0].event_id = self.device.completion_signal.event_id
kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000)
def copyout(self, dest:memoryview, src):
self.device._map_userptr_to_gpu(ctypes.addressof(from_mv(dest).contents), dest.nbytes)
self.device.completion_signal.value = 1
self.device._submit_sdma(ctypes.addressof(from_mv(dest).contents), src.va_addr, dest.nbytes, completion_signal=self.device.completion_signal)
evt_arr = (kfd.struct_kfd_event_data * 1)()
evt_arr[0].event_id = self.device.completion_signal.event_id
kio.wait_events(KFDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000)
MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
class KFDDevice(Compiled):
kfd:int = -1
def _map_userptr_to_gpu(self, addr, size):
self.map_uptr2gpu_struct.start_addr = addr&~0xfff
self.map_uptr2gpu_struct.size = round_up(size+addr-(addr&~0xfff), 0x1000)
kio.svm(self.kfd, made_struct=self.map_uptr2gpu_struct)
def _gpu_alloc(self, size:int, flags:int, uncached=False, public=False, map_to_gpu=True):
flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED
if public: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR:
buf = addr = libc.mmap(0, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
else:
buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
assert addr != 0xffffffffffffffff
mem = kio.alloc_memory_of_gpu(self.kfd, va_addr=addr, size=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf)
if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.drm_fd, mem.mmap_offset)
assert buf != 0xffffffffffffffff
assert addr == buf == mem.va_addr
if map_to_gpu:
arr = (ctypes.c_int32 * 1)(self.gpu_id)
stm = kio.map_memory_to_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
assert stm.n_success == 1
return mem
def __init__(self, device:str=""):
if KFDDevice.kfd == -1: KFDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
self.device_id = int(device.split(":")[1]) if ":" in device else 0
with open(node_sysfs_path(self.device_id+1, "gpu_id"), "r") as f: self.gpu_id = int(f.read())
with open(node_sysfs_path(self.device_id+1, "properties"), "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)
self.arch = f"gfx{self.properties['gfx_target_version']//100}"
kio.acquire_vm(KFDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
self.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.sync_event = kio.create_event(KFDDevice.kfd, event_page_offset=self.event_page.handle, auto_reset=1)
self.eop_buffer = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.aql_ring = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
self.signals_page = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
self.gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
self.kernargs = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.ctx_save_restore_address = self._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.completion_signal = hsa.amd_signal_t.from_address(self.signals_page.va_addr)
self.completion_signal.value = 1
self.completion_signal.kind = hsa.AMD_SIGNAL_KIND_USER
self.completion_signal.event_mailbox_ptr = self.event_page.va_addr + self.sync_event.event_slot_index*8
self.completion_signal.event_id = self.sync_event.event_id
# AQL Queue
self.amd_aql_queue = hsa.amd_queue_t.from_address(self.gart.va_addr)
self.amd_aql_queue.write_dispatch_id = 0
self.amd_aql_queue.read_dispatch_id = 0
self.amd_aql_queue.read_dispatch_id_field_base_byte_offset = getattr(hsa.amd_queue_t, 'read_dispatch_id').offset
self.amd_aql_queue.queue_properties = hsa.AMD_QUEUE_PROPERTIES_IS_PTR64 | hsa.AMD_QUEUE_PROPERTIES_ENABLE_PROFILING
self.amd_aql_queue.max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
self.amd_aql_queue.max_wave_id = self.properties['max_waves_per_simd'] * self.properties['simd_per_cu'] - 1
# scratch setup
self.max_private_segment_size = 256
self.scratch_len = self.max_private_segment_size * (self.amd_aql_queue.max_cu_id + 1) * (self.amd_aql_queue.max_wave_id + 1)
self.scratch = self._gpu_alloc(self.scratch_len, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
self.amd_aql_queue.scratch_backing_memory_location = self.scratch.va_addr
self.amd_aql_queue.scratch_backing_memory_byte_size = self.scratch_len
self.amd_aql_queue.scratch_wave64_lane_byte_size = self.max_private_segment_size * (self.amd_aql_queue.max_wave_id + 1) // 64
self.amd_aql_queue.scratch_resource_descriptor[0] = self.scratch.va_addr & 0xFFFFFFFF
self.amd_aql_queue.scratch_resource_descriptor[1] = ((self.scratch.va_addr >> 32) & 0xFFFF) | (1 << 30) # va_hi | SWIZZLE_ENABLE
self.amd_aql_queue.scratch_resource_descriptor[2] = self.scratch_len & 0xFFFFFFFF
self.amd_aql_queue.scratch_resource_descriptor[3] = 0x20814fac # FORMAT=BUF_FORMAT_32_UINT,OOB_SELECT=2,ADD_TID_ENABLE=1,TYPE=SQ_RSRC_BUF,SQ_SELs
wave_scratch = (((self.amd_aql_queue.max_wave_id + 1) * self.max_private_segment_size + 255) // 256)
self.amd_aql_queue.compute_tmpring_size = wave_scratch << 12 | (self.amd_aql_queue.max_cu_id + 1)
self.aql_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.aql_ring.va_addr, ring_size=self.aql_ring.size, gpu_id=self.gpu_id,
queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE_AQL, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
eop_buffer_address=self.eop_buffer.va_addr, eop_buffer_size=self.eop_buffer.size,
ctx_save_restore_address=self.ctx_save_restore_address.va_addr, ctx_save_restore_size=self.ctx_save_restore_address.size,
ctl_stack_size = 0xa000,
write_pointer_address=self.gart.va_addr + getattr(hsa.amd_queue_t, 'write_dispatch_id').offset,
read_pointer_address=self.gart.va_addr + getattr(hsa.amd_queue_t, 'read_dispatch_id').offset)
self.doorbells_base = self.aql_queue.doorbell_offset & (~0xfff)
self.doorbells = libc.mmap(0, 8192, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, KFDDevice.kfd, self.doorbells_base)
self.aql_doorbell = to_mv(self.doorbells + self.aql_queue.doorbell_offset - self.doorbells_base, 4).cast("I")
self.aql_doorbell_value = 0
# SDMA Queue
self.sdma_ring = self._gpu_alloc(1 << 20, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, uncached=True)
self.sdma_queue = kio.create_queue(KFDDevice.kfd, ring_base_address=self.sdma_ring.va_addr, ring_size=self.sdma_ring.size, gpu_id=self.gpu_id,
queue_type=kfd.KFD_IOC_QUEUE_TYPE_SDMA, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
write_pointer_address=self.gart.va_addr + 0x100, read_pointer_address=self.gart.va_addr + 0x108)
self.sdma_read_pointer = to_mv(self.sdma_queue.read_pointer_address, 8).cast("Q")
self.sdma_write_pointer = to_mv(self.sdma_queue.write_pointer_address, 8).cast("Q")
self.sdma_doorbell = to_mv(self.doorbells + self.sdma_queue.doorbell_offset - self.doorbells_base, 4).cast("I")
self.sdma_doorbell_value = 0
# prebuilt packets
self.sdma_flush_hdp_pkt = sdma_pkts.hdp_flush(0x8, 0x0, 0x80000000, 0x0, 0x0, 0x0)
self.sdma_cache_inv = sdma_pkts.gcr(op=amd_sdma.SDMA_OP_GCR, sub_op=amd_sdma.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
GCR_CONTROL_GL2_INV=1, GCR_CONTROL_GL1_INV=1, GCR_CONTROL_GLV_INV=1, GCR_CONTROL_GLK_INV=1,
GCR_CONTROL_GL2_RANGE=0)
self.sdma_cache_wb = sdma_pkts.gcr(op=amd_sdma.SDMA_OP_GCR, sub_op=amd_sdma.SDMA_SUBOP_USER_GCR, GCR_CONTROL_GL2_WB=1, GCR_CONTROL_GLK_WB=1,
GCR_CONTROL_GL2_RANGE=0)
# Helpers
map_uptr2gpu_struct_t = init_c_struct_t(tuple(kfd.struct_kfd_ioctl_svm_args._fields_[:-1]+[('attrs', kfd.struct_kfd_ioctl_svm_attribute*2)])) # type: ignore
self.map_uptr2gpu_struct = map_uptr2gpu_struct_t(nattr=2, op=0x0)
self.map_uptr2gpu_struct.attrs[0].type = kfd.KFD_IOCTL_SVM_ATTR_SET_FLAGS
self.map_uptr2gpu_struct.attrs[0].value = kfd.KFD_IOCTL_SVM_FLAG_COHERENT
self.map_uptr2gpu_struct.attrs[1].type = kfd.KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE
self.map_uptr2gpu_struct.attrs[1].value = self.gpu_id
super().__init__(device, KFDAllocator(self), KFDCompiler(self.arch), functools.partial(KFDProgram, self))
def _submit_sdma(self, dest, src, copy_size, wait_signals=None, completion_signal=None):
def blit_sdma_command(cmd):
ctypes.memmove(self.sdma_ring.va_addr + (self.sdma_doorbell_value % self.sdma_ring.size), ctypes.addressof(cmd), sz:=ctypes.sizeof(cmd))
self.sdma_doorbell_value += sz
if wait_signals is not None:
# NOTE: we check only low 32 bits to be zeroed, we don't use higher values for signals
for sig in wait_signals:
poll_addr = ctypes.addressof(sig) + getattr(hsa.amd_signal_t, 'value').offset
blit_sdma_command(sdma_pkts.poll_regmem(op=amd_sdma.SDMA_OP_POLL_REGMEM, mem_poll=1, func=0x3, addr=poll_addr,
value=0, mask=0xffffffff, interval=0x04, retry_count=0xfff))
if completion_signal is not None:
blit_sdma_command(sdma_pkts.timestamp(op=amd_sdma.SDMA_OP_TIMESTAMP, sub_op=amd_sdma.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL,
addr=ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'start_ts').offset))
blit_sdma_command(self.sdma_flush_hdp_pkt)
blit_sdma_command(self.sdma_cache_inv)
copied = 0
copies_commands = (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
for _ in range(copies_commands):
step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
blit_sdma_command(sdma_pkts.copy_linear(op=amd_sdma.SDMA_OP_COPY, sub_op=amd_sdma.SDMA_SUBOP_COPY_LINEAR,
count=step_copy_size-1, src_addr=src+copied, dst_addr=dest+copied))
copied += step_copy_size
blit_sdma_command(self.sdma_cache_wb)
if completion_signal is not None:
blit_sdma_command(sdma_pkts.timestamp(op=amd_sdma.SDMA_OP_TIMESTAMP, sub_op=amd_sdma.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL,
addr=ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'end_ts').offset))
if completion_signal is not None:
signal_addr = ctypes.addressof(completion_signal) + getattr(hsa.amd_signal_t, 'value').offset
blit_sdma_command(sdma_pkts.atomic(op=amd_sdma.SDMA_OP_ATOMIC, operation=amd_sdma.SDMA_ATOMIC_ADD64, addr=signal_addr, src_data=(1<<64)-1))
if completion_signal.event_mailbox_ptr != 0:
blit_sdma_command(sdma_pkts.fence(op=amd_sdma.SDMA_OP_FENCE, mtype=3, addr=completion_signal.event_mailbox_ptr,
data=completion_signal.event_id))
blit_sdma_command(sdma_pkts.trap(op=amd_sdma.SDMA_OP_TRAP, int_ctx=completion_signal.event_id))
self.sdma_write_pointer[0] = self.sdma_doorbell_value
self.sdma_doorbell[0] = self.sdma_doorbell_value