Refactor ops_amd.py (MI300X prereq) (#9428)

This commit is contained in:
uuuvn
2025-03-28 22:17:20 +05:00
committed by GitHub
parent 3e1168ff5e
commit dd9aae02c3
24 changed files with 937619 additions and 110 deletions

View File

@@ -308,11 +308,27 @@ generate_am() {
sed -i "s\(int64_t)\ \g" $BASE/am/am.py
sed -i "s\AMDGPU_PTE_MTYPE_VG10(2)\AMDGPU_PTE_MTYPE_VG10(0, 2)\g" $BASE/am/am.py # incorrect parsing (TODO: remove when clang2py is gone).
clang2py -k cdefstum \
extra/hip_gpu_driver/kfd_pm4_headers_ai.h \
extra/hip_gpu_driver/nvd.h \
-o $BASE/am/pm4_nv.py
fixup $BASE/am/pm4_nv.py
clang2py -k cdefstum \
extra/amdpci/headers/navi10_enum.h \
-o $BASE/am/navi10.py
fixup $BASE/am/navi10.py
clang2py -k cdefstum \
extra/amdpci/headers/soc21_enum.h \
-o $BASE/am/soc21.py
fixup $BASE/am/soc21.py
clang2py -k cdefstum \
extra/amdpci/headers/soc24_enum.h \
-o $BASE/am/soc24.py
fixup $BASE/am/soc24.py
clang2py -k cdefstum \
extra/amdpci/headers/mp_13_0_0_offset.h \
extra/amdpci/headers/mp_13_0_0_sh_mask.h \
@@ -325,12 +341,38 @@ generate_am() {
-o $BASE/am/mp_11_0.py
fixup $BASE/am/mp_11_0.py
clang2py -k cdefstum \
extra/amdpci/headers/gc_10_3_0_offset.h \
extra/amdpci/headers/gc_10_3_0_sh_mask.h \
-o $BASE/am/gc_10_3_0.py
fixup $BASE/am/gc_10_3_0.py
clang2py -k cdefstum \
extra/amdpci/headers/gc_11_0_0_offset.h \
extra/amdpci/headers/gc_11_0_0_sh_mask.h \
-o $BASE/am/gc_11_0_0.py
fixup $BASE/am/gc_11_0_0.py
clang2py -k cdefstum \
extra/amdpci/headers/gc_12_0_0_offset.h \
extra/amdpci/headers/gc_12_0_0_sh_mask.h \
-o $BASE/am/gc_12_0_0.py
fixup $BASE/am/gc_12_0_0.py
clang2py -k cdefstum \
extra/hip_gpu_driver/sdma_registers.h \
extra/hip_gpu_driver/navi10_sdma_pkt_open.h \
--clang-args="-I/opt/rocm/include -x c++" \
-o $BASE/am/sdma_5_0_0.py
fixup $BASE/am/sdma_5_0_0.py
clang2py -k cdefstum \
extra/hip_gpu_driver/sdma_registers.h \
extra/hip_gpu_driver/sdma_v6_0_0_pkt_open.h \
--clang-args="-I/opt/rocm/include -x c++" \
-o $BASE/am/sdma_6_0_0.py
fixup $BASE/am/sdma_6_0_0.py
clang2py -k cdefstum \
extra/amdpci/headers/mmhub_3_0_0_offset.h \
extra/amdpci/headers/mmhub_3_0_0_sh_mask.h \
@@ -343,12 +385,24 @@ generate_am() {
-o $BASE/am/mmhub_3_0_2.py
fixup $BASE/am/mmhub_3_0_2.py
clang2py -k cdefstum \
extra/amdpci/headers/nbio_2_3_offset.h \
extra/amdpci/headers/nbio_2_3_sh_mask.h \
-o $BASE/am/nbio_2_3_0.py
fixup $BASE/am/nbio_2_3_0.py
clang2py -k cdefstum \
extra/amdpci/headers/nbio_4_3_0_offset.h \
extra/amdpci/headers/nbio_4_3_0_sh_mask.h \
-o $BASE/am/nbio_4_3_0.py
fixup $BASE/am/nbio_4_3_0.py
clang2py -k cdefstum \
extra/amdpci/headers/nbif_6_3_1_offset.h \
extra/amdpci/headers/nbif_6_3_1_sh_mask.h \
-o $BASE/am/nbif_6_3_1.py
fixup $BASE/am/nbif_6_3_1.py
clang2py -k cdefstum \
extra/amdpci/headers/osssys_6_0_0_offset.h \
extra/amdpci/headers/osssys_6_0_0_sh_mask.h \

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,6 @@
import pathlib, re, ctypes, mmap, collections, functools, copy, os
import tinygrad.runtime.autogen.kfd as kfd
import tinygrad.runtime.autogen.am.am as am
from tinygrad.helpers import from_mv
from test.mockgpu.driver import VirtDriver, VirtFileDesc, TextFileDesc, DirFileDesc, VirtFile
from test.mockgpu.amd.amdgpu import AMDGPU, gpu_props
@@ -82,6 +83,23 @@ class AMDDriver(VirtDriver):
VirtFile(f'/sys/devices/virtual/kfd/kfd/topology/nodes/{gpu_id}/gpu_id', functools.partial(TextFileDesc, text=f"{gpu_id}")),
VirtFile(f'/sys/devices/virtual/kfd/kfd/topology/nodes/{gpu_id}/properties',
functools.partial(TextFileDesc, text=gpu_props.format(drm_render_minor=gpu_id))),
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0',
functools.partial(DirFileDesc, child_names=[str(am.GC_HWID), str(am.SDMA0_HWID), str(am.NBIF_HWID)])),
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.GC_HWID}/0/major', functools.partial(TextFileDesc, text='11')),
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.GC_HWID}/0/minor', functools.partial(TextFileDesc, text='0')),
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.GC_HWID}/0/revision', functools.partial(TextFileDesc, text='0')),
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.GC_HWID}/0/base_addr',
functools.partial(TextFileDesc, text='0x00001260\n0x0000A000\n0x0001C000\n0x02402C00')),
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.SDMA0_HWID}/0/major', functools.partial(TextFileDesc, text='6')),
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.SDMA0_HWID}/0/minor', functools.partial(TextFileDesc, text='0')),
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.SDMA0_HWID}/0/revision', functools.partial(TextFileDesc, text='0')),
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.SDMA0_HWID}/0/base_addr',
functools.partial(TextFileDesc, text='0x00001260\n0x0000A000\n0x0001C000\n0x02402C00')),
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.NBIF_HWID}/0/major', functools.partial(TextFileDesc, text='4')),
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.NBIF_HWID}/0/minor', functools.partial(TextFileDesc, text='3')),
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.NBIF_HWID}/0/revision', functools.partial(TextFileDesc, text='0')),
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.NBIF_HWID}/0/base_addr',
functools.partial(TextFileDesc, text='0x00000000\n0x00000014\n0x00000D20\n0x00010400\n0x0241B000\n0x04040000')),
VirtFile(f'/dev/dri/renderD{gpu_id}', functools.partial(DRMFileDesc, driver=self, gpu=f"{self.gpus[gpu_id]}")),
]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,962 @@
# mypy: ignore-errors
# -*- coding: utf-8 -*-
#
# TARGET arch is: []
# WORD_SIZE is: 8
# POINTER_SIZE is: 8
# LONGDOUBLE_SIZE is: 16
#
import ctypes
class AsDictMixin:
@classmethod
def as_dict(cls, self):
result = {}
if not isinstance(self, AsDictMixin):
# not a structure, assume it's already a python object
return self
if not hasattr(cls, "_fields_"):
return result
# sys.version_info >= (3, 5)
# for (field, *_) in cls._fields_: # noqa
for field_tuple in cls._fields_: # noqa
field = field_tuple[0]
if field.startswith('PADDING_'):
continue
value = getattr(self, field)
type_ = type(value)
if hasattr(value, "_length_") and hasattr(value, "_type_"):
# array
if not hasattr(type_, "as_dict"):
value = [v for v in value]
else:
type_ = type_._type_
value = [type_.as_dict(v) for v in value]
elif hasattr(value, "contents") and hasattr(value, "_type_"):
# pointer
try:
if not hasattr(type_, "as_dict"):
value = value.contents
else:
type_ = type_._type_
value = type_.as_dict(value.contents)
except ValueError:
# nullptr
value = None
elif isinstance(value, AsDictMixin):
# other structure
value = type_.as_dict(value)
result[field] = value
return result
class Structure(ctypes.Structure, AsDictMixin):
def __init__(self, *args, **kwds):
# We don't want to use positional arguments fill PADDING_* fields
args = dict(zip(self.__class__._field_names_(), args))
args.update(kwds)
super(Structure, self).__init__(**args)
@classmethod
def _field_names_(cls):
if hasattr(cls, '_fields_'):
return (f[0] for f in cls._fields_ if not f[0].startswith('PADDING'))
else:
return ()
@classmethod
def get_type(cls, field):
for f in cls._fields_:
if f[0] == field:
return f[1]
return None
@classmethod
def bind(cls, bound_fields):
fields = {}
for name, type_ in cls._fields_:
if hasattr(type_, "restype"):
if name in bound_fields:
if bound_fields[name] is None:
fields[name] = type_()
else:
# use a closure to capture the callback from the loop scope
fields[name] = (
type_((lambda callback: lambda *args: callback(*args))(
bound_fields[name]))
)
del bound_fields[name]
else:
# default callback implementation (does nothing)
try:
default_ = type_(0).restype().value
except TypeError:
default_ = None
fields[name] = type_((
lambda default_: lambda *args: default_)(default_))
else:
# not a callback function, use default initialization
if name in bound_fields:
fields[name] = bound_fields[name]
del bound_fields[name]
else:
fields[name] = type_()
if len(bound_fields) != 0:
raise ValueError(
"Cannot bind the following unknown callback(s) {}.{}".format(
cls.__name__, bound_fields.keys()
))
return cls(**fields)
class Union(ctypes.Union, AsDictMixin):
pass
F32_MES_PM4_PACKETS_H = True # macro
uint32_t = True # macro
int32_t = True # macro
PM4_MES_HEADER_DEFINED = True # macro
PM4_MEC_RELEASE_MEM_DEFINED = True # macro
PM4_MEC_WRITE_DATA_DEFINED = True # macro
class union_PM4_MES_TYPE_3_HEADER(Union):
pass
class struct_PM4_MES_TYPE_3_HEADER_0(Structure):
pass
struct_PM4_MES_TYPE_3_HEADER_0._pack_ = 1 # source:False
struct_PM4_MES_TYPE_3_HEADER_0._fields_ = [
('reserved1', ctypes.c_uint32, 8),
('opcode', ctypes.c_uint32, 8),
('count', ctypes.c_uint32, 14),
('type', ctypes.c_uint32, 2),
]
union_PM4_MES_TYPE_3_HEADER._pack_ = 1 # source:False
union_PM4_MES_TYPE_3_HEADER._anonymous_ = ('_0',)
union_PM4_MES_TYPE_3_HEADER._fields_ = [
('_0', struct_PM4_MES_TYPE_3_HEADER_0),
('u32All', ctypes.c_uint32),
]
# values for enumeration 'c_uint32'
c_uint32__enumvalues = {
5: 'event_index__mec_release_mem__end_of_pipe',
6: 'event_index__mec_release_mem__shader_done',
}
event_index__mec_release_mem__end_of_pipe = 5
event_index__mec_release_mem__shader_done = 6
c_uint32 = ctypes.c_uint32 # enum
# values for enumeration 'c_uint32'
c_uint32__enumvalues = {
0: 'cache_policy__mec_release_mem__lru',
1: 'cache_policy__mec_release_mem__stream',
}
cache_policy__mec_release_mem__lru = 0
cache_policy__mec_release_mem__stream = 1
c_uint32 = ctypes.c_uint32 # enum
# values for enumeration 'c_uint32'
c_uint32__enumvalues = {
0: 'pq_exe_status__mec_release_mem__default',
1: 'pq_exe_status__mec_release_mem__phase_update',
}
pq_exe_status__mec_release_mem__default = 0
pq_exe_status__mec_release_mem__phase_update = 1
c_uint32 = ctypes.c_uint32 # enum
# values for enumeration 'c_uint32'
c_uint32__enumvalues = {
0: 'dst_sel__mec_release_mem__memory_controller',
1: 'dst_sel__mec_release_mem__tc_l2',
2: 'dst_sel__mec_release_mem__queue_write_pointer_register',
3: 'dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit',
}
dst_sel__mec_release_mem__memory_controller = 0
dst_sel__mec_release_mem__tc_l2 = 1
dst_sel__mec_release_mem__queue_write_pointer_register = 2
dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit = 3
c_uint32 = ctypes.c_uint32 # enum
# values for enumeration 'c_uint32'
c_uint32__enumvalues = {
0: 'int_sel__mec_release_mem__none',
1: 'int_sel__mec_release_mem__send_interrupt_only',
2: 'int_sel__mec_release_mem__send_interrupt_after_write_confirm',
3: 'int_sel__mec_release_mem__send_data_after_write_confirm',
4: 'int_sel__mec_release_mem__unconditionally_send_int_ctxid',
5: 'int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare',
6: 'int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare',
}
int_sel__mec_release_mem__none = 0
int_sel__mec_release_mem__send_interrupt_only = 1
int_sel__mec_release_mem__send_interrupt_after_write_confirm = 2
int_sel__mec_release_mem__send_data_after_write_confirm = 3
int_sel__mec_release_mem__unconditionally_send_int_ctxid = 4
int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare = 5
int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare = 6
c_uint32 = ctypes.c_uint32 # enum
# values for enumeration 'c_uint32'
c_uint32__enumvalues = {
0: 'data_sel__mec_release_mem__none',
1: 'data_sel__mec_release_mem__send_32_bit_low',
2: 'data_sel__mec_release_mem__send_64_bit_data',
3: 'data_sel__mec_release_mem__send_gpu_clock_counter',
4: 'data_sel__mec_release_mem__send_cp_perfcounter_hi_lo',
5: 'data_sel__mec_release_mem__store_gds_data_to_memory',
}
data_sel__mec_release_mem__none = 0
data_sel__mec_release_mem__send_32_bit_low = 1
data_sel__mec_release_mem__send_64_bit_data = 2
data_sel__mec_release_mem__send_gpu_clock_counter = 3
data_sel__mec_release_mem__send_cp_perfcounter_hi_lo = 4
data_sel__mec_release_mem__store_gds_data_to_memory = 5
c_uint32 = ctypes.c_uint32 # enum
class struct_pm4_mec_release_mem(Structure):
pass
class union_pm4_mec_release_mem_0(Union):
pass
union_pm4_mec_release_mem_0._pack_ = 1 # source:False
union_pm4_mec_release_mem_0._fields_ = [
('header', union_PM4_MES_TYPE_3_HEADER),
('ordinal1', ctypes.c_uint32),
]
class union_pm4_mec_release_mem_1(Union):
pass
class struct_pm4_mec_release_mem_1_bitfields2(Structure):
pass
struct_pm4_mec_release_mem_1_bitfields2._pack_ = 1 # source:False
struct_pm4_mec_release_mem_1_bitfields2._fields_ = [
('event_type', ctypes.c_uint32, 6),
('reserved1', ctypes.c_uint32, 2),
('event_index', c_uint32, 4),
('tcl1_vol_action_ena', ctypes.c_uint32, 1),
('tc_vol_action_ena', ctypes.c_uint32, 1),
('reserved2', ctypes.c_uint32, 1),
('tc_wb_action_ena', ctypes.c_uint32, 1),
('tcl1_action_ena', ctypes.c_uint32, 1),
('tc_action_ena', ctypes.c_uint32, 1),
('reserved3', ctypes.c_uint32, 1),
('tc_nc_action_ena', ctypes.c_uint32, 1),
('tc_wc_action_ena', ctypes.c_uint32, 1),
('tc_md_action_ena', ctypes.c_uint32, 1),
('reserved4', ctypes.c_uint32, 3),
('cache_policy', c_uint32, 2),
('reserved5', ctypes.c_uint32, 2),
('pq_exe_status', c_uint32, 1),
('reserved6', ctypes.c_uint32, 2),
]
union_pm4_mec_release_mem_1._pack_ = 1 # source:False
union_pm4_mec_release_mem_1._fields_ = [
('bitfields2', struct_pm4_mec_release_mem_1_bitfields2),
('ordinal2', ctypes.c_uint32),
]
class union_pm4_mec_release_mem_2(Union):
pass
class struct_pm4_mec_release_mem_2_bitfields3(Structure):
pass
struct_pm4_mec_release_mem_2_bitfields3._pack_ = 1 # source:False
struct_pm4_mec_release_mem_2_bitfields3._fields_ = [
('reserved7', ctypes.c_uint32, 16),
('dst_sel', c_uint32, 2),
('reserved8', ctypes.c_uint32, 6),
('int_sel', c_uint32, 3),
('reserved9', ctypes.c_uint32, 2),
('data_sel', c_uint32, 3),
]
union_pm4_mec_release_mem_2._pack_ = 1 # source:False
union_pm4_mec_release_mem_2._fields_ = [
('bitfields3', struct_pm4_mec_release_mem_2_bitfields3),
('ordinal3', ctypes.c_uint32),
]
class union_pm4_mec_release_mem_3(Union):
pass
class struct_pm4_mec_release_mem_3_bitfields4(Structure):
pass
struct_pm4_mec_release_mem_3_bitfields4._pack_ = 1 # source:False
struct_pm4_mec_release_mem_3_bitfields4._fields_ = [
('reserved10', ctypes.c_uint32, 2),
('address_lo_32b', ctypes.c_uint32, 30),
]
class struct_pm4_mec_release_mem_3_bitfields4b(Structure):
pass
struct_pm4_mec_release_mem_3_bitfields4b._pack_ = 1 # source:False
struct_pm4_mec_release_mem_3_bitfields4b._fields_ = [
('reserved11', ctypes.c_uint32, 3),
('address_lo_64b', ctypes.c_uint32, 29),
]
union_pm4_mec_release_mem_3._pack_ = 1 # source:False
union_pm4_mec_release_mem_3._fields_ = [
('bitfields4', struct_pm4_mec_release_mem_3_bitfields4),
('bitfields4b', struct_pm4_mec_release_mem_3_bitfields4b),
('reserved12', ctypes.c_uint32),
('ordinal4', ctypes.c_uint32),
]
class union_pm4_mec_release_mem_4(Union):
pass
union_pm4_mec_release_mem_4._pack_ = 1 # source:False
union_pm4_mec_release_mem_4._fields_ = [
('address_hi', ctypes.c_uint32),
('reserved13', ctypes.c_uint32),
('ordinal5', ctypes.c_uint32),
]
class union_pm4_mec_release_mem_5(Union):
pass
class struct_pm4_mec_release_mem_5_bitfields6c(Structure):
pass
struct_pm4_mec_release_mem_5_bitfields6c._pack_ = 1 # source:False
struct_pm4_mec_release_mem_5_bitfields6c._fields_ = [
('dw_offset', ctypes.c_uint32, 16),
('num_dwords', ctypes.c_uint32, 16),
]
union_pm4_mec_release_mem_5._pack_ = 1 # source:False
union_pm4_mec_release_mem_5._fields_ = [
('data_lo', ctypes.c_uint32),
('cmp_data_lo', ctypes.c_uint32),
('bitfields6c', struct_pm4_mec_release_mem_5_bitfields6c),
('reserved14', ctypes.c_uint32),
('ordinal6', ctypes.c_uint32),
]
class union_pm4_mec_release_mem_6(Union):
pass
union_pm4_mec_release_mem_6._pack_ = 1 # source:False
union_pm4_mec_release_mem_6._fields_ = [
('data_hi', ctypes.c_uint32),
('cmp_data_hi', ctypes.c_uint32),
('reserved15', ctypes.c_uint32),
('reserved16', ctypes.c_uint32),
('ordinal7', ctypes.c_uint32),
]
struct_pm4_mec_release_mem._pack_ = 1 # source:False
struct_pm4_mec_release_mem._anonymous_ = ('_0', '_1', '_2', '_3', '_4', '_5', '_6',)
struct_pm4_mec_release_mem._fields_ = [
('_0', union_pm4_mec_release_mem_0),
('_1', union_pm4_mec_release_mem_1),
('_2', union_pm4_mec_release_mem_2),
('_3', union_pm4_mec_release_mem_3),
('_4', union_pm4_mec_release_mem_4),
('_5', union_pm4_mec_release_mem_5),
('_6', union_pm4_mec_release_mem_6),
('int_ctxid', ctypes.c_uint32),
]
# values for enumeration 'WRITE_DATA_dst_sel_enum'
WRITE_DATA_dst_sel_enum__enumvalues = {
0: 'dst_sel___write_data__mem_mapped_register',
2: 'dst_sel___write_data__tc_l2',
3: 'dst_sel___write_data__gds',
5: 'dst_sel___write_data__memory',
6: 'dst_sel___write_data__memory_mapped_adc_persistent_state',
}
dst_sel___write_data__mem_mapped_register = 0
dst_sel___write_data__tc_l2 = 2
dst_sel___write_data__gds = 3
dst_sel___write_data__memory = 5
dst_sel___write_data__memory_mapped_adc_persistent_state = 6
WRITE_DATA_dst_sel_enum = ctypes.c_uint32 # enum
# values for enumeration 'WRITE_DATA_addr_incr_enum'
WRITE_DATA_addr_incr_enum__enumvalues = {
0: 'addr_incr___write_data__increment_address',
1: 'addr_incr___write_data__do_not_increment_address',
}
addr_incr___write_data__increment_address = 0
addr_incr___write_data__do_not_increment_address = 1
WRITE_DATA_addr_incr_enum = ctypes.c_uint32 # enum
# values for enumeration 'WRITE_DATA_wr_confirm_enum'
WRITE_DATA_wr_confirm_enum__enumvalues = {
0: 'wr_confirm___write_data__do_not_wait_for_write_confirmation',
1: 'wr_confirm___write_data__wait_for_write_confirmation',
}
wr_confirm___write_data__do_not_wait_for_write_confirmation = 0
wr_confirm___write_data__wait_for_write_confirmation = 1
WRITE_DATA_wr_confirm_enum = ctypes.c_uint32 # enum
# values for enumeration 'WRITE_DATA_cache_policy_enum'
WRITE_DATA_cache_policy_enum__enumvalues = {
0: 'cache_policy___write_data__lru',
1: 'cache_policy___write_data__stream',
}
cache_policy___write_data__lru = 0
cache_policy___write_data__stream = 1
WRITE_DATA_cache_policy_enum = ctypes.c_uint32 # enum
class struct_pm4_mec_write_data_mmio(Structure):
pass
class union_pm4_mec_write_data_mmio_0(Union):
pass
union_pm4_mec_write_data_mmio_0._pack_ = 1 # source:False
union_pm4_mec_write_data_mmio_0._fields_ = [
('header', union_PM4_MES_TYPE_3_HEADER),
('ordinal1', ctypes.c_uint32),
]
class union_pm4_mec_write_data_mmio_1(Union):
pass
class struct_pm4_mec_write_data_mmio_1_bitfields2(Structure):
pass
struct_pm4_mec_write_data_mmio_1_bitfields2._pack_ = 1 # source:False
struct_pm4_mec_write_data_mmio_1_bitfields2._fields_ = [
('reserved1', ctypes.c_uint32, 8),
('dst_sel', ctypes.c_uint32, 4),
('reserved2', ctypes.c_uint32, 4),
('addr_incr', ctypes.c_uint32, 1),
('reserved3', ctypes.c_uint32, 2),
('resume_vf', ctypes.c_uint32, 1),
('wr_confirm', ctypes.c_uint32, 1),
('reserved4', ctypes.c_uint32, 4),
('cache_policy', ctypes.c_uint32, 2),
('reserved5', ctypes.c_uint32, 5),
]
union_pm4_mec_write_data_mmio_1._pack_ = 1 # source:False
union_pm4_mec_write_data_mmio_1._fields_ = [
('bitfields2', struct_pm4_mec_write_data_mmio_1_bitfields2),
('ordinal2', ctypes.c_uint32),
]
class union_pm4_mec_write_data_mmio_2(Union):
pass
class struct_pm4_mec_write_data_mmio_2_bitfields3(Structure):
pass
struct_pm4_mec_write_data_mmio_2_bitfields3._pack_ = 1 # source:False
struct_pm4_mec_write_data_mmio_2_bitfields3._fields_ = [
('dst_mmreg_addr', ctypes.c_uint32, 18),
('reserved6', ctypes.c_uint32, 14),
]
union_pm4_mec_write_data_mmio_2._pack_ = 1 # source:False
union_pm4_mec_write_data_mmio_2._fields_ = [
('bitfields3', struct_pm4_mec_write_data_mmio_2_bitfields3),
('ordinal3', ctypes.c_uint32),
]
struct_pm4_mec_write_data_mmio._pack_ = 1 # source:False
struct_pm4_mec_write_data_mmio._anonymous_ = ('_0', '_1', '_2',)
struct_pm4_mec_write_data_mmio._fields_ = [
('_0', union_pm4_mec_write_data_mmio_0),
('_1', union_pm4_mec_write_data_mmio_1),
('_2', union_pm4_mec_write_data_mmio_2),
('reserved7', ctypes.c_uint32),
('data', ctypes.c_uint32),
]
# values for enumeration 'c__Ea_CACHE_FLUSH_AND_INV_TS_EVENT'
c__Ea_CACHE_FLUSH_AND_INV_TS_EVENT__enumvalues = {
20: 'CACHE_FLUSH_AND_INV_TS_EVENT',
}
CACHE_FLUSH_AND_INV_TS_EVENT = 20
c__Ea_CACHE_FLUSH_AND_INV_TS_EVENT = ctypes.c_uint32 # enum
NVD_H = True # macro
PACKET_TYPE0 = 0 # macro
PACKET_TYPE1 = 1 # macro
PACKET_TYPE2 = 2 # macro
PACKET_TYPE3 = 3 # macro
def CP_PACKET_GET_TYPE(h): # macro
return (((h)>>30)&3)
def CP_PACKET_GET_COUNT(h): # macro
return (((h)>>16)&0x3FFF)
def CP_PACKET0_GET_REG(h): # macro
return ((h)&0xFFFF)
def CP_PACKET3_GET_OPCODE(h): # macro
return (((h)>>8)&0xFF)
def PACKET0(reg, n): # macro
return ((0<<30)|((reg)&0xFFFF)|((n)&0x3FFF)<<16)
CP_PACKET2 = 0x80000000 # macro
PACKET2_PAD_SHIFT = 0 # macro
PACKET2_PAD_MASK = (0x3fffffff<<0) # macro
# def PACKET2(v): # macro
# return (0x80000000|REG_SET(PACKET2_PAD,(v)))
def PACKET3(op, n): # macro
return ((3<<30)|(((op)&0xFF)<<8)|((n)&0x3FFF)<<16)
def PACKET3_COMPUTE(op, n): # macro
return (PACKET3(op,n)|1<<1)
PACKET3_NOP = 0x10 # macro
PACKET3_SET_BASE = 0x11 # macro
def PACKET3_BASE_INDEX(x): # macro
return ((x)<<0)
CE_PARTITION_BASE = 3 # macro
PACKET3_CLEAR_STATE = 0x12 # macro
PACKET3_INDEX_BUFFER_SIZE = 0x13 # macro
PACKET3_DISPATCH_DIRECT = 0x15 # macro
PACKET3_DISPATCH_INDIRECT = 0x16 # macro
PACKET3_INDIRECT_BUFFER_END = 0x17 # macro
PACKET3_INDIRECT_BUFFER_CNST_END = 0x19 # macro
PACKET3_ATOMIC_GDS = 0x1D # macro
PACKET3_ATOMIC_MEM = 0x1E # macro
PACKET3_OCCLUSION_QUERY = 0x1F # macro
PACKET3_SET_PREDICATION = 0x20 # macro
PACKET3_REG_RMW = 0x21 # macro
PACKET3_COND_EXEC = 0x22 # macro
PACKET3_PRED_EXEC = 0x23 # macro
PACKET3_DRAW_INDIRECT = 0x24 # macro
PACKET3_DRAW_INDEX_INDIRECT = 0x25 # macro
PACKET3_INDEX_BASE = 0x26 # macro
PACKET3_DRAW_INDEX_2 = 0x27 # macro
PACKET3_CONTEXT_CONTROL = 0x28 # macro
PACKET3_INDEX_TYPE = 0x2A # macro
PACKET3_DRAW_INDIRECT_MULTI = 0x2C # macro
PACKET3_DRAW_INDEX_AUTO = 0x2D # macro
PACKET3_NUM_INSTANCES = 0x2F # macro
PACKET3_DRAW_INDEX_MULTI_AUTO = 0x30 # macro
PACKET3_INDIRECT_BUFFER_PRIV = 0x32 # macro
PACKET3_INDIRECT_BUFFER_CNST = 0x33 # macro
PACKET3_COND_INDIRECT_BUFFER_CNST = 0x33 # macro
PACKET3_STRMOUT_BUFFER_UPDATE = 0x34 # macro
PACKET3_DRAW_INDEX_OFFSET_2 = 0x35 # macro
PACKET3_DRAW_PREAMBLE = 0x36 # macro
PACKET3_WRITE_DATA = 0x37 # macro
def WRITE_DATA_DST_SEL(x): # macro
return ((x)<<8)
WR_ONE_ADDR = (1<<16) # macro
WR_CONFIRM = (1<<20) # macro
def WRITE_DATA_CACHE_POLICY(x): # macro
return ((x)<<25)
def WRITE_DATA_ENGINE_SEL(x): # macro
return ((x)<<30)
PACKET3_DRAW_INDEX_INDIRECT_MULTI = 0x38 # macro
PACKET3_MEM_SEMAPHORE = 0x39 # macro
PACKET3_SEM_USE_MAILBOX = (0x1<<16) # macro
PACKET3_SEM_SEL_SIGNAL_TYPE = (0x1<<20) # macro
PACKET3_SEM_SEL_SIGNAL = (0x6<<29) # macro
PACKET3_SEM_SEL_WAIT = (0x7<<29) # macro
PACKET3_DRAW_INDEX_MULTI_INST = 0x3A # macro
PACKET3_COPY_DW = 0x3B # macro
PACKET3_WAIT_REG_MEM = 0x3C # macro
def WAIT_REG_MEM_FUNCTION(x): # macro
return ((x)<<0)
def WAIT_REG_MEM_MEM_SPACE(x): # macro
return ((x)<<4)
def WAIT_REG_MEM_OPERATION(x): # macro
return ((x)<<6)
def WAIT_REG_MEM_ENGINE(x): # macro
return ((x)<<8)
PACKET3_INDIRECT_BUFFER = 0x3F # macro
INDIRECT_BUFFER_VALID = (1<<23) # macro
def INDIRECT_BUFFER_CACHE_POLICY(x): # macro
return ((x)<<28)
def INDIRECT_BUFFER_PRE_ENB(x): # macro
return ((x)<<21)
def INDIRECT_BUFFER_PRE_RESUME(x): # macro
return ((x)<<30)
PACKET3_COND_INDIRECT_BUFFER = 0x3F # macro
PACKET3_COPY_DATA = 0x40 # macro
PACKET3_CP_DMA = 0x41 # macro
PACKET3_PFP_SYNC_ME = 0x42 # macro
PACKET3_SURFACE_SYNC = 0x43 # macro
PACKET3_ME_INITIALIZE = 0x44 # macro
PACKET3_COND_WRITE = 0x45 # macro
PACKET3_EVENT_WRITE = 0x46 # macro
def EVENT_TYPE(x): # macro
return ((x)<<0)
def EVENT_INDEX(x): # macro
return ((x)<<8)
PACKET3_EVENT_WRITE_EOP = 0x47 # macro
PACKET3_EVENT_WRITE_EOS = 0x48 # macro
PACKET3_RELEASE_MEM = 0x49 # macro
def PACKET3_RELEASE_MEM_EVENT_TYPE(x): # macro
return ((x)<<0)
def PACKET3_RELEASE_MEM_EVENT_INDEX(x): # macro
return ((x)<<8)
PACKET3_RELEASE_MEM_GCR_GLM_WB = (1<<12) # macro
PACKET3_RELEASE_MEM_GCR_GLM_INV = (1<<13) # macro
PACKET3_RELEASE_MEM_GCR_GLV_INV = (1<<14) # macro
PACKET3_RELEASE_MEM_GCR_GL1_INV = (1<<15) # macro
PACKET3_RELEASE_MEM_GCR_GL2_US = (1<<16) # macro
PACKET3_RELEASE_MEM_GCR_GL2_RANGE = (1<<17) # macro
PACKET3_RELEASE_MEM_GCR_GL2_DISCARD = (1<<19) # macro
PACKET3_RELEASE_MEM_GCR_GL2_INV = (1<<20) # macro
PACKET3_RELEASE_MEM_GCR_GL2_WB = (1<<21) # macro
PACKET3_RELEASE_MEM_GCR_SEQ = (1<<22) # macro
def PACKET3_RELEASE_MEM_CACHE_POLICY(x): # macro
return ((x)<<25)
PACKET3_RELEASE_MEM_EXECUTE = (1<<28) # macro
def PACKET3_RELEASE_MEM_DATA_SEL(x): # macro
return ((x)<<29)
def PACKET3_RELEASE_MEM_INT_SEL(x): # macro
return ((x)<<24)
def PACKET3_RELEASE_MEM_DST_SEL(x): # macro
return ((x)<<16)
PACKET3_PREAMBLE_CNTL = 0x4A # macro
PACKET3_PREAMBLE_BEGIN_CLEAR_STATE = (2<<28) # macro
PACKET3_PREAMBLE_END_CLEAR_STATE = (3<<28) # macro
PACKET3_DMA_DATA = 0x50 # macro
def PACKET3_DMA_DATA_ENGINE(x): # macro
return ((x)<<0)
def PACKET3_DMA_DATA_SRC_CACHE_POLICY(x): # macro
return ((x)<<13)
def PACKET3_DMA_DATA_DST_SEL(x): # macro
return ((x)<<20)
def PACKET3_DMA_DATA_DST_CACHE_POLICY(x): # macro
return ((x)<<25)
def PACKET3_DMA_DATA_SRC_SEL(x): # macro
return ((x)<<29)
PACKET3_DMA_DATA_CP_SYNC = (1<<31) # macro
PACKET3_DMA_DATA_CMD_SAS = (1<<26) # macro
PACKET3_DMA_DATA_CMD_DAS = (1<<27) # macro
PACKET3_DMA_DATA_CMD_SAIC = (1<<28) # macro
PACKET3_DMA_DATA_CMD_DAIC = (1<<29) # macro
PACKET3_DMA_DATA_CMD_RAW_WAIT = (1<<30) # macro
PACKET3_CONTEXT_REG_RMW = 0x51 # macro
PACKET3_GFX_CNTX_UPDATE = 0x52 # macro
PACKET3_BLK_CNTX_UPDATE = 0x53 # macro
PACKET3_INCR_UPDT_STATE = 0x55 # macro
PACKET3_ACQUIRE_MEM = 0x58 # macro
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(x): # macro
return ((x)<<0)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_RANGE(x): # macro
return ((x)<<2)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(x): # macro
return ((x)<<4)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(x): # macro
return ((x)<<5)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(x): # macro
return ((x)<<6)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(x): # macro
return ((x)<<7)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(x): # macro
return ((x)<<8)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(x): # macro
return ((x)<<9)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_US(x): # macro
return ((x)<<10)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_RANGE(x): # macro
return ((x)<<11)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_DISCARD(x): # macro
return ((x)<<13)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(x): # macro
return ((x)<<14)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(x): # macro
return ((x)<<15)
def PACKET3_ACQUIRE_MEM_GCR_CNTL_SEQ(x): # macro
return ((x)<<16)
PACKET3_ACQUIRE_MEM_GCR_RANGE_IS_PA = (1<<18) # macro
PACKET3_REWIND = 0x59 # macro
PACKET3_INTERRUPT = 0x5A # macro
PACKET3_GEN_PDEPTE = 0x5B # macro
PACKET3_INDIRECT_BUFFER_PASID = 0x5C # macro
PACKET3_PRIME_UTCL2 = 0x5D # macro
PACKET3_LOAD_UCONFIG_REG = 0x5E # macro
PACKET3_LOAD_SH_REG = 0x5F # macro
PACKET3_LOAD_CONFIG_REG = 0x60 # macro
PACKET3_LOAD_CONTEXT_REG = 0x61 # macro
PACKET3_LOAD_COMPUTE_STATE = 0x62 # macro
PACKET3_LOAD_SH_REG_INDEX = 0x63 # macro
PACKET3_SET_CONFIG_REG = 0x68 # macro
PACKET3_SET_CONFIG_REG_START = 0x00002000 # macro
PACKET3_SET_CONFIG_REG_END = 0x00002c00 # macro
PACKET3_SET_CONTEXT_REG = 0x69 # macro
PACKET3_SET_CONTEXT_REG_START = 0x0000a000 # macro
PACKET3_SET_CONTEXT_REG_END = 0x0000a400 # macro
PACKET3_SET_CONTEXT_REG_INDEX = 0x6A # macro
PACKET3_SET_VGPR_REG_DI_MULTI = 0x71 # macro
PACKET3_SET_SH_REG_DI = 0x72 # macro
PACKET3_SET_CONTEXT_REG_INDIRECT = 0x73 # macro
PACKET3_SET_SH_REG_DI_MULTI = 0x74 # macro
PACKET3_GFX_PIPE_LOCK = 0x75 # macro
PACKET3_SET_SH_REG = 0x76 # macro
PACKET3_SET_SH_REG_START = 0x00002c00 # macro
PACKET3_SET_SH_REG_END = 0x00003000 # macro
PACKET3_SET_SH_REG_OFFSET = 0x77 # macro
PACKET3_SET_QUEUE_REG = 0x78 # macro
PACKET3_SET_UCONFIG_REG = 0x79 # macro
PACKET3_SET_UCONFIG_REG_START = 0x0000c000 # macro
PACKET3_SET_UCONFIG_REG_END = 0x0000c400 # macro
PACKET3_SET_UCONFIG_REG_INDEX = 0x7A # macro
PACKET3_FORWARD_HEADER = 0x7C # macro
PACKET3_SCRATCH_RAM_WRITE = 0x7D # macro
PACKET3_SCRATCH_RAM_READ = 0x7E # macro
PACKET3_LOAD_CONST_RAM = 0x80 # macro
PACKET3_WRITE_CONST_RAM = 0x81 # macro
PACKET3_DUMP_CONST_RAM = 0x83 # macro
PACKET3_INCREMENT_CE_COUNTER = 0x84 # macro
PACKET3_INCREMENT_DE_COUNTER = 0x85 # macro
PACKET3_WAIT_ON_CE_COUNTER = 0x86 # macro
PACKET3_WAIT_ON_DE_COUNTER_DIFF = 0x88 # macro
PACKET3_SWITCH_BUFFER = 0x8B # macro
PACKET3_DISPATCH_DRAW_PREAMBLE = 0x8C # macro
PACKET3_DISPATCH_DRAW_PREAMBLE_ACE = 0x8C # macro
PACKET3_DISPATCH_DRAW = 0x8D # macro
PACKET3_DISPATCH_DRAW_ACE = 0x8D # macro
PACKET3_GET_LOD_STATS = 0x8E # macro
PACKET3_DRAW_MULTI_PREAMBLE = 0x8F # macro
PACKET3_FRAME_CONTROL = 0x90 # macro
FRAME_TMZ = (1<<0) # macro
def FRAME_CMD(x): # macro
return ((x)<<28)
PACKET3_INDEX_ATTRIBUTES_INDIRECT = 0x91 # macro
PACKET3_WAIT_REG_MEM64 = 0x93 # macro
PACKET3_COND_PREEMPT = 0x94 # macro
PACKET3_HDP_FLUSH = 0x95 # macro
PACKET3_COPY_DATA_RB = 0x96 # macro
PACKET3_INVALIDATE_TLBS = 0x98 # macro
def PACKET3_INVALIDATE_TLBS_DST_SEL(x): # macro
return ((x)<<0)
def PACKET3_INVALIDATE_TLBS_ALL_HUB(x): # macro
return ((x)<<4)
def PACKET3_INVALIDATE_TLBS_PASID(x): # macro
return ((x)<<5)
PACKET3_AQL_PACKET = 0x99 # macro
PACKET3_DMA_DATA_FILL_MULTI = 0x9A # macro
PACKET3_SET_SH_REG_INDEX = 0x9B # macro
PACKET3_DRAW_INDIRECT_COUNT_MULTI = 0x9C # macro
PACKET3_DRAW_INDEX_INDIRECT_COUNT_MULTI = 0x9D # macro
PACKET3_DUMP_CONST_RAM_OFFSET = 0x9E # macro
PACKET3_LOAD_CONTEXT_REG_INDEX = 0x9F # macro
PACKET3_SET_RESOURCES = 0xA0 # macro
def PACKET3_SET_RESOURCES_VMID_MASK(x): # macro
return ((x)<<0)
def PACKET3_SET_RESOURCES_UNMAP_LATENTY(x): # macro
return ((x)<<16)
def PACKET3_SET_RESOURCES_QUEUE_TYPE(x): # macro
return ((x)<<29)
PACKET3_MAP_PROCESS = 0xA1 # macro
PACKET3_MAP_QUEUES = 0xA2 # macro
def PACKET3_MAP_QUEUES_QUEUE_SEL(x): # macro
return ((x)<<4)
def PACKET3_MAP_QUEUES_VMID(x): # macro
return ((x)<<8)
def PACKET3_MAP_QUEUES_QUEUE(x): # macro
return ((x)<<13)
def PACKET3_MAP_QUEUES_PIPE(x): # macro
return ((x)<<16)
def PACKET3_MAP_QUEUES_ME(x): # macro
return ((x)<<18)
def PACKET3_MAP_QUEUES_QUEUE_TYPE(x): # macro
return ((x)<<21)
def PACKET3_MAP_QUEUES_ALLOC_FORMAT(x): # macro
return ((x)<<24)
def PACKET3_MAP_QUEUES_ENGINE_SEL(x): # macro
return ((x)<<26)
def PACKET3_MAP_QUEUES_NUM_QUEUES(x): # macro
return ((x)<<29)
def PACKET3_MAP_QUEUES_CHECK_DISABLE(x): # macro
return ((x)<<1)
def PACKET3_MAP_QUEUES_DOORBELL_OFFSET(x): # macro
return ((x)<<2)
PACKET3_UNMAP_QUEUES = 0xA3 # macro
def PACKET3_UNMAP_QUEUES_ACTION(x): # macro
return ((x)<<0)
def PACKET3_UNMAP_QUEUES_QUEUE_SEL(x): # macro
return ((x)<<4)
def PACKET3_UNMAP_QUEUES_ENGINE_SEL(x): # macro
return ((x)<<26)
def PACKET3_UNMAP_QUEUES_NUM_QUEUES(x): # macro
return ((x)<<29)
def PACKET3_UNMAP_QUEUES_PASID(x): # macro
return ((x)<<0)
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(x): # macro
return ((x)<<2)
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET1(x): # macro
return ((x)<<2)
def PACKET3_UNMAP_QUEUES_RB_WPTR(x): # macro
return ((x)<<0)
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET2(x): # macro
return ((x)<<2)
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET3(x): # macro
return ((x)<<2)
PACKET3_QUERY_STATUS = 0xA4 # macro
def PACKET3_QUERY_STATUS_CONTEXT_ID(x): # macro
return ((x)<<0)
def PACKET3_QUERY_STATUS_INTERRUPT_SEL(x): # macro
return ((x)<<28)
def PACKET3_QUERY_STATUS_COMMAND(x): # macro
return ((x)<<30)
def PACKET3_QUERY_STATUS_PASID(x): # macro
return ((x)<<0)
def PACKET3_QUERY_STATUS_DOORBELL_OFFSET(x): # macro
return ((x)<<2)
def PACKET3_QUERY_STATUS_ENG_SEL(x): # macro
return ((x)<<25)
PACKET3_RUN_LIST = 0xA5 # macro
PACKET3_MAP_PROCESS_VM = 0xA6 # macro
PACKET3_SET_Q_PREEMPTION_MODE = 0xF0 # macro
def PACKET3_SET_Q_PREEMPTION_MODE_IB_VMID(x): # macro
return ((x)<<0)
PACKET3_SET_Q_PREEMPTION_MODE_INIT_SHADOW_MEM = (1<<0) # macro
__all__ = \
['CACHE_FLUSH_AND_INV_TS_EVENT', 'CE_PARTITION_BASE',
'CP_PACKET2', 'F32_MES_PM4_PACKETS_H', 'FRAME_TMZ',
'INDIRECT_BUFFER_VALID', 'NVD_H', 'PACKET2_PAD_MASK',
'PACKET2_PAD_SHIFT', 'PACKET3_ACQUIRE_MEM',
'PACKET3_ACQUIRE_MEM_GCR_RANGE_IS_PA', 'PACKET3_AQL_PACKET',
'PACKET3_ATOMIC_GDS', 'PACKET3_ATOMIC_MEM',
'PACKET3_BLK_CNTX_UPDATE', 'PACKET3_CLEAR_STATE',
'PACKET3_COND_EXEC', 'PACKET3_COND_INDIRECT_BUFFER',
'PACKET3_COND_INDIRECT_BUFFER_CNST', 'PACKET3_COND_PREEMPT',
'PACKET3_COND_WRITE', 'PACKET3_CONTEXT_CONTROL',
'PACKET3_CONTEXT_REG_RMW', 'PACKET3_COPY_DATA',
'PACKET3_COPY_DATA_RB', 'PACKET3_COPY_DW', 'PACKET3_CP_DMA',
'PACKET3_DISPATCH_DIRECT', 'PACKET3_DISPATCH_DRAW',
'PACKET3_DISPATCH_DRAW_ACE', 'PACKET3_DISPATCH_DRAW_PREAMBLE',
'PACKET3_DISPATCH_DRAW_PREAMBLE_ACE', 'PACKET3_DISPATCH_INDIRECT',
'PACKET3_DMA_DATA', 'PACKET3_DMA_DATA_CMD_DAIC',
'PACKET3_DMA_DATA_CMD_DAS', 'PACKET3_DMA_DATA_CMD_RAW_WAIT',
'PACKET3_DMA_DATA_CMD_SAIC', 'PACKET3_DMA_DATA_CMD_SAS',
'PACKET3_DMA_DATA_CP_SYNC', 'PACKET3_DMA_DATA_FILL_MULTI',
'PACKET3_DRAW_INDEX_2', 'PACKET3_DRAW_INDEX_AUTO',
'PACKET3_DRAW_INDEX_INDIRECT',
'PACKET3_DRAW_INDEX_INDIRECT_COUNT_MULTI',
'PACKET3_DRAW_INDEX_INDIRECT_MULTI',
'PACKET3_DRAW_INDEX_MULTI_AUTO', 'PACKET3_DRAW_INDEX_MULTI_INST',
'PACKET3_DRAW_INDEX_OFFSET_2', 'PACKET3_DRAW_INDIRECT',
'PACKET3_DRAW_INDIRECT_COUNT_MULTI',
'PACKET3_DRAW_INDIRECT_MULTI', 'PACKET3_DRAW_MULTI_PREAMBLE',
'PACKET3_DRAW_PREAMBLE', 'PACKET3_DUMP_CONST_RAM',
'PACKET3_DUMP_CONST_RAM_OFFSET', 'PACKET3_EVENT_WRITE',
'PACKET3_EVENT_WRITE_EOP', 'PACKET3_EVENT_WRITE_EOS',
'PACKET3_FORWARD_HEADER', 'PACKET3_FRAME_CONTROL',
'PACKET3_GEN_PDEPTE', 'PACKET3_GET_LOD_STATS',
'PACKET3_GFX_CNTX_UPDATE', 'PACKET3_GFX_PIPE_LOCK',
'PACKET3_HDP_FLUSH', 'PACKET3_INCREMENT_CE_COUNTER',
'PACKET3_INCREMENT_DE_COUNTER', 'PACKET3_INCR_UPDT_STATE',
'PACKET3_INDEX_ATTRIBUTES_INDIRECT', 'PACKET3_INDEX_BASE',
'PACKET3_INDEX_BUFFER_SIZE', 'PACKET3_INDEX_TYPE',
'PACKET3_INDIRECT_BUFFER', 'PACKET3_INDIRECT_BUFFER_CNST',
'PACKET3_INDIRECT_BUFFER_CNST_END', 'PACKET3_INDIRECT_BUFFER_END',
'PACKET3_INDIRECT_BUFFER_PASID', 'PACKET3_INDIRECT_BUFFER_PRIV',
'PACKET3_INTERRUPT', 'PACKET3_INVALIDATE_TLBS',
'PACKET3_LOAD_COMPUTE_STATE', 'PACKET3_LOAD_CONFIG_REG',
'PACKET3_LOAD_CONST_RAM', 'PACKET3_LOAD_CONTEXT_REG',
'PACKET3_LOAD_CONTEXT_REG_INDEX', 'PACKET3_LOAD_SH_REG',
'PACKET3_LOAD_SH_REG_INDEX', 'PACKET3_LOAD_UCONFIG_REG',
'PACKET3_MAP_PROCESS', 'PACKET3_MAP_PROCESS_VM',
'PACKET3_MAP_QUEUES', 'PACKET3_MEM_SEMAPHORE',
'PACKET3_ME_INITIALIZE', 'PACKET3_NOP', 'PACKET3_NUM_INSTANCES',
'PACKET3_OCCLUSION_QUERY', 'PACKET3_PFP_SYNC_ME',
'PACKET3_PREAMBLE_BEGIN_CLEAR_STATE', 'PACKET3_PREAMBLE_CNTL',
'PACKET3_PREAMBLE_END_CLEAR_STATE', 'PACKET3_PRED_EXEC',
'PACKET3_PRIME_UTCL2', 'PACKET3_QUERY_STATUS', 'PACKET3_REG_RMW',
'PACKET3_RELEASE_MEM', 'PACKET3_RELEASE_MEM_EXECUTE',
'PACKET3_RELEASE_MEM_GCR_GL1_INV',
'PACKET3_RELEASE_MEM_GCR_GL2_DISCARD',
'PACKET3_RELEASE_MEM_GCR_GL2_INV',
'PACKET3_RELEASE_MEM_GCR_GL2_RANGE',
'PACKET3_RELEASE_MEM_GCR_GL2_US',
'PACKET3_RELEASE_MEM_GCR_GL2_WB',
'PACKET3_RELEASE_MEM_GCR_GLM_INV',
'PACKET3_RELEASE_MEM_GCR_GLM_WB',
'PACKET3_RELEASE_MEM_GCR_GLV_INV', 'PACKET3_RELEASE_MEM_GCR_SEQ',
'PACKET3_REWIND', 'PACKET3_RUN_LIST', 'PACKET3_SCRATCH_RAM_READ',
'PACKET3_SCRATCH_RAM_WRITE', 'PACKET3_SEM_SEL_SIGNAL',
'PACKET3_SEM_SEL_SIGNAL_TYPE', 'PACKET3_SEM_SEL_WAIT',
'PACKET3_SEM_USE_MAILBOX', 'PACKET3_SET_BASE',
'PACKET3_SET_CONFIG_REG', 'PACKET3_SET_CONFIG_REG_END',
'PACKET3_SET_CONFIG_REG_START', 'PACKET3_SET_CONTEXT_REG',
'PACKET3_SET_CONTEXT_REG_END', 'PACKET3_SET_CONTEXT_REG_INDEX',
'PACKET3_SET_CONTEXT_REG_INDIRECT',
'PACKET3_SET_CONTEXT_REG_START', 'PACKET3_SET_PREDICATION',
'PACKET3_SET_QUEUE_REG', 'PACKET3_SET_Q_PREEMPTION_MODE',
'PACKET3_SET_Q_PREEMPTION_MODE_INIT_SHADOW_MEM',
'PACKET3_SET_RESOURCES', 'PACKET3_SET_SH_REG',
'PACKET3_SET_SH_REG_DI', 'PACKET3_SET_SH_REG_DI_MULTI',
'PACKET3_SET_SH_REG_END', 'PACKET3_SET_SH_REG_INDEX',
'PACKET3_SET_SH_REG_OFFSET', 'PACKET3_SET_SH_REG_START',
'PACKET3_SET_UCONFIG_REG', 'PACKET3_SET_UCONFIG_REG_END',
'PACKET3_SET_UCONFIG_REG_INDEX', 'PACKET3_SET_UCONFIG_REG_START',
'PACKET3_SET_VGPR_REG_DI_MULTI', 'PACKET3_STRMOUT_BUFFER_UPDATE',
'PACKET3_SURFACE_SYNC', 'PACKET3_SWITCH_BUFFER',
'PACKET3_UNMAP_QUEUES', 'PACKET3_WAIT_ON_CE_COUNTER',
'PACKET3_WAIT_ON_DE_COUNTER_DIFF', 'PACKET3_WAIT_REG_MEM',
'PACKET3_WAIT_REG_MEM64', 'PACKET3_WRITE_CONST_RAM',
'PACKET3_WRITE_DATA', 'PACKET_TYPE0', 'PACKET_TYPE1',
'PACKET_TYPE2', 'PACKET_TYPE3', 'PM4_MEC_RELEASE_MEM_DEFINED',
'PM4_MEC_WRITE_DATA_DEFINED', 'PM4_MES_HEADER_DEFINED',
'WRITE_DATA_addr_incr_enum', 'WRITE_DATA_cache_policy_enum',
'WRITE_DATA_dst_sel_enum', 'WRITE_DATA_wr_confirm_enum',
'WR_CONFIRM', 'WR_ONE_ADDR',
'addr_incr___write_data__do_not_increment_address',
'addr_incr___write_data__increment_address',
'c__Ea_CACHE_FLUSH_AND_INV_TS_EVENT', 'c_uint32', 'c_uint32',
'c_uint32', 'c_uint32', 'c_uint32', 'c_uint32',
'cache_policy___write_data__lru',
'cache_policy___write_data__stream',
'cache_policy__mec_release_mem__lru',
'cache_policy__mec_release_mem__stream',
'data_sel__mec_release_mem__none',
'data_sel__mec_release_mem__send_32_bit_low',
'data_sel__mec_release_mem__send_64_bit_data',
'data_sel__mec_release_mem__send_cp_perfcounter_hi_lo',
'data_sel__mec_release_mem__send_gpu_clock_counter',
'data_sel__mec_release_mem__store_gds_data_to_memory',
'dst_sel___write_data__gds',
'dst_sel___write_data__mem_mapped_register',
'dst_sel___write_data__memory',
'dst_sel___write_data__memory_mapped_adc_persistent_state',
'dst_sel___write_data__tc_l2',
'dst_sel__mec_release_mem__memory_controller',
'dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit',
'dst_sel__mec_release_mem__queue_write_pointer_register',
'dst_sel__mec_release_mem__tc_l2',
'event_index__mec_release_mem__end_of_pipe',
'event_index__mec_release_mem__shader_done', 'int32_t',
'int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare',
'int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare',
'int_sel__mec_release_mem__none',
'int_sel__mec_release_mem__send_data_after_write_confirm',
'int_sel__mec_release_mem__send_interrupt_after_write_confirm',
'int_sel__mec_release_mem__send_interrupt_only',
'int_sel__mec_release_mem__unconditionally_send_int_ctxid',
'pq_exe_status__mec_release_mem__default',
'pq_exe_status__mec_release_mem__phase_update',
'struct_PM4_MES_TYPE_3_HEADER_0', 'struct_pm4_mec_release_mem',
'struct_pm4_mec_release_mem_1_bitfields2',
'struct_pm4_mec_release_mem_2_bitfields3',
'struct_pm4_mec_release_mem_3_bitfields4',
'struct_pm4_mec_release_mem_3_bitfields4b',
'struct_pm4_mec_release_mem_5_bitfields6c',
'struct_pm4_mec_write_data_mmio',
'struct_pm4_mec_write_data_mmio_1_bitfields2',
'struct_pm4_mec_write_data_mmio_2_bitfields3', 'uint32_t',
'union_PM4_MES_TYPE_3_HEADER', 'union_pm4_mec_release_mem_0',
'union_pm4_mec_release_mem_1', 'union_pm4_mec_release_mem_2',
'union_pm4_mec_release_mem_3', 'union_pm4_mec_release_mem_4',
'union_pm4_mec_release_mem_5', 'union_pm4_mec_release_mem_6',
'union_pm4_mec_write_data_mmio_0',
'union_pm4_mec_write_data_mmio_1',
'union_pm4_mec_write_data_mmio_2',
'wr_confirm___write_data__do_not_wait_for_write_confirmation',
'wr_confirm___write_data__wait_for_write_confirmation']

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,6 @@
from __future__ import annotations
from typing import Any, cast, ClassVar
import os, ctypes, ctypes.util, struct, hashlib, functools, mmap, errno, array, contextlib, sys, select
import os, ctypes, ctypes.util, struct, hashlib, functools, importlib, mmap, errno, array, contextlib, sys, select
assert sys.platform != 'win32'
from dataclasses import dataclass
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, HWInterface
@@ -9,31 +9,19 @@ from tinygrad.device import Compiled, ProfileEvent, BufferSpec, CPUProgram, PROF
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, DEBUG, OSX
from tinygrad.renderer.cstyle import AMDRenderer
from tinygrad.renderer.llvmir import AMDLLVMRenderer
from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc, pci, vfio, sqtt
from tinygrad.runtime.autogen.am import am, gc_11_0_0
from tinygrad.runtime.autogen import kfd, hsa, libc, pci, vfio, sqtt
from tinygrad.runtime.autogen.am import am
from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler
from tinygrad.runtime.support.elf import elf_loader
from tinygrad.runtime.support.am.amdev import AMDev, AMMapping
from tinygrad.runtime.support.amd import AMDRegBase, collect_registers, import_module
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0106, 0x0107
EVENT_INDEX_PARTIAL_FLUSH = 4 # based on a comment in nvd.h
WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
WAIT_REG_MEM_FUNCTION_NEQ = 4 # !=
WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15)
def gfxreg(reg): return reg + amd_gpu.GC_BASE__INST0_SEG0 - amd_gpu.PACKET3_SET_SH_REG_START
def ucfgreg(reg, pkt3_set:bool=True): return reg + amd_gpu.GC_BASE__INST0_SEG1 - (amd_gpu.PACKET3_SET_UCONFIG_REG_START if pkt3_set else 0)
def nbioreg(reg): return reg + amd_gpu.NBIO_BASE__INST0_SEG2
# This can potentially be shared with AMRegister._parse_kwargs. NOTE: This is hardcoded to gfx11, bitfields might be different in other gfxvers.
# Currently not a problem because this is only used by sqtt and sqtt is only supported on 7900xtx
def encode_bitfields(regname: str, **kwargs) -> int:
return functools.reduce(lambda x,y: x|y, [v << getattr(gc_11_0_0, f'{regname}__{k.upper()}__SHIFT') for k,v in kwargs.items()], 0)
class AMDSignal(HCQSignal):
def __init__(self, base_addr:int|None=None, **kwargs):
super().__init__(base_addr, **kwargs, timestamp_divider=100, dev_t=AMDDevice)
@@ -43,61 +31,71 @@ class AMDSignal(HCQSignal):
if time_spent_waiting_ms > 2000 and self.timeline_for_device is not None: self.timeline_for_device.dev_iface.sleep(200)
class AMDComputeQueue(HWQueue):
def __init__(self, dev:AMDDevice):
self.soc, self.pm4, self.gc, self.nbio = dev.soc, dev.pm4, dev.gc, dev.nbio
super().__init__()
def __del__(self):
if self.binded_device is not None:
self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferSpec(cpu_access=True, nolru=True, uncached=True))
def pkt3(self, cmd, *vals): self.q(amd_gpu.PACKET3(cmd, len(vals) - 1), *vals)
def pkt3(self, cmd, *vals): self.q(self.pm4.PACKET3(cmd, len(vals) - 1), *vals)
def gfxreg(self, reg:AMDReg): return reg.addr - self.pm4.PACKET3_SET_SH_REG_START
def ucfgreg(self, reg:AMDReg): return reg.addr - self.pm4.PACKET3_SET_UCONFIG_REG_START
def sqtt_userdata(self, data, *extra_dwords):
data_ints = [x[0] for x in struct.iter_unpack('<I', bytes(data))] + list(extra_dwords)
for i in range(0, len(data_ints), 2):
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_USERDATA_2), *data_ints[i:i+2])
self.pkt3(self.pm4.PACKET3_SET_UCONFIG_REG, self.ucfgreg(self.gc.regSQ_THREAD_TRACE_USERDATA_2), *data_ints[i:i+2])
def wait_reg_mem(self, value, mask=0xffffffff, mem=None, reg_req=None, reg_done=None):
wrm_info_dw = amd_gpu.WAIT_REG_MEM_MEM_SPACE(int(mem is not None)) | amd_gpu.WAIT_REG_MEM_OPERATION(int(mem is None)) \
| amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0)
wrm_info_dw = self.pm4.WAIT_REG_MEM_MEM_SPACE(int(mem is not None)) | self.pm4.WAIT_REG_MEM_OPERATION(int(mem is None)) \
| self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | self.pm4.WAIT_REG_MEM_ENGINE(0)
self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, wrm_info_dw, *(data64_le(mem) if mem is not None else (reg_req, reg_done)), value, mask, 4)
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, wrm_info_dw, *(data64_le(mem) if mem is not None else (reg_req, reg_done)), value, mask, 4)
def acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
cache_flags_dw = amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) \
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) \
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) \
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) \
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)
cache_flags_dw = self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) \
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) \
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) \
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) \
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)
self.pkt3(amd_gpu.PACKET3_ACQUIRE_MEM, 0, *data64_le(sz), *data64_le(addr), 0, cache_flags_dw)
self.pkt3(self.pm4.PACKET3_ACQUIRE_MEM, 0, *data64_le(sz), *data64_le(addr), 0, cache_flags_dw)
def release_mem(self, address, value, data_sel, int_sel, ctxid=0, cache_flush=False):
cache_flags_dw = 0 if not cache_flush else (amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV \
| amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB \
| amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ)
cache_flags_dw = 0 if not cache_flush else (self.pm4.PACKET3_RELEASE_MEM_GCR_GLV_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GL1_INV \
| self.pm4.PACKET3_RELEASE_MEM_GCR_GL2_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GLM_WB \
| self.pm4.PACKET3_RELEASE_MEM_GCR_GLM_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GL2_WB | self.pm4.PACKET3_RELEASE_MEM_GCR_SEQ)
event_dw = amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(amd_gpu.CACHE_FLUSH_AND_INV_TS_EVENT) \
| amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(amd_gpu.event_index__mec_release_mem__end_of_pipe)
event_dw = self.pm4.PACKET3_RELEASE_MEM_EVENT_TYPE(self.pm4.CACHE_FLUSH_AND_INV_TS_EVENT) \
| self.pm4.PACKET3_RELEASE_MEM_EVENT_INDEX(self.pm4.event_index__mec_release_mem__end_of_pipe)
memsel_dw = amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0)
memsel_dw = self.pm4.PACKET3_RELEASE_MEM_DATA_SEL(data_sel) | self.pm4.PACKET3_RELEASE_MEM_INT_SEL(int_sel) \
| self.pm4.PACKET3_RELEASE_MEM_DST_SEL(0)
self.pkt3(amd_gpu.PACKET3_RELEASE_MEM, event_dw | cache_flags_dw, memsel_dw, *data64_le(address), *data64_le(value), ctxid)
self.pkt3(self.pm4.PACKET3_RELEASE_MEM, event_dw | cache_flags_dw, memsel_dw, *data64_le(address), *data64_le(value), ctxid)
def memory_barrier(self):
self.wait_reg_mem(reg_req=nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ), reg_done=nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), value=0xffffffff)
self.wait_reg_mem(reg_req=self.nbio.regBIF_BX_PF0_GPU_HDP_FLUSH_REQ.addr, reg_done=self.nbio.regBIF_BX_PF0_GPU_HDP_FLUSH_DONE.addr,
value=0xffffffff)
self.acquire_mem()
return self
def spi_config(self, tracing:bool):
spi_config_cntl = encode_bitfields('SPI_CONFIG_CNTL', ps_pkr_priority_cntl=3, exp_priority_order=3, gpr_write_priority=0x2c688,
enable_sqg_bop_events=int(tracing), enable_sqg_top_events=int(tracing))
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSPI_CONFIG_CNTL), spi_config_cntl)
spi_config_cntl = self.gc.regSPI_CONFIG_CNTL.encode(ps_pkr_priority_cntl=3, exp_priority_order=3, gpr_write_priority=0x2c688,
enable_sqg_bop_events=int(tracing), enable_sqg_top_events=int(tracing))
self.pkt3(self.pm4.PACKET3_SET_UCONFIG_REG, self.ucfgreg(self.gc.regSPI_CONFIG_CNTL), spi_config_cntl)
def sqtt_config(self, tracing:bool):
sq_thread_trace_ctrl = encode_bitfields('SQ_THREAD_TRACE_CTRL', draw_event_en=1, spi_stall_en=1, sq_stall_en=1, reg_at_hwm=2, hiwater=1,
rt_freq=amd_gpu.SQ_TT_RT_FREQ_4096_CLK, util_timer=amd_gpu.SQ_TT_UTIL_TIMER_250_CLK, mode=int(tracing))
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_CTRL), sq_thread_trace_ctrl)
sq_thread_trace_ctrl = self.gc.regSQ_THREAD_TRACE_CTRL.encode(draw_event_en=1, spi_stall_en=1, sq_stall_en=1, reg_at_hwm=2, hiwater=1,
rt_freq=self.soc.SQ_TT_RT_FREQ_4096_CLK,
util_timer=self.soc.SQ_TT_UTIL_TIMER_250_CLK, mode=int(tracing))
self.pkt3(self.pm4.PACKET3_SET_UCONFIG_REG, self.ucfgreg(self.gc.regSQ_THREAD_TRACE_CTRL), sq_thread_trace_ctrl)
def grbm_gfx_index(self, **kwargs):
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regGRBM_GFX_INDEX), encode_bitfields('GRBM_GFX_INDEX', **kwargs))
self.pkt3(self.pm4.PACKET3_SET_UCONFIG_REG, self.ucfgreg(self.gc.regGRBM_GFX_INDEX), self.gc.regGRBM_GFX_INDEX.encode(**kwargs))
# Magic values from mesa/src/amd/vulkan/radv_sqtt.c:radv_emit_spi_config_cntl and src/amd/common/ac_sqtt.c:ac_sqtt_emit_start
def start_trace(self, buf0s:list[HCQBuffer], se_mask:int):
@@ -107,31 +105,31 @@ class AMDComputeQueue(HWQueue):
for se in range(len(buf0s)):
self.grbm_gfx_index(se_index=se, instance_broadcast_writes=1) # select se, broadcast to all instances in that se
buf0_lo, buf0_hi = data64_le(buf0s[se].va_addr>>12)
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_BUF0_SIZE),
encode_bitfields('SQ_THREAD_TRACE_BUF0_SIZE', base_hi=buf0_hi, size=buf0s[se].size>>12))
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_BUF0_BASE), buf0_lo)
self.pkt3(self.pm4.PACKET3_SET_UCONFIG_REG, self.ucfgreg(self.gc.regSQ_THREAD_TRACE_BUF0_SIZE),
self.gc.regSQ_THREAD_TRACE_BUF0_SIZE.encode(base_hi=buf0_hi, size=buf0s[se].size>>12))
self.pkt3(self.pm4.PACKET3_SET_UCONFIG_REG, self.ucfgreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE), buf0_lo)
# NOTE: SQTT can only trace instructions on one simd per se, this selects first simd in first wgp in first sa.
# For RGP to display instruction trace it has to see it on first SE. Howerver ACE/MEC/whatever does the dispatching starting with second se,
# and on amdgpu/non-AM it also does weird things with dispatch order inside se: around 7 times out of 10 it starts from the last cu, but
# sometimes not, especially if the kernel has more than one wavefront which means that kernels with small global size might get unlucky and
# be dispatched on something else and not be seen in instruction tracing tab. You can force the wavefronts of a kernel to be dispatched on the
# CUs you want to by disabling other CUs via bits in regCOMPUTE_STATIC_THREAD_MGMT_SE<x> and trace even kernels that only have one wavefront.
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_MASK),
encode_bitfields('SQ_THREAD_TRACE_MASK', wtype_include=amd_gpu.SQ_TT_WTYPE_INCLUDE_CS_BIT, simd_sel=0, wgp_sel=0, sa_sel=0))
REG_INCLUDE = amd_gpu.SQ_TT_TOKEN_MASK_SQDEC_BIT | amd_gpu.SQ_TT_TOKEN_MASK_SHDEC_BIT | amd_gpu.SQ_TT_TOKEN_MASK_GFXUDEC_BIT | \
amd_gpu.SQ_TT_TOKEN_MASK_COMP_BIT | amd_gpu.SQ_TT_TOKEN_MASK_CONTEXT_BIT | amd_gpu.SQ_TT_TOKEN_MASK_CONTEXT_BIT
TOKEN_EXCLUDE = 1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_PERF_SHIFT
self.pkt3(self.pm4.PACKET3_SET_UCONFIG_REG, self.ucfgreg(self.gc.regSQ_THREAD_TRACE_MASK),
self.gc.regSQ_THREAD_TRACE_MASK.encode(wtype_include=self.soc.SQ_TT_WTYPE_INCLUDE_CS_BIT, simd_sel=0, wgp_sel=0, sa_sel=0))
REG_INCLUDE = self.soc.SQ_TT_TOKEN_MASK_SQDEC_BIT | self.soc.SQ_TT_TOKEN_MASK_SHDEC_BIT | self.soc.SQ_TT_TOKEN_MASK_GFXUDEC_BIT | \
self.soc.SQ_TT_TOKEN_MASK_COMP_BIT | self.soc.SQ_TT_TOKEN_MASK_CONTEXT_BIT | self.soc.SQ_TT_TOKEN_MASK_CONTEXT_BIT
TOKEN_EXCLUDE = 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_PERF_SHIFT
if not (se_mask >> se) & 0b1:
TOKEN_EXCLUDE |= 1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_VMEMEXEC_SHIFT | 1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_ALUEXEC_SHIFT | \
1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_VALUINST_SHIFT | 1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_IMMEDIATE_SHIFT | \
1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_INST_SHIFT
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_TOKEN_MASK),
encode_bitfields('SQ_THREAD_TRACE_TOKEN_MASK', reg_include=REG_INCLUDE, token_exclude=TOKEN_EXCLUDE, bop_events_token_include=1))
TOKEN_EXCLUDE |= 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_VMEMEXEC_SHIFT | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_ALUEXEC_SHIFT | \
1 << self.soc.SQ_TT_TOKEN_EXCLUDE_VALUINST_SHIFT | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_IMMEDIATE_SHIFT | \
1 << self.soc.SQ_TT_TOKEN_EXCLUDE_INST_SHIFT
self.pkt3(self.pm4.PACKET3_SET_UCONFIG_REG, self.ucfgreg(self.gc.regSQ_THREAD_TRACE_TOKEN_MASK),
self.gc.regSQ_THREAD_TRACE_TOKEN_MASK.encode(reg_include=REG_INCLUDE, token_exclude=TOKEN_EXCLUDE, bop_events_token_include=1))
# Enable SQTT
self.sqtt_config(tracing=True)
# Restore global broadcasting
self.grbm_gfx_index(se_broadcast_writes=1, sa_broadcast_writes=1, instance_broadcast_writes=1)
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_THREAD_TRACE_ENABLE), 1)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_THREAD_TRACE_ENABLE), 1)
self.memory_barrier()
return self
@@ -139,24 +137,24 @@ class AMDComputeQueue(HWQueue):
def stop_trace(self, ses: int, wptrs: HCQBuffer):
self.memory_barrier()
# Start shutting everything down
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_THREAD_TRACE_ENABLE), 0)
self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.THREAD_TRACE_FINISH) | amd_gpu.EVENT_INDEX(0))
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_THREAD_TRACE_ENABLE), 0)
self.pkt3(self.pm4.PACKET3_EVENT_WRITE, self.pm4.EVENT_TYPE(self.soc.THREAD_TRACE_FINISH) | self.pm4.EVENT_INDEX(0))
# For each SE wait for finish to complete and copy regSQ_THREAD_TRACE_WPTR to know where in the buffer trace data ends
for se in range(ses):
self.grbm_gfx_index(se_index=se, instance_broadcast_writes=1) # select se, broadcast to all instances in that se
# Wait for FINISH_PENDING==0
self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ),
ucfgreg(amd_gpu.regSQ_THREAD_TRACE_STATUS, False), 0, 0, gc_11_0_0.SQ_THREAD_TRACE_STATUS__FINISH_PENDING_MASK, 4)
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ),
self.gc.regSQ_THREAD_TRACE_STATUS.addr, 0, 0, self.gc.SQ_THREAD_TRACE_STATUS__FINISH_PENDING_MASK, 4)
# Wait for FINISH_DONE!=0
self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_NEQ),
ucfgreg(amd_gpu.regSQ_THREAD_TRACE_STATUS, False), 0, 0, gc_11_0_0.SQ_THREAD_TRACE_STATUS__FINISH_DONE_MASK, 4)
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_NEQ),
self.gc.regSQ_THREAD_TRACE_STATUS.addr, 0, 0, self.gc.SQ_THREAD_TRACE_STATUS__FINISH_DONE_MASK, 4)
# Disable SQTT
self.sqtt_config(tracing=False)
# Wait for BUSY==0
self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ),
ucfgreg(amd_gpu.regSQ_THREAD_TRACE_STATUS, False), 0, 0, gc_11_0_0.SQ_THREAD_TRACE_STATUS__BUSY_MASK, 4)
# Copy WPTR to memory (src_sel = perf, dst_sel = tc_l2, wr_confirm = True), ucfgreg with False adds GC_BASE__INST0_SEG1 but not pkt3 reg offset
self.pkt3(amd_gpu.PACKET3_COPY_DATA, 1 << 20 | 2 << 8 | 4, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_WPTR, False), 0, *data64_le(wptrs.va_addr+(se*4)))
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ),
self.gc.regSQ_THREAD_TRACE_STATUS.addr, 0, 0, self.gc.SQ_THREAD_TRACE_STATUS__BUSY_MASK, 4)
# Copy WPTR to memory (src_sel = perf, dst_sel = tc_l2, wr_confirm = True)
self.pkt3(self.pm4.PACKET3_COPY_DATA, 1 << 20 | 2 << 8 | 4, self.gc.regSQ_THREAD_TRACE_WPTR.addr, 0, *data64_le(wptrs.va_addr+(se*4)))
# Restore global broadcasting
self.grbm_gfx_index(se_broadcast_writes=1, sa_broadcast_writes=1, instance_broadcast_writes=1)
self.spi_config(tracing=False)
@@ -198,25 +196,26 @@ class AMDComputeQueue(HWQueue):
), *global_size)
prg.dev.cmd_id += 1
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8))
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2)
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC3), 0)
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_TMPRING_SIZE), prg.dev.tmpring_size)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8))
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_PGM_RSRC3), 0)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_TMPRING_SIZE), prg.dev.tmpring_size)
if prg.dev.has_scratch_base_registers:
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO), *data64_le(prg.dev.scratch.va_addr >> 8))
if prg.dev.target < 110000: self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.mmCP_COHER_START_DELAY), 0x20)
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_RESTART_X), 0, 0, 0)
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE0), 0xFFFFFFFF, 0xFFFFFFFF)
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE2), 0xFFFFFFFF, 0xFFFFFFFF)
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4), 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0), *user_regs)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO), *data64_le(prg.dev.scratch.va_addr >> 8))
if prg.dev.target < 110000: self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.mmCP_COHER_START_DELAY), 0x20)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_RESTART_X), 0, 0, 0)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE0), 0xFFFFFFFF, 0xFFFFFFFF)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE2), 0xFFFFFFFF, 0xFFFFFFFF)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE4), 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_USER_DATA_0), *user_regs)
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0)
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0)
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_RESOURCE_LIMITS), 0)
self.pkt3(amd_gpu.PACKET3_DISPATCH_DIRECT, *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN)
if prg.dev.sqtt_enabled: self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.THREAD_TRACE_MARKER) | amd_gpu.EVENT_INDEX(0))
self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.CS_PARTIAL_FLUSH) | amd_gpu.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH))
DISPATCH_INITIATOR = self.gc.regCOMPUTE_DISPATCH_INITIATOR.encode(cs_w32_en=1, force_start_at_000=1, compute_shader_en=1)
self.pkt3(self.pm4.PACKET3_DISPATCH_DIRECT, *global_size, DISPATCH_INITIATOR)
if prg.dev.sqtt_enabled: self.pkt3(self.pm4.PACKET3_EVENT_WRITE, self.pm4.EVENT_TYPE(self.soc.THREAD_TRACE_MARKER) | self.pm4.EVENT_INDEX(0))
self.pkt3(self.pm4.PACKET3_EVENT_WRITE, self.pm4.EVENT_TYPE(self.soc.CS_PARTIAL_FLUSH) | self.pm4.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH))
return self
def wait(self, signal:AMDSignal, value:sint=0):
@@ -224,17 +223,17 @@ class AMDComputeQueue(HWQueue):
return self
def timestamp(self, signal:AMDSignal):
self.release_mem(signal.timestamp_addr, 0, amd_gpu.data_sel__mec_release_mem__send_gpu_clock_counter, amd_gpu.int_sel__mec_release_mem__none)
self.release_mem(signal.timestamp_addr, 0, self.pm4.data_sel__mec_release_mem__send_gpu_clock_counter, self.pm4.int_sel__mec_release_mem__none)
return self
def signal(self, signal:AMDSignal, value:sint=0):
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
self.release_mem(signal.value_addr, value, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, cache_flush=True)
self.release_mem(signal.value_addr, value, self.pm4.data_sel__mec_release_mem__send_32_bit_low,
self.pm4.int_sel__mec_release_mem__send_interrupt_after_write_confirm, cache_flush=True)
if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None:
self.release_mem(dev.queue_event_mailbox_ptr, dev.queue_event.event_id, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=dev.queue_event.event_id)
self.release_mem(dev.queue_event_mailbox_ptr, dev.queue_event.event_id, self.pm4.data_sel__mec_release_mem__send_32_bit_low,
self.pm4.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=dev.queue_event.event_id)
return self
def bind(self, dev:AMDDevice):
@@ -243,8 +242,8 @@ class AMDComputeQueue(HWQueue):
hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
for i, value in enumerate(self._q): hw_view[i] = value
self.indirect_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_INDIRECT_BUFFER, 2), *data64_le(self.hw_page.va_addr),
len(self._q) | amd_gpu.INDIRECT_BUFFER_VALID]
self.indirect_cmd = [self.pm4.PACKET3(self.pm4.PACKET3_INDIRECT_BUFFER, 2), *data64_le(self.hw_page.va_addr),
len(self._q) | self.pm4.INDIRECT_BUFFER_VALID]
self._q = hw_view
return self
@@ -257,8 +256,8 @@ class AMDComputeQueue(HWQueue):
dev.compute_queue.signal_doorbell(dev)
class AMDCopyQueue(HWQueue):
def __init__(self, max_copy_size=0x40000000):
self.internal_cmd_sizes, self.max_copy_size = [], max_copy_size
def __init__(self, dev, max_copy_size=0x40000000):
self.sdma, self.internal_cmd_sizes, self.max_copy_size = dev.sdma, [], max_copy_size
super().__init__()
def q(self, *arr):
@@ -271,30 +270,30 @@ class AMDCopyQueue(HWQueue):
for _ in range(copy_commands):
step_copy_size = min(copy_size - copied, self.max_copy_size)
self.q(amd_gpu.SDMA_OP_COPY | amd_gpu.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_COPY_LINEAR),
amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied))
self.q(self.sdma.SDMA_OP_COPY | self.sdma.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(self.sdma.SDMA_SUBOP_COPY_LINEAR),
self.sdma.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied))
copied += step_copy_size
return self
def signal(self, signal:AMDSignal, value:sint=0):
self.q(amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.value_addr), value)
self.q(self.sdma.SDMA_OP_FENCE | self.sdma.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.value_addr), value)
if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None:
self.q(amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(dev.queue_event_mailbox_ptr), dev.queue_event.event_id)
self.q(amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(dev.queue_event.event_id))
elif AMDDevice.driverless: self.q(amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(0))
self.q(self.sdma.SDMA_OP_FENCE | self.sdma.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(dev.queue_event_mailbox_ptr), dev.queue_event.event_id)
self.q(self.sdma.SDMA_OP_TRAP, self.sdma.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(dev.queue_event.event_id))
elif AMDDevice.driverless: self.q(self.sdma.SDMA_OP_TRAP, self.sdma.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(0))
return self
def wait(self, signal:AMDSignal, value:sint=0):
self.q(amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal.value_addr), value, 0xffffffff,
amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff))
self.q(self.sdma.SDMA_OP_POLL_REGMEM | self.sdma.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
self.sdma.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal.value_addr), value, 0xffffffff,
self.sdma.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | self.sdma.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff))
return self
def timestamp(self, signal:AMDSignal):
self.q(amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
self.q(self.sdma.SDMA_OP_TIMESTAMP | self.sdma.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(self.sdma.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
*data64_le(signal.timestamp_addr))
return self
@@ -306,7 +305,8 @@ class AMDCopyQueue(HWQueue):
hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
for i in range(qsz): hw_view[i] = self._q[i] if i < len(self._q) else 0
self.indirect_cmd = [amd_gpu.SDMA_OP_INDIRECT | amd_gpu.SDMA_PKT_INDIRECT_HEADER_VMID(0), *data64_le(self.hw_page.va_addr), qsz, *data64_le(0)]
self.indirect_cmd = [self.sdma.SDMA_OP_INDIRECT | self.sdma.SDMA_PKT_INDIRECT_HEADER_VMID(0), *data64_le(self.hw_page.va_addr), qsz,
*data64_le(0)]
self._q, self.cmd_sizes = hw_view, [len(self.indirect_cmd)]
def _submit(self, dev:AMDDevice):
@@ -415,6 +415,25 @@ class AMDQueueDesc:
if dev.driverless and getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1): dev.dev_iface.adev.gmc.flush_hdp()
self.doorbell[0] = self.put_value
@dataclass(frozen=True)
class AMDReg(AMDRegBase):
ip: AMDIP
@property
def addr(self): return self.ip.bases[self.segment] + self.offset
@dataclass(frozen=True)
class AMDIP:
name: str
version: tuple[int, ...]
bases: tuple[int, ...]
@functools.cached_property
def module(self): return import_module(self.name, self.version)
@functools.cached_property
def regs(self): return collect_registers(self.module, cls=functools.partial(AMDReg, ip=self))
def __getattr__(self, name:str):
if name in self.regs: return self.regs[name]
return getattr(self.module, name)
class KFDIface:
kfd:HWInterface|None = None
event_page:HCQBuffer|None = None
@@ -441,6 +460,12 @@ class KFDIface:
self.gpu_id = int(HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/gpu_id").read())
self.props = {l.split()[0]: int(l.split()[1]) for l in HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/properties").read().splitlines()}
ip_base = f"/sys/class/drm/renderD{self.props['drm_render_minor']}/device/ip_discovery/die/0"
id2ip = {am.GC_HWID: am.GC_HWIP, am.SDMA0_HWID: am.SDMA0_HWIP, am.NBIF_HWID: am.NBIF_HWIP}
self.ip_versions = {id2ip[int(hwid)]:tuple(int(HWInterface(f'{ip_base}/{hwid}/0/{part}').read()) for part in ['major', 'minor', 'revision'])
for hwid in HWInterface(ip_base).listdir() if hwid.isnumeric() and int(hwid) in id2ip}
self.ip_offsets = {id2ip[int(hwid)]:tuple(int(x, 16) for x in HWInterface(f'{ip_base}/{hwid}/0/base_addr').read().splitlines())
for hwid in HWInterface(ip_base).listdir() if hwid.isnumeric() and int(hwid) in id2ip}
self.drm_fd = HWInterface(f"/dev/dri/renderD{self.props['drm_render_minor']}", os.O_RDWR)
kfd.AMDKFD_IOC_ACQUIRE_VM(KFDIface.kfd, drm_fd=self.drm_fd.fd, gpu_id=self.gpu_id)
@@ -609,6 +634,8 @@ class PCIIface:
self.bar_info = {j:(int(start,16), int(end,16), int(flgs,16)) for j,(start,end,flgs) in enumerate(l.split() for l in bar_info)}
self.adev = AMDev(self.pcibus, self._map_pci_range(0), dbell:=self._map_pci_range(2).cast('Q'), self._map_pci_range(5).cast('I'))
self.ip_versions = self.adev.ip_ver
self.ip_offsets = {hwip: tuple(instances[0]) for hwip,instances in self.adev.regs_offset.items()}
self.doorbell_cpu_addr = mv_address(dbell)
pci_cmd = int.from_bytes(self.cfg_fd.read(2, binary=True, offset=pci.PCI_COMMAND), byteorder='little') | pci.PCI_COMMAND_MASTER
@@ -702,6 +729,12 @@ class AMDDevice(HCQCompiled):
if self.target//10000 == 10: ctl_stack_size = min(ctl_stack_size, 0x7000)
debug_memory_size = round_up((self.max_cu_id + 1) * (self.max_wave_id + 1) * 32, 64)
self.soc = importlib.import_module(f"tinygrad.runtime.autogen.am.{({10: 'navi10', 11: 'soc21', 12: 'soc24'}[self.target//10000])}")
self.pm4 = importlib.import_module("tinygrad.runtime.autogen.am.pm4_nv")
self.sdma = import_module('sdma', self.dev_iface.ip_versions[am.SDMA0_HWIP])
self.gc = AMDIP('gc', self.dev_iface.ip_versions[am.GC_HWIP], self.dev_iface.ip_offsets[am.GC_HWIP])
self.nbio = AMDIP('nbio' if self.target < 120000 else 'nbif', self.dev_iface.ip_versions[am.NBIF_HWIP], self.dev_iface.ip_offsets[am.NBIF_HWIP])
self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x800000, ctx_save_restore_size=wg_data_size + ctl_stack_size,
eop_buffer_size=0x1000, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
@@ -709,7 +742,7 @@ class AMDDevice(HCQCompiled):
super().__init__(device, AMDAllocator(self), AMDLLVMRenderer() if getenv("AMD_LLVM", 0) else AMDRenderer(self.arch),
AMDLLVMCompiler(self.arch) if getenv("AMD_LLVM", 0) else HIPCompiler(self.arch), functools.partial(AMDProgram, self),
AMDSignal, AMDComputeQueue, AMDCopyQueue)
AMDSignal, functools.partial(AMDComputeQueue, self), functools.partial(AMDCopyQueue, self))
# Scratch setup
self.max_private_segment_size = 0
@@ -728,7 +761,7 @@ class AMDDevice(HCQCompiled):
self.sqtt_buffers = [self.allocator.alloc(SQTT_BUFFER_SIZE*1024*1024, BufferSpec(cpu_access=True, nolru=True)) for _ in range(SQTT_NUM)]
self.sqtt_itrace_se_mask = getenv("SQTT_ITRACE_SE_MASK", 2) # -1 enable all, 0 disable all, >0 bitmask for where to enable instruction tracing
self.cmd_id = 0
AMDComputeQueue().start_trace(self.sqtt_buffers, self.sqtt_itrace_se_mask).submit(self)
AMDComputeQueue(self).start_trace(self.sqtt_buffers, self.sqtt_itrace_se_mask).submit(self)
def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0):
ring = self.dev_iface.alloc(ring_size, uncached=True, cpu_access=True)
@@ -753,7 +786,7 @@ class AMDDevice(HCQCompiled):
self.max_private_segment_size = required
def invalidate_caches(self):
AMDComputeQueue().memory_barrier().signal(self.timeline_signal, self.next_timeline()).submit(self)
AMDComputeQueue(self).memory_barrier().signal(self.timeline_signal, self.next_timeline()).submit(self)
self.synchronize()
def on_device_hang(self): self.dev_iface.on_device_hang()
@@ -762,7 +795,7 @@ class AMDDevice(HCQCompiled):
if self.sqtt_enabled:
wptrs_buf = self.allocator.alloc(round_up(len(self.sqtt_buffers), 0x1000), BufferSpec(cpu_access=True, nolru=True))
wptrs = to_mv(wptrs_buf.va_addr, wptrs_buf.size)
AMDComputeQueue().stop_trace(len(self.sqtt_buffers), wptrs_buf).signal(self.timeline_signal, self.next_timeline()).submit(self)
AMDComputeQueue(self).stop_trace(len(self.sqtt_buffers), wptrs_buf).signal(self.timeline_signal, self.next_timeline()).submit(self)
self.synchronize()
if DEBUG>=2: print('Saving SQTT in profile...')
for i,buf0 in enumerate(self.sqtt_buffers):

View File

@@ -1,5 +1,5 @@
from __future__ import annotations
from typing import cast, Type, TypeVar, Generic, Any, ClassVar
from typing import cast, Callable, Type, TypeVar, Generic, Any, ClassVar
import contextlib, decimal, statistics, time, ctypes, array, os, fcntl
from tinygrad.helpers import PROFILE, from_mv, getenv, to_mv, round_up
from tinygrad.renderer import Renderer
@@ -255,7 +255,7 @@ class HCQSignal(Generic[DeviceType]):
if self.value < value: raise RuntimeError(f"Wait timeout: {timeout} ms! (the signal is not set to {value}, but {self.value})")
@contextlib.contextmanager
def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Type[HWQueue]|None=None, queue:HWQueue|None=None):
def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Callable[[], HWQueue]|None=None, queue:HWQueue|None=None):
st, en = (dev.signal_t(), dev.signal_t()) if enabled else (None, None)
if enabled and queue is not None: queue.timestamp(st)
@@ -341,7 +341,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
signal_pool: ClassVar[list[int]] = []
def __init__(self, device:str, allocator:HCQAllocatorBase, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[SignalType],
comp_queue_t:Type[HWQueue], copy_queue_t:Type[HWQueue]|None):
comp_queue_t:Callable[[], HWQueue], copy_queue_t:Callable[[], HWQueue]|None):
self.device_id:int = int(device.split(":")[1]) if ":" in device else 0
from tinygrad.runtime.graph.hcq import HCQGraph
@@ -384,7 +384,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
return cls.signal_pool.pop()
def _at_profile_finalize(self):
def _sync(d:HCQCompiled, q_t:Type[HWQueue]):
def _sync(d:HCQCompiled, q_t:Callable[[], HWQueue]):
q_t().timestamp(d.timeline_signal).signal(d.timeline_signal, d.next_timeline()).submit(d)
st = time.perf_counter_ns()
d.timeline_signal.wait(d.timeline_value - 1) # average of the two