mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-09 06:58:11 -05:00
Refactor ops_amd.py (MI300X prereq) (#9428)
This commit is contained in:
@@ -308,11 +308,27 @@ generate_am() {
|
||||
sed -i "s\(int64_t)\ \g" $BASE/am/am.py
|
||||
sed -i "s\AMDGPU_PTE_MTYPE_VG10(2)\AMDGPU_PTE_MTYPE_VG10(0, 2)\g" $BASE/am/am.py # incorrect parsing (TODO: remove when clang2py is gone).
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/hip_gpu_driver/kfd_pm4_headers_ai.h \
|
||||
extra/hip_gpu_driver/nvd.h \
|
||||
-o $BASE/am/pm4_nv.py
|
||||
fixup $BASE/am/pm4_nv.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/amdpci/headers/navi10_enum.h \
|
||||
-o $BASE/am/navi10.py
|
||||
fixup $BASE/am/navi10.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/amdpci/headers/soc21_enum.h \
|
||||
-o $BASE/am/soc21.py
|
||||
fixup $BASE/am/soc21.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/amdpci/headers/soc24_enum.h \
|
||||
-o $BASE/am/soc24.py
|
||||
fixup $BASE/am/soc24.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/amdpci/headers/mp_13_0_0_offset.h \
|
||||
extra/amdpci/headers/mp_13_0_0_sh_mask.h \
|
||||
@@ -325,12 +341,38 @@ generate_am() {
|
||||
-o $BASE/am/mp_11_0.py
|
||||
fixup $BASE/am/mp_11_0.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/amdpci/headers/gc_10_3_0_offset.h \
|
||||
extra/amdpci/headers/gc_10_3_0_sh_mask.h \
|
||||
-o $BASE/am/gc_10_3_0.py
|
||||
fixup $BASE/am/gc_10_3_0.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/amdpci/headers/gc_11_0_0_offset.h \
|
||||
extra/amdpci/headers/gc_11_0_0_sh_mask.h \
|
||||
-o $BASE/am/gc_11_0_0.py
|
||||
fixup $BASE/am/gc_11_0_0.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/amdpci/headers/gc_12_0_0_offset.h \
|
||||
extra/amdpci/headers/gc_12_0_0_sh_mask.h \
|
||||
-o $BASE/am/gc_12_0_0.py
|
||||
fixup $BASE/am/gc_12_0_0.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/hip_gpu_driver/sdma_registers.h \
|
||||
extra/hip_gpu_driver/navi10_sdma_pkt_open.h \
|
||||
--clang-args="-I/opt/rocm/include -x c++" \
|
||||
-o $BASE/am/sdma_5_0_0.py
|
||||
fixup $BASE/am/sdma_5_0_0.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/hip_gpu_driver/sdma_registers.h \
|
||||
extra/hip_gpu_driver/sdma_v6_0_0_pkt_open.h \
|
||||
--clang-args="-I/opt/rocm/include -x c++" \
|
||||
-o $BASE/am/sdma_6_0_0.py
|
||||
fixup $BASE/am/sdma_6_0_0.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/amdpci/headers/mmhub_3_0_0_offset.h \
|
||||
extra/amdpci/headers/mmhub_3_0_0_sh_mask.h \
|
||||
@@ -343,12 +385,24 @@ generate_am() {
|
||||
-o $BASE/am/mmhub_3_0_2.py
|
||||
fixup $BASE/am/mmhub_3_0_2.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/amdpci/headers/nbio_2_3_offset.h \
|
||||
extra/amdpci/headers/nbio_2_3_sh_mask.h \
|
||||
-o $BASE/am/nbio_2_3_0.py
|
||||
fixup $BASE/am/nbio_2_3_0.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/amdpci/headers/nbio_4_3_0_offset.h \
|
||||
extra/amdpci/headers/nbio_4_3_0_sh_mask.h \
|
||||
-o $BASE/am/nbio_4_3_0.py
|
||||
fixup $BASE/am/nbio_4_3_0.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/amdpci/headers/nbif_6_3_1_offset.h \
|
||||
extra/amdpci/headers/nbif_6_3_1_sh_mask.h \
|
||||
-o $BASE/am/nbif_6_3_1.py
|
||||
fixup $BASE/am/nbif_6_3_1.py
|
||||
|
||||
clang2py -k cdefstum \
|
||||
extra/amdpci/headers/osssys_6_0_0_offset.h \
|
||||
extra/amdpci/headers/osssys_6_0_0_sh_mask.h \
|
||||
|
||||
13609
extra/amdpci/headers/gc_10_3_0_offset.h
Normal file
13609
extra/amdpci/headers/gc_10_3_0_offset.h
Normal file
File diff suppressed because it is too large
Load Diff
49369
extra/amdpci/headers/gc_10_3_0_sh_mask.h
Normal file
49369
extra/amdpci/headers/gc_10_3_0_sh_mask.h
Normal file
File diff suppressed because it is too large
Load Diff
11061
extra/amdpci/headers/gc_12_0_0_offset.h
Normal file
11061
extra/amdpci/headers/gc_12_0_0_offset.h
Normal file
File diff suppressed because it is too large
Load Diff
40550
extra/amdpci/headers/gc_12_0_0_sh_mask.h
Normal file
40550
extra/amdpci/headers/gc_12_0_0_sh_mask.h
Normal file
File diff suppressed because it is too large
Load Diff
22764
extra/amdpci/headers/navi10_enum.h
Normal file
22764
extra/amdpci/headers/navi10_enum.h
Normal file
File diff suppressed because it is too large
Load Diff
11287
extra/amdpci/headers/nbif_6_3_1_offset.h
Normal file
11287
extra/amdpci/headers/nbif_6_3_1_offset.h
Normal file
File diff suppressed because it is too large
Load Diff
32806
extra/amdpci/headers/nbif_6_3_1_sh_mask.h
Normal file
32806
extra/amdpci/headers/nbif_6_3_1_sh_mask.h
Normal file
File diff suppressed because it is too large
Load Diff
14663
extra/amdpci/headers/nbio_2_3_offset.h
Normal file
14663
extra/amdpci/headers/nbio_2_3_offset.h
Normal file
File diff suppressed because it is too large
Load Diff
120339
extra/amdpci/headers/nbio_2_3_sh_mask.h
Normal file
120339
extra/amdpci/headers/nbio_2_3_sh_mask.h
Normal file
File diff suppressed because it is too large
Load Diff
21073
extra/amdpci/headers/soc24_enum.h
Normal file
21073
extra/amdpci/headers/soc24_enum.h
Normal file
File diff suppressed because it is too large
Load Diff
4886
extra/hip_gpu_driver/navi10_sdma_pkt_open.h
Normal file
4886
extra/hip_gpu_driver/navi10_sdma_pkt_open.h
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,6 @@
|
||||
import pathlib, re, ctypes, mmap, collections, functools, copy, os
|
||||
import tinygrad.runtime.autogen.kfd as kfd
|
||||
import tinygrad.runtime.autogen.am.am as am
|
||||
from tinygrad.helpers import from_mv
|
||||
from test.mockgpu.driver import VirtDriver, VirtFileDesc, TextFileDesc, DirFileDesc, VirtFile
|
||||
from test.mockgpu.amd.amdgpu import AMDGPU, gpu_props
|
||||
@@ -82,6 +83,23 @@ class AMDDriver(VirtDriver):
|
||||
VirtFile(f'/sys/devices/virtual/kfd/kfd/topology/nodes/{gpu_id}/gpu_id', functools.partial(TextFileDesc, text=f"{gpu_id}")),
|
||||
VirtFile(f'/sys/devices/virtual/kfd/kfd/topology/nodes/{gpu_id}/properties',
|
||||
functools.partial(TextFileDesc, text=gpu_props.format(drm_render_minor=gpu_id))),
|
||||
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0',
|
||||
functools.partial(DirFileDesc, child_names=[str(am.GC_HWID), str(am.SDMA0_HWID), str(am.NBIF_HWID)])),
|
||||
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.GC_HWID}/0/major', functools.partial(TextFileDesc, text='11')),
|
||||
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.GC_HWID}/0/minor', functools.partial(TextFileDesc, text='0')),
|
||||
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.GC_HWID}/0/revision', functools.partial(TextFileDesc, text='0')),
|
||||
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.GC_HWID}/0/base_addr',
|
||||
functools.partial(TextFileDesc, text='0x00001260\n0x0000A000\n0x0001C000\n0x02402C00')),
|
||||
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.SDMA0_HWID}/0/major', functools.partial(TextFileDesc, text='6')),
|
||||
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.SDMA0_HWID}/0/minor', functools.partial(TextFileDesc, text='0')),
|
||||
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.SDMA0_HWID}/0/revision', functools.partial(TextFileDesc, text='0')),
|
||||
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.SDMA0_HWID}/0/base_addr',
|
||||
functools.partial(TextFileDesc, text='0x00001260\n0x0000A000\n0x0001C000\n0x02402C00')),
|
||||
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.NBIF_HWID}/0/major', functools.partial(TextFileDesc, text='4')),
|
||||
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.NBIF_HWID}/0/minor', functools.partial(TextFileDesc, text='3')),
|
||||
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.NBIF_HWID}/0/revision', functools.partial(TextFileDesc, text='0')),
|
||||
VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.NBIF_HWID}/0/base_addr',
|
||||
functools.partial(TextFileDesc, text='0x00000000\n0x00000014\n0x00000D20\n0x00010400\n0x0241B000\n0x04040000')),
|
||||
VirtFile(f'/dev/dri/renderD{gpu_id}', functools.partial(DRMFileDesc, driver=self, gpu=f"{self.gpus[gpu_id]}")),
|
||||
]
|
||||
|
||||
|
||||
104218
tinygrad/runtime/autogen/am/gc_10_3_0.py
Normal file
104218
tinygrad/runtime/autogen/am/gc_10_3_0.py
Normal file
File diff suppressed because it is too large
Load Diff
86482
tinygrad/runtime/autogen/am/gc_12_0_0.py
Normal file
86482
tinygrad/runtime/autogen/am/gc_12_0_0.py
Normal file
File diff suppressed because it is too large
Load Diff
40774
tinygrad/runtime/autogen/am/navi10.py
Normal file
40774
tinygrad/runtime/autogen/am/navi10.py
Normal file
File diff suppressed because it is too large
Load Diff
75952
tinygrad/runtime/autogen/am/nbif_6_3_1.py
Normal file
75952
tinygrad/runtime/autogen/am/nbif_6_3_1.py
Normal file
File diff suppressed because it is too large
Load Diff
234281
tinygrad/runtime/autogen/am/nbio_2_3_0.py
Normal file
234281
tinygrad/runtime/autogen/am/nbio_2_3_0.py
Normal file
File diff suppressed because it is too large
Load Diff
962
tinygrad/runtime/autogen/am/pm4_nv.py
Normal file
962
tinygrad/runtime/autogen/am/pm4_nv.py
Normal file
@@ -0,0 +1,962 @@
|
||||
# mypy: ignore-errors
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# TARGET arch is: []
|
||||
# WORD_SIZE is: 8
|
||||
# POINTER_SIZE is: 8
|
||||
# LONGDOUBLE_SIZE is: 16
|
||||
#
|
||||
import ctypes
|
||||
|
||||
|
||||
class AsDictMixin:
|
||||
@classmethod
|
||||
def as_dict(cls, self):
|
||||
result = {}
|
||||
if not isinstance(self, AsDictMixin):
|
||||
# not a structure, assume it's already a python object
|
||||
return self
|
||||
if not hasattr(cls, "_fields_"):
|
||||
return result
|
||||
# sys.version_info >= (3, 5)
|
||||
# for (field, *_) in cls._fields_: # noqa
|
||||
for field_tuple in cls._fields_: # noqa
|
||||
field = field_tuple[0]
|
||||
if field.startswith('PADDING_'):
|
||||
continue
|
||||
value = getattr(self, field)
|
||||
type_ = type(value)
|
||||
if hasattr(value, "_length_") and hasattr(value, "_type_"):
|
||||
# array
|
||||
if not hasattr(type_, "as_dict"):
|
||||
value = [v for v in value]
|
||||
else:
|
||||
type_ = type_._type_
|
||||
value = [type_.as_dict(v) for v in value]
|
||||
elif hasattr(value, "contents") and hasattr(value, "_type_"):
|
||||
# pointer
|
||||
try:
|
||||
if not hasattr(type_, "as_dict"):
|
||||
value = value.contents
|
||||
else:
|
||||
type_ = type_._type_
|
||||
value = type_.as_dict(value.contents)
|
||||
except ValueError:
|
||||
# nullptr
|
||||
value = None
|
||||
elif isinstance(value, AsDictMixin):
|
||||
# other structure
|
||||
value = type_.as_dict(value)
|
||||
result[field] = value
|
||||
return result
|
||||
|
||||
|
||||
class Structure(ctypes.Structure, AsDictMixin):
|
||||
|
||||
def __init__(self, *args, **kwds):
|
||||
# We don't want to use positional arguments fill PADDING_* fields
|
||||
|
||||
args = dict(zip(self.__class__._field_names_(), args))
|
||||
args.update(kwds)
|
||||
super(Structure, self).__init__(**args)
|
||||
|
||||
@classmethod
|
||||
def _field_names_(cls):
|
||||
if hasattr(cls, '_fields_'):
|
||||
return (f[0] for f in cls._fields_ if not f[0].startswith('PADDING'))
|
||||
else:
|
||||
return ()
|
||||
|
||||
@classmethod
|
||||
def get_type(cls, field):
|
||||
for f in cls._fields_:
|
||||
if f[0] == field:
|
||||
return f[1]
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def bind(cls, bound_fields):
|
||||
fields = {}
|
||||
for name, type_ in cls._fields_:
|
||||
if hasattr(type_, "restype"):
|
||||
if name in bound_fields:
|
||||
if bound_fields[name] is None:
|
||||
fields[name] = type_()
|
||||
else:
|
||||
# use a closure to capture the callback from the loop scope
|
||||
fields[name] = (
|
||||
type_((lambda callback: lambda *args: callback(*args))(
|
||||
bound_fields[name]))
|
||||
)
|
||||
del bound_fields[name]
|
||||
else:
|
||||
# default callback implementation (does nothing)
|
||||
try:
|
||||
default_ = type_(0).restype().value
|
||||
except TypeError:
|
||||
default_ = None
|
||||
fields[name] = type_((
|
||||
lambda default_: lambda *args: default_)(default_))
|
||||
else:
|
||||
# not a callback function, use default initialization
|
||||
if name in bound_fields:
|
||||
fields[name] = bound_fields[name]
|
||||
del bound_fields[name]
|
||||
else:
|
||||
fields[name] = type_()
|
||||
if len(bound_fields) != 0:
|
||||
raise ValueError(
|
||||
"Cannot bind the following unknown callback(s) {}.{}".format(
|
||||
cls.__name__, bound_fields.keys()
|
||||
))
|
||||
return cls(**fields)
|
||||
|
||||
|
||||
class Union(ctypes.Union, AsDictMixin):
|
||||
pass
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
F32_MES_PM4_PACKETS_H = True # macro
|
||||
uint32_t = True # macro
|
||||
int32_t = True # macro
|
||||
PM4_MES_HEADER_DEFINED = True # macro
|
||||
PM4_MEC_RELEASE_MEM_DEFINED = True # macro
|
||||
PM4_MEC_WRITE_DATA_DEFINED = True # macro
|
||||
class union_PM4_MES_TYPE_3_HEADER(Union):
|
||||
pass
|
||||
|
||||
class struct_PM4_MES_TYPE_3_HEADER_0(Structure):
|
||||
pass
|
||||
|
||||
struct_PM4_MES_TYPE_3_HEADER_0._pack_ = 1 # source:False
|
||||
struct_PM4_MES_TYPE_3_HEADER_0._fields_ = [
|
||||
('reserved1', ctypes.c_uint32, 8),
|
||||
('opcode', ctypes.c_uint32, 8),
|
||||
('count', ctypes.c_uint32, 14),
|
||||
('type', ctypes.c_uint32, 2),
|
||||
]
|
||||
|
||||
union_PM4_MES_TYPE_3_HEADER._pack_ = 1 # source:False
|
||||
union_PM4_MES_TYPE_3_HEADER._anonymous_ = ('_0',)
|
||||
union_PM4_MES_TYPE_3_HEADER._fields_ = [
|
||||
('_0', struct_PM4_MES_TYPE_3_HEADER_0),
|
||||
('u32All', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
|
||||
# values for enumeration 'c_uint32'
|
||||
c_uint32__enumvalues = {
|
||||
5: 'event_index__mec_release_mem__end_of_pipe',
|
||||
6: 'event_index__mec_release_mem__shader_done',
|
||||
}
|
||||
event_index__mec_release_mem__end_of_pipe = 5
|
||||
event_index__mec_release_mem__shader_done = 6
|
||||
c_uint32 = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'c_uint32'
|
||||
c_uint32__enumvalues = {
|
||||
0: 'cache_policy__mec_release_mem__lru',
|
||||
1: 'cache_policy__mec_release_mem__stream',
|
||||
}
|
||||
cache_policy__mec_release_mem__lru = 0
|
||||
cache_policy__mec_release_mem__stream = 1
|
||||
c_uint32 = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'c_uint32'
|
||||
c_uint32__enumvalues = {
|
||||
0: 'pq_exe_status__mec_release_mem__default',
|
||||
1: 'pq_exe_status__mec_release_mem__phase_update',
|
||||
}
|
||||
pq_exe_status__mec_release_mem__default = 0
|
||||
pq_exe_status__mec_release_mem__phase_update = 1
|
||||
c_uint32 = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'c_uint32'
|
||||
c_uint32__enumvalues = {
|
||||
0: 'dst_sel__mec_release_mem__memory_controller',
|
||||
1: 'dst_sel__mec_release_mem__tc_l2',
|
||||
2: 'dst_sel__mec_release_mem__queue_write_pointer_register',
|
||||
3: 'dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit',
|
||||
}
|
||||
dst_sel__mec_release_mem__memory_controller = 0
|
||||
dst_sel__mec_release_mem__tc_l2 = 1
|
||||
dst_sel__mec_release_mem__queue_write_pointer_register = 2
|
||||
dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit = 3
|
||||
c_uint32 = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'c_uint32'
|
||||
c_uint32__enumvalues = {
|
||||
0: 'int_sel__mec_release_mem__none',
|
||||
1: 'int_sel__mec_release_mem__send_interrupt_only',
|
||||
2: 'int_sel__mec_release_mem__send_interrupt_after_write_confirm',
|
||||
3: 'int_sel__mec_release_mem__send_data_after_write_confirm',
|
||||
4: 'int_sel__mec_release_mem__unconditionally_send_int_ctxid',
|
||||
5: 'int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare',
|
||||
6: 'int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare',
|
||||
}
|
||||
int_sel__mec_release_mem__none = 0
|
||||
int_sel__mec_release_mem__send_interrupt_only = 1
|
||||
int_sel__mec_release_mem__send_interrupt_after_write_confirm = 2
|
||||
int_sel__mec_release_mem__send_data_after_write_confirm = 3
|
||||
int_sel__mec_release_mem__unconditionally_send_int_ctxid = 4
|
||||
int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare = 5
|
||||
int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare = 6
|
||||
c_uint32 = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'c_uint32'
|
||||
c_uint32__enumvalues = {
|
||||
0: 'data_sel__mec_release_mem__none',
|
||||
1: 'data_sel__mec_release_mem__send_32_bit_low',
|
||||
2: 'data_sel__mec_release_mem__send_64_bit_data',
|
||||
3: 'data_sel__mec_release_mem__send_gpu_clock_counter',
|
||||
4: 'data_sel__mec_release_mem__send_cp_perfcounter_hi_lo',
|
||||
5: 'data_sel__mec_release_mem__store_gds_data_to_memory',
|
||||
}
|
||||
data_sel__mec_release_mem__none = 0
|
||||
data_sel__mec_release_mem__send_32_bit_low = 1
|
||||
data_sel__mec_release_mem__send_64_bit_data = 2
|
||||
data_sel__mec_release_mem__send_gpu_clock_counter = 3
|
||||
data_sel__mec_release_mem__send_cp_perfcounter_hi_lo = 4
|
||||
data_sel__mec_release_mem__store_gds_data_to_memory = 5
|
||||
c_uint32 = ctypes.c_uint32 # enum
|
||||
class struct_pm4_mec_release_mem(Structure):
|
||||
pass
|
||||
|
||||
class union_pm4_mec_release_mem_0(Union):
|
||||
pass
|
||||
|
||||
union_pm4_mec_release_mem_0._pack_ = 1 # source:False
|
||||
union_pm4_mec_release_mem_0._fields_ = [
|
||||
('header', union_PM4_MES_TYPE_3_HEADER),
|
||||
('ordinal1', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class union_pm4_mec_release_mem_1(Union):
|
||||
pass
|
||||
|
||||
class struct_pm4_mec_release_mem_1_bitfields2(Structure):
|
||||
pass
|
||||
|
||||
struct_pm4_mec_release_mem_1_bitfields2._pack_ = 1 # source:False
|
||||
struct_pm4_mec_release_mem_1_bitfields2._fields_ = [
|
||||
('event_type', ctypes.c_uint32, 6),
|
||||
('reserved1', ctypes.c_uint32, 2),
|
||||
('event_index', c_uint32, 4),
|
||||
('tcl1_vol_action_ena', ctypes.c_uint32, 1),
|
||||
('tc_vol_action_ena', ctypes.c_uint32, 1),
|
||||
('reserved2', ctypes.c_uint32, 1),
|
||||
('tc_wb_action_ena', ctypes.c_uint32, 1),
|
||||
('tcl1_action_ena', ctypes.c_uint32, 1),
|
||||
('tc_action_ena', ctypes.c_uint32, 1),
|
||||
('reserved3', ctypes.c_uint32, 1),
|
||||
('tc_nc_action_ena', ctypes.c_uint32, 1),
|
||||
('tc_wc_action_ena', ctypes.c_uint32, 1),
|
||||
('tc_md_action_ena', ctypes.c_uint32, 1),
|
||||
('reserved4', ctypes.c_uint32, 3),
|
||||
('cache_policy', c_uint32, 2),
|
||||
('reserved5', ctypes.c_uint32, 2),
|
||||
('pq_exe_status', c_uint32, 1),
|
||||
('reserved6', ctypes.c_uint32, 2),
|
||||
]
|
||||
|
||||
union_pm4_mec_release_mem_1._pack_ = 1 # source:False
|
||||
union_pm4_mec_release_mem_1._fields_ = [
|
||||
('bitfields2', struct_pm4_mec_release_mem_1_bitfields2),
|
||||
('ordinal2', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class union_pm4_mec_release_mem_2(Union):
|
||||
pass
|
||||
|
||||
class struct_pm4_mec_release_mem_2_bitfields3(Structure):
|
||||
pass
|
||||
|
||||
struct_pm4_mec_release_mem_2_bitfields3._pack_ = 1 # source:False
|
||||
struct_pm4_mec_release_mem_2_bitfields3._fields_ = [
|
||||
('reserved7', ctypes.c_uint32, 16),
|
||||
('dst_sel', c_uint32, 2),
|
||||
('reserved8', ctypes.c_uint32, 6),
|
||||
('int_sel', c_uint32, 3),
|
||||
('reserved9', ctypes.c_uint32, 2),
|
||||
('data_sel', c_uint32, 3),
|
||||
]
|
||||
|
||||
union_pm4_mec_release_mem_2._pack_ = 1 # source:False
|
||||
union_pm4_mec_release_mem_2._fields_ = [
|
||||
('bitfields3', struct_pm4_mec_release_mem_2_bitfields3),
|
||||
('ordinal3', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class union_pm4_mec_release_mem_3(Union):
|
||||
pass
|
||||
|
||||
class struct_pm4_mec_release_mem_3_bitfields4(Structure):
|
||||
pass
|
||||
|
||||
struct_pm4_mec_release_mem_3_bitfields4._pack_ = 1 # source:False
|
||||
struct_pm4_mec_release_mem_3_bitfields4._fields_ = [
|
||||
('reserved10', ctypes.c_uint32, 2),
|
||||
('address_lo_32b', ctypes.c_uint32, 30),
|
||||
]
|
||||
|
||||
class struct_pm4_mec_release_mem_3_bitfields4b(Structure):
|
||||
pass
|
||||
|
||||
struct_pm4_mec_release_mem_3_bitfields4b._pack_ = 1 # source:False
|
||||
struct_pm4_mec_release_mem_3_bitfields4b._fields_ = [
|
||||
('reserved11', ctypes.c_uint32, 3),
|
||||
('address_lo_64b', ctypes.c_uint32, 29),
|
||||
]
|
||||
|
||||
union_pm4_mec_release_mem_3._pack_ = 1 # source:False
|
||||
union_pm4_mec_release_mem_3._fields_ = [
|
||||
('bitfields4', struct_pm4_mec_release_mem_3_bitfields4),
|
||||
('bitfields4b', struct_pm4_mec_release_mem_3_bitfields4b),
|
||||
('reserved12', ctypes.c_uint32),
|
||||
('ordinal4', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class union_pm4_mec_release_mem_4(Union):
|
||||
pass
|
||||
|
||||
union_pm4_mec_release_mem_4._pack_ = 1 # source:False
|
||||
union_pm4_mec_release_mem_4._fields_ = [
|
||||
('address_hi', ctypes.c_uint32),
|
||||
('reserved13', ctypes.c_uint32),
|
||||
('ordinal5', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class union_pm4_mec_release_mem_5(Union):
|
||||
pass
|
||||
|
||||
class struct_pm4_mec_release_mem_5_bitfields6c(Structure):
|
||||
pass
|
||||
|
||||
struct_pm4_mec_release_mem_5_bitfields6c._pack_ = 1 # source:False
|
||||
struct_pm4_mec_release_mem_5_bitfields6c._fields_ = [
|
||||
('dw_offset', ctypes.c_uint32, 16),
|
||||
('num_dwords', ctypes.c_uint32, 16),
|
||||
]
|
||||
|
||||
union_pm4_mec_release_mem_5._pack_ = 1 # source:False
|
||||
union_pm4_mec_release_mem_5._fields_ = [
|
||||
('data_lo', ctypes.c_uint32),
|
||||
('cmp_data_lo', ctypes.c_uint32),
|
||||
('bitfields6c', struct_pm4_mec_release_mem_5_bitfields6c),
|
||||
('reserved14', ctypes.c_uint32),
|
||||
('ordinal6', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class union_pm4_mec_release_mem_6(Union):
|
||||
pass
|
||||
|
||||
union_pm4_mec_release_mem_6._pack_ = 1 # source:False
|
||||
union_pm4_mec_release_mem_6._fields_ = [
|
||||
('data_hi', ctypes.c_uint32),
|
||||
('cmp_data_hi', ctypes.c_uint32),
|
||||
('reserved15', ctypes.c_uint32),
|
||||
('reserved16', ctypes.c_uint32),
|
||||
('ordinal7', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
struct_pm4_mec_release_mem._pack_ = 1 # source:False
|
||||
struct_pm4_mec_release_mem._anonymous_ = ('_0', '_1', '_2', '_3', '_4', '_5', '_6',)
|
||||
struct_pm4_mec_release_mem._fields_ = [
|
||||
('_0', union_pm4_mec_release_mem_0),
|
||||
('_1', union_pm4_mec_release_mem_1),
|
||||
('_2', union_pm4_mec_release_mem_2),
|
||||
('_3', union_pm4_mec_release_mem_3),
|
||||
('_4', union_pm4_mec_release_mem_4),
|
||||
('_5', union_pm4_mec_release_mem_5),
|
||||
('_6', union_pm4_mec_release_mem_6),
|
||||
('int_ctxid', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
|
||||
# values for enumeration 'WRITE_DATA_dst_sel_enum'
|
||||
WRITE_DATA_dst_sel_enum__enumvalues = {
|
||||
0: 'dst_sel___write_data__mem_mapped_register',
|
||||
2: 'dst_sel___write_data__tc_l2',
|
||||
3: 'dst_sel___write_data__gds',
|
||||
5: 'dst_sel___write_data__memory',
|
||||
6: 'dst_sel___write_data__memory_mapped_adc_persistent_state',
|
||||
}
|
||||
dst_sel___write_data__mem_mapped_register = 0
|
||||
dst_sel___write_data__tc_l2 = 2
|
||||
dst_sel___write_data__gds = 3
|
||||
dst_sel___write_data__memory = 5
|
||||
dst_sel___write_data__memory_mapped_adc_persistent_state = 6
|
||||
WRITE_DATA_dst_sel_enum = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'WRITE_DATA_addr_incr_enum'
|
||||
WRITE_DATA_addr_incr_enum__enumvalues = {
|
||||
0: 'addr_incr___write_data__increment_address',
|
||||
1: 'addr_incr___write_data__do_not_increment_address',
|
||||
}
|
||||
addr_incr___write_data__increment_address = 0
|
||||
addr_incr___write_data__do_not_increment_address = 1
|
||||
WRITE_DATA_addr_incr_enum = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'WRITE_DATA_wr_confirm_enum'
|
||||
WRITE_DATA_wr_confirm_enum__enumvalues = {
|
||||
0: 'wr_confirm___write_data__do_not_wait_for_write_confirmation',
|
||||
1: 'wr_confirm___write_data__wait_for_write_confirmation',
|
||||
}
|
||||
wr_confirm___write_data__do_not_wait_for_write_confirmation = 0
|
||||
wr_confirm___write_data__wait_for_write_confirmation = 1
|
||||
WRITE_DATA_wr_confirm_enum = ctypes.c_uint32 # enum
|
||||
|
||||
# values for enumeration 'WRITE_DATA_cache_policy_enum'
|
||||
WRITE_DATA_cache_policy_enum__enumvalues = {
|
||||
0: 'cache_policy___write_data__lru',
|
||||
1: 'cache_policy___write_data__stream',
|
||||
}
|
||||
cache_policy___write_data__lru = 0
|
||||
cache_policy___write_data__stream = 1
|
||||
WRITE_DATA_cache_policy_enum = ctypes.c_uint32 # enum
|
||||
class struct_pm4_mec_write_data_mmio(Structure):
|
||||
pass
|
||||
|
||||
class union_pm4_mec_write_data_mmio_0(Union):
|
||||
pass
|
||||
|
||||
union_pm4_mec_write_data_mmio_0._pack_ = 1 # source:False
|
||||
union_pm4_mec_write_data_mmio_0._fields_ = [
|
||||
('header', union_PM4_MES_TYPE_3_HEADER),
|
||||
('ordinal1', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class union_pm4_mec_write_data_mmio_1(Union):
|
||||
pass
|
||||
|
||||
class struct_pm4_mec_write_data_mmio_1_bitfields2(Structure):
|
||||
pass
|
||||
|
||||
struct_pm4_mec_write_data_mmio_1_bitfields2._pack_ = 1 # source:False
|
||||
struct_pm4_mec_write_data_mmio_1_bitfields2._fields_ = [
|
||||
('reserved1', ctypes.c_uint32, 8),
|
||||
('dst_sel', ctypes.c_uint32, 4),
|
||||
('reserved2', ctypes.c_uint32, 4),
|
||||
('addr_incr', ctypes.c_uint32, 1),
|
||||
('reserved3', ctypes.c_uint32, 2),
|
||||
('resume_vf', ctypes.c_uint32, 1),
|
||||
('wr_confirm', ctypes.c_uint32, 1),
|
||||
('reserved4', ctypes.c_uint32, 4),
|
||||
('cache_policy', ctypes.c_uint32, 2),
|
||||
('reserved5', ctypes.c_uint32, 5),
|
||||
]
|
||||
|
||||
union_pm4_mec_write_data_mmio_1._pack_ = 1 # source:False
|
||||
union_pm4_mec_write_data_mmio_1._fields_ = [
|
||||
('bitfields2', struct_pm4_mec_write_data_mmio_1_bitfields2),
|
||||
('ordinal2', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
class union_pm4_mec_write_data_mmio_2(Union):
|
||||
pass
|
||||
|
||||
class struct_pm4_mec_write_data_mmio_2_bitfields3(Structure):
|
||||
pass
|
||||
|
||||
struct_pm4_mec_write_data_mmio_2_bitfields3._pack_ = 1 # source:False
|
||||
struct_pm4_mec_write_data_mmio_2_bitfields3._fields_ = [
|
||||
('dst_mmreg_addr', ctypes.c_uint32, 18),
|
||||
('reserved6', ctypes.c_uint32, 14),
|
||||
]
|
||||
|
||||
union_pm4_mec_write_data_mmio_2._pack_ = 1 # source:False
|
||||
union_pm4_mec_write_data_mmio_2._fields_ = [
|
||||
('bitfields3', struct_pm4_mec_write_data_mmio_2_bitfields3),
|
||||
('ordinal3', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
struct_pm4_mec_write_data_mmio._pack_ = 1 # source:False
|
||||
struct_pm4_mec_write_data_mmio._anonymous_ = ('_0', '_1', '_2',)
|
||||
struct_pm4_mec_write_data_mmio._fields_ = [
|
||||
('_0', union_pm4_mec_write_data_mmio_0),
|
||||
('_1', union_pm4_mec_write_data_mmio_1),
|
||||
('_2', union_pm4_mec_write_data_mmio_2),
|
||||
('reserved7', ctypes.c_uint32),
|
||||
('data', ctypes.c_uint32),
|
||||
]
|
||||
|
||||
|
||||
# values for enumeration 'c__Ea_CACHE_FLUSH_AND_INV_TS_EVENT'
|
||||
c__Ea_CACHE_FLUSH_AND_INV_TS_EVENT__enumvalues = {
|
||||
20: 'CACHE_FLUSH_AND_INV_TS_EVENT',
|
||||
}
|
||||
CACHE_FLUSH_AND_INV_TS_EVENT = 20
|
||||
c__Ea_CACHE_FLUSH_AND_INV_TS_EVENT = ctypes.c_uint32 # enum
|
||||
NVD_H = True # macro
|
||||
PACKET_TYPE0 = 0 # macro
|
||||
PACKET_TYPE1 = 1 # macro
|
||||
PACKET_TYPE2 = 2 # macro
|
||||
PACKET_TYPE3 = 3 # macro
|
||||
def CP_PACKET_GET_TYPE(h): # macro
|
||||
return (((h)>>30)&3)
|
||||
def CP_PACKET_GET_COUNT(h): # macro
|
||||
return (((h)>>16)&0x3FFF)
|
||||
def CP_PACKET0_GET_REG(h): # macro
|
||||
return ((h)&0xFFFF)
|
||||
def CP_PACKET3_GET_OPCODE(h): # macro
|
||||
return (((h)>>8)&0xFF)
|
||||
def PACKET0(reg, n): # macro
|
||||
return ((0<<30)|((reg)&0xFFFF)|((n)&0x3FFF)<<16)
|
||||
CP_PACKET2 = 0x80000000 # macro
|
||||
PACKET2_PAD_SHIFT = 0 # macro
|
||||
PACKET2_PAD_MASK = (0x3fffffff<<0) # macro
|
||||
# def PACKET2(v): # macro
|
||||
# return (0x80000000|REG_SET(PACKET2_PAD,(v)))
|
||||
def PACKET3(op, n): # macro
|
||||
return ((3<<30)|(((op)&0xFF)<<8)|((n)&0x3FFF)<<16)
|
||||
def PACKET3_COMPUTE(op, n): # macro
|
||||
return (PACKET3(op,n)|1<<1)
|
||||
PACKET3_NOP = 0x10 # macro
|
||||
PACKET3_SET_BASE = 0x11 # macro
|
||||
def PACKET3_BASE_INDEX(x): # macro
|
||||
return ((x)<<0)
|
||||
CE_PARTITION_BASE = 3 # macro
|
||||
PACKET3_CLEAR_STATE = 0x12 # macro
|
||||
PACKET3_INDEX_BUFFER_SIZE = 0x13 # macro
|
||||
PACKET3_DISPATCH_DIRECT = 0x15 # macro
|
||||
PACKET3_DISPATCH_INDIRECT = 0x16 # macro
|
||||
PACKET3_INDIRECT_BUFFER_END = 0x17 # macro
|
||||
PACKET3_INDIRECT_BUFFER_CNST_END = 0x19 # macro
|
||||
PACKET3_ATOMIC_GDS = 0x1D # macro
|
||||
PACKET3_ATOMIC_MEM = 0x1E # macro
|
||||
PACKET3_OCCLUSION_QUERY = 0x1F # macro
|
||||
PACKET3_SET_PREDICATION = 0x20 # macro
|
||||
PACKET3_REG_RMW = 0x21 # macro
|
||||
PACKET3_COND_EXEC = 0x22 # macro
|
||||
PACKET3_PRED_EXEC = 0x23 # macro
|
||||
PACKET3_DRAW_INDIRECT = 0x24 # macro
|
||||
PACKET3_DRAW_INDEX_INDIRECT = 0x25 # macro
|
||||
PACKET3_INDEX_BASE = 0x26 # macro
|
||||
PACKET3_DRAW_INDEX_2 = 0x27 # macro
|
||||
PACKET3_CONTEXT_CONTROL = 0x28 # macro
|
||||
PACKET3_INDEX_TYPE = 0x2A # macro
|
||||
PACKET3_DRAW_INDIRECT_MULTI = 0x2C # macro
|
||||
PACKET3_DRAW_INDEX_AUTO = 0x2D # macro
|
||||
PACKET3_NUM_INSTANCES = 0x2F # macro
|
||||
PACKET3_DRAW_INDEX_MULTI_AUTO = 0x30 # macro
|
||||
PACKET3_INDIRECT_BUFFER_PRIV = 0x32 # macro
|
||||
PACKET3_INDIRECT_BUFFER_CNST = 0x33 # macro
|
||||
PACKET3_COND_INDIRECT_BUFFER_CNST = 0x33 # macro
|
||||
PACKET3_STRMOUT_BUFFER_UPDATE = 0x34 # macro
|
||||
PACKET3_DRAW_INDEX_OFFSET_2 = 0x35 # macro
|
||||
PACKET3_DRAW_PREAMBLE = 0x36 # macro
|
||||
PACKET3_WRITE_DATA = 0x37 # macro
|
||||
def WRITE_DATA_DST_SEL(x): # macro
|
||||
return ((x)<<8)
|
||||
WR_ONE_ADDR = (1<<16) # macro
|
||||
WR_CONFIRM = (1<<20) # macro
|
||||
def WRITE_DATA_CACHE_POLICY(x): # macro
|
||||
return ((x)<<25)
|
||||
def WRITE_DATA_ENGINE_SEL(x): # macro
|
||||
return ((x)<<30)
|
||||
PACKET3_DRAW_INDEX_INDIRECT_MULTI = 0x38 # macro
|
||||
PACKET3_MEM_SEMAPHORE = 0x39 # macro
|
||||
PACKET3_SEM_USE_MAILBOX = (0x1<<16) # macro
|
||||
PACKET3_SEM_SEL_SIGNAL_TYPE = (0x1<<20) # macro
|
||||
PACKET3_SEM_SEL_SIGNAL = (0x6<<29) # macro
|
||||
PACKET3_SEM_SEL_WAIT = (0x7<<29) # macro
|
||||
PACKET3_DRAW_INDEX_MULTI_INST = 0x3A # macro
|
||||
PACKET3_COPY_DW = 0x3B # macro
|
||||
PACKET3_WAIT_REG_MEM = 0x3C # macro
|
||||
def WAIT_REG_MEM_FUNCTION(x): # macro
|
||||
return ((x)<<0)
|
||||
def WAIT_REG_MEM_MEM_SPACE(x): # macro
|
||||
return ((x)<<4)
|
||||
def WAIT_REG_MEM_OPERATION(x): # macro
|
||||
return ((x)<<6)
|
||||
def WAIT_REG_MEM_ENGINE(x): # macro
|
||||
return ((x)<<8)
|
||||
PACKET3_INDIRECT_BUFFER = 0x3F # macro
|
||||
INDIRECT_BUFFER_VALID = (1<<23) # macro
|
||||
def INDIRECT_BUFFER_CACHE_POLICY(x): # macro
|
||||
return ((x)<<28)
|
||||
def INDIRECT_BUFFER_PRE_ENB(x): # macro
|
||||
return ((x)<<21)
|
||||
def INDIRECT_BUFFER_PRE_RESUME(x): # macro
|
||||
return ((x)<<30)
|
||||
PACKET3_COND_INDIRECT_BUFFER = 0x3F # macro
|
||||
PACKET3_COPY_DATA = 0x40 # macro
|
||||
PACKET3_CP_DMA = 0x41 # macro
|
||||
PACKET3_PFP_SYNC_ME = 0x42 # macro
|
||||
PACKET3_SURFACE_SYNC = 0x43 # macro
|
||||
PACKET3_ME_INITIALIZE = 0x44 # macro
|
||||
PACKET3_COND_WRITE = 0x45 # macro
|
||||
PACKET3_EVENT_WRITE = 0x46 # macro
|
||||
def EVENT_TYPE(x): # macro
|
||||
return ((x)<<0)
|
||||
def EVENT_INDEX(x): # macro
|
||||
return ((x)<<8)
|
||||
PACKET3_EVENT_WRITE_EOP = 0x47 # macro
|
||||
PACKET3_EVENT_WRITE_EOS = 0x48 # macro
|
||||
PACKET3_RELEASE_MEM = 0x49 # macro
|
||||
def PACKET3_RELEASE_MEM_EVENT_TYPE(x): # macro
|
||||
return ((x)<<0)
|
||||
def PACKET3_RELEASE_MEM_EVENT_INDEX(x): # macro
|
||||
return ((x)<<8)
|
||||
PACKET3_RELEASE_MEM_GCR_GLM_WB = (1<<12) # macro
|
||||
PACKET3_RELEASE_MEM_GCR_GLM_INV = (1<<13) # macro
|
||||
PACKET3_RELEASE_MEM_GCR_GLV_INV = (1<<14) # macro
|
||||
PACKET3_RELEASE_MEM_GCR_GL1_INV = (1<<15) # macro
|
||||
PACKET3_RELEASE_MEM_GCR_GL2_US = (1<<16) # macro
|
||||
PACKET3_RELEASE_MEM_GCR_GL2_RANGE = (1<<17) # macro
|
||||
PACKET3_RELEASE_MEM_GCR_GL2_DISCARD = (1<<19) # macro
|
||||
PACKET3_RELEASE_MEM_GCR_GL2_INV = (1<<20) # macro
|
||||
PACKET3_RELEASE_MEM_GCR_GL2_WB = (1<<21) # macro
|
||||
PACKET3_RELEASE_MEM_GCR_SEQ = (1<<22) # macro
|
||||
def PACKET3_RELEASE_MEM_CACHE_POLICY(x): # macro
|
||||
return ((x)<<25)
|
||||
PACKET3_RELEASE_MEM_EXECUTE = (1<<28) # macro
|
||||
def PACKET3_RELEASE_MEM_DATA_SEL(x): # macro
|
||||
return ((x)<<29)
|
||||
def PACKET3_RELEASE_MEM_INT_SEL(x): # macro
|
||||
return ((x)<<24)
|
||||
def PACKET3_RELEASE_MEM_DST_SEL(x): # macro
|
||||
return ((x)<<16)
|
||||
PACKET3_PREAMBLE_CNTL = 0x4A # macro
|
||||
PACKET3_PREAMBLE_BEGIN_CLEAR_STATE = (2<<28) # macro
|
||||
PACKET3_PREAMBLE_END_CLEAR_STATE = (3<<28) # macro
|
||||
PACKET3_DMA_DATA = 0x50 # macro
|
||||
def PACKET3_DMA_DATA_ENGINE(x): # macro
|
||||
return ((x)<<0)
|
||||
def PACKET3_DMA_DATA_SRC_CACHE_POLICY(x): # macro
|
||||
return ((x)<<13)
|
||||
def PACKET3_DMA_DATA_DST_SEL(x): # macro
|
||||
return ((x)<<20)
|
||||
def PACKET3_DMA_DATA_DST_CACHE_POLICY(x): # macro
|
||||
return ((x)<<25)
|
||||
def PACKET3_DMA_DATA_SRC_SEL(x): # macro
|
||||
return ((x)<<29)
|
||||
PACKET3_DMA_DATA_CP_SYNC = (1<<31) # macro
|
||||
PACKET3_DMA_DATA_CMD_SAS = (1<<26) # macro
|
||||
PACKET3_DMA_DATA_CMD_DAS = (1<<27) # macro
|
||||
PACKET3_DMA_DATA_CMD_SAIC = (1<<28) # macro
|
||||
PACKET3_DMA_DATA_CMD_DAIC = (1<<29) # macro
|
||||
PACKET3_DMA_DATA_CMD_RAW_WAIT = (1<<30) # macro
|
||||
PACKET3_CONTEXT_REG_RMW = 0x51 # macro
|
||||
PACKET3_GFX_CNTX_UPDATE = 0x52 # macro
|
||||
PACKET3_BLK_CNTX_UPDATE = 0x53 # macro
|
||||
PACKET3_INCR_UPDT_STATE = 0x55 # macro
|
||||
PACKET3_ACQUIRE_MEM = 0x58 # macro
|
||||
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(x): # macro
|
||||
return ((x)<<0)
|
||||
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_RANGE(x): # macro
|
||||
return ((x)<<2)
|
||||
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(x): # macro
|
||||
return ((x)<<4)
|
||||
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(x): # macro
|
||||
return ((x)<<5)
|
||||
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(x): # macro
|
||||
return ((x)<<6)
|
||||
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(x): # macro
|
||||
return ((x)<<7)
|
||||
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(x): # macro
|
||||
return ((x)<<8)
|
||||
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(x): # macro
|
||||
return ((x)<<9)
|
||||
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_US(x): # macro
|
||||
return ((x)<<10)
|
||||
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_RANGE(x): # macro
|
||||
return ((x)<<11)
|
||||
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_DISCARD(x): # macro
|
||||
return ((x)<<13)
|
||||
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(x): # macro
|
||||
return ((x)<<14)
|
||||
def PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(x): # macro
|
||||
return ((x)<<15)
|
||||
def PACKET3_ACQUIRE_MEM_GCR_CNTL_SEQ(x): # macro
|
||||
return ((x)<<16)
|
||||
PACKET3_ACQUIRE_MEM_GCR_RANGE_IS_PA = (1<<18) # macro
|
||||
PACKET3_REWIND = 0x59 # macro
|
||||
PACKET3_INTERRUPT = 0x5A # macro
|
||||
PACKET3_GEN_PDEPTE = 0x5B # macro
|
||||
PACKET3_INDIRECT_BUFFER_PASID = 0x5C # macro
|
||||
PACKET3_PRIME_UTCL2 = 0x5D # macro
|
||||
PACKET3_LOAD_UCONFIG_REG = 0x5E # macro
|
||||
PACKET3_LOAD_SH_REG = 0x5F # macro
|
||||
PACKET3_LOAD_CONFIG_REG = 0x60 # macro
|
||||
PACKET3_LOAD_CONTEXT_REG = 0x61 # macro
|
||||
PACKET3_LOAD_COMPUTE_STATE = 0x62 # macro
|
||||
PACKET3_LOAD_SH_REG_INDEX = 0x63 # macro
|
||||
PACKET3_SET_CONFIG_REG = 0x68 # macro
|
||||
PACKET3_SET_CONFIG_REG_START = 0x00002000 # macro
|
||||
PACKET3_SET_CONFIG_REG_END = 0x00002c00 # macro
|
||||
PACKET3_SET_CONTEXT_REG = 0x69 # macro
|
||||
PACKET3_SET_CONTEXT_REG_START = 0x0000a000 # macro
|
||||
PACKET3_SET_CONTEXT_REG_END = 0x0000a400 # macro
|
||||
PACKET3_SET_CONTEXT_REG_INDEX = 0x6A # macro
|
||||
PACKET3_SET_VGPR_REG_DI_MULTI = 0x71 # macro
|
||||
PACKET3_SET_SH_REG_DI = 0x72 # macro
|
||||
PACKET3_SET_CONTEXT_REG_INDIRECT = 0x73 # macro
|
||||
PACKET3_SET_SH_REG_DI_MULTI = 0x74 # macro
|
||||
PACKET3_GFX_PIPE_LOCK = 0x75 # macro
|
||||
PACKET3_SET_SH_REG = 0x76 # macro
|
||||
PACKET3_SET_SH_REG_START = 0x00002c00 # macro
|
||||
PACKET3_SET_SH_REG_END = 0x00003000 # macro
|
||||
PACKET3_SET_SH_REG_OFFSET = 0x77 # macro
|
||||
PACKET3_SET_QUEUE_REG = 0x78 # macro
|
||||
PACKET3_SET_UCONFIG_REG = 0x79 # macro
|
||||
PACKET3_SET_UCONFIG_REG_START = 0x0000c000 # macro
|
||||
PACKET3_SET_UCONFIG_REG_END = 0x0000c400 # macro
|
||||
PACKET3_SET_UCONFIG_REG_INDEX = 0x7A # macro
|
||||
PACKET3_FORWARD_HEADER = 0x7C # macro
|
||||
PACKET3_SCRATCH_RAM_WRITE = 0x7D # macro
|
||||
PACKET3_SCRATCH_RAM_READ = 0x7E # macro
|
||||
PACKET3_LOAD_CONST_RAM = 0x80 # macro
|
||||
PACKET3_WRITE_CONST_RAM = 0x81 # macro
|
||||
PACKET3_DUMP_CONST_RAM = 0x83 # macro
|
||||
PACKET3_INCREMENT_CE_COUNTER = 0x84 # macro
|
||||
PACKET3_INCREMENT_DE_COUNTER = 0x85 # macro
|
||||
PACKET3_WAIT_ON_CE_COUNTER = 0x86 # macro
|
||||
PACKET3_WAIT_ON_DE_COUNTER_DIFF = 0x88 # macro
|
||||
PACKET3_SWITCH_BUFFER = 0x8B # macro
|
||||
PACKET3_DISPATCH_DRAW_PREAMBLE = 0x8C # macro
|
||||
PACKET3_DISPATCH_DRAW_PREAMBLE_ACE = 0x8C # macro
|
||||
PACKET3_DISPATCH_DRAW = 0x8D # macro
|
||||
PACKET3_DISPATCH_DRAW_ACE = 0x8D # macro
|
||||
PACKET3_GET_LOD_STATS = 0x8E # macro
|
||||
PACKET3_DRAW_MULTI_PREAMBLE = 0x8F # macro
|
||||
PACKET3_FRAME_CONTROL = 0x90 # macro
|
||||
FRAME_TMZ = (1<<0) # macro
|
||||
def FRAME_CMD(x): # macro
|
||||
return ((x)<<28)
|
||||
PACKET3_INDEX_ATTRIBUTES_INDIRECT = 0x91 # macro
|
||||
PACKET3_WAIT_REG_MEM64 = 0x93 # macro
|
||||
PACKET3_COND_PREEMPT = 0x94 # macro
|
||||
PACKET3_HDP_FLUSH = 0x95 # macro
|
||||
PACKET3_COPY_DATA_RB = 0x96 # macro
|
||||
PACKET3_INVALIDATE_TLBS = 0x98 # macro
|
||||
def PACKET3_INVALIDATE_TLBS_DST_SEL(x): # macro
|
||||
return ((x)<<0)
|
||||
def PACKET3_INVALIDATE_TLBS_ALL_HUB(x): # macro
|
||||
return ((x)<<4)
|
||||
def PACKET3_INVALIDATE_TLBS_PASID(x): # macro
|
||||
return ((x)<<5)
|
||||
PACKET3_AQL_PACKET = 0x99 # macro
|
||||
PACKET3_DMA_DATA_FILL_MULTI = 0x9A # macro
|
||||
PACKET3_SET_SH_REG_INDEX = 0x9B # macro
|
||||
PACKET3_DRAW_INDIRECT_COUNT_MULTI = 0x9C # macro
|
||||
PACKET3_DRAW_INDEX_INDIRECT_COUNT_MULTI = 0x9D # macro
|
||||
PACKET3_DUMP_CONST_RAM_OFFSET = 0x9E # macro
|
||||
PACKET3_LOAD_CONTEXT_REG_INDEX = 0x9F # macro
|
||||
PACKET3_SET_RESOURCES = 0xA0 # macro
|
||||
def PACKET3_SET_RESOURCES_VMID_MASK(x): # macro
|
||||
return ((x)<<0)
|
||||
def PACKET3_SET_RESOURCES_UNMAP_LATENTY(x): # macro
|
||||
return ((x)<<16)
|
||||
def PACKET3_SET_RESOURCES_QUEUE_TYPE(x): # macro
|
||||
return ((x)<<29)
|
||||
PACKET3_MAP_PROCESS = 0xA1 # macro
|
||||
PACKET3_MAP_QUEUES = 0xA2 # macro
|
||||
def PACKET3_MAP_QUEUES_QUEUE_SEL(x): # macro
|
||||
return ((x)<<4)
|
||||
def PACKET3_MAP_QUEUES_VMID(x): # macro
|
||||
return ((x)<<8)
|
||||
def PACKET3_MAP_QUEUES_QUEUE(x): # macro
|
||||
return ((x)<<13)
|
||||
def PACKET3_MAP_QUEUES_PIPE(x): # macro
|
||||
return ((x)<<16)
|
||||
def PACKET3_MAP_QUEUES_ME(x): # macro
|
||||
return ((x)<<18)
|
||||
def PACKET3_MAP_QUEUES_QUEUE_TYPE(x): # macro
|
||||
return ((x)<<21)
|
||||
def PACKET3_MAP_QUEUES_ALLOC_FORMAT(x): # macro
|
||||
return ((x)<<24)
|
||||
def PACKET3_MAP_QUEUES_ENGINE_SEL(x): # macro
|
||||
return ((x)<<26)
|
||||
def PACKET3_MAP_QUEUES_NUM_QUEUES(x): # macro
|
||||
return ((x)<<29)
|
||||
def PACKET3_MAP_QUEUES_CHECK_DISABLE(x): # macro
|
||||
return ((x)<<1)
|
||||
def PACKET3_MAP_QUEUES_DOORBELL_OFFSET(x): # macro
|
||||
return ((x)<<2)
|
||||
PACKET3_UNMAP_QUEUES = 0xA3 # macro
|
||||
def PACKET3_UNMAP_QUEUES_ACTION(x): # macro
|
||||
return ((x)<<0)
|
||||
def PACKET3_UNMAP_QUEUES_QUEUE_SEL(x): # macro
|
||||
return ((x)<<4)
|
||||
def PACKET3_UNMAP_QUEUES_ENGINE_SEL(x): # macro
|
||||
return ((x)<<26)
|
||||
def PACKET3_UNMAP_QUEUES_NUM_QUEUES(x): # macro
|
||||
return ((x)<<29)
|
||||
def PACKET3_UNMAP_QUEUES_PASID(x): # macro
|
||||
return ((x)<<0)
|
||||
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(x): # macro
|
||||
return ((x)<<2)
|
||||
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET1(x): # macro
|
||||
return ((x)<<2)
|
||||
def PACKET3_UNMAP_QUEUES_RB_WPTR(x): # macro
|
||||
return ((x)<<0)
|
||||
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET2(x): # macro
|
||||
return ((x)<<2)
|
||||
def PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET3(x): # macro
|
||||
return ((x)<<2)
|
||||
PACKET3_QUERY_STATUS = 0xA4 # macro
|
||||
def PACKET3_QUERY_STATUS_CONTEXT_ID(x): # macro
|
||||
return ((x)<<0)
|
||||
def PACKET3_QUERY_STATUS_INTERRUPT_SEL(x): # macro
|
||||
return ((x)<<28)
|
||||
def PACKET3_QUERY_STATUS_COMMAND(x): # macro
|
||||
return ((x)<<30)
|
||||
def PACKET3_QUERY_STATUS_PASID(x): # macro
|
||||
return ((x)<<0)
|
||||
def PACKET3_QUERY_STATUS_DOORBELL_OFFSET(x): # macro
|
||||
return ((x)<<2)
|
||||
def PACKET3_QUERY_STATUS_ENG_SEL(x): # macro
|
||||
return ((x)<<25)
|
||||
PACKET3_RUN_LIST = 0xA5 # macro
|
||||
PACKET3_MAP_PROCESS_VM = 0xA6 # macro
|
||||
PACKET3_SET_Q_PREEMPTION_MODE = 0xF0 # macro
|
||||
def PACKET3_SET_Q_PREEMPTION_MODE_IB_VMID(x): # macro
|
||||
return ((x)<<0)
|
||||
PACKET3_SET_Q_PREEMPTION_MODE_INIT_SHADOW_MEM = (1<<0) # macro
|
||||
__all__ = \
|
||||
['CACHE_FLUSH_AND_INV_TS_EVENT', 'CE_PARTITION_BASE',
|
||||
'CP_PACKET2', 'F32_MES_PM4_PACKETS_H', 'FRAME_TMZ',
|
||||
'INDIRECT_BUFFER_VALID', 'NVD_H', 'PACKET2_PAD_MASK',
|
||||
'PACKET2_PAD_SHIFT', 'PACKET3_ACQUIRE_MEM',
|
||||
'PACKET3_ACQUIRE_MEM_GCR_RANGE_IS_PA', 'PACKET3_AQL_PACKET',
|
||||
'PACKET3_ATOMIC_GDS', 'PACKET3_ATOMIC_MEM',
|
||||
'PACKET3_BLK_CNTX_UPDATE', 'PACKET3_CLEAR_STATE',
|
||||
'PACKET3_COND_EXEC', 'PACKET3_COND_INDIRECT_BUFFER',
|
||||
'PACKET3_COND_INDIRECT_BUFFER_CNST', 'PACKET3_COND_PREEMPT',
|
||||
'PACKET3_COND_WRITE', 'PACKET3_CONTEXT_CONTROL',
|
||||
'PACKET3_CONTEXT_REG_RMW', 'PACKET3_COPY_DATA',
|
||||
'PACKET3_COPY_DATA_RB', 'PACKET3_COPY_DW', 'PACKET3_CP_DMA',
|
||||
'PACKET3_DISPATCH_DIRECT', 'PACKET3_DISPATCH_DRAW',
|
||||
'PACKET3_DISPATCH_DRAW_ACE', 'PACKET3_DISPATCH_DRAW_PREAMBLE',
|
||||
'PACKET3_DISPATCH_DRAW_PREAMBLE_ACE', 'PACKET3_DISPATCH_INDIRECT',
|
||||
'PACKET3_DMA_DATA', 'PACKET3_DMA_DATA_CMD_DAIC',
|
||||
'PACKET3_DMA_DATA_CMD_DAS', 'PACKET3_DMA_DATA_CMD_RAW_WAIT',
|
||||
'PACKET3_DMA_DATA_CMD_SAIC', 'PACKET3_DMA_DATA_CMD_SAS',
|
||||
'PACKET3_DMA_DATA_CP_SYNC', 'PACKET3_DMA_DATA_FILL_MULTI',
|
||||
'PACKET3_DRAW_INDEX_2', 'PACKET3_DRAW_INDEX_AUTO',
|
||||
'PACKET3_DRAW_INDEX_INDIRECT',
|
||||
'PACKET3_DRAW_INDEX_INDIRECT_COUNT_MULTI',
|
||||
'PACKET3_DRAW_INDEX_INDIRECT_MULTI',
|
||||
'PACKET3_DRAW_INDEX_MULTI_AUTO', 'PACKET3_DRAW_INDEX_MULTI_INST',
|
||||
'PACKET3_DRAW_INDEX_OFFSET_2', 'PACKET3_DRAW_INDIRECT',
|
||||
'PACKET3_DRAW_INDIRECT_COUNT_MULTI',
|
||||
'PACKET3_DRAW_INDIRECT_MULTI', 'PACKET3_DRAW_MULTI_PREAMBLE',
|
||||
'PACKET3_DRAW_PREAMBLE', 'PACKET3_DUMP_CONST_RAM',
|
||||
'PACKET3_DUMP_CONST_RAM_OFFSET', 'PACKET3_EVENT_WRITE',
|
||||
'PACKET3_EVENT_WRITE_EOP', 'PACKET3_EVENT_WRITE_EOS',
|
||||
'PACKET3_FORWARD_HEADER', 'PACKET3_FRAME_CONTROL',
|
||||
'PACKET3_GEN_PDEPTE', 'PACKET3_GET_LOD_STATS',
|
||||
'PACKET3_GFX_CNTX_UPDATE', 'PACKET3_GFX_PIPE_LOCK',
|
||||
'PACKET3_HDP_FLUSH', 'PACKET3_INCREMENT_CE_COUNTER',
|
||||
'PACKET3_INCREMENT_DE_COUNTER', 'PACKET3_INCR_UPDT_STATE',
|
||||
'PACKET3_INDEX_ATTRIBUTES_INDIRECT', 'PACKET3_INDEX_BASE',
|
||||
'PACKET3_INDEX_BUFFER_SIZE', 'PACKET3_INDEX_TYPE',
|
||||
'PACKET3_INDIRECT_BUFFER', 'PACKET3_INDIRECT_BUFFER_CNST',
|
||||
'PACKET3_INDIRECT_BUFFER_CNST_END', 'PACKET3_INDIRECT_BUFFER_END',
|
||||
'PACKET3_INDIRECT_BUFFER_PASID', 'PACKET3_INDIRECT_BUFFER_PRIV',
|
||||
'PACKET3_INTERRUPT', 'PACKET3_INVALIDATE_TLBS',
|
||||
'PACKET3_LOAD_COMPUTE_STATE', 'PACKET3_LOAD_CONFIG_REG',
|
||||
'PACKET3_LOAD_CONST_RAM', 'PACKET3_LOAD_CONTEXT_REG',
|
||||
'PACKET3_LOAD_CONTEXT_REG_INDEX', 'PACKET3_LOAD_SH_REG',
|
||||
'PACKET3_LOAD_SH_REG_INDEX', 'PACKET3_LOAD_UCONFIG_REG',
|
||||
'PACKET3_MAP_PROCESS', 'PACKET3_MAP_PROCESS_VM',
|
||||
'PACKET3_MAP_QUEUES', 'PACKET3_MEM_SEMAPHORE',
|
||||
'PACKET3_ME_INITIALIZE', 'PACKET3_NOP', 'PACKET3_NUM_INSTANCES',
|
||||
'PACKET3_OCCLUSION_QUERY', 'PACKET3_PFP_SYNC_ME',
|
||||
'PACKET3_PREAMBLE_BEGIN_CLEAR_STATE', 'PACKET3_PREAMBLE_CNTL',
|
||||
'PACKET3_PREAMBLE_END_CLEAR_STATE', 'PACKET3_PRED_EXEC',
|
||||
'PACKET3_PRIME_UTCL2', 'PACKET3_QUERY_STATUS', 'PACKET3_REG_RMW',
|
||||
'PACKET3_RELEASE_MEM', 'PACKET3_RELEASE_MEM_EXECUTE',
|
||||
'PACKET3_RELEASE_MEM_GCR_GL1_INV',
|
||||
'PACKET3_RELEASE_MEM_GCR_GL2_DISCARD',
|
||||
'PACKET3_RELEASE_MEM_GCR_GL2_INV',
|
||||
'PACKET3_RELEASE_MEM_GCR_GL2_RANGE',
|
||||
'PACKET3_RELEASE_MEM_GCR_GL2_US',
|
||||
'PACKET3_RELEASE_MEM_GCR_GL2_WB',
|
||||
'PACKET3_RELEASE_MEM_GCR_GLM_INV',
|
||||
'PACKET3_RELEASE_MEM_GCR_GLM_WB',
|
||||
'PACKET3_RELEASE_MEM_GCR_GLV_INV', 'PACKET3_RELEASE_MEM_GCR_SEQ',
|
||||
'PACKET3_REWIND', 'PACKET3_RUN_LIST', 'PACKET3_SCRATCH_RAM_READ',
|
||||
'PACKET3_SCRATCH_RAM_WRITE', 'PACKET3_SEM_SEL_SIGNAL',
|
||||
'PACKET3_SEM_SEL_SIGNAL_TYPE', 'PACKET3_SEM_SEL_WAIT',
|
||||
'PACKET3_SEM_USE_MAILBOX', 'PACKET3_SET_BASE',
|
||||
'PACKET3_SET_CONFIG_REG', 'PACKET3_SET_CONFIG_REG_END',
|
||||
'PACKET3_SET_CONFIG_REG_START', 'PACKET3_SET_CONTEXT_REG',
|
||||
'PACKET3_SET_CONTEXT_REG_END', 'PACKET3_SET_CONTEXT_REG_INDEX',
|
||||
'PACKET3_SET_CONTEXT_REG_INDIRECT',
|
||||
'PACKET3_SET_CONTEXT_REG_START', 'PACKET3_SET_PREDICATION',
|
||||
'PACKET3_SET_QUEUE_REG', 'PACKET3_SET_Q_PREEMPTION_MODE',
|
||||
'PACKET3_SET_Q_PREEMPTION_MODE_INIT_SHADOW_MEM',
|
||||
'PACKET3_SET_RESOURCES', 'PACKET3_SET_SH_REG',
|
||||
'PACKET3_SET_SH_REG_DI', 'PACKET3_SET_SH_REG_DI_MULTI',
|
||||
'PACKET3_SET_SH_REG_END', 'PACKET3_SET_SH_REG_INDEX',
|
||||
'PACKET3_SET_SH_REG_OFFSET', 'PACKET3_SET_SH_REG_START',
|
||||
'PACKET3_SET_UCONFIG_REG', 'PACKET3_SET_UCONFIG_REG_END',
|
||||
'PACKET3_SET_UCONFIG_REG_INDEX', 'PACKET3_SET_UCONFIG_REG_START',
|
||||
'PACKET3_SET_VGPR_REG_DI_MULTI', 'PACKET3_STRMOUT_BUFFER_UPDATE',
|
||||
'PACKET3_SURFACE_SYNC', 'PACKET3_SWITCH_BUFFER',
|
||||
'PACKET3_UNMAP_QUEUES', 'PACKET3_WAIT_ON_CE_COUNTER',
|
||||
'PACKET3_WAIT_ON_DE_COUNTER_DIFF', 'PACKET3_WAIT_REG_MEM',
|
||||
'PACKET3_WAIT_REG_MEM64', 'PACKET3_WRITE_CONST_RAM',
|
||||
'PACKET3_WRITE_DATA', 'PACKET_TYPE0', 'PACKET_TYPE1',
|
||||
'PACKET_TYPE2', 'PACKET_TYPE3', 'PM4_MEC_RELEASE_MEM_DEFINED',
|
||||
'PM4_MEC_WRITE_DATA_DEFINED', 'PM4_MES_HEADER_DEFINED',
|
||||
'WRITE_DATA_addr_incr_enum', 'WRITE_DATA_cache_policy_enum',
|
||||
'WRITE_DATA_dst_sel_enum', 'WRITE_DATA_wr_confirm_enum',
|
||||
'WR_CONFIRM', 'WR_ONE_ADDR',
|
||||
'addr_incr___write_data__do_not_increment_address',
|
||||
'addr_incr___write_data__increment_address',
|
||||
'c__Ea_CACHE_FLUSH_AND_INV_TS_EVENT', 'c_uint32', 'c_uint32',
|
||||
'c_uint32', 'c_uint32', 'c_uint32', 'c_uint32',
|
||||
'cache_policy___write_data__lru',
|
||||
'cache_policy___write_data__stream',
|
||||
'cache_policy__mec_release_mem__lru',
|
||||
'cache_policy__mec_release_mem__stream',
|
||||
'data_sel__mec_release_mem__none',
|
||||
'data_sel__mec_release_mem__send_32_bit_low',
|
||||
'data_sel__mec_release_mem__send_64_bit_data',
|
||||
'data_sel__mec_release_mem__send_cp_perfcounter_hi_lo',
|
||||
'data_sel__mec_release_mem__send_gpu_clock_counter',
|
||||
'data_sel__mec_release_mem__store_gds_data_to_memory',
|
||||
'dst_sel___write_data__gds',
|
||||
'dst_sel___write_data__mem_mapped_register',
|
||||
'dst_sel___write_data__memory',
|
||||
'dst_sel___write_data__memory_mapped_adc_persistent_state',
|
||||
'dst_sel___write_data__tc_l2',
|
||||
'dst_sel__mec_release_mem__memory_controller',
|
||||
'dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit',
|
||||
'dst_sel__mec_release_mem__queue_write_pointer_register',
|
||||
'dst_sel__mec_release_mem__tc_l2',
|
||||
'event_index__mec_release_mem__end_of_pipe',
|
||||
'event_index__mec_release_mem__shader_done', 'int32_t',
|
||||
'int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare',
|
||||
'int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare',
|
||||
'int_sel__mec_release_mem__none',
|
||||
'int_sel__mec_release_mem__send_data_after_write_confirm',
|
||||
'int_sel__mec_release_mem__send_interrupt_after_write_confirm',
|
||||
'int_sel__mec_release_mem__send_interrupt_only',
|
||||
'int_sel__mec_release_mem__unconditionally_send_int_ctxid',
|
||||
'pq_exe_status__mec_release_mem__default',
|
||||
'pq_exe_status__mec_release_mem__phase_update',
|
||||
'struct_PM4_MES_TYPE_3_HEADER_0', 'struct_pm4_mec_release_mem',
|
||||
'struct_pm4_mec_release_mem_1_bitfields2',
|
||||
'struct_pm4_mec_release_mem_2_bitfields3',
|
||||
'struct_pm4_mec_release_mem_3_bitfields4',
|
||||
'struct_pm4_mec_release_mem_3_bitfields4b',
|
||||
'struct_pm4_mec_release_mem_5_bitfields6c',
|
||||
'struct_pm4_mec_write_data_mmio',
|
||||
'struct_pm4_mec_write_data_mmio_1_bitfields2',
|
||||
'struct_pm4_mec_write_data_mmio_2_bitfields3', 'uint32_t',
|
||||
'union_PM4_MES_TYPE_3_HEADER', 'union_pm4_mec_release_mem_0',
|
||||
'union_pm4_mec_release_mem_1', 'union_pm4_mec_release_mem_2',
|
||||
'union_pm4_mec_release_mem_3', 'union_pm4_mec_release_mem_4',
|
||||
'union_pm4_mec_release_mem_5', 'union_pm4_mec_release_mem_6',
|
||||
'union_pm4_mec_write_data_mmio_0',
|
||||
'union_pm4_mec_write_data_mmio_1',
|
||||
'union_pm4_mec_write_data_mmio_2',
|
||||
'wr_confirm___write_data__do_not_wait_for_write_confirmation',
|
||||
'wr_confirm___write_data__wait_for_write_confirmation']
|
||||
7103
tinygrad/runtime/autogen/am/sdma_5_0_0.py
Normal file
7103
tinygrad/runtime/autogen/am/sdma_5_0_0.py
Normal file
File diff suppressed because it is too large
Load Diff
8085
tinygrad/runtime/autogen/am/sdma_6_0_0.py
Normal file
8085
tinygrad/runtime/autogen/am/sdma_6_0_0.py
Normal file
File diff suppressed because it is too large
Load Diff
37140
tinygrad/runtime/autogen/am/soc24.py
Normal file
37140
tinygrad/runtime/autogen/am/soc24.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
from typing import Any, cast, ClassVar
|
||||
import os, ctypes, ctypes.util, struct, hashlib, functools, mmap, errno, array, contextlib, sys, select
|
||||
import os, ctypes, ctypes.util, struct, hashlib, functools, importlib, mmap, errno, array, contextlib, sys, select
|
||||
assert sys.platform != 'win32'
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, HWInterface
|
||||
@@ -9,31 +9,19 @@ from tinygrad.device import Compiled, ProfileEvent, BufferSpec, CPUProgram, PROF
|
||||
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, DEBUG, OSX
|
||||
from tinygrad.renderer.cstyle import AMDRenderer
|
||||
from tinygrad.renderer.llvmir import AMDLLVMRenderer
|
||||
from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc, pci, vfio, sqtt
|
||||
from tinygrad.runtime.autogen.am import am, gc_11_0_0
|
||||
from tinygrad.runtime.autogen import kfd, hsa, libc, pci, vfio, sqtt
|
||||
from tinygrad.runtime.autogen.am import am
|
||||
from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler
|
||||
from tinygrad.runtime.support.elf import elf_loader
|
||||
from tinygrad.runtime.support.am.amdev import AMDev, AMMapping
|
||||
from tinygrad.runtime.support.amd import AMDRegBase, collect_registers, import_module
|
||||
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import
|
||||
|
||||
regBIF_BX_PF1_GPU_HDP_FLUSH_REQ, regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0106, 0x0107
|
||||
|
||||
EVENT_INDEX_PARTIAL_FLUSH = 4 # based on a comment in nvd.h
|
||||
WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
|
||||
WAIT_REG_MEM_FUNCTION_NEQ = 4 # !=
|
||||
WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
|
||||
|
||||
COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15)
|
||||
|
||||
def gfxreg(reg): return reg + amd_gpu.GC_BASE__INST0_SEG0 - amd_gpu.PACKET3_SET_SH_REG_START
|
||||
def ucfgreg(reg, pkt3_set:bool=True): return reg + amd_gpu.GC_BASE__INST0_SEG1 - (amd_gpu.PACKET3_SET_UCONFIG_REG_START if pkt3_set else 0)
|
||||
def nbioreg(reg): return reg + amd_gpu.NBIO_BASE__INST0_SEG2
|
||||
|
||||
# This can potentially be shared with AMRegister._parse_kwargs. NOTE: This is hardcoded to gfx11, bitfields might be different in other gfxvers.
|
||||
# Currently not a problem because this is only used by sqtt and sqtt is only supported on 7900xtx
|
||||
def encode_bitfields(regname: str, **kwargs) -> int:
|
||||
return functools.reduce(lambda x,y: x|y, [v << getattr(gc_11_0_0, f'{regname}__{k.upper()}__SHIFT') for k,v in kwargs.items()], 0)
|
||||
|
||||
class AMDSignal(HCQSignal):
|
||||
def __init__(self, base_addr:int|None=None, **kwargs):
|
||||
super().__init__(base_addr, **kwargs, timestamp_divider=100, dev_t=AMDDevice)
|
||||
@@ -43,61 +31,71 @@ class AMDSignal(HCQSignal):
|
||||
if time_spent_waiting_ms > 2000 and self.timeline_for_device is not None: self.timeline_for_device.dev_iface.sleep(200)
|
||||
|
||||
class AMDComputeQueue(HWQueue):
|
||||
def __init__(self, dev:AMDDevice):
|
||||
self.soc, self.pm4, self.gc, self.nbio = dev.soc, dev.pm4, dev.gc, dev.nbio
|
||||
super().__init__()
|
||||
|
||||
def __del__(self):
|
||||
if self.binded_device is not None:
|
||||
self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferSpec(cpu_access=True, nolru=True, uncached=True))
|
||||
|
||||
def pkt3(self, cmd, *vals): self.q(amd_gpu.PACKET3(cmd, len(vals) - 1), *vals)
|
||||
def pkt3(self, cmd, *vals): self.q(self.pm4.PACKET3(cmd, len(vals) - 1), *vals)
|
||||
|
||||
def gfxreg(self, reg:AMDReg): return reg.addr - self.pm4.PACKET3_SET_SH_REG_START
|
||||
def ucfgreg(self, reg:AMDReg): return reg.addr - self.pm4.PACKET3_SET_UCONFIG_REG_START
|
||||
|
||||
def sqtt_userdata(self, data, *extra_dwords):
|
||||
data_ints = [x[0] for x in struct.iter_unpack('<I', bytes(data))] + list(extra_dwords)
|
||||
for i in range(0, len(data_ints), 2):
|
||||
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_USERDATA_2), *data_ints[i:i+2])
|
||||
self.pkt3(self.pm4.PACKET3_SET_UCONFIG_REG, self.ucfgreg(self.gc.regSQ_THREAD_TRACE_USERDATA_2), *data_ints[i:i+2])
|
||||
|
||||
def wait_reg_mem(self, value, mask=0xffffffff, mem=None, reg_req=None, reg_done=None):
|
||||
wrm_info_dw = amd_gpu.WAIT_REG_MEM_MEM_SPACE(int(mem is not None)) | amd_gpu.WAIT_REG_MEM_OPERATION(int(mem is None)) \
|
||||
| amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0)
|
||||
wrm_info_dw = self.pm4.WAIT_REG_MEM_MEM_SPACE(int(mem is not None)) | self.pm4.WAIT_REG_MEM_OPERATION(int(mem is None)) \
|
||||
| self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | self.pm4.WAIT_REG_MEM_ENGINE(0)
|
||||
|
||||
self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, wrm_info_dw, *(data64_le(mem) if mem is not None else (reg_req, reg_done)), value, mask, 4)
|
||||
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, wrm_info_dw, *(data64_le(mem) if mem is not None else (reg_req, reg_done)), value, mask, 4)
|
||||
|
||||
def acquire_mem(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
|
||||
cache_flags_dw = amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) \
|
||||
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) \
|
||||
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) \
|
||||
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) \
|
||||
| amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)
|
||||
cache_flags_dw = self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) \
|
||||
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) \
|
||||
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) \
|
||||
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) \
|
||||
| self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | self.pm4.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)
|
||||
|
||||
self.pkt3(amd_gpu.PACKET3_ACQUIRE_MEM, 0, *data64_le(sz), *data64_le(addr), 0, cache_flags_dw)
|
||||
self.pkt3(self.pm4.PACKET3_ACQUIRE_MEM, 0, *data64_le(sz), *data64_le(addr), 0, cache_flags_dw)
|
||||
|
||||
def release_mem(self, address, value, data_sel, int_sel, ctxid=0, cache_flush=False):
|
||||
cache_flags_dw = 0 if not cache_flush else (amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV \
|
||||
| amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB \
|
||||
| amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ)
|
||||
cache_flags_dw = 0 if not cache_flush else (self.pm4.PACKET3_RELEASE_MEM_GCR_GLV_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GL1_INV \
|
||||
| self.pm4.PACKET3_RELEASE_MEM_GCR_GL2_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GLM_WB \
|
||||
| self.pm4.PACKET3_RELEASE_MEM_GCR_GLM_INV | self.pm4.PACKET3_RELEASE_MEM_GCR_GL2_WB | self.pm4.PACKET3_RELEASE_MEM_GCR_SEQ)
|
||||
|
||||
event_dw = amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(amd_gpu.CACHE_FLUSH_AND_INV_TS_EVENT) \
|
||||
| amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(amd_gpu.event_index__mec_release_mem__end_of_pipe)
|
||||
event_dw = self.pm4.PACKET3_RELEASE_MEM_EVENT_TYPE(self.pm4.CACHE_FLUSH_AND_INV_TS_EVENT) \
|
||||
| self.pm4.PACKET3_RELEASE_MEM_EVENT_INDEX(self.pm4.event_index__mec_release_mem__end_of_pipe)
|
||||
|
||||
memsel_dw = amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0)
|
||||
memsel_dw = self.pm4.PACKET3_RELEASE_MEM_DATA_SEL(data_sel) | self.pm4.PACKET3_RELEASE_MEM_INT_SEL(int_sel) \
|
||||
| self.pm4.PACKET3_RELEASE_MEM_DST_SEL(0)
|
||||
|
||||
self.pkt3(amd_gpu.PACKET3_RELEASE_MEM, event_dw | cache_flags_dw, memsel_dw, *data64_le(address), *data64_le(value), ctxid)
|
||||
self.pkt3(self.pm4.PACKET3_RELEASE_MEM, event_dw | cache_flags_dw, memsel_dw, *data64_le(address), *data64_le(value), ctxid)
|
||||
|
||||
def memory_barrier(self):
|
||||
self.wait_reg_mem(reg_req=nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ), reg_done=nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), value=0xffffffff)
|
||||
self.wait_reg_mem(reg_req=self.nbio.regBIF_BX_PF0_GPU_HDP_FLUSH_REQ.addr, reg_done=self.nbio.regBIF_BX_PF0_GPU_HDP_FLUSH_DONE.addr,
|
||||
value=0xffffffff)
|
||||
self.acquire_mem()
|
||||
return self
|
||||
|
||||
def spi_config(self, tracing:bool):
|
||||
spi_config_cntl = encode_bitfields('SPI_CONFIG_CNTL', ps_pkr_priority_cntl=3, exp_priority_order=3, gpr_write_priority=0x2c688,
|
||||
enable_sqg_bop_events=int(tracing), enable_sqg_top_events=int(tracing))
|
||||
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSPI_CONFIG_CNTL), spi_config_cntl)
|
||||
spi_config_cntl = self.gc.regSPI_CONFIG_CNTL.encode(ps_pkr_priority_cntl=3, exp_priority_order=3, gpr_write_priority=0x2c688,
|
||||
enable_sqg_bop_events=int(tracing), enable_sqg_top_events=int(tracing))
|
||||
self.pkt3(self.pm4.PACKET3_SET_UCONFIG_REG, self.ucfgreg(self.gc.regSPI_CONFIG_CNTL), spi_config_cntl)
|
||||
|
||||
def sqtt_config(self, tracing:bool):
|
||||
sq_thread_trace_ctrl = encode_bitfields('SQ_THREAD_TRACE_CTRL', draw_event_en=1, spi_stall_en=1, sq_stall_en=1, reg_at_hwm=2, hiwater=1,
|
||||
rt_freq=amd_gpu.SQ_TT_RT_FREQ_4096_CLK, util_timer=amd_gpu.SQ_TT_UTIL_TIMER_250_CLK, mode=int(tracing))
|
||||
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_CTRL), sq_thread_trace_ctrl)
|
||||
sq_thread_trace_ctrl = self.gc.regSQ_THREAD_TRACE_CTRL.encode(draw_event_en=1, spi_stall_en=1, sq_stall_en=1, reg_at_hwm=2, hiwater=1,
|
||||
rt_freq=self.soc.SQ_TT_RT_FREQ_4096_CLK,
|
||||
util_timer=self.soc.SQ_TT_UTIL_TIMER_250_CLK, mode=int(tracing))
|
||||
self.pkt3(self.pm4.PACKET3_SET_UCONFIG_REG, self.ucfgreg(self.gc.regSQ_THREAD_TRACE_CTRL), sq_thread_trace_ctrl)
|
||||
|
||||
def grbm_gfx_index(self, **kwargs):
|
||||
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regGRBM_GFX_INDEX), encode_bitfields('GRBM_GFX_INDEX', **kwargs))
|
||||
self.pkt3(self.pm4.PACKET3_SET_UCONFIG_REG, self.ucfgreg(self.gc.regGRBM_GFX_INDEX), self.gc.regGRBM_GFX_INDEX.encode(**kwargs))
|
||||
|
||||
# Magic values from mesa/src/amd/vulkan/radv_sqtt.c:radv_emit_spi_config_cntl and src/amd/common/ac_sqtt.c:ac_sqtt_emit_start
|
||||
def start_trace(self, buf0s:list[HCQBuffer], se_mask:int):
|
||||
@@ -107,31 +105,31 @@ class AMDComputeQueue(HWQueue):
|
||||
for se in range(len(buf0s)):
|
||||
self.grbm_gfx_index(se_index=se, instance_broadcast_writes=1) # select se, broadcast to all instances in that se
|
||||
buf0_lo, buf0_hi = data64_le(buf0s[se].va_addr>>12)
|
||||
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_BUF0_SIZE),
|
||||
encode_bitfields('SQ_THREAD_TRACE_BUF0_SIZE', base_hi=buf0_hi, size=buf0s[se].size>>12))
|
||||
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_BUF0_BASE), buf0_lo)
|
||||
self.pkt3(self.pm4.PACKET3_SET_UCONFIG_REG, self.ucfgreg(self.gc.regSQ_THREAD_TRACE_BUF0_SIZE),
|
||||
self.gc.regSQ_THREAD_TRACE_BUF0_SIZE.encode(base_hi=buf0_hi, size=buf0s[se].size>>12))
|
||||
self.pkt3(self.pm4.PACKET3_SET_UCONFIG_REG, self.ucfgreg(self.gc.regSQ_THREAD_TRACE_BUF0_BASE), buf0_lo)
|
||||
# NOTE: SQTT can only trace instructions on one simd per se, this selects first simd in first wgp in first sa.
|
||||
# For RGP to display instruction trace it has to see it on first SE. Howerver ACE/MEC/whatever does the dispatching starting with second se,
|
||||
# and on amdgpu/non-AM it also does weird things with dispatch order inside se: around 7 times out of 10 it starts from the last cu, but
|
||||
# sometimes not, especially if the kernel has more than one wavefront which means that kernels with small global size might get unlucky and
|
||||
# be dispatched on something else and not be seen in instruction tracing tab. You can force the wavefronts of a kernel to be dispatched on the
|
||||
# CUs you want to by disabling other CUs via bits in regCOMPUTE_STATIC_THREAD_MGMT_SE<x> and trace even kernels that only have one wavefront.
|
||||
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_MASK),
|
||||
encode_bitfields('SQ_THREAD_TRACE_MASK', wtype_include=amd_gpu.SQ_TT_WTYPE_INCLUDE_CS_BIT, simd_sel=0, wgp_sel=0, sa_sel=0))
|
||||
REG_INCLUDE = amd_gpu.SQ_TT_TOKEN_MASK_SQDEC_BIT | amd_gpu.SQ_TT_TOKEN_MASK_SHDEC_BIT | amd_gpu.SQ_TT_TOKEN_MASK_GFXUDEC_BIT | \
|
||||
amd_gpu.SQ_TT_TOKEN_MASK_COMP_BIT | amd_gpu.SQ_TT_TOKEN_MASK_CONTEXT_BIT | amd_gpu.SQ_TT_TOKEN_MASK_CONTEXT_BIT
|
||||
TOKEN_EXCLUDE = 1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_PERF_SHIFT
|
||||
self.pkt3(self.pm4.PACKET3_SET_UCONFIG_REG, self.ucfgreg(self.gc.regSQ_THREAD_TRACE_MASK),
|
||||
self.gc.regSQ_THREAD_TRACE_MASK.encode(wtype_include=self.soc.SQ_TT_WTYPE_INCLUDE_CS_BIT, simd_sel=0, wgp_sel=0, sa_sel=0))
|
||||
REG_INCLUDE = self.soc.SQ_TT_TOKEN_MASK_SQDEC_BIT | self.soc.SQ_TT_TOKEN_MASK_SHDEC_BIT | self.soc.SQ_TT_TOKEN_MASK_GFXUDEC_BIT | \
|
||||
self.soc.SQ_TT_TOKEN_MASK_COMP_BIT | self.soc.SQ_TT_TOKEN_MASK_CONTEXT_BIT | self.soc.SQ_TT_TOKEN_MASK_CONTEXT_BIT
|
||||
TOKEN_EXCLUDE = 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_PERF_SHIFT
|
||||
if not (se_mask >> se) & 0b1:
|
||||
TOKEN_EXCLUDE |= 1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_VMEMEXEC_SHIFT | 1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_ALUEXEC_SHIFT | \
|
||||
1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_VALUINST_SHIFT | 1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_IMMEDIATE_SHIFT | \
|
||||
1 << amd_gpu.SQ_TT_TOKEN_EXCLUDE_INST_SHIFT
|
||||
self.pkt3(amd_gpu.PACKET3_SET_UCONFIG_REG, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_TOKEN_MASK),
|
||||
encode_bitfields('SQ_THREAD_TRACE_TOKEN_MASK', reg_include=REG_INCLUDE, token_exclude=TOKEN_EXCLUDE, bop_events_token_include=1))
|
||||
TOKEN_EXCLUDE |= 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_VMEMEXEC_SHIFT | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_ALUEXEC_SHIFT | \
|
||||
1 << self.soc.SQ_TT_TOKEN_EXCLUDE_VALUINST_SHIFT | 1 << self.soc.SQ_TT_TOKEN_EXCLUDE_IMMEDIATE_SHIFT | \
|
||||
1 << self.soc.SQ_TT_TOKEN_EXCLUDE_INST_SHIFT
|
||||
self.pkt3(self.pm4.PACKET3_SET_UCONFIG_REG, self.ucfgreg(self.gc.regSQ_THREAD_TRACE_TOKEN_MASK),
|
||||
self.gc.regSQ_THREAD_TRACE_TOKEN_MASK.encode(reg_include=REG_INCLUDE, token_exclude=TOKEN_EXCLUDE, bop_events_token_include=1))
|
||||
# Enable SQTT
|
||||
self.sqtt_config(tracing=True)
|
||||
# Restore global broadcasting
|
||||
self.grbm_gfx_index(se_broadcast_writes=1, sa_broadcast_writes=1, instance_broadcast_writes=1)
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_THREAD_TRACE_ENABLE), 1)
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_THREAD_TRACE_ENABLE), 1)
|
||||
self.memory_barrier()
|
||||
return self
|
||||
|
||||
@@ -139,24 +137,24 @@ class AMDComputeQueue(HWQueue):
|
||||
def stop_trace(self, ses: int, wptrs: HCQBuffer):
|
||||
self.memory_barrier()
|
||||
# Start shutting everything down
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_THREAD_TRACE_ENABLE), 0)
|
||||
self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.THREAD_TRACE_FINISH) | amd_gpu.EVENT_INDEX(0))
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_THREAD_TRACE_ENABLE), 0)
|
||||
self.pkt3(self.pm4.PACKET3_EVENT_WRITE, self.pm4.EVENT_TYPE(self.soc.THREAD_TRACE_FINISH) | self.pm4.EVENT_INDEX(0))
|
||||
# For each SE wait for finish to complete and copy regSQ_THREAD_TRACE_WPTR to know where in the buffer trace data ends
|
||||
for se in range(ses):
|
||||
self.grbm_gfx_index(se_index=se, instance_broadcast_writes=1) # select se, broadcast to all instances in that se
|
||||
# Wait for FINISH_PENDING==0
|
||||
self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ),
|
||||
ucfgreg(amd_gpu.regSQ_THREAD_TRACE_STATUS, False), 0, 0, gc_11_0_0.SQ_THREAD_TRACE_STATUS__FINISH_PENDING_MASK, 4)
|
||||
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ),
|
||||
self.gc.regSQ_THREAD_TRACE_STATUS.addr, 0, 0, self.gc.SQ_THREAD_TRACE_STATUS__FINISH_PENDING_MASK, 4)
|
||||
# Wait for FINISH_DONE!=0
|
||||
self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_NEQ),
|
||||
ucfgreg(amd_gpu.regSQ_THREAD_TRACE_STATUS, False), 0, 0, gc_11_0_0.SQ_THREAD_TRACE_STATUS__FINISH_DONE_MASK, 4)
|
||||
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_NEQ),
|
||||
self.gc.regSQ_THREAD_TRACE_STATUS.addr, 0, 0, self.gc.SQ_THREAD_TRACE_STATUS__FINISH_DONE_MASK, 4)
|
||||
# Disable SQTT
|
||||
self.sqtt_config(tracing=False)
|
||||
# Wait for BUSY==0
|
||||
self.pkt3(amd_gpu.PACKET3_WAIT_REG_MEM, amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ),
|
||||
ucfgreg(amd_gpu.regSQ_THREAD_TRACE_STATUS, False), 0, 0, gc_11_0_0.SQ_THREAD_TRACE_STATUS__BUSY_MASK, 4)
|
||||
# Copy WPTR to memory (src_sel = perf, dst_sel = tc_l2, wr_confirm = True), ucfgreg with False adds GC_BASE__INST0_SEG1 but not pkt3 reg offset
|
||||
self.pkt3(amd_gpu.PACKET3_COPY_DATA, 1 << 20 | 2 << 8 | 4, ucfgreg(amd_gpu.regSQ_THREAD_TRACE_WPTR, False), 0, *data64_le(wptrs.va_addr+(se*4)))
|
||||
self.pkt3(self.pm4.PACKET3_WAIT_REG_MEM, self.pm4.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ),
|
||||
self.gc.regSQ_THREAD_TRACE_STATUS.addr, 0, 0, self.gc.SQ_THREAD_TRACE_STATUS__BUSY_MASK, 4)
|
||||
# Copy WPTR to memory (src_sel = perf, dst_sel = tc_l2, wr_confirm = True)
|
||||
self.pkt3(self.pm4.PACKET3_COPY_DATA, 1 << 20 | 2 << 8 | 4, self.gc.regSQ_THREAD_TRACE_WPTR.addr, 0, *data64_le(wptrs.va_addr+(se*4)))
|
||||
# Restore global broadcasting
|
||||
self.grbm_gfx_index(se_broadcast_writes=1, sa_broadcast_writes=1, instance_broadcast_writes=1)
|
||||
self.spi_config(tracing=False)
|
||||
@@ -198,25 +196,26 @@ class AMDComputeQueue(HWQueue):
|
||||
), *global_size)
|
||||
prg.dev.cmd_id += 1
|
||||
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8))
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2)
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC3), 0)
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_TMPRING_SIZE), prg.dev.tmpring_size)
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8))
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2)
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_PGM_RSRC3), 0)
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_TMPRING_SIZE), prg.dev.tmpring_size)
|
||||
if prg.dev.has_scratch_base_registers:
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO), *data64_le(prg.dev.scratch.va_addr >> 8))
|
||||
if prg.dev.target < 110000: self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.mmCP_COHER_START_DELAY), 0x20)
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_RESTART_X), 0, 0, 0)
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE0), 0xFFFFFFFF, 0xFFFFFFFF)
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE2), 0xFFFFFFFF, 0xFFFFFFFF)
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4), 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0), *user_regs)
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_DISPATCH_SCRATCH_BASE_LO), *data64_le(prg.dev.scratch.va_addr >> 8))
|
||||
if prg.dev.target < 110000: self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.mmCP_COHER_START_DELAY), 0x20)
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_RESTART_X), 0, 0, 0)
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE0), 0xFFFFFFFF, 0xFFFFFFFF)
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE2), 0xFFFFFFFF, 0xFFFFFFFF)
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_STATIC_THREAD_MGMT_SE4), 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_USER_DATA_0), *user_regs)
|
||||
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0)
|
||||
self.pkt3(amd_gpu.PACKET3_SET_SH_REG, gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0)
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0)
|
||||
self.pkt3(self.pm4.PACKET3_SET_SH_REG, self.gfxreg(self.gc.regCOMPUTE_RESOURCE_LIMITS), 0)
|
||||
|
||||
self.pkt3(amd_gpu.PACKET3_DISPATCH_DIRECT, *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN)
|
||||
if prg.dev.sqtt_enabled: self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.THREAD_TRACE_MARKER) | amd_gpu.EVENT_INDEX(0))
|
||||
self.pkt3(amd_gpu.PACKET3_EVENT_WRITE, amd_gpu.EVENT_TYPE(amd_gpu.CS_PARTIAL_FLUSH) | amd_gpu.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH))
|
||||
DISPATCH_INITIATOR = self.gc.regCOMPUTE_DISPATCH_INITIATOR.encode(cs_w32_en=1, force_start_at_000=1, compute_shader_en=1)
|
||||
self.pkt3(self.pm4.PACKET3_DISPATCH_DIRECT, *global_size, DISPATCH_INITIATOR)
|
||||
if prg.dev.sqtt_enabled: self.pkt3(self.pm4.PACKET3_EVENT_WRITE, self.pm4.EVENT_TYPE(self.soc.THREAD_TRACE_MARKER) | self.pm4.EVENT_INDEX(0))
|
||||
self.pkt3(self.pm4.PACKET3_EVENT_WRITE, self.pm4.EVENT_TYPE(self.soc.CS_PARTIAL_FLUSH) | self.pm4.EVENT_INDEX(EVENT_INDEX_PARTIAL_FLUSH))
|
||||
return self
|
||||
|
||||
def wait(self, signal:AMDSignal, value:sint=0):
|
||||
@@ -224,17 +223,17 @@ class AMDComputeQueue(HWQueue):
|
||||
return self
|
||||
|
||||
def timestamp(self, signal:AMDSignal):
|
||||
self.release_mem(signal.timestamp_addr, 0, amd_gpu.data_sel__mec_release_mem__send_gpu_clock_counter, amd_gpu.int_sel__mec_release_mem__none)
|
||||
self.release_mem(signal.timestamp_addr, 0, self.pm4.data_sel__mec_release_mem__send_gpu_clock_counter, self.pm4.int_sel__mec_release_mem__none)
|
||||
return self
|
||||
|
||||
def signal(self, signal:AMDSignal, value:sint=0):
|
||||
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
|
||||
self.release_mem(signal.value_addr, value, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
|
||||
amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, cache_flush=True)
|
||||
self.release_mem(signal.value_addr, value, self.pm4.data_sel__mec_release_mem__send_32_bit_low,
|
||||
self.pm4.int_sel__mec_release_mem__send_interrupt_after_write_confirm, cache_flush=True)
|
||||
|
||||
if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None:
|
||||
self.release_mem(dev.queue_event_mailbox_ptr, dev.queue_event.event_id, amd_gpu.data_sel__mec_release_mem__send_32_bit_low,
|
||||
amd_gpu.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=dev.queue_event.event_id)
|
||||
self.release_mem(dev.queue_event_mailbox_ptr, dev.queue_event.event_id, self.pm4.data_sel__mec_release_mem__send_32_bit_low,
|
||||
self.pm4.int_sel__mec_release_mem__send_interrupt_after_write_confirm, ctxid=dev.queue_event.event_id)
|
||||
return self
|
||||
|
||||
def bind(self, dev:AMDDevice):
|
||||
@@ -243,8 +242,8 @@ class AMDComputeQueue(HWQueue):
|
||||
hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
|
||||
for i, value in enumerate(self._q): hw_view[i] = value
|
||||
|
||||
self.indirect_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_INDIRECT_BUFFER, 2), *data64_le(self.hw_page.va_addr),
|
||||
len(self._q) | amd_gpu.INDIRECT_BUFFER_VALID]
|
||||
self.indirect_cmd = [self.pm4.PACKET3(self.pm4.PACKET3_INDIRECT_BUFFER, 2), *data64_le(self.hw_page.va_addr),
|
||||
len(self._q) | self.pm4.INDIRECT_BUFFER_VALID]
|
||||
self._q = hw_view
|
||||
return self
|
||||
|
||||
@@ -257,8 +256,8 @@ class AMDComputeQueue(HWQueue):
|
||||
dev.compute_queue.signal_doorbell(dev)
|
||||
|
||||
class AMDCopyQueue(HWQueue):
|
||||
def __init__(self, max_copy_size=0x40000000):
|
||||
self.internal_cmd_sizes, self.max_copy_size = [], max_copy_size
|
||||
def __init__(self, dev, max_copy_size=0x40000000):
|
||||
self.sdma, self.internal_cmd_sizes, self.max_copy_size = dev.sdma, [], max_copy_size
|
||||
super().__init__()
|
||||
|
||||
def q(self, *arr):
|
||||
@@ -271,30 +270,30 @@ class AMDCopyQueue(HWQueue):
|
||||
for _ in range(copy_commands):
|
||||
step_copy_size = min(copy_size - copied, self.max_copy_size)
|
||||
|
||||
self.q(amd_gpu.SDMA_OP_COPY | amd_gpu.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_COPY_LINEAR),
|
||||
amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied))
|
||||
self.q(self.sdma.SDMA_OP_COPY | self.sdma.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(self.sdma.SDMA_SUBOP_COPY_LINEAR),
|
||||
self.sdma.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied))
|
||||
|
||||
copied += step_copy_size
|
||||
return self
|
||||
|
||||
def signal(self, signal:AMDSignal, value:sint=0):
|
||||
self.q(amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.value_addr), value)
|
||||
self.q(self.sdma.SDMA_OP_FENCE | self.sdma.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.value_addr), value)
|
||||
|
||||
if not AMDDevice.driverless and (dev:=signal.timeline_for_device) is not None:
|
||||
self.q(amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(dev.queue_event_mailbox_ptr), dev.queue_event.event_id)
|
||||
self.q(amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(dev.queue_event.event_id))
|
||||
elif AMDDevice.driverless: self.q(amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(0))
|
||||
self.q(self.sdma.SDMA_OP_FENCE | self.sdma.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(dev.queue_event_mailbox_ptr), dev.queue_event.event_id)
|
||||
self.q(self.sdma.SDMA_OP_TRAP, self.sdma.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(dev.queue_event.event_id))
|
||||
elif AMDDevice.driverless: self.q(self.sdma.SDMA_OP_TRAP, self.sdma.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(0))
|
||||
|
||||
return self
|
||||
|
||||
def wait(self, signal:AMDSignal, value:sint=0):
|
||||
self.q(amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
|
||||
amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal.value_addr), value, 0xffffffff,
|
||||
amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff))
|
||||
self.q(self.sdma.SDMA_OP_POLL_REGMEM | self.sdma.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
|
||||
self.sdma.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal.value_addr), value, 0xffffffff,
|
||||
self.sdma.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | self.sdma.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff))
|
||||
return self
|
||||
|
||||
def timestamp(self, signal:AMDSignal):
|
||||
self.q(amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
|
||||
self.q(self.sdma.SDMA_OP_TIMESTAMP | self.sdma.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(self.sdma.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
|
||||
*data64_le(signal.timestamp_addr))
|
||||
return self
|
||||
|
||||
@@ -306,7 +305,8 @@ class AMDCopyQueue(HWQueue):
|
||||
hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
|
||||
for i in range(qsz): hw_view[i] = self._q[i] if i < len(self._q) else 0
|
||||
|
||||
self.indirect_cmd = [amd_gpu.SDMA_OP_INDIRECT | amd_gpu.SDMA_PKT_INDIRECT_HEADER_VMID(0), *data64_le(self.hw_page.va_addr), qsz, *data64_le(0)]
|
||||
self.indirect_cmd = [self.sdma.SDMA_OP_INDIRECT | self.sdma.SDMA_PKT_INDIRECT_HEADER_VMID(0), *data64_le(self.hw_page.va_addr), qsz,
|
||||
*data64_le(0)]
|
||||
self._q, self.cmd_sizes = hw_view, [len(self.indirect_cmd)]
|
||||
|
||||
def _submit(self, dev:AMDDevice):
|
||||
@@ -415,6 +415,25 @@ class AMDQueueDesc:
|
||||
if dev.driverless and getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1): dev.dev_iface.adev.gmc.flush_hdp()
|
||||
self.doorbell[0] = self.put_value
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AMDReg(AMDRegBase):
|
||||
ip: AMDIP
|
||||
@property
|
||||
def addr(self): return self.ip.bases[self.segment] + self.offset
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AMDIP:
|
||||
name: str
|
||||
version: tuple[int, ...]
|
||||
bases: tuple[int, ...]
|
||||
@functools.cached_property
|
||||
def module(self): return import_module(self.name, self.version)
|
||||
@functools.cached_property
|
||||
def regs(self): return collect_registers(self.module, cls=functools.partial(AMDReg, ip=self))
|
||||
def __getattr__(self, name:str):
|
||||
if name in self.regs: return self.regs[name]
|
||||
return getattr(self.module, name)
|
||||
|
||||
class KFDIface:
|
||||
kfd:HWInterface|None = None
|
||||
event_page:HCQBuffer|None = None
|
||||
@@ -441,6 +460,12 @@ class KFDIface:
|
||||
|
||||
self.gpu_id = int(HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/gpu_id").read())
|
||||
self.props = {l.split()[0]: int(l.split()[1]) for l in HWInterface(f"{kfd_topo_path}/{KFDIface.gpus[device_id]}/properties").read().splitlines()}
|
||||
ip_base = f"/sys/class/drm/renderD{self.props['drm_render_minor']}/device/ip_discovery/die/0"
|
||||
id2ip = {am.GC_HWID: am.GC_HWIP, am.SDMA0_HWID: am.SDMA0_HWIP, am.NBIF_HWID: am.NBIF_HWIP}
|
||||
self.ip_versions = {id2ip[int(hwid)]:tuple(int(HWInterface(f'{ip_base}/{hwid}/0/{part}').read()) for part in ['major', 'minor', 'revision'])
|
||||
for hwid in HWInterface(ip_base).listdir() if hwid.isnumeric() and int(hwid) in id2ip}
|
||||
self.ip_offsets = {id2ip[int(hwid)]:tuple(int(x, 16) for x in HWInterface(f'{ip_base}/{hwid}/0/base_addr').read().splitlines())
|
||||
for hwid in HWInterface(ip_base).listdir() if hwid.isnumeric() and int(hwid) in id2ip}
|
||||
self.drm_fd = HWInterface(f"/dev/dri/renderD{self.props['drm_render_minor']}", os.O_RDWR)
|
||||
|
||||
kfd.AMDKFD_IOC_ACQUIRE_VM(KFDIface.kfd, drm_fd=self.drm_fd.fd, gpu_id=self.gpu_id)
|
||||
@@ -609,6 +634,8 @@ class PCIIface:
|
||||
self.bar_info = {j:(int(start,16), int(end,16), int(flgs,16)) for j,(start,end,flgs) in enumerate(l.split() for l in bar_info)}
|
||||
|
||||
self.adev = AMDev(self.pcibus, self._map_pci_range(0), dbell:=self._map_pci_range(2).cast('Q'), self._map_pci_range(5).cast('I'))
|
||||
self.ip_versions = self.adev.ip_ver
|
||||
self.ip_offsets = {hwip: tuple(instances[0]) for hwip,instances in self.adev.regs_offset.items()}
|
||||
self.doorbell_cpu_addr = mv_address(dbell)
|
||||
|
||||
pci_cmd = int.from_bytes(self.cfg_fd.read(2, binary=True, offset=pci.PCI_COMMAND), byteorder='little') | pci.PCI_COMMAND_MASTER
|
||||
@@ -702,6 +729,12 @@ class AMDDevice(HCQCompiled):
|
||||
if self.target//10000 == 10: ctl_stack_size = min(ctl_stack_size, 0x7000)
|
||||
debug_memory_size = round_up((self.max_cu_id + 1) * (self.max_wave_id + 1) * 32, 64)
|
||||
|
||||
self.soc = importlib.import_module(f"tinygrad.runtime.autogen.am.{({10: 'navi10', 11: 'soc21', 12: 'soc24'}[self.target//10000])}")
|
||||
self.pm4 = importlib.import_module("tinygrad.runtime.autogen.am.pm4_nv")
|
||||
self.sdma = import_module('sdma', self.dev_iface.ip_versions[am.SDMA0_HWIP])
|
||||
self.gc = AMDIP('gc', self.dev_iface.ip_versions[am.GC_HWIP], self.dev_iface.ip_offsets[am.GC_HWIP])
|
||||
self.nbio = AMDIP('nbio' if self.target < 120000 else 'nbif', self.dev_iface.ip_versions[am.NBIF_HWIP], self.dev_iface.ip_offsets[am.NBIF_HWIP])
|
||||
|
||||
self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x800000, ctx_save_restore_size=wg_data_size + ctl_stack_size,
|
||||
eop_buffer_size=0x1000, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
|
||||
|
||||
@@ -709,7 +742,7 @@ class AMDDevice(HCQCompiled):
|
||||
|
||||
super().__init__(device, AMDAllocator(self), AMDLLVMRenderer() if getenv("AMD_LLVM", 0) else AMDRenderer(self.arch),
|
||||
AMDLLVMCompiler(self.arch) if getenv("AMD_LLVM", 0) else HIPCompiler(self.arch), functools.partial(AMDProgram, self),
|
||||
AMDSignal, AMDComputeQueue, AMDCopyQueue)
|
||||
AMDSignal, functools.partial(AMDComputeQueue, self), functools.partial(AMDCopyQueue, self))
|
||||
|
||||
# Scratch setup
|
||||
self.max_private_segment_size = 0
|
||||
@@ -728,7 +761,7 @@ class AMDDevice(HCQCompiled):
|
||||
self.sqtt_buffers = [self.allocator.alloc(SQTT_BUFFER_SIZE*1024*1024, BufferSpec(cpu_access=True, nolru=True)) for _ in range(SQTT_NUM)]
|
||||
self.sqtt_itrace_se_mask = getenv("SQTT_ITRACE_SE_MASK", 2) # -1 enable all, 0 disable all, >0 bitmask for where to enable instruction tracing
|
||||
self.cmd_id = 0
|
||||
AMDComputeQueue().start_trace(self.sqtt_buffers, self.sqtt_itrace_se_mask).submit(self)
|
||||
AMDComputeQueue(self).start_trace(self.sqtt_buffers, self.sqtt_itrace_se_mask).submit(self)
|
||||
|
||||
def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0):
|
||||
ring = self.dev_iface.alloc(ring_size, uncached=True, cpu_access=True)
|
||||
@@ -753,7 +786,7 @@ class AMDDevice(HCQCompiled):
|
||||
self.max_private_segment_size = required
|
||||
|
||||
def invalidate_caches(self):
|
||||
AMDComputeQueue().memory_barrier().signal(self.timeline_signal, self.next_timeline()).submit(self)
|
||||
AMDComputeQueue(self).memory_barrier().signal(self.timeline_signal, self.next_timeline()).submit(self)
|
||||
self.synchronize()
|
||||
|
||||
def on_device_hang(self): self.dev_iface.on_device_hang()
|
||||
@@ -762,7 +795,7 @@ class AMDDevice(HCQCompiled):
|
||||
if self.sqtt_enabled:
|
||||
wptrs_buf = self.allocator.alloc(round_up(len(self.sqtt_buffers), 0x1000), BufferSpec(cpu_access=True, nolru=True))
|
||||
wptrs = to_mv(wptrs_buf.va_addr, wptrs_buf.size)
|
||||
AMDComputeQueue().stop_trace(len(self.sqtt_buffers), wptrs_buf).signal(self.timeline_signal, self.next_timeline()).submit(self)
|
||||
AMDComputeQueue(self).stop_trace(len(self.sqtt_buffers), wptrs_buf).signal(self.timeline_signal, self.next_timeline()).submit(self)
|
||||
self.synchronize()
|
||||
if DEBUG>=2: print('Saving SQTT in profile...')
|
||||
for i,buf0 in enumerate(self.sqtt_buffers):
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from __future__ import annotations
|
||||
from typing import cast, Type, TypeVar, Generic, Any, ClassVar
|
||||
from typing import cast, Callable, Type, TypeVar, Generic, Any, ClassVar
|
||||
import contextlib, decimal, statistics, time, ctypes, array, os, fcntl
|
||||
from tinygrad.helpers import PROFILE, from_mv, getenv, to_mv, round_up
|
||||
from tinygrad.renderer import Renderer
|
||||
@@ -255,7 +255,7 @@ class HCQSignal(Generic[DeviceType]):
|
||||
if self.value < value: raise RuntimeError(f"Wait timeout: {timeout} ms! (the signal is not set to {value}, but {self.value})")
|
||||
|
||||
@contextlib.contextmanager
|
||||
def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Type[HWQueue]|None=None, queue:HWQueue|None=None):
|
||||
def hcq_profile(dev:HCQCompiled, enabled, desc, queue_type:Callable[[], HWQueue]|None=None, queue:HWQueue|None=None):
|
||||
st, en = (dev.signal_t(), dev.signal_t()) if enabled else (None, None)
|
||||
|
||||
if enabled and queue is not None: queue.timestamp(st)
|
||||
@@ -341,7 +341,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
signal_pool: ClassVar[list[int]] = []
|
||||
|
||||
def __init__(self, device:str, allocator:HCQAllocatorBase, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[SignalType],
|
||||
comp_queue_t:Type[HWQueue], copy_queue_t:Type[HWQueue]|None):
|
||||
comp_queue_t:Callable[[], HWQueue], copy_queue_t:Callable[[], HWQueue]|None):
|
||||
self.device_id:int = int(device.split(":")[1]) if ":" in device else 0
|
||||
|
||||
from tinygrad.runtime.graph.hcq import HCQGraph
|
||||
@@ -384,7 +384,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
||||
return cls.signal_pool.pop()
|
||||
|
||||
def _at_profile_finalize(self):
|
||||
def _sync(d:HCQCompiled, q_t:Type[HWQueue]):
|
||||
def _sync(d:HCQCompiled, q_t:Callable[[], HWQueue]):
|
||||
q_t().timestamp(d.timeline_signal).signal(d.timeline_signal, d.next_timeline()).submit(d)
|
||||
st = time.perf_counter_ns()
|
||||
d.timeline_signal.wait(d.timeline_value - 1) # average of the two
|
||||
|
||||
Reference in New Issue
Block a user