rename to extra.assembly.amd (#13879)

This commit is contained in:
George Hotz
2025-12-29 14:10:55 -05:00
committed by GitHub
parent 37f0fa11b6
commit 81cf9ea0ab
40 changed files with 5167 additions and 795 deletions

View File

@@ -655,7 +655,7 @@ jobs:
uses: ./.github/actions/process-replay
testrdna3:
name: RDNA3 IDE
name: AMD ASM IDE
runs-on: ubuntu-24.04
timeout-minutes: 10
steps:
@@ -674,19 +674,27 @@ jobs:
sudo apt-get update
sudo apt-get install llvm-21 llvm-21-tools cloc
- name: RDNA3 Line Count
run: cloc --by-file extra/assembly/rdna3/*.py
run: cloc --by-file extra/assembly/amd/*.py
- name: Run RDNA3 emulator tests
run: python -m pytest -n=auto extra/assembly/rdna3/ --durations 20
run: python -m pytest -n=auto extra/assembly/amd/ --durations 20
- name: Install pdfplumber
run: pip install pdfplumber
- name: Verify RDNA3 autogen is up to date
run: |
python -m extra.assembly.rdna3.lib
git diff --exit-code extra/assembly/rdna3/autogen/__init__.py
python -m extra.assembly.amd.lib --arch rdna3
git diff --exit-code extra/assembly/amd/autogen/rdna3/__init__.py
- name: Verify CDNA4 autogen is up to date
run: |
python -m extra.assembly.amd.lib --arch cdna4
git diff --exit-code extra/assembly/amd/autogen/cdna4/__init__.py
- name: Verify RDNA3 pcode autogen is up to date
run: |
python -m extra.assembly.rdna3.pcode
git diff --exit-code extra/assembly/rdna3/autogen/gen_pcode.py
python -m extra.assembly.amd.pcode --arch rdna3
git diff --exit-code extra/assembly/amd/autogen/rdna3/gen_pcode.py
- name: Verify CDNA4 pcode autogen is up to date
run: |
python -m extra.assembly.amd.pcode --arch cdna4
git diff --exit-code extra/assembly/amd/autogen/cdna4/gen_pcode.py
testnvidia:
strategy:

View File

@@ -76,10 +76,12 @@ VIZ=1 python -c "from tinygrad import Tensor; Tensor.ones(10).sum().realize()"
## Auto-generated Files (DO NOT EDIT)
The following files are auto-generated and should never be edited manually:
- `extra/assembly/rdna3/autogen/gen_pcode.py` - Generated by `python -m extra.assembly.rdna3.pcode`
- `extra/assembly/rdna3/autogen/__init__.py` - Generated from AMD ISA definitions
- `extra/assembly/amd/autogen/rdna3/__init__.py` - Generated by `python -m extra.assembly.amd.lib --arch rdna3`
- `extra/assembly/amd/autogen/rdna3/gen_pcode.py` - Generated by `python -m extra.assembly.amd.pcode --arch rdna3`
- `extra/assembly/amd/autogen/cdna4/__init__.py` - Generated by `python -m extra.assembly.amd.lib --arch cdna4`
- `extra/assembly/amd/autogen/cdna4/gen_pcode.py` - Generated by `python -m extra.assembly.amd.pcode --arch cdna4`
To add missing instruction implementations, add them to `extra/assembly/rdna3/emu.py` instead.
To add missing instruction implementations, add them to `extra/assembly/amd/emu.py` instead.
## Style Notes

View File

@@ -1,7 +1,7 @@
# RDNA3 assembler and disassembler
from __future__ import annotations
import re
from extra.assembly.rdna3.lib import Inst, RawImm, Reg, SGPR, VGPR, TTMP, s, v, ttmp, _RegFactory, FLOAT_ENC, SRC_FIELDS, unwrap
from extra.assembly.amd.lib import Inst, RawImm, Reg, SGPR, VGPR, TTMP, s, v, ttmp, _RegFactory, FLOAT_ENC, SRC_FIELDS, unwrap
# Decoding helpers
SPECIAL_GPRS = {106: "vcc_lo", 107: "vcc_hi", 124: "null", 125: "m0", 126: "exec_lo", 127: "exec_hi", 253: "scc"}
@@ -91,7 +91,7 @@ def disasm(inst: Inst) -> str:
# VOP3 and VOP3SD share encoding - check opcode to determine which
is_vop3sd = cls_name == 'VOP3' and op_val in VOP3SD_OPCODES
try:
from extra.assembly.rdna3 import autogen
from extra.assembly.amd.autogen import rdna3 as autogen
if is_vop3sd:
op_name = autogen.VOP3SDOp(op_val).name.lower()
else:
@@ -347,7 +347,7 @@ def disasm(inst: Inst) -> str:
# VOPD: dual-issue instructions
if cls_name == 'VOPD':
from extra.assembly.rdna3 import autogen
from extra.assembly.amd.autogen import rdna3 as autogen
opx, opy, vdstx, vdsty_enc = [unwrap(inst._values.get(f, 0)) for f in ('opx', 'opy', 'vdstx', 'vdsty')]
srcx0, vsrcx1, srcy0, vsrcy1 = [unwrap(inst._values.get(f, 0)) for f in ('srcx0', 'vsrcx1', 'srcy0', 'vsrcy1')]
vdsty = (vdsty_enc << 1) | ((vdstx & 1) ^ 1) # Decode vdsty
@@ -505,7 +505,7 @@ SOPK_IMM_FIRST = {'s_setreg_b32'}
SOPK_UNSUPPORTED = {'s_setreg_imm32_b32'}
def asm(text: str) -> Inst:
from extra.assembly.rdna3 import autogen
from extra.assembly.amd.autogen import rdna3 as autogen
text = text.strip()
clamp = 'clamp' in text.lower()
if clamp: text = re.sub(r'\s+clamp\s*$', '', text, flags=re.I)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,7 +1,7 @@
# autogenerated from AMD RDNA3.5 ISA PDF by lib.py - do not edit
from enum import IntEnum
from typing import Annotated
from extra.assembly.rdna3.lib import bits, BitField, Inst32, Inst64, SGPR, VGPR, TTMP as TTMP, s as s, v as v, ttmp as ttmp, SSrc, Src, SImm, Imm, VDSTYEnc, SGPRField, VGPRField
from extra.assembly.amd.lib import bits, BitField, Inst32, Inst64, SGPR, VGPR, TTMP as TTMP, s as s, v as v, ttmp as ttmp, SSrc, Src, SImm, Imm, VDSTYEnc, SGPRField, VGPRField
import functools
class SrcEnum(IntEnum):

View File

@@ -1,9 +1,9 @@
# autogenerated by pcode.py - do not edit
# to regenerate: python -m extra.assembly.rdna3.pcode
# to regenerate: python -m extra.assembly.amd.pcode --arch rdna3
# ruff: noqa: E501,F405,F403
# mypy: ignore-errors
from extra.assembly.rdna3.autogen import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp
from extra.assembly.rdna3.pcode import *
from extra.assembly.amd.autogen.rdna3 import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp
from extra.assembly.amd.pcode import *
def _SOP1Op_S_MOV_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0):
# D0.b32 = S0.b32

View File

@@ -2,10 +2,10 @@
# mypy: ignore-errors
from __future__ import annotations
import ctypes, os
from extra.assembly.rdna3.lib import Inst, RawImm
from extra.assembly.rdna3.pcode import _f32, _i32, _sext, _f16, _i16, _f64, _i64
from extra.assembly.rdna3.autogen.gen_pcode import get_compiled_functions
from extra.assembly.rdna3.autogen import (
from extra.assembly.amd.lib import Inst, RawImm
from extra.assembly.amd.pcode import _f32, _i32, _sext, _f16, _i16, _f64, _i64
from extra.assembly.amd.autogen.rdna3.gen_pcode import get_compiled_functions
from extra.assembly.amd.autogen.rdna3 import (
SOP1, SOP2, SOPC, SOPK, SOPP, SMEM, VOP1, VOP2, VOP3, VOP3SD, VOP3P, VOPC, DS, FLAT, VOPD, SrcEnum,
SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, SMEMOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp, DSOp, FLATOp, GLOBALOp, VOPDOp
)

View File

@@ -217,7 +217,7 @@ class Inst:
# op may be an enum (from __init__) or an int (from from_int)
op_name = op.name if hasattr(op, 'name') else None
if op_name is None and self.__class__.__name__ == 'VOP3':
from extra.assembly.rdna3.autogen import VOP3Op
from extra.assembly.amd.autogen.rdna3 import VOP3Op
try: op_name = VOP3Op(op).name
except ValueError: pass
if op_name is None: return False
@@ -277,7 +277,7 @@ class Inst:
def __hash__(self): return hash((self.__class__.__name__, tuple(sorted((k, repr(v)) for k, v in self._values.items())), self._literal))
def disasm(self) -> str:
from extra.assembly.rdna3.asm import disasm
from extra.assembly.amd.asm import disasm
return disasm(self)
class Inst32(Inst): pass
@@ -459,7 +459,7 @@ def generate(output_path: str | None = None, arch: str = "rdna3") -> dict:
def field_key(f): return order.index(f[0].lower()) if f[0].lower() in order else 1000
lines = [f"# autogenerated from AMD {doc_name} ISA PDF by lib.py - do not edit", "from enum import IntEnum",
"from typing import Annotated",
"from extra.assembly.rdna3.lib import bits, BitField, Inst32, Inst64, SGPR, VGPR, TTMP as TTMP, s as s, v as v, ttmp as ttmp, SSrc, Src, SImm, Imm, VDSTYEnc, SGPRField, VGPRField",
"from extra.assembly.amd.lib import bits, BitField, Inst32, Inst64, SGPR, VGPR, TTMP as TTMP, s as s, v as v, ttmp as ttmp, SSrc, Src, SImm, Imm, VDSTYEnc, SGPRField, VGPRField",
"import functools", ""]
lines += enum_lines("SrcEnum", src_enum) + sum([enum_lines(n, ops) for n, ops in sorted(enums.items())], [])
# Format-specific field defaults (verified against LLVM test vectors)
@@ -521,5 +521,5 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate instruction definitions from AMD ISA PDF")
parser.add_argument("--arch", choices=list(PDF_URLS.keys()), default="rdna3", help="Target architecture (default: rdna3)")
args = parser.parse_args()
result = generate("extra/assembly/rdna3/autogen/__init__.py", arch=args.arch)
result = generate(f"extra/assembly/amd/autogen/{args.arch}/__init__.py", arch=args.arch)
print(f"generated SrcEnum ({len(result['src_enum'])}) + {len(result['enums'])} opcode enums + {len(result['formats'])} format classes")

View File

@@ -702,7 +702,7 @@ class ExecContext:
# PDF EXTRACTION AND CODE GENERATION
# ═══════════════════════════════════════════════════════════════════════════════
from extra.assembly.rdna3.lib import PDF_URLS
from extra.assembly.amd.lib import PDF_URLS
INST_PATTERN = re.compile(r'^([SV]_[A-Z0-9_]+)\s+(\d+)\s*$', re.M)
# Patterns that can't be handled by the DSL (require special handling in emu.py)
@@ -740,7 +740,7 @@ def parse_pseudocode_from_pdf(arch: str = "rdna3") -> dict:
"""Parse pseudocode from PDF for all ops. Returns {enum_cls: {op: pseudocode}}."""
import pdfplumber
from tinygrad.helpers import fetch
from extra.assembly.rdna3.autogen import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp
from extra.assembly.amd.autogen.rdna3 import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp
OP_ENUMS = [SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp]
defined_ops = {}
@@ -783,10 +783,10 @@ def parse_pseudocode_from_pdf(arch: str = "rdna3") -> dict:
return instructions
def generate_gen_pcode(output_path: str = "extra/assembly/rdna3/autogen/gen_pcode.py", arch: str = "rdna3"):
def generate_gen_pcode(output_path: str = "extra/assembly/amd/autogen/rdna3/gen_pcode.py", arch: str = "rdna3"):
"""Generate gen_pcode.py - compiled pseudocode functions for the emulator."""
from pathlib import Path
from extra.assembly.rdna3.autogen import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp
from extra.assembly.amd.autogen.rdna3 import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp
OP_ENUMS = [SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp]
@@ -803,12 +803,12 @@ def generate_gen_pcode(output_path: str = "extra/assembly/rdna3/autogen/gen_pcod
print(f"Total: {total_found}/{total_ops} ({100*total_found//total_ops}%)")
print("\nCompiling to pseudocode functions...")
lines = ['''# autogenerated by pcode.py - do not edit
# to regenerate: python -m extra.assembly.rdna3.pcode
lines = [f'''# autogenerated by pcode.py - do not edit
# to regenerate: python -m extra.assembly.amd.pcode --arch {arch}
# ruff: noqa: E501,F405,F403
# mypy: ignore-errors
from extra.assembly.rdna3.autogen import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp
from extra.assembly.rdna3.pcode import *
from extra.assembly.amd.autogen.{arch} import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp
from extra.assembly.amd.pcode import *
''']
compiled_count, skipped_count = 0, 0
@@ -989,4 +989,4 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate pseudocode functions from AMD ISA PDF")
parser.add_argument("--arch", choices=list(PDF_URLS.keys()), default="rdna3", help="Target architecture (default: rdna3)")
args = parser.parse_args()
generate_gen_pcode(arch=args.arch)
generate_gen_pcode(output_path=f"extra/assembly/amd/autogen/{args.arch}/gen_pcode.py", arch=args.arch)

View File

@@ -7,7 +7,7 @@ from typing import Callable
# Set AMD=1 before importing tinygrad
os.environ["AMD"] = "1"
from extra.assembly.rdna3.emu import run_asm as python_run_asm, set_valid_mem_ranges, decode_program, step_wave, WaveState, WAVE_SIZE
from extra.assembly.amd.emu import run_asm as python_run_asm, set_valid_mem_ranges, decode_program, step_wave, WaveState, WAVE_SIZE
REMU_PATH = Path(__file__).parents[3] / "remu/target/release/libremu.so"
if not REMU_PATH.exists():

View File

@@ -3,8 +3,8 @@
# Currently many of these tests fail - they document desired behavior
import unittest
from extra.assembly.rdna3.autogen import *
from extra.assembly.rdna3.lib import Inst, RawImm, SGPR, VGPR
from extra.assembly.amd.autogen.rdna3 import *
from extra.assembly.amd.lib import Inst, RawImm, SGPR, VGPR
class TestRegisterSliceSyntax(unittest.TestCase):
"""

View File

@@ -9,8 +9,8 @@ os.environ["AMD"] = "1"
os.environ["MOCKGPU"] = "1"
os.environ["PYTHON_REMU"] = "1"
from extra.assembly.rdna3.emu import WaveState, decode_program, step_wave, WAVE_SIZE
from extra.assembly.rdna3.test.helpers import KernelInfo
from extra.assembly.amd.emu import WaveState, decode_program, step_wave, WAVE_SIZE, set_valid_mem_ranges
from extra.assembly.amd.test.helpers import KernelInfo
REMU_PATH = Path(__file__).parents[3] / "remu/target/release/libremu.so"
@@ -223,7 +223,6 @@ def run_single_kernel(kernel: bytes, n_lanes: int, args_ptr: int, global_size: t
def compare_emulators_multi_kernel(kernels: list[KernelInfo], buf_pool: dict[int, int], max_steps: int = 1000,
debug: bool = False, trace_len: int = 10, buf_data: dict[int, bytes] | None = None) -> tuple[bool, str]:
"""Run all kernels through both emulators with shared buffer pool."""
from extra.assembly.rdna3.emu import set_valid_mem_ranges, decode_program
if buf_data is None: buf_data = {}
# Allocate shared buffer pool with padding for over-reads (GPU loads up to 16 bytes at once)
@@ -267,8 +266,6 @@ def compare_emulators_multi_kernel(kernels: list[KernelInfo], buf_pool: dict[int
def compare_emulators_with_memory(kernel: bytes, n_lanes: int, buf_sizes: list, max_steps: int = 1000, debug: bool = False,
global_size: tuple[int, int, int] = (1, 1, 1), trace_len: int = 10) -> tuple[bool, str]:
"""Run both emulators with memory set up for tinygrad kernels, executing all workgroups. Legacy wrapper."""
from extra.assembly.rdna3.emu import set_valid_mem_ranges, decode_program
# Allocate buffers
buffers = []
for size in buf_sizes:

View File

@@ -6,10 +6,10 @@ Set USE_HW=1 to run on both emulator and real hardware, comparing results.
"""
import ctypes, unittest, os, struct
from extra.assembly.rdna3.autogen import *
from extra.assembly.rdna3.lib import RawImm
from extra.assembly.rdna3.emu import WaveState, run_asm, set_valid_mem_ranges
from extra.assembly.rdna3.pcode import _i32, _f32
from extra.assembly.amd.autogen.rdna3 import *
from extra.assembly.amd.lib import RawImm
from extra.assembly.amd.emu import WaveState, run_asm, set_valid_mem_ranges
from extra.assembly.amd.pcode import _i32, _f32
VCC = SrcEnum.VCC_LO # For VOP3SD sdst field
USE_HW = os.environ.get("USE_HW", "0") == "1"
@@ -1776,7 +1776,7 @@ class TestF16Conversions(unittest.TestCase):
def test_v_cvt_f16_f32_basic(self):
"""V_CVT_F16_F32 converts f32 to f16 in low 16 bits."""
from extra.assembly.rdna3.pcode import _f16
from extra.assembly.amd.pcode import _f16
instructions = [
v_mov_b32_e32(v[0], 1.0), # f32 1.0 = 0x3f800000
v_cvt_f16_f32_e32(v[1], v[0]),
@@ -1789,7 +1789,7 @@ class TestF16Conversions(unittest.TestCase):
def test_v_cvt_f16_f32_negative(self):
"""V_CVT_F16_F32 converts negative f32 to f16."""
from extra.assembly.rdna3.pcode import _f16
from extra.assembly.amd.pcode import _f16
instructions = [
v_mov_b32_e32(v[0], -2.0), # f32 -2.0 = 0xc0000000
v_cvt_f16_f32_e32(v[1], v[0]),
@@ -1802,7 +1802,7 @@ class TestF16Conversions(unittest.TestCase):
def test_v_cvt_f16_f32_small(self):
"""V_CVT_F16_F32 converts small f32 value."""
from extra.assembly.rdna3.pcode import _f16, f32_to_f16
from extra.assembly.amd.pcode import _f16, f32_to_f16
instructions = [
v_mov_b32_e32(v[0], 0.5),
v_cvt_f16_f32_e32(v[1], v[0]),
@@ -1862,7 +1862,7 @@ class TestF16Conversions(unittest.TestCase):
which would produce wrong results when the significant bits of the f32 value are
in the upper bits (as they are for most f32 values > 1.0 or < -1.0).
"""
from extra.assembly.rdna3.pcode import _f16
from extra.assembly.amd.pcode import _f16
# Use f32 value 1.5 = 0x3fc00000. If only low 16 bits (0x0000) are read, result is wrong.
# Correct f16 result: 0x3e00 (1.5 in half precision)
instructions = [
@@ -1886,7 +1886,7 @@ class TestF16Conversions(unittest.TestCase):
is in the name), causing it to read only low 16 bits of the f32 input.
This resulted in WMMA receiving zero inputs and producing zero outputs.
"""
from extra.assembly.rdna3.pcode import _f16
from extra.assembly.amd.pcode import _f16
# Simulate loading two f32 values and converting/packing for WMMA
# f32 1.5 = 0x3fc00000, f32 2.5 = 0x40200000
# After CVT: f16 1.5 = 0x3e00, f16 2.5 = 0x4100
@@ -1914,7 +1914,7 @@ class TestF16Conversions(unittest.TestCase):
def test_v_pack_b32_f16_basic(self):
"""V_PACK_B32_F16 packs two f16 values into one 32-bit register."""
from extra.assembly.rdna3.pcode import _f16
from extra.assembly.amd.pcode import _f16
instructions = [
# First convert two f32 values to f16
v_mov_b32_e32(v[0], 1.0), # Will become f16 0x3c00
@@ -1934,7 +1934,7 @@ class TestF16Conversions(unittest.TestCase):
def test_v_pack_b32_f16_both_positive(self):
"""V_PACK_B32_F16 packs two positive f16 values."""
from extra.assembly.rdna3.pcode import _f16
from extra.assembly.amd.pcode import _f16
instructions = [
v_mov_b32_e32(v[0], 0.5), # f16 0x3800
v_mov_b32_e32(v[2], 2.0), # f16 0x4000
@@ -2186,7 +2186,7 @@ class TestVOP3P(unittest.TestCase):
def test_v_pk_add_f16_basic(self):
"""V_PK_ADD_F16 adds two packed f16 values."""
from extra.assembly.rdna3.pcode import _f16
from extra.assembly.amd.pcode import _f16
# v0 = packed (1.0, 2.0), v1 = packed (3.0, 4.0)
# Result should be packed (4.0, 6.0)
instructions = [
@@ -2209,7 +2209,7 @@ class TestVOP3P(unittest.TestCase):
Inline constants for VOP3P are f16 values in the low 16 bits only.
The opsel_hi bits (default=0b11) select lo half for hi result, so both halves use the constant.
"""
from extra.assembly.rdna3.pcode import _f16
from extra.assembly.amd.pcode import _f16
# v0 = packed (1.0, 1.0), add POS_ONE
# With default opsel_hi=0b11: both lo and hi results use lo half of src1 (the constant)
# But opsel_hi=1 means src1 hi comes from lo half - wait, let me check the actual encoding
@@ -2230,7 +2230,7 @@ class TestVOP3P(unittest.TestCase):
def test_v_pk_mul_f16_basic(self):
"""V_PK_MUL_F16 multiplies two packed f16 values."""
from extra.assembly.rdna3.pcode import _f16
from extra.assembly.amd.pcode import _f16
# v0 = packed (2.0, 3.0), v1 = packed (4.0, 5.0)
# Result should be packed (8.0, 15.0)
instructions = [
@@ -2251,7 +2251,7 @@ class TestVOP3P(unittest.TestCase):
"""V_PK_MUL_F16 with inline constant POS_TWO (2.0).
Inline constant has value only in low 16 bits, hi is 0.
"""
from extra.assembly.rdna3.pcode import _f16
from extra.assembly.amd.pcode import _f16
# v0 = packed (3.0, 4.0), multiply by POS_TWO
# lo = 3.0 * 2.0 = 6.0, hi = 4.0 * 0.0 = 0.0 (inline const hi is 0)
instructions = [
@@ -2268,7 +2268,7 @@ class TestVOP3P(unittest.TestCase):
def test_v_pk_fma_f16_basic(self):
"""V_PK_FMA_F16: D = A * B + C for packed f16."""
from extra.assembly.rdna3.pcode import _f16
from extra.assembly.amd.pcode import _f16
# A = packed (2.0, 3.0), B = packed (4.0, 5.0), C = packed (1.0, 1.0)
# Result should be packed (2*4+1=9.0, 3*5+1=16.0)
instructions = [

View File

@@ -1,8 +1,8 @@
#!/usr/bin/env python3
"""Test MUBUF, MTBUF, MIMG, EXP, DS formats against LLVM."""
import unittest
from extra.assembly.rdna3.autogen import *
from extra.assembly.rdna3.lib import encode_src
from extra.assembly.amd.autogen.rdna3 import *
from extra.assembly.amd.lib import encode_src
class TestMUBUF(unittest.TestCase):
"""Test MUBUF (buffer) instructions."""
@@ -308,7 +308,7 @@ class TestVOP3Literal(unittest.TestCase):
def test_vop3_with_literal(self):
# v_add3_u32 v5, vcc_hi, 0xaf123456, v255
# GFX11: encoding: [0x05,0x00,0x55,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf]
from extra.assembly.rdna3.lib import RawImm
from extra.assembly.amd.lib import RawImm
inst = VOP3(VOP3Op.V_ADD3_U32, vdst=v[5], src0=RawImm(107), src1=0xaf123456, src2=v[255])
expected = bytes([0x05,0x00,0x55,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf])
self.assertEqual(inst.to_bytes(), expected)
@@ -316,14 +316,14 @@ class TestVOP3Literal(unittest.TestCase):
def test_vop3_literal_null_operand(self):
# v_add3_u32 v5, null, exec_lo, 0xaf123456
# GFX11: encoding: [0x05,0x00,0x55,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf]
from extra.assembly.rdna3.lib import RawImm
from extra.assembly.amd.lib import RawImm
inst = VOP3(VOP3Op.V_ADD3_U32, vdst=v[5], src0=NULL, src1=RawImm(126), src2=0xaf123456)
expected = bytes([0x05,0x00,0x55,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf])
self.assertEqual(inst.to_bytes(), expected)
def test_vop3p_with_literal(self):
# Test VOP3P literal encoding (also uses Inst64)
from extra.assembly.rdna3.lib import RawImm
from extra.assembly.amd.lib import RawImm
inst = VOP3P(VOP3POp.V_PK_ADD_F16, vdst=v[5], src0=RawImm(240), src1=0x12345678, src2=v[0])
self.assertEqual(len(inst.to_bytes()), 12) # 8 bytes + 4 byte literal

View File

@@ -2,10 +2,10 @@
# the Inst constructor should be looking at the types of the fields to correctly set the value
import unittest, struct
from extra.assembly.rdna3.autogen import *
from extra.assembly.rdna3.lib import Inst
from extra.assembly.rdna3.asm import asm
from extra.assembly.rdna3.test.test_roundtrip import compile_asm
from extra.assembly.amd.autogen.rdna3 import *
from extra.assembly.amd.lib import Inst
from extra.assembly.amd.asm import asm
from extra.assembly.amd.test.test_roundtrip import compile_asm
class TestIntegration(unittest.TestCase):
inst: Inst

View File

@@ -1,9 +1,9 @@
#!/usr/bin/env python3
"""Integration test: round-trip RDNA3 assembly through AMD toolchain."""
import unittest, re, io, sys, subprocess
from extra.assembly.rdna3.autogen import *
from extra.assembly.rdna3.asm import waitcnt, asm
from extra.assembly.rdna3.test.helpers import get_llvm_mc
from extra.assembly.amd.autogen.rdna3 import *
from extra.assembly.amd.asm import waitcnt, asm
from extra.assembly.amd.test.helpers import get_llvm_mc
def disassemble(lib: bytes, arch: str = "gfx1100") -> str:
"""Disassemble ELF binary using tinygrad's compiler, return raw output."""

View File

@@ -2,9 +2,9 @@
"""Test RDNA3 assembler/disassembler against LLVM test vectors."""
import unittest, re, subprocess
from tinygrad.helpers import fetch
from extra.assembly.rdna3.autogen import *
from extra.assembly.rdna3.asm import asm
from extra.assembly.rdna3.test.helpers import get_llvm_mc
from extra.assembly.amd.autogen.rdna3 import *
from extra.assembly.amd.asm import asm
from extra.assembly.amd.test.helpers import get_llvm_mc
LLVM_BASE = "https://raw.githubusercontent.com/llvm/llvm-project/main/llvm/test/MC/AMDGPU"

View File

@@ -1,8 +1,8 @@
#!/usr/bin/env python3
"""Tests for the RDNA3 pseudocode DSL."""
import unittest
from extra.assembly.rdna3.pcode import Reg, TypedView, SliceProxy, ExecContext, compile_pseudocode, _expr, MASK32, MASK64, _f32, _i32, _f16, _i16, f32_to_f16, _isnan
from extra.assembly.rdna3.autogen.gen_pcode import _VOP3SDOp_V_DIV_SCALE_F32, _VOPCOp_V_CMP_CLASS_F32
from extra.assembly.amd.pcode import Reg, TypedView, SliceProxy, ExecContext, compile_pseudocode, _expr, MASK32, MASK64, _f32, _i32, _f16, _i16, f32_to_f16, _isnan
from extra.assembly.amd.autogen.rdna3.gen_pcode import _VOP3SDOp_V_DIV_SCALE_F32, _VOPCOp_V_CMP_CLASS_F32
class TestReg(unittest.TestCase):
def test_u32_read(self):

View File

@@ -1,7 +1,7 @@
#!/usr/bin/env python3
"""Test that PDF parser correctly extracts format fields."""
import unittest, os
from extra.assembly.rdna3.autogen import (
from extra.assembly.amd.autogen.rdna3 import (
SOP1, SOP2, SOPK, SOPP, VOP1, VOP2, VOP3SD, VOPC, FLAT, VOPD,
SOP1Op, SOP2Op, VOP1Op, VOP3Op
)
@@ -41,7 +41,7 @@ class TestPDFParserGenerate(unittest.TestCase):
def test_pdf_parser(self):
"""Single test that validates all PDF parser outputs."""
from extra.assembly.rdna3.lib import generate
from extra.assembly.amd.lib import generate
result = generate()
# test_all_formats_present

View File

@@ -1,7 +1,7 @@
#!/usr/bin/env python3
import unittest, subprocess
from extra.assembly.rdna3.autogen import *
from extra.assembly.rdna3.test.helpers import get_llvm_mc
from extra.assembly.amd.autogen.rdna3 import *
from extra.assembly.amd.test.helpers import get_llvm_mc
def llvm_assemble(asm: str) -> bytes:
"""Assemble using llvm-mc and return bytes."""

View File

@@ -1,10 +1,10 @@
#!/usr/bin/env python3
"""Roundtrip tests: generate tinygrad kernels, decode instructions, re-encode, verify match."""
import unittest, io, sys, re, subprocess, os
from extra.assembly.rdna3.autogen import *
from extra.assembly.rdna3.lib import Inst
from extra.assembly.rdna3.asm import asm
from extra.assembly.rdna3.test.helpers import get_llvm_mc, get_llvm_objdump
from extra.assembly.amd.autogen.rdna3 import *
from extra.assembly.amd.lib import Inst
from extra.assembly.amd.asm import asm
from extra.assembly.amd.test.helpers import get_llvm_mc, get_llvm_objdump
# Instruction format detection based on encoding bits
def detect_format(data: bytes) -> type[Inst] | None:
@@ -140,7 +140,7 @@ class TestTinygradKernelRoundtrip(unittest.TestCase):
2. asm(disasm()) matches LLVM output
3. our disasm() matches LLVM's disassembly string exactly
"""
from extra.assembly.rdna3.test.test_compare_emulators import get_kernels_from_tinygrad
from extra.assembly.amd.test.test_compare_emulators import get_kernels_from_tinygrad
from tinygrad.runtime.support.compiler_amd import HIPCompiler
kernels, _, _ = get_kernels_from_tinygrad(op_fn)

View File

@@ -1,4 +0,0 @@
*.deb
build
src
sniffer/sniff.so

View File

@@ -1,20 +0,0 @@
Built ROCT-Thunk-Interface (hsakmt)
hsakmt-roct-dev_5.4.4.99999-local_amd64.deb
note: installs to /opt/rocm
Built ROCm-Device-Libs
Works with ROCM_PATH=/home/tiny/build/ROCm-Device-Libs/build/dist
rocm-device-libs_1.0.0.99999-local_amd64.deb
Built ROCm-CompilerSupport (amd_comgr)
no deb, sudo make install to /usr/local
Built ROCR-Runtime
hsa-rocr_1.8.0-local_amd64.deb
hsa-rocr-dev_1.8.0-local_amd64.deb
Built ROCm-OpenCL-Runtime
rocm-ocl-icd_2.0.0-local_amd64.deb
ISSUE: these depend on "comgr"
rocm-opencl_2.0.0-local_amd64.deb
rocm-opencl-dev_2.0.0-local_amd64.deb
Did sudo make install

View File

@@ -1,41 +0,0 @@
# run two "rocm-bandwidth-test" in a loop
# amdgpu-6.0.5-1581431.20.04
# fixed in kernel 6.2.14
[ 72.153646] RIP: 0010:pm_send_runlist+0x4a/0x630 [amdgpu]
[ 72.153815] Code: 30 65 48 8b 04 25 28 00 00 00 48 89 45 d0 31 c0 80 fb 01 0f 87 aa 9d 49 00 83 e3 01 0f 85 1c 05 00 00 49 8b 3f b8 01 00 00 00 <48> 8b 97 30 01 00 00 44 8b b7 6c 01 00 00 8b 9f 70 01 00 00 8b 8a
[ 72.153900] RSP: 0018:ffffb48445c03c30 EFLAGS: 00010246
[ 72.153928] RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000000
[ 72.153962] RDX: 000000000000007b RSI: ffff9395e1562558 RDI: 0000000000000000
[ 72.153996] RBP: ffffb48445c03cb8 R08: 0000000000000000 R09: 0000000000000001
[ 72.154030] R10: ffff9395c900d840 R11: 0000000000000000 R12: 0000000000000000
[ 72.154065] R13: ffff9395c9e00400 R14: 0000000000000001 R15: ffff9395e15624e0
[ 72.154099] FS: 00007f345c6463c0(0000) GS:ffff93a4aee80000(0000) knlGS:0000000000000000
[ 72.154137] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 72.154165] CR2: 0000000000000130 CR3: 0000000112840000 CR4: 0000000000750ee0
[ 72.154201] PKRU: 55555554
[ 72.154215] Call Trace:
[ 72.154230] <TASK>
[ 72.154244] map_queues_cpsch+0x75/0xc0 [amdgpu]
[ 72.154365] debug_map_and_unlock+0x51/0x90 [amdgpu]
[ 72.154480] debug_refresh_runlist+0x1f/0x30 [amdgpu]
[ 72.154591] kfd_dbg_runtime_disable+0x13c/0x240 [amdgpu]
[ 72.154705] kfd_ioctl_dbg_set_debug_trap+0x69d/0x8b0 [amdgpu]
[ 72.154820] kfd_ioctl+0x24a/0x5b0 [amdgpu]
[ 72.154925] ? kfd_ioctl_create_queue+0x770/0x770 [amdgpu]
[ 72.155035] ? syscall_exit_to_user_mode+0x27/0x50
[ 72.155061] ? exit_to_user_mode_prepare+0x3d/0x1c0
[ 72.155088] __x64_sys_ioctl+0x95/0xd0
[ 72.155109] do_syscall_64+0x5c/0xc0
[ 72.155128] ? syscall_exit_to_user_mode+0x27/0x50
[ 72.155151] ? do_syscall_64+0x69/0xc0
[ 72.155172] entry_SYSCALL_64_after_hwframe+0x61/0xcb
[ 72.155198] RIP: 0033:0x7f345c7f63ab
[ 72.155218] Code: 0f 1e fa 48 8b 05 e5 7a 0d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d b5 7a 0d 00 f7 d8 64 89 01 48
[ 72.155301] RSP: 002b:00007ffc97cc89f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[ 72.155339] RAX: ffffffffffffffda RBX: 00007ffc97cc8a30 RCX: 00007f345c7f63ab
[ 72.155375] RDX: 00007ffc97cc8a30 RSI: 00000000c0284b82 RDI: 0000000000000003
[ 72.155411] RBP: 00000000c0284b82 R08: 0000000000000000 R09: 0000000000000000
[ 72.155447] R10: 00007f345cd4ddb0 R11: 0000000000000246 R12: 00007ffc97cc8a30
[ 72.155481] R13: 0000000000000003 R14: 00007ffc97cc8d20 R15: 0000000000000000
[ 72.155517] </TASK>

View File

@@ -1,41 +0,0 @@
# run two tinygrad matrix example in a loop
# amdgpu-6.0.5-1581431.20.04
# NOT fixed in kernel 6.2.14
[ 553.016624] gmc_v11_0_process_interrupt: 30 callbacks suppressed
[ 553.016631] amdgpu 0000:0b:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:24 vmid:9 pasid:32770, for process python3 pid 10001 thread python3 pid 10001)
[ 553.016790] amdgpu 0000:0b:00.0: amdgpu: in page starting at address 0x00007f0000000000 from client 10
[ 553.016892] amdgpu 0000:0b:00.0: amdgpu: GCVM_L2_PROTECTION_FAULT_STATUS:0x00901A30
[ 553.016974] amdgpu 0000:0b:00.0: amdgpu: Faulty UTCL2 client ID: SDMA0 (0xd)
[ 553.017051] amdgpu 0000:0b:00.0: amdgpu: MORE_FAULTS: 0x0
[ 553.017111] amdgpu 0000:0b:00.0: amdgpu: WALKER_ERROR: 0x0
[ 553.017173] amdgpu 0000:0b:00.0: amdgpu: PERMISSION_FAULTS: 0x3
[ 553.017238] amdgpu 0000:0b:00.0: amdgpu: MAPPING_ERROR: 0x0
[ 553.017300] amdgpu 0000:0b:00.0: amdgpu: RW: 0x0
[ 553.123921] [drm:mes_v11_0_submit_pkt_and_poll_completion.constprop.0 [amdgpu]] *ERROR* MES failed to response msg=2
[ 553.124153] amdgpu: failed to add hardware queue to MES, doorbell=0x1a16
[ 553.124195] amdgpu: MES might be in unrecoverable state, issue a GPU reset
[ 553.124237] amdgpu: Failed to restore queue 2
[ 553.124266] amdgpu: Failed to restore process queues
[ 553.124270] amdgpu: Failed to evict queue 3
[ 553.124297] amdgpu: amdgpu_amdkfd_restore_userptr_worker: Failed to resume KFD
# alternative crash in kernel 6.2.14
[ 151.097948] gmc_v11_0_process_interrupt: 30 callbacks suppressed
[ 151.097953] amdgpu 0000:0b:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:24 vmid:8 pasid:32771, for process python3 pid 7525 thread python3 pid 7525)
[ 151.097993] amdgpu 0000:0b:00.0: amdgpu: in page starting at address 0x00007f0000000000 from client 10
[ 151.098008] amdgpu 0000:0b:00.0: amdgpu: GCVM_L2_PROTECTION_FAULT_STATUS:0x00801A30
[ 151.098020] amdgpu 0000:0b:00.0: amdgpu: Faulty UTCL2 client ID: SDMA0 (0xd)
[ 151.098032] amdgpu 0000:0b:00.0: amdgpu: MORE_FAULTS: 0x0
[ 151.098042] amdgpu 0000:0b:00.0: amdgpu: WALKER_ERROR: 0x0
[ 151.098052] amdgpu 0000:0b:00.0: amdgpu: PERMISSION_FAULTS: 0x3
[ 151.098062] amdgpu 0000:0b:00.0: amdgpu: MAPPING_ERROR: 0x0
[ 151.098071] amdgpu 0000:0b:00.0: amdgpu: RW: 0x0
[ 151.209517] [drm:mes_v11_0_submit_pkt_and_poll_completion.constprop.0 [amdgpu]] *ERROR* MES failed to response msg=2
[ 151.209724] amdgpu: failed to add hardware queue to MES, doorbell=0x1002
[ 151.209734] amdgpu: MES might be in unrecoverable state, issue a GPU reset
[ 151.209743] amdgpu: Failed to restore queue 1
[ 151.209751] amdgpu: Failed to restore process queues
[ 151.209759] amdgpu: amdgpu_amdkfd_restore_userptr_worker: Failed to resume KFD
[ 151.209858] amdgpu 0000:0b:00.0: amdgpu: GPU reset begin!

View File

@@ -1,20 +0,0 @@
# two tinygrad + two bandwidth test
# RDNA2, driver 6.0.5
# recovered from this!
[ 136.971209] gmc_v10_0_process_interrupt: 39 callbacks suppressed
[ 136.971218] amdgpu 0000:0b:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:24 vmid:11 pasid:32773, for process rocm-bandwidth- pid 20281 thread rocm-bandwidth- pid 20281)
[ 136.971228] amdgpu 0000:0b:00.0: amdgpu: in page starting at address 0x00007f5c2b800000 from client 0x1b (UTCL2)
[ 136.971232] amdgpu 0000:0b:00.0: amdgpu: GCVM_L2_PROTECTION_FAULT_STATUS:0x00B01A31
[ 136.971233] amdgpu 0000:0b:00.0: amdgpu: Faulty UTCL2 client ID: SDMA0 (0xd)
[ 136.971235] amdgpu 0000:0b:00.0: amdgpu: MORE_FAULTS: 0x1
[ 136.971236] amdgpu 0000:0b:00.0: amdgpu: WALKER_ERROR: 0x0
[ 136.971236] amdgpu 0000:0b:00.0: amdgpu: PERMISSION_FAULTS: 0x3
[ 136.971237] amdgpu 0000:0b:00.0: amdgpu: MAPPING_ERROR: 0x0
[ 136.971238] amdgpu 0000:0b:00.0: amdgpu: RW: 0x0
...
[ 136.993979] amdgpu 0000:0b:00.0: amdgpu: IH ring buffer overflow (0x000BE5A0, 0x0003C480, 0x0003E5C0)
[ 138.209072] amdgpu 0000:0b:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x001a address=0x7c00004000 flags=0x0000]
[ 138.209078] amdgpu 0000:0b:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x001a address=0x7c00004d80 flags=0x0000]
[ 138.209081] amdgpu 0000:0b:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x001a address=0x7c00005000 flags=0x0000]
[ 138.209084] amdgpu 0000:0b:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x001a address=0x7c00005d80 flags=0x0000]

View File

@@ -1,33 +0,0 @@
# ROCK-Kernel-Driver 0b579de9622f5c93021dcb7927d13926313740a2
# non fatal "crash"
[ 127.418045] ------------[ cut here ]------------
[ 127.418046] User pages unexpectedly invalid
[ 127.418056] WARNING: CPU: 16 PID: 260 at drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:3000 amdgpu_amdkfd_restore_userptr_worker+0x4d9/0x500 [amdgpu]
[ 127.418235] Modules linked in: rfcomm cmac algif_hash algif_skcipher af_alg bnep nls_iso8859_1 iwlmvm mac80211 intel_rapl_msr intel_rapl_common edac_mce_amd snd_hda_codec_realtek snd_hda_codec_generic snd_hda_codec_hdmi kvm_amd binfmt_misc snd_hda_intel snd_intel_dspcfg kvm libarc4 snd_intel_sdw_acpi snd_hda_codec btusb iwlwifi btrtl snd_hda_core btbcm btintel irqbypass btmtk snd_hwdep crct10dif_pclmul snd_pcm polyval_clmulni bluetooth snd_seq_midi snd_seq_midi_event snd_rawmidi snd_seq polyval_generic cfg80211 ghash_clmulni_intel eeepc_wmi snd_seq_device snd_timer aesni_intel asus_wmi ecdh_generic snd platform_profile crypto_simd ledtrig_audio cryptd ecc ccp soundcore sparse_keymap rapl k10temp wmi_bmof mac_hid sch_fq_codel msr parport_pc ppdev lp parport ramoops pstore_blk efi_pstore reed_solomon pstore_zone ip_tables x_tables autofs4 amdgpu hid_generic usbhid hid i2c_algo_bit drm_ttm_helper ttm video iommu_v2 drm_buddy gpu_sched drm_display_helper drm_kms_helper syscopyarea
[ 127.418276] sysfillrect sysimgblt fb_sys_fops drm nvme nvme_core cec r8169 ahci crc32_pclmul rc_core i2c_piix4 xhci_pci libahci nvme_common xhci_pci_renesas realtek wmi
[ 127.418284] CPU: 16 PID: 260 Comm: kworker/16:1 Tainted: G W 6.0.0 #4
[ 127.418286] Hardware name: System manufacturer System Product Name/TUF GAMING X570-PLUS (WI-FI), BIOS 3603 03/20/2021
[ 127.418287] Workqueue: events amdgpu_amdkfd_restore_userptr_worker [amdgpu]
[ 127.418455] RIP: 0010:amdgpu_amdkfd_restore_userptr_worker+0x4d9/0x500 [amdgpu]
[ 127.418601] Code: ff e8 2b 8a 96 d1 e9 66 fe ff ff 48 c7 c7 40 4f f5 c0 e8 56 7b 8a d1 0f 0b e9 2e ff ff ff 48 c7 c7 d8 d0 ed c0 e8 43 7b 8a d1 <0f> 0b e9 0a fe ff ff 4c 89 ef e8 f8 89 96 d1 e9 cb fd ff ff e8 ce
[ 127.418603] RSP: 0018:ffffb36740a83dc8 EFLAGS: 00010282
[ 127.418604] RAX: 0000000000000000 RBX: ffff9d159ee9df30 RCX: 0000000000000027
[ 127.418605] RDX: 0000000000000027 RSI: ffffb36740a83c88 RDI: ffff9d242a220568
[ 127.418606] RBP: ffffb36740a83e58 R08: ffff9d242a220560 R09: 0000000000000001
[ 127.418607] R10: 0000000000000001 R11: 0000000000000020 R12: ffff9d159ee9df98
[ 127.418607] R13: ffff9d159ee9df70 R14: ffff9d159ee9dee0 R15: ffff9d159ee9dee0
[ 127.418608] FS: 0000000000000000(0000) GS:ffff9d242a200000(0000) knlGS:0000000000000000
[ 127.418609] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 127.418610] CR2: 00007fd5d4715000 CR3: 0000000120ffe000 CR4: 0000000000750ee0
[ 127.418611] PKRU: 55555554
[ 127.418611] Call Trace:
[ 127.418612] <TASK>
[ 127.418613] process_one_work+0x21f/0x3f0
[ 127.418615] worker_thread+0x4a/0x3c0
[ 127.418617] ? process_one_work+0x3f0/0x3f0
[ 127.418618] kthread+0xf0/0x120
[ 127.418619] ? kthread_complete_and_exit+0x20/0x20
[ 127.418620] ret_from_fork+0x22/0x30
[ 127.418622] </TASK>
[ 127.418623] ---[ end trace 0000000000000000 ]---

View File

@@ -1,80 +0,0 @@
import numpy as np
import pathlib
from hexdump import hexdump
from tinygrad.helpers import colored
from extra.helpers import enable_early_exec
early_exec = enable_early_exec()
from tinygrad.runtime.ops_cl import CLProgram, CLBuffer, ROCM_LLVM_PATH
ENABLE_NON_ASM = False
WMMA = True
DUAL_ALU = True
F32 = True
if ENABLE_NON_ASM:
buf = CLBuffer.fromCPU(np.zeros(10, np.float32))
prg_empty = CLProgram("code", "__kernel void code(__global float *a) { a[0] = 1; }")
asm_real = prg_empty.binary()
with open("/tmp/cc.elf", "wb") as f:
f.write(asm_real)
prg_empty([1], [1], buf, wait=True)
print(buf.toCPU())
print(colored("creating CLBuffer", "green"))
buf = CLBuffer.fromCPU(np.zeros(10, np.float32))
code = open(pathlib.Path(__file__).parent / "prog.s", "r").read()
gen = []
FLOPS = 0
MAX_REG = 251
for j in range(1):
if WMMA:
KY, KX = 4, 4
for y in range(KY):
for x in range(KX):
c = (y*KX+x)*8
a = (KY*KX*8) + y*8
b = (KY*KX*8) + (KY*8) + x*8
gen.append(f"v_wmma_f32_16x16x16_f16 v[{c}:{c+7}], v[{a}:{a+7}], v[{b}:{b+7}], v[{c}:{c+7}]")
FLOPS += 16*8*2
else:
for i in range(0, MAX_REG, 6):
if DUAL_ALU:
if F32:
gen.append(f"v_dual_fmac_f32 v{i+0}, v{i+1}, v{i+2} :: v_dual_fmac_f32 v{i+3}, v{i+4}, v{i+5}")
FLOPS += 4
else:
gen.append(f"v_dual_dot2acc_f32_f16 v{i+0}, v{i+1}, v{i+2} :: v_dual_dot2acc_f32_f16 v{i+3}, v{i+4}, v{i+5}")
FLOPS += 8
else:
assert F32
gen.append(f"v_fmac_f32 v{i+0}, v{i+1}, v{i+2}")
gen.append(f"v_fmac_f32 v{i+3}, v{i+4}, v{i+5}")
code = code.replace("// FLOPS", '\n'.join(gen))
print(code)
# fix: COMGR failed to get code object ISA name. set triple to 'amdgcn-amd-amdhsa'
object = early_exec(([ROCM_LLVM_PATH / "llvm-mc", '--arch=amdgcn', '--mcpu=gfx1100', '--triple=amdgcn-amd-amdhsa', '--filetype=obj', '-'], code.encode("utf-8")))
asm = early_exec(([ROCM_LLVM_PATH / "ld.lld", "/dev/stdin", "-o", "/dev/stdout", "--pie"], object))
with open("/tmp/cc2.o", "wb") as f:
f.write(object)
with open("/tmp/cc2.elf", "wb") as f:
f.write(asm)
print(colored("creating CLProgram", "green"))
prg = CLProgram("code", asm)
print(colored("running program", "green"))
G = 512
FLOPS *= 100000*G*G # loop * global_size
for i in range(3):
tm = prg(buf, global_size=[G//256, G, 1], local_size=[256, 1, 1], wait=True)
print(f"ran in {tm*1e3:.2f} ms, {FLOPS/(tm*1e9):.2f} GFLOPS")
print(colored("transferring buffer", "green"))
print(buf.toCPU())

View File

@@ -1,80 +0,0 @@
.global _start
_start:
.rodata
.align 0x10
.global code.kd
.type code.kd,STT_OBJECT
# amd_kernel_code_t (must be at 0x440 for kernel_code_entry_byte_offset to be right)
code.kd:
# amd_kernel_..., amd_machine_...
.long 0,0,0,0
# kernel_code_entry_byte_offset, kernel_code_prefetch_byte_offset
.long 0x00000bc0,0x00000000,0x00000000,0x00000000
# kernel_code_prefetch_byte_size, max_scratch_backing_memory_byte_size
.long 0,0,0,0
# compute_pgm_rsrc1, compute_pgm_rsrc2, kernel_code_properties, workitem_private_segment_byte_size
.long 0x60af0000,0x0000009e,0x00000408,0x00000000
# compute_pgm_rsrc1 |= AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32 | AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64
# compute_pgm_rsrc1 |= AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP | AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE
# compute_pgm_rsrc2 |= AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT = 0xF
# compute_pgm_rsrc2 |= AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X
# kernel_code_properties |= AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR = 1
# kernel_code_properties |= AMD_KERNEL_CODE_PROPERTIES_RESERVED1 = 1
.text
.global code
.type code,STT_FUNC
code:
# https://llvm.org/docs/AMDGPUUsage.html#initial-kernel-execution-state
# s[0:1] contains the kernarg_address
# TODO: can we use s[2:3] if this was really a wave since we only alloced 2 SGPRs?
s_load_b64 s[2:3], s[0:1], null
s_mov_b32 s8, 0
loop:
s_addk_i32 s8, 1
s_cmp_eq_u32 s8, 100000
// FLOPS
s_cbranch_scc0 loop
# wait for the s_load_b64
s_waitcnt lgkmcnt(0)
v_dual_mov_b32 v0, 4 :: v_dual_mov_b32 v1, 2.0
global_store_b32 v0, v1, s[2:3]
# Deallocate all VGPRs for this wave. Use only when next instruction is S_ENDPGM.
s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
s_endpgm
s_code_end
.amdgpu_metadata
amdhsa.kernels:
- .args:
- .address_space: global
.name: a
.offset: 0
.size: 8
.type_name: 'float*'
.value_kind: global_buffer
.group_segment_fixed_size: 0
.kernarg_segment_align: 8
.kernarg_segment_size: 8
.language: OpenCL C
.language_version:
- 1
- 2
.max_flat_workgroup_size: 256
.name: code
.private_segment_fixed_size: 0
.sgpr_count: 2
.sgpr_spill_count: 0
.symbol: code.kd
.uses_dynamic_stack: false
.vgpr_count: 256
.vgpr_spill_count: 0
.wavefront_size: 32
amdhsa.target: amdgcn-amd-amdhsa--gfx1100
amdhsa.version:
- 1
- 2
.end_amdgpu_metadata

View File

@@ -1,11 +0,0 @@
#!/bin/bash
mkdir -p src
cd src
git clone https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface.git -b rocm-5.5.0
git clone https://github.com/RadeonOpenCompute/ROCm-Device-Libs.git -b rocm-5.5.0
git clone https://github.com/RadeonOpenCompute/llvm-project.git -b rocm-5.5.0 --depth 1
git clone https://github.com/RadeonOpenCompute/ROCR-Runtime.git -b rocm-5.5.0
git clone https://github.com/ROCm-Developer-Tools/ROCclr.git -b rocm-5.5.0
git clone https://github.com/RadeonOpenCompute/ROCm-CompilerSupport.git -b rocm-5.5.0
git clone https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime.git -b rocm-5.5.0
cd ../

View File

@@ -1,69 +0,0 @@
#!/bin/bash
mkdir -p build/debs
cd build
# ROCT-Thunk-Interface (hsakmt)
if [ ! -f debs/hsakmt-roct-dev_5.5.0.99999-local_amd64.deb ]
then
mkdir -p ROCT-Thunk-Interface
cd ROCT-Thunk-Interface
cmake ../../src/ROCT-Thunk-Interface
make -j32 package
cp hsakmt-roct-dev_5.5.0.99999-local_amd64.deb ../debs
cd ../
fi
# build custom LLVM
if [ ! -f llvm-project/bin/clang ]
then
mkdir -p llvm-project
cd llvm-project
cmake -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="llvm;clang;lld" -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" ../../src/llvm-project/llvm
make -j32
cd ..
fi
# use custom LLVM
export PATH="$PWD/llvm-project/bin:$PATH"
# ROCm-Device-Libs
if [ ! -f debs/rocm-device-libs_1.0.0.99999-local_amd64.deb ]
then
mkdir -p ROCm-Device-Libs
cd ROCm-Device-Libs
cmake ../../src/ROCm-Device-Libs
make -j32 package
cp rocm-device-libs_1.0.0.99999-local_amd64.deb ../debs
cd ../
fi
# ROCR-Runtime
if [ ! -f debs/hsa-rocr_1.8.0-local_amd64.deb ]
then
mkdir -p ROCR-Runtime
cd ROCR-Runtime
cmake ../../src/ROCR-Runtime/src
make -j32 package
cp hsa-rocr_1.8.0-local_amd64.deb ../debs
cp hsa-rocr-dev_1.8.0-local_amd64.deb ../debs
cd ../
fi
# ROCm-OpenCL-Runtime (needs ROCclr)
if [ ! -f debs/rocm-opencl_2.0.0-local_amd64.deb ]
then
mkdir -p ROCm-OpenCL-Runtime
cd ROCm-OpenCL-Runtime
cmake ../../src/ROCm-OpenCL-Runtime
make -j32 package
cp rocm-opencl_2.0.0-local_amd64.deb ../debs
cp rocm-opencl-dev_2.0.0-local_amd64.deb ../debs
cp rocm-ocl-icd_2.0.0-local_amd64.deb ../debs
fi
# ROCm-CompilerSupport (broken)
#mkdir -p ROCm-CompilerSupport
#cd ROCm-CompilerSupport
#cmake ../../src/ROCm-CompilerSupport/lib/comgr
#make -j32

View File

@@ -1,14 +0,0 @@
#!/bin/bash
rm amdgpu-install_5.5.50500-1_all.deb
wget https://repo.radeon.com/amdgpu-install/5.5/ubuntu/$(lsb_release -cs)/amdgpu-install_5.5.50500-1_all.deb
sudo dpkg -i amdgpu-install_5.5.50500-1_all.deb
sudo apt-get update
# kernel driver
sudo apt-get install amdgpu-dkms
# for opencl
sudo apt-get install rocm-opencl-runtime
# for HIP
sudo apt-get install hip-runtime-amd rocm-device-libs hip-dev

View File

@@ -1,11 +0,0 @@
#!/bin/bash -e
clang sniff.cc -Werror -shared -fPIC -I../src/ -I../src/ROCT-Thunk-Interface/include -I../src/ROCm-Device-Libs/ockl/inc -o sniff.so -lstdc++
#AMD_LOG_LEVEL=4 HSAKMT_DEBUG_LEVEL=7 LD_PRELOAD=$PWD/sniff.so /home/tiny/build/HIP-Examples/HIP-Examples-Applications/HelloWorld/HelloWorld
#AMD_LOG_LEVEL=4 LD_PRELOAD=$PWD/sniff.so $HOME/build/HIP-Examples/HIP-Examples-Applications/HelloWorld/HelloWorld
#AMD_LOG_LEVEL=5 LD_PRELOAD=$PWD/sniff.so python3 ../rdna3/asm.py
DEBUG=5 LD_PRELOAD=$PWD/sniff.so python3 ../rdna3/asm.py
#AMD_LOG_LEVEL=5 HSAKMT_DEBUG_LEVEL=7 DEBUG=5 LD_PRELOAD=$PWD/sniff.so strace -F python3 ../rdna3/asm.py
#LD_PRELOAD=$PWD/sniff.so python3 ../rdna3/asm.py
#AMD_LOG_LEVEL=4 LD_PRELOAD=$PWD/sniff.so FORWARD_ONLY=1 DEBUG=2 python3 ../../../test/test_ops.py TestOps.test_add
#AMD_LOG_LEVEL=4 HSAKMT_DEBUG_LEVEL=7 LD_PRELOAD=$PWD/sniff.so rocm-bandwidth-test -s 0 -d 1 -m 1
#AMD_LOG_LEVEL=4 HSAKMT_DEBUG_LEVEL=7 LD_PRELOAD=$PWD/sniff.so rocm-bandwidth-test -s 1 -d 2 -m 1

View File

@@ -1,282 +0,0 @@
// template copied from https://github.com/geohot/cuda_ioctl_sniffer/blob/master/sniff.cc
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <dlfcn.h>
#include <signal.h>
#include <ucontext.h>
#include <sys/mman.h>
// includes from the ROCm sources
#include <linux/kfd_ioctl.h>
#include <hsa.h>
#include <amd_hsa_kernel_code.h>
#include <ROCR-Runtime/src/core/inc/sdma_registers.h>
using namespace rocr::AMD;
#include <string>
#include <map>
std::map<int, std::string> files;
std::map<uint64_t, uint64_t> ring_base_addresses;
#define D(args...) fprintf(stderr, args)
uint64_t doorbell_offset = -1;
std::map<uint64_t, int> queue_types;
void hexdump(void *d, int l) {
for (int i = 0; i < l; i++) {
if (i%0x10 == 0 && i != 0) printf("\n");
if (i%0x10 == 8) printf(" ");
if (i%0x10 == 0) printf("%8X: ", i);
printf("%2.2X ", ((uint8_t*)d)[i]);
}
printf("\n");
}
extern "C" {
// https://defuse.ca/online-x86-assembler.htm#disassembly2
static void handler(int sig, siginfo_t *si, void *unused) {
ucontext_t *u = (ucontext_t *)unused;
uint8_t *rip = (uint8_t*)u->uc_mcontext.gregs[REG_RIP];
int store_size = 0;
uint64_t value;
if (rip[0] == 0x48 && rip[1] == 0x89 && rip[2] == 0x30) {
// 0: 48 89 30 mov QWORD PTR [rax],rsi
store_size = 8;
value = u->uc_mcontext.gregs[REG_RSI];
u->uc_mcontext.gregs[REG_RIP] += 3;
} else if (rip[0] == 0x4c && rip[1] == 0x89 && rip[2] == 0x28) {
// 0: 4c 89 28 mov QWORD PTR [rax],r13
store_size = 8;
value = u->uc_mcontext.gregs[REG_R13];
u->uc_mcontext.gregs[REG_RIP] += 3;
} else {
D("segfault %02X %02X %02X %02X %02X %02X %02X %02X rip: %p addr: %p\n", rip[0], rip[1], rip[2], rip[3], rip[4], rip[5], rip[6], rip[7], rip, si->si_addr);
D("rax: %llx rcx: %llx rdx: %llx rsi: %llx rbx: %llx\n", u->uc_mcontext.gregs[REG_RAX], u->uc_mcontext.gregs[REG_RCX], u->uc_mcontext.gregs[REG_RDX], u->uc_mcontext.gregs[REG_RSI], u->uc_mcontext.gregs[REG_RBX]);
exit(-1);
}
uint64_t ring_base_address = ring_base_addresses[((uint64_t)si->si_addr)&0xFFF];
int queue_type = queue_types[((uint64_t)si->si_addr)&0xFFF];
D("%16p: \u001b[31mDING DONG\u001b[0m (queue_type %d) store(%d): 0x%8lx -> %p ring_base_address:0x%lx\n", rip, queue_type, store_size, value, si->si_addr, ring_base_address);
if (queue_type == KFD_IOC_QUEUE_TYPE_SDMA) {
uint8_t *sdma_ptr = (uint8_t*)(ring_base_address);
while (sdma_ptr < ((uint8_t*)(ring_base_address)+value)) {
D("0x%3lx: ", sdma_ptr-(uint8_t*)(ring_base_address));
if (sdma_ptr[0] == SDMA_OP_TIMESTAMP) {
D("SDMA_PKT_TIMESTAMP\n");
sdma_ptr += sizeof(SDMA_PKT_TIMESTAMP);
} else if (sdma_ptr[0] == SDMA_OP_GCR) {
D("SDMA_PKT_GCR\n");
sdma_ptr += sizeof(SDMA_PKT_GCR);
} else if (sdma_ptr[0] == SDMA_OP_ATOMIC) {
D("SDMA_PKT_ATOMIC\n");
sdma_ptr += sizeof(SDMA_PKT_ATOMIC);
} else if (sdma_ptr[0] == SDMA_OP_FENCE) {
D("SDMA_PKT_FENCE\n");
sdma_ptr += sizeof(SDMA_PKT_FENCE);
} else if (sdma_ptr[0] == SDMA_OP_TRAP) {
D("SDMA_PKT_TRAP\n");
sdma_ptr += sizeof(SDMA_PKT_TRAP);
} else if (sdma_ptr[0] == SDMA_OP_COPY && sdma_ptr[1] == SDMA_SUBOP_COPY_LINEAR) {
SDMA_PKT_COPY_LINEAR *pkt = (SDMA_PKT_COPY_LINEAR *)sdma_ptr;
D("SDMA_PKT_COPY_LINEAR: count:0x%x src:0x%lx dst:0x%lx\n", pkt->COUNT_UNION.count+1,
(uint64_t)pkt->SRC_ADDR_LO_UNION.src_addr_31_0 | ((uint64_t)pkt->SRC_ADDR_HI_UNION.src_addr_63_32 << 32),
(uint64_t)pkt->DST_ADDR_LO_UNION.dst_addr_31_0 | ((uint64_t)pkt->DST_ADDR_HI_UNION.dst_addr_63_32 << 32)
);
sdma_ptr += sizeof(SDMA_PKT_COPY_LINEAR);
} else {
D("unhandled packet type %d %d, exiting\n", sdma_ptr[0], sdma_ptr[1]);
break;
}
}
//hexdump((void*)(ring_base_address), 0x100);
} else if (queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) {
hsa_kernel_dispatch_packet_t *pkt = (hsa_kernel_dispatch_packet_t *)(ring_base_address+value*0x40);
if ((pkt->header&0xFF) == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
D("HSA_PACKET_TYPE_KERNEL_DISPATCH -- setup:%d workgroup[%d, %d, %d] grid[%d, %d, %d] kernel_object:0x%lx kernarg_address:%p\n", pkt->setup, pkt->workgroup_size_x, pkt->workgroup_size_y, pkt->workgroup_size_z, pkt->grid_size_x, pkt->grid_size_y, pkt->grid_size_z, pkt->kernel_object, pkt->kernarg_address);
amd_kernel_code_t *code = (amd_kernel_code_t *)pkt->kernel_object;
D("kernel_code_entry_byte_offset:%lx\n", code->kernel_code_entry_byte_offset);
uint32_t *kernel_code = (uint32_t*)(pkt->kernel_object + code->kernel_code_entry_byte_offset);
int code_len = 0;
while (kernel_code[code_len] != 0xbf9f0000 && kernel_code[code_len] != 0) code_len++;
hexdump(kernel_code, code_len*4);
/*FILE *f = fopen("/tmp/kernel_code", "wb");
fwrite(kernel_code, 4, code_len, f);
fclose(f);
system("python -c 'print(\" \".join([(\"0x%02X\"%x) for x in open(\"/tmp/kernel_code\", \"rb\").read()]))' | ../build/llvm-project/bin/llvm-mc --disassemble --arch=amdgcn --mcpu=gfx1100 --show-encoding");*/
D("kernargs (kernarg_segment_byte_size:0x%lx)\n", code->kernarg_segment_byte_size);
// get length
int i;
for (i = 0; i < 0x400; i+=0x10) {
if (memcmp((void*)((uint64_t)pkt->kernarg_address+i), "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", 0x10) == 0) break;
}
hexdump((void*)pkt->kernarg_address, i+0x10);
} else if ((pkt->header&0xFF) == HSA_PACKET_TYPE_BARRIER_AND) {
hsa_barrier_and_packet_t *pkt_and = (hsa_barrier_and_packet_t *)(ring_base_address+value*0x40);
D("HSA_PACKET_TYPE_BARRIER_AND completion_signal:0x%lx\n", pkt_and->completion_signal.handle);
//hexdump((void*)(ring_base_address+value*0x40), 0x40);
} else if ((pkt->header&0xFF) == HSA_PACKET_TYPE_VENDOR_SPECIFIC) {
D("HSA_PACKET_TYPE_VENDOR_SPECIFIC\n");
hexdump((void*)(ring_base_address+value*0x40), 0x40);
} else {
hexdump((void*)(ring_base_address+value*0x40), 0x40);
}
}
mprotect((void *)((uint64_t)si->si_addr & ~0xFFF), 0x2000, PROT_READ | PROT_WRITE);
if (store_size == 8) {
*(volatile uint64_t*)(si->si_addr) = value;
} else if (store_size == 4) {
*(volatile uint32_t*)(si->si_addr) = value;
} else if (store_size == 2) {
*(volatile uint16_t*)(si->si_addr) = value;
} else {
D("store size not supported\n");
exit(-1);
}
mprotect((void *)((uint64_t)si->si_addr & ~0xFFF), 0x2000, PROT_NONE);
}
void register_sigsegv_handler() {
struct sigaction sa = {0};
sa.sa_flags = SA_SIGINFO;
sigemptyset(&sa.sa_mask);
sa.sa_sigaction = handler;
if (sigaction(SIGSEGV, &sa, NULL) == -1) {
D("ERROR: failed to register sigsegv handler");
exit(-1);
}
// NOTE: python (or ocl runtime?) blocks the SIGSEGV signal
sigset_t x;
sigemptyset(&x);
sigaddset(&x, SIGSEGV);
sigprocmask(SIG_UNBLOCK, &x, NULL);
}
int (*my_open)(const char *pathname, int flags, mode_t mode);
#undef open
int open(const char *pathname, int flags, mode_t mode) {
if (my_open == NULL) my_open = reinterpret_cast<decltype(my_open)>(dlsym(RTLD_NEXT, "open"));
int ret = my_open(pathname, flags, mode);
//D("open %s (0o%o) = %d\n", pathname, flags, ret);
files[ret] = pathname;
return ret;
}
int (*my_open64)(const char *pathname, int flags, mode_t mode);
#undef open
int open64(const char *pathname, int flags, mode_t mode) {
if (my_open64 == NULL) my_open64 = reinterpret_cast<decltype(my_open64)>(dlsym(RTLD_NEXT, "open64"));
int ret = my_open64(pathname, flags, mode);
//D("open %s (0o%o) = %d\n", pathname, flags, ret);
files[ret] = pathname;
return ret;
}
void *(*my_mmap)(void *addr, size_t length, int prot, int flags, int fd, off_t offset);
#undef mmap
void *mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset) {
if (my_mmap == NULL) my_mmap = reinterpret_cast<decltype(my_mmap)>(dlsym(RTLD_NEXT, "mmap"));
void *ret = my_mmap(addr, length, prot, flags, fd, offset);
if (doorbell_offset != -1 && offset == doorbell_offset) {
D("HIDDEN DOORBELL %p, handled by %p\n", addr, handler);
register_sigsegv_handler();
mprotect(addr, length, PROT_NONE);
}
if (fd != -1) D("mmapped %p (target %p) with flags 0x%x length 0x%zx fd %d %s offset 0x%lx\n", ret, addr, flags, length, fd, files[fd].c_str(), offset);
return ret;
}
void *(*my_mmap64)(void *addr, size_t length, int prot, int flags, int fd, off_t offset);
#undef mmap64
void *mmap64(void *addr, size_t length, int prot, int flags, int fd, off_t offset) { return mmap(addr, length, prot, flags, fd, offset); }
int ioctl_num = 1;
int (*my_ioctl)(int filedes, unsigned long request, void *argp) = NULL;
#undef ioctl
int ioctl(int filedes, unsigned long request, void *argp) {
if (my_ioctl == NULL) my_ioctl = reinterpret_cast<decltype(my_ioctl)>(dlsym(RTLD_NEXT, "ioctl"));
int ret = 0;
ret = my_ioctl(filedes, request, argp);
if (!files.count(filedes)) return ret;
uint8_t type = (request >> 8) & 0xFF;
uint8_t nr = (request >> 0) & 0xFF;
uint16_t size = (request >> 16) & 0xFFF;
D("%3d: %d = %3d(%20s) 0x%3x ", ioctl_num, ret, filedes, files[filedes].c_str(), size);
if (request == AMDKFD_IOC_SET_EVENT) {
kfd_ioctl_set_event_args *args = (kfd_ioctl_set_event_args *)argp;
D("AMDKFD_IOC_SET_EVENT event_id:%d", args->event_id);
} else if (request == AMDKFD_IOC_ALLOC_MEMORY_OF_GPU) {
kfd_ioctl_alloc_memory_of_gpu_args *args = (kfd_ioctl_alloc_memory_of_gpu_args *)argp;
D("AMDKFD_IOC_ALLOC_MEMORY_OF_GPU va_addr:0x%llx size:0x%llx handle:%llX gpu_id:0x%x", args->va_addr, args->size, args->handle, args->gpu_id);
} else if (request == AMDKFD_IOC_MAP_MEMORY_TO_GPU) {
kfd_ioctl_map_memory_to_gpu_args *args = (kfd_ioctl_map_memory_to_gpu_args *)argp;
D("AMDKFD_IOC_MAP_MEMORY_TO_GPU handle:%llX", args->handle);
} else if (request == AMDKFD_IOC_CREATE_EVENT) {
kfd_ioctl_create_event_args *args = (kfd_ioctl_create_event_args *)argp;
D("AMDKFD_IOC_CREATE_EVENT event_page_offset:0x%llx event_type:%d event_id:%d", args->event_page_offset, args->event_type, args->event_id);
} else if (request == AMDKFD_IOC_WAIT_EVENTS) {
D("AMDKFD_IOC_WAIT_EVENTS");
} else if (request == AMDKFD_IOC_SET_XNACK_MODE) {
D("AMDKFD_IOC_SET_XNACK_MODE");
} else if (request == AMDKFD_IOC_SVM || (type == 0x4b && nr == 0x20)) {
// NOTE: this one is variable length
kfd_ioctl_svm_args *args = (kfd_ioctl_svm_args *)argp;
D("AMDKFD_IOC_SVM start_addr:0x%llx size:0x%llx op:%d", args->start_addr, args->size, args->op);
} else if (request == AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU) {
kfd_ioctl_unmap_memory_from_gpu_args *args = (kfd_ioctl_unmap_memory_from_gpu_args *)argp;
D("AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU handle:%llX", args->handle);
} else if (request == AMDKFD_IOC_FREE_MEMORY_OF_GPU) {
D("AMDKFD_IOC_FREE_MEMORY_OF_GPU");
} else if (request == AMDKFD_IOC_SET_SCRATCH_BACKING_VA) {
D("AMDKFD_IOC_SET_SCRATCH_BACKING_VA");
} else if (request == AMDKFD_IOC_GET_TILE_CONFIG) {
D("AMDKFD_IOC_GET_TILE_CONFIG");
} else if (request == AMDKFD_IOC_SET_TRAP_HANDLER) {
D("AMDKFD_IOC_SET_TRAP_HANDLER");
} else if (request == AMDKFD_IOC_GET_VERSION) {
kfd_ioctl_get_version_args *args = (kfd_ioctl_get_version_args *)argp;
D("AMDKFD_IOC_GET_VERSION major_version:%d minor_version:%d", args->major_version, args->minor_version);
} else if (request == AMDKFD_IOC_GET_PROCESS_APERTURES_NEW) {
D("AMDKFD_IOC_GET_PROCESS_APERTURES_NEW");
} else if (request == AMDKFD_IOC_ACQUIRE_VM) {
D("AMDKFD_IOC_ACQUIRE_VM");
} else if (request == AMDKFD_IOC_SET_MEMORY_POLICY) {
D("AMDKFD_IOC_SET_MEMORY_POLICY");
} else if (request == AMDKFD_IOC_GET_CLOCK_COUNTERS) {
D("AMDKFD_IOC_GET_CLOCK_COUNTERS");
} else if (request == AMDKFD_IOC_CREATE_QUEUE) {
kfd_ioctl_create_queue_args *args = (kfd_ioctl_create_queue_args *)argp;
D("AMDKFD_IOC_CREATE_QUEUE\n");
D("queue_type:%d ring_base_address:0x%llx\n", args->queue_type, args->ring_base_address);
D("eop_buffer_address:0x%llx ctx_save_restore_address:0x%llx\n", args->eop_buffer_address, args->ctx_save_restore_address);
D("ring_size:0x%x queue_priority:%d\n", args->ring_size, args->queue_priority);
D("RETURNS write_pointer_address:0x%llx read_pointer_address:0x%llx doorbell_offset:0x%llx queue_id:%d\n", args->write_pointer_address, args->read_pointer_address, args->doorbell_offset, args->queue_id);
//D("RETURNS *write_pointer_address:0x%llx *read_pointer_address:0x%llx\n", *(uint64_t*)args->write_pointer_address, *(uint64_t*)args->read_pointer_address);
ring_base_addresses[args->doorbell_offset&0xFFF] = args->ring_base_address;
queue_types[args->doorbell_offset&0xFFF] = args->queue_type;
doorbell_offset = args->doorbell_offset&~0xFFF;
} else {
D("type:0x%x nr:0x%x size:0x%x", type, nr, size);
}
D("\n");
ioctl_num++;
return ret;
}
}

View File

@@ -11,8 +11,8 @@ from tinygrad.runtime.support.compiler_amd import amdgpu_disassemble
from tinygrad.renderer import ProgramSpec
from tinygrad.engine.realize import CompiledRunner
from extra.assembly.rdna3.autogen import *
from extra.assembly.rdna3.asm import waitcnt
from extra.assembly.amd.autogen.rdna3 import *
from extra.assembly.amd.asm import waitcnt
from test.testextra.test_cfg_viz import template
def get_output(asm:list, n_threads:int=1, vdst:VGPR=v[1]):

View File

@@ -21,7 +21,7 @@ class PythonRemu:
rsrc2: int = 0x19c # Default: USER_SGPR_COUNT=14, enable X and Y workgroup IDs
def run_asm(self, lib: int, lib_sz: int, gx: int, gy: int, gz: int, lx: int, ly: int, lz: int, args_ptr: int) -> int:
from extra.assembly.rdna3.emu import run_asm, set_valid_mem_ranges
from extra.assembly.amd.emu import run_asm, set_valid_mem_ranges
# Pad ranges to handle GPU loads that may read past small buffers (e.g. s_load_b128 on 12-byte buffer)
set_valid_mem_ranges({(start, size + 4096) for start, size in self.valid_mem_ranges})
return run_asm(lib, lib_sz, gx, gy, gz, lx, ly, lz, args_ptr, self.rsrc2)

View File

@@ -10,7 +10,7 @@ from tinygrad.renderer import ProgramSpec
from tinygrad.helpers import TracingKey, getenv
from tinygrad.engine.realize import ExecItem, CompiledRunner
from extra.assembly.rdna3.autogen import *
from extra.assembly.amd.autogen.rdna3 import *
# TODO: use the RDNA3 renderer when it's in master
template = """.text