mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-07 22:23:55 -05:00
rename to extra.assembly.amd (#13879)
This commit is contained in:
22
.github/workflows/test.yml
vendored
22
.github/workflows/test.yml
vendored
@@ -655,7 +655,7 @@ jobs:
|
||||
uses: ./.github/actions/process-replay
|
||||
|
||||
testrdna3:
|
||||
name: RDNA3 IDE
|
||||
name: AMD ASM IDE
|
||||
runs-on: ubuntu-24.04
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
@@ -674,19 +674,27 @@ jobs:
|
||||
sudo apt-get update
|
||||
sudo apt-get install llvm-21 llvm-21-tools cloc
|
||||
- name: RDNA3 Line Count
|
||||
run: cloc --by-file extra/assembly/rdna3/*.py
|
||||
run: cloc --by-file extra/assembly/amd/*.py
|
||||
- name: Run RDNA3 emulator tests
|
||||
run: python -m pytest -n=auto extra/assembly/rdna3/ --durations 20
|
||||
run: python -m pytest -n=auto extra/assembly/amd/ --durations 20
|
||||
- name: Install pdfplumber
|
||||
run: pip install pdfplumber
|
||||
- name: Verify RDNA3 autogen is up to date
|
||||
run: |
|
||||
python -m extra.assembly.rdna3.lib
|
||||
git diff --exit-code extra/assembly/rdna3/autogen/__init__.py
|
||||
python -m extra.assembly.amd.lib --arch rdna3
|
||||
git diff --exit-code extra/assembly/amd/autogen/rdna3/__init__.py
|
||||
- name: Verify CDNA4 autogen is up to date
|
||||
run: |
|
||||
python -m extra.assembly.amd.lib --arch cdna4
|
||||
git diff --exit-code extra/assembly/amd/autogen/cdna4/__init__.py
|
||||
- name: Verify RDNA3 pcode autogen is up to date
|
||||
run: |
|
||||
python -m extra.assembly.rdna3.pcode
|
||||
git diff --exit-code extra/assembly/rdna3/autogen/gen_pcode.py
|
||||
python -m extra.assembly.amd.pcode --arch rdna3
|
||||
git diff --exit-code extra/assembly/amd/autogen/rdna3/gen_pcode.py
|
||||
- name: Verify CDNA4 pcode autogen is up to date
|
||||
run: |
|
||||
python -m extra.assembly.amd.pcode --arch cdna4
|
||||
git diff --exit-code extra/assembly/amd/autogen/cdna4/gen_pcode.py
|
||||
|
||||
testnvidia:
|
||||
strategy:
|
||||
|
||||
@@ -76,10 +76,12 @@ VIZ=1 python -c "from tinygrad import Tensor; Tensor.ones(10).sum().realize()"
|
||||
## Auto-generated Files (DO NOT EDIT)
|
||||
|
||||
The following files are auto-generated and should never be edited manually:
|
||||
- `extra/assembly/rdna3/autogen/gen_pcode.py` - Generated by `python -m extra.assembly.rdna3.pcode`
|
||||
- `extra/assembly/rdna3/autogen/__init__.py` - Generated from AMD ISA definitions
|
||||
- `extra/assembly/amd/autogen/rdna3/__init__.py` - Generated by `python -m extra.assembly.amd.lib --arch rdna3`
|
||||
- `extra/assembly/amd/autogen/rdna3/gen_pcode.py` - Generated by `python -m extra.assembly.amd.pcode --arch rdna3`
|
||||
- `extra/assembly/amd/autogen/cdna4/__init__.py` - Generated by `python -m extra.assembly.amd.lib --arch cdna4`
|
||||
- `extra/assembly/amd/autogen/cdna4/gen_pcode.py` - Generated by `python -m extra.assembly.amd.pcode --arch cdna4`
|
||||
|
||||
To add missing instruction implementations, add them to `extra/assembly/rdna3/emu.py` instead.
|
||||
To add missing instruction implementations, add them to `extra/assembly/amd/emu.py` instead.
|
||||
|
||||
## Style Notes
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# RDNA3 assembler and disassembler
|
||||
from __future__ import annotations
|
||||
import re
|
||||
from extra.assembly.rdna3.lib import Inst, RawImm, Reg, SGPR, VGPR, TTMP, s, v, ttmp, _RegFactory, FLOAT_ENC, SRC_FIELDS, unwrap
|
||||
from extra.assembly.amd.lib import Inst, RawImm, Reg, SGPR, VGPR, TTMP, s, v, ttmp, _RegFactory, FLOAT_ENC, SRC_FIELDS, unwrap
|
||||
|
||||
# Decoding helpers
|
||||
SPECIAL_GPRS = {106: "vcc_lo", 107: "vcc_hi", 124: "null", 125: "m0", 126: "exec_lo", 127: "exec_hi", 253: "scc"}
|
||||
@@ -91,7 +91,7 @@ def disasm(inst: Inst) -> str:
|
||||
# VOP3 and VOP3SD share encoding - check opcode to determine which
|
||||
is_vop3sd = cls_name == 'VOP3' and op_val in VOP3SD_OPCODES
|
||||
try:
|
||||
from extra.assembly.rdna3 import autogen
|
||||
from extra.assembly.amd.autogen import rdna3 as autogen
|
||||
if is_vop3sd:
|
||||
op_name = autogen.VOP3SDOp(op_val).name.lower()
|
||||
else:
|
||||
@@ -347,7 +347,7 @@ def disasm(inst: Inst) -> str:
|
||||
|
||||
# VOPD: dual-issue instructions
|
||||
if cls_name == 'VOPD':
|
||||
from extra.assembly.rdna3 import autogen
|
||||
from extra.assembly.amd.autogen import rdna3 as autogen
|
||||
opx, opy, vdstx, vdsty_enc = [unwrap(inst._values.get(f, 0)) for f in ('opx', 'opy', 'vdstx', 'vdsty')]
|
||||
srcx0, vsrcx1, srcy0, vsrcy1 = [unwrap(inst._values.get(f, 0)) for f in ('srcx0', 'vsrcx1', 'srcy0', 'vsrcy1')]
|
||||
vdsty = (vdsty_enc << 1) | ((vdstx & 1) ^ 1) # Decode vdsty
|
||||
@@ -505,7 +505,7 @@ SOPK_IMM_FIRST = {'s_setreg_b32'}
|
||||
SOPK_UNSUPPORTED = {'s_setreg_imm32_b32'}
|
||||
|
||||
def asm(text: str) -> Inst:
|
||||
from extra.assembly.rdna3 import autogen
|
||||
from extra.assembly.amd.autogen import rdna3 as autogen
|
||||
text = text.strip()
|
||||
clamp = 'clamp' in text.lower()
|
||||
if clamp: text = re.sub(r'\s+clamp\s*$', '', text, flags=re.I)
|
||||
3441
extra/assembly/amd/autogen/cdna4/__init__.py
Normal file
3441
extra/assembly/amd/autogen/cdna4/__init__.py
Normal file
File diff suppressed because it is too large
Load Diff
1630
extra/assembly/amd/autogen/cdna4/gen_pcode.py
Normal file
1630
extra/assembly/amd/autogen/cdna4/gen_pcode.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,7 +1,7 @@
|
||||
# autogenerated from AMD RDNA3.5 ISA PDF by lib.py - do not edit
|
||||
from enum import IntEnum
|
||||
from typing import Annotated
|
||||
from extra.assembly.rdna3.lib import bits, BitField, Inst32, Inst64, SGPR, VGPR, TTMP as TTMP, s as s, v as v, ttmp as ttmp, SSrc, Src, SImm, Imm, VDSTYEnc, SGPRField, VGPRField
|
||||
from extra.assembly.amd.lib import bits, BitField, Inst32, Inst64, SGPR, VGPR, TTMP as TTMP, s as s, v as v, ttmp as ttmp, SSrc, Src, SImm, Imm, VDSTYEnc, SGPRField, VGPRField
|
||||
import functools
|
||||
|
||||
class SrcEnum(IntEnum):
|
||||
@@ -1,9 +1,9 @@
|
||||
# autogenerated by pcode.py - do not edit
|
||||
# to regenerate: python -m extra.assembly.rdna3.pcode
|
||||
# to regenerate: python -m extra.assembly.amd.pcode --arch rdna3
|
||||
# ruff: noqa: E501,F405,F403
|
||||
# mypy: ignore-errors
|
||||
from extra.assembly.rdna3.autogen import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp
|
||||
from extra.assembly.rdna3.pcode import *
|
||||
from extra.assembly.amd.autogen.rdna3 import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp
|
||||
from extra.assembly.amd.pcode import *
|
||||
|
||||
def _SOP1Op_S_MOV_B32(s0, s1, s2, d0, scc, vcc, lane, exec_mask, literal, VGPR, _vars, src0_idx=0, vdst_idx=0):
|
||||
# D0.b32 = S0.b32
|
||||
@@ -2,10 +2,10 @@
|
||||
# mypy: ignore-errors
|
||||
from __future__ import annotations
|
||||
import ctypes, os
|
||||
from extra.assembly.rdna3.lib import Inst, RawImm
|
||||
from extra.assembly.rdna3.pcode import _f32, _i32, _sext, _f16, _i16, _f64, _i64
|
||||
from extra.assembly.rdna3.autogen.gen_pcode import get_compiled_functions
|
||||
from extra.assembly.rdna3.autogen import (
|
||||
from extra.assembly.amd.lib import Inst, RawImm
|
||||
from extra.assembly.amd.pcode import _f32, _i32, _sext, _f16, _i16, _f64, _i64
|
||||
from extra.assembly.amd.autogen.rdna3.gen_pcode import get_compiled_functions
|
||||
from extra.assembly.amd.autogen.rdna3 import (
|
||||
SOP1, SOP2, SOPC, SOPK, SOPP, SMEM, VOP1, VOP2, VOP3, VOP3SD, VOP3P, VOPC, DS, FLAT, VOPD, SrcEnum,
|
||||
SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, SMEMOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp, DSOp, FLATOp, GLOBALOp, VOPDOp
|
||||
)
|
||||
@@ -217,7 +217,7 @@ class Inst:
|
||||
# op may be an enum (from __init__) or an int (from from_int)
|
||||
op_name = op.name if hasattr(op, 'name') else None
|
||||
if op_name is None and self.__class__.__name__ == 'VOP3':
|
||||
from extra.assembly.rdna3.autogen import VOP3Op
|
||||
from extra.assembly.amd.autogen.rdna3 import VOP3Op
|
||||
try: op_name = VOP3Op(op).name
|
||||
except ValueError: pass
|
||||
if op_name is None: return False
|
||||
@@ -277,7 +277,7 @@ class Inst:
|
||||
def __hash__(self): return hash((self.__class__.__name__, tuple(sorted((k, repr(v)) for k, v in self._values.items())), self._literal))
|
||||
|
||||
def disasm(self) -> str:
|
||||
from extra.assembly.rdna3.asm import disasm
|
||||
from extra.assembly.amd.asm import disasm
|
||||
return disasm(self)
|
||||
|
||||
class Inst32(Inst): pass
|
||||
@@ -459,7 +459,7 @@ def generate(output_path: str | None = None, arch: str = "rdna3") -> dict:
|
||||
def field_key(f): return order.index(f[0].lower()) if f[0].lower() in order else 1000
|
||||
lines = [f"# autogenerated from AMD {doc_name} ISA PDF by lib.py - do not edit", "from enum import IntEnum",
|
||||
"from typing import Annotated",
|
||||
"from extra.assembly.rdna3.lib import bits, BitField, Inst32, Inst64, SGPR, VGPR, TTMP as TTMP, s as s, v as v, ttmp as ttmp, SSrc, Src, SImm, Imm, VDSTYEnc, SGPRField, VGPRField",
|
||||
"from extra.assembly.amd.lib import bits, BitField, Inst32, Inst64, SGPR, VGPR, TTMP as TTMP, s as s, v as v, ttmp as ttmp, SSrc, Src, SImm, Imm, VDSTYEnc, SGPRField, VGPRField",
|
||||
"import functools", ""]
|
||||
lines += enum_lines("SrcEnum", src_enum) + sum([enum_lines(n, ops) for n, ops in sorted(enums.items())], [])
|
||||
# Format-specific field defaults (verified against LLVM test vectors)
|
||||
@@ -521,5 +521,5 @@ if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Generate instruction definitions from AMD ISA PDF")
|
||||
parser.add_argument("--arch", choices=list(PDF_URLS.keys()), default="rdna3", help="Target architecture (default: rdna3)")
|
||||
args = parser.parse_args()
|
||||
result = generate("extra/assembly/rdna3/autogen/__init__.py", arch=args.arch)
|
||||
result = generate(f"extra/assembly/amd/autogen/{args.arch}/__init__.py", arch=args.arch)
|
||||
print(f"generated SrcEnum ({len(result['src_enum'])}) + {len(result['enums'])} opcode enums + {len(result['formats'])} format classes")
|
||||
@@ -702,7 +702,7 @@ class ExecContext:
|
||||
# PDF EXTRACTION AND CODE GENERATION
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
from extra.assembly.rdna3.lib import PDF_URLS
|
||||
from extra.assembly.amd.lib import PDF_URLS
|
||||
INST_PATTERN = re.compile(r'^([SV]_[A-Z0-9_]+)\s+(\d+)\s*$', re.M)
|
||||
|
||||
# Patterns that can't be handled by the DSL (require special handling in emu.py)
|
||||
@@ -740,7 +740,7 @@ def parse_pseudocode_from_pdf(arch: str = "rdna3") -> dict:
|
||||
"""Parse pseudocode from PDF for all ops. Returns {enum_cls: {op: pseudocode}}."""
|
||||
import pdfplumber
|
||||
from tinygrad.helpers import fetch
|
||||
from extra.assembly.rdna3.autogen import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp
|
||||
from extra.assembly.amd.autogen.rdna3 import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp
|
||||
|
||||
OP_ENUMS = [SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp]
|
||||
defined_ops = {}
|
||||
@@ -783,10 +783,10 @@ def parse_pseudocode_from_pdf(arch: str = "rdna3") -> dict:
|
||||
|
||||
return instructions
|
||||
|
||||
def generate_gen_pcode(output_path: str = "extra/assembly/rdna3/autogen/gen_pcode.py", arch: str = "rdna3"):
|
||||
def generate_gen_pcode(output_path: str = "extra/assembly/amd/autogen/rdna3/gen_pcode.py", arch: str = "rdna3"):
|
||||
"""Generate gen_pcode.py - compiled pseudocode functions for the emulator."""
|
||||
from pathlib import Path
|
||||
from extra.assembly.rdna3.autogen import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp
|
||||
from extra.assembly.amd.autogen.rdna3 import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp
|
||||
|
||||
OP_ENUMS = [SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp]
|
||||
|
||||
@@ -803,12 +803,12 @@ def generate_gen_pcode(output_path: str = "extra/assembly/rdna3/autogen/gen_pcod
|
||||
print(f"Total: {total_found}/{total_ops} ({100*total_found//total_ops}%)")
|
||||
|
||||
print("\nCompiling to pseudocode functions...")
|
||||
lines = ['''# autogenerated by pcode.py - do not edit
|
||||
# to regenerate: python -m extra.assembly.rdna3.pcode
|
||||
lines = [f'''# autogenerated by pcode.py - do not edit
|
||||
# to regenerate: python -m extra.assembly.amd.pcode --arch {arch}
|
||||
# ruff: noqa: E501,F405,F403
|
||||
# mypy: ignore-errors
|
||||
from extra.assembly.rdna3.autogen import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp
|
||||
from extra.assembly.rdna3.pcode import *
|
||||
from extra.assembly.amd.autogen.{arch} import SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3SDOp, VOP3POp, VOPCOp
|
||||
from extra.assembly.amd.pcode import *
|
||||
''']
|
||||
|
||||
compiled_count, skipped_count = 0, 0
|
||||
@@ -989,4 +989,4 @@ if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Generate pseudocode functions from AMD ISA PDF")
|
||||
parser.add_argument("--arch", choices=list(PDF_URLS.keys()), default="rdna3", help="Target architecture (default: rdna3)")
|
||||
args = parser.parse_args()
|
||||
generate_gen_pcode(arch=args.arch)
|
||||
generate_gen_pcode(output_path=f"extra/assembly/amd/autogen/{args.arch}/gen_pcode.py", arch=args.arch)
|
||||
@@ -7,7 +7,7 @@ from typing import Callable
|
||||
# Set AMD=1 before importing tinygrad
|
||||
os.environ["AMD"] = "1"
|
||||
|
||||
from extra.assembly.rdna3.emu import run_asm as python_run_asm, set_valid_mem_ranges, decode_program, step_wave, WaveState, WAVE_SIZE
|
||||
from extra.assembly.amd.emu import run_asm as python_run_asm, set_valid_mem_ranges, decode_program, step_wave, WaveState, WAVE_SIZE
|
||||
|
||||
REMU_PATH = Path(__file__).parents[3] / "remu/target/release/libremu.so"
|
||||
if not REMU_PATH.exists():
|
||||
@@ -3,8 +3,8 @@
|
||||
# Currently many of these tests fail - they document desired behavior
|
||||
|
||||
import unittest
|
||||
from extra.assembly.rdna3.autogen import *
|
||||
from extra.assembly.rdna3.lib import Inst, RawImm, SGPR, VGPR
|
||||
from extra.assembly.amd.autogen.rdna3 import *
|
||||
from extra.assembly.amd.lib import Inst, RawImm, SGPR, VGPR
|
||||
|
||||
class TestRegisterSliceSyntax(unittest.TestCase):
|
||||
"""
|
||||
@@ -9,8 +9,8 @@ os.environ["AMD"] = "1"
|
||||
os.environ["MOCKGPU"] = "1"
|
||||
os.environ["PYTHON_REMU"] = "1"
|
||||
|
||||
from extra.assembly.rdna3.emu import WaveState, decode_program, step_wave, WAVE_SIZE
|
||||
from extra.assembly.rdna3.test.helpers import KernelInfo
|
||||
from extra.assembly.amd.emu import WaveState, decode_program, step_wave, WAVE_SIZE, set_valid_mem_ranges
|
||||
from extra.assembly.amd.test.helpers import KernelInfo
|
||||
|
||||
REMU_PATH = Path(__file__).parents[3] / "remu/target/release/libremu.so"
|
||||
|
||||
@@ -223,7 +223,6 @@ def run_single_kernel(kernel: bytes, n_lanes: int, args_ptr: int, global_size: t
|
||||
def compare_emulators_multi_kernel(kernels: list[KernelInfo], buf_pool: dict[int, int], max_steps: int = 1000,
|
||||
debug: bool = False, trace_len: int = 10, buf_data: dict[int, bytes] | None = None) -> tuple[bool, str]:
|
||||
"""Run all kernels through both emulators with shared buffer pool."""
|
||||
from extra.assembly.rdna3.emu import set_valid_mem_ranges, decode_program
|
||||
if buf_data is None: buf_data = {}
|
||||
|
||||
# Allocate shared buffer pool with padding for over-reads (GPU loads up to 16 bytes at once)
|
||||
@@ -267,8 +266,6 @@ def compare_emulators_multi_kernel(kernels: list[KernelInfo], buf_pool: dict[int
|
||||
def compare_emulators_with_memory(kernel: bytes, n_lanes: int, buf_sizes: list, max_steps: int = 1000, debug: bool = False,
|
||||
global_size: tuple[int, int, int] = (1, 1, 1), trace_len: int = 10) -> tuple[bool, str]:
|
||||
"""Run both emulators with memory set up for tinygrad kernels, executing all workgroups. Legacy wrapper."""
|
||||
from extra.assembly.rdna3.emu import set_valid_mem_ranges, decode_program
|
||||
|
||||
# Allocate buffers
|
||||
buffers = []
|
||||
for size in buf_sizes:
|
||||
@@ -6,10 +6,10 @@ Set USE_HW=1 to run on both emulator and real hardware, comparing results.
|
||||
"""
|
||||
|
||||
import ctypes, unittest, os, struct
|
||||
from extra.assembly.rdna3.autogen import *
|
||||
from extra.assembly.rdna3.lib import RawImm
|
||||
from extra.assembly.rdna3.emu import WaveState, run_asm, set_valid_mem_ranges
|
||||
from extra.assembly.rdna3.pcode import _i32, _f32
|
||||
from extra.assembly.amd.autogen.rdna3 import *
|
||||
from extra.assembly.amd.lib import RawImm
|
||||
from extra.assembly.amd.emu import WaveState, run_asm, set_valid_mem_ranges
|
||||
from extra.assembly.amd.pcode import _i32, _f32
|
||||
|
||||
VCC = SrcEnum.VCC_LO # For VOP3SD sdst field
|
||||
USE_HW = os.environ.get("USE_HW", "0") == "1"
|
||||
@@ -1776,7 +1776,7 @@ class TestF16Conversions(unittest.TestCase):
|
||||
|
||||
def test_v_cvt_f16_f32_basic(self):
|
||||
"""V_CVT_F16_F32 converts f32 to f16 in low 16 bits."""
|
||||
from extra.assembly.rdna3.pcode import _f16
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], 1.0), # f32 1.0 = 0x3f800000
|
||||
v_cvt_f16_f32_e32(v[1], v[0]),
|
||||
@@ -1789,7 +1789,7 @@ class TestF16Conversions(unittest.TestCase):
|
||||
|
||||
def test_v_cvt_f16_f32_negative(self):
|
||||
"""V_CVT_F16_F32 converts negative f32 to f16."""
|
||||
from extra.assembly.rdna3.pcode import _f16
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], -2.0), # f32 -2.0 = 0xc0000000
|
||||
v_cvt_f16_f32_e32(v[1], v[0]),
|
||||
@@ -1802,7 +1802,7 @@ class TestF16Conversions(unittest.TestCase):
|
||||
|
||||
def test_v_cvt_f16_f32_small(self):
|
||||
"""V_CVT_F16_F32 converts small f32 value."""
|
||||
from extra.assembly.rdna3.pcode import _f16, f32_to_f16
|
||||
from extra.assembly.amd.pcode import _f16, f32_to_f16
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], 0.5),
|
||||
v_cvt_f16_f32_e32(v[1], v[0]),
|
||||
@@ -1862,7 +1862,7 @@ class TestF16Conversions(unittest.TestCase):
|
||||
which would produce wrong results when the significant bits of the f32 value are
|
||||
in the upper bits (as they are for most f32 values > 1.0 or < -1.0).
|
||||
"""
|
||||
from extra.assembly.rdna3.pcode import _f16
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
# Use f32 value 1.5 = 0x3fc00000. If only low 16 bits (0x0000) are read, result is wrong.
|
||||
# Correct f16 result: 0x3e00 (1.5 in half precision)
|
||||
instructions = [
|
||||
@@ -1886,7 +1886,7 @@ class TestF16Conversions(unittest.TestCase):
|
||||
is in the name), causing it to read only low 16 bits of the f32 input.
|
||||
This resulted in WMMA receiving zero inputs and producing zero outputs.
|
||||
"""
|
||||
from extra.assembly.rdna3.pcode import _f16
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
# Simulate loading two f32 values and converting/packing for WMMA
|
||||
# f32 1.5 = 0x3fc00000, f32 2.5 = 0x40200000
|
||||
# After CVT: f16 1.5 = 0x3e00, f16 2.5 = 0x4100
|
||||
@@ -1914,7 +1914,7 @@ class TestF16Conversions(unittest.TestCase):
|
||||
|
||||
def test_v_pack_b32_f16_basic(self):
|
||||
"""V_PACK_B32_F16 packs two f16 values into one 32-bit register."""
|
||||
from extra.assembly.rdna3.pcode import _f16
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
instructions = [
|
||||
# First convert two f32 values to f16
|
||||
v_mov_b32_e32(v[0], 1.0), # Will become f16 0x3c00
|
||||
@@ -1934,7 +1934,7 @@ class TestF16Conversions(unittest.TestCase):
|
||||
|
||||
def test_v_pack_b32_f16_both_positive(self):
|
||||
"""V_PACK_B32_F16 packs two positive f16 values."""
|
||||
from extra.assembly.rdna3.pcode import _f16
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], 0.5), # f16 0x3800
|
||||
v_mov_b32_e32(v[2], 2.0), # f16 0x4000
|
||||
@@ -2186,7 +2186,7 @@ class TestVOP3P(unittest.TestCase):
|
||||
|
||||
def test_v_pk_add_f16_basic(self):
|
||||
"""V_PK_ADD_F16 adds two packed f16 values."""
|
||||
from extra.assembly.rdna3.pcode import _f16
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
# v0 = packed (1.0, 2.0), v1 = packed (3.0, 4.0)
|
||||
# Result should be packed (4.0, 6.0)
|
||||
instructions = [
|
||||
@@ -2209,7 +2209,7 @@ class TestVOP3P(unittest.TestCase):
|
||||
Inline constants for VOP3P are f16 values in the low 16 bits only.
|
||||
The opsel_hi bits (default=0b11) select lo half for hi result, so both halves use the constant.
|
||||
"""
|
||||
from extra.assembly.rdna3.pcode import _f16
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
# v0 = packed (1.0, 1.0), add POS_ONE
|
||||
# With default opsel_hi=0b11: both lo and hi results use lo half of src1 (the constant)
|
||||
# But opsel_hi=1 means src1 hi comes from lo half - wait, let me check the actual encoding
|
||||
@@ -2230,7 +2230,7 @@ class TestVOP3P(unittest.TestCase):
|
||||
|
||||
def test_v_pk_mul_f16_basic(self):
|
||||
"""V_PK_MUL_F16 multiplies two packed f16 values."""
|
||||
from extra.assembly.rdna3.pcode import _f16
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
# v0 = packed (2.0, 3.0), v1 = packed (4.0, 5.0)
|
||||
# Result should be packed (8.0, 15.0)
|
||||
instructions = [
|
||||
@@ -2251,7 +2251,7 @@ class TestVOP3P(unittest.TestCase):
|
||||
"""V_PK_MUL_F16 with inline constant POS_TWO (2.0).
|
||||
Inline constant has value only in low 16 bits, hi is 0.
|
||||
"""
|
||||
from extra.assembly.rdna3.pcode import _f16
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
# v0 = packed (3.0, 4.0), multiply by POS_TWO
|
||||
# lo = 3.0 * 2.0 = 6.0, hi = 4.0 * 0.0 = 0.0 (inline const hi is 0)
|
||||
instructions = [
|
||||
@@ -2268,7 +2268,7 @@ class TestVOP3P(unittest.TestCase):
|
||||
|
||||
def test_v_pk_fma_f16_basic(self):
|
||||
"""V_PK_FMA_F16: D = A * B + C for packed f16."""
|
||||
from extra.assembly.rdna3.pcode import _f16
|
||||
from extra.assembly.amd.pcode import _f16
|
||||
# A = packed (2.0, 3.0), B = packed (4.0, 5.0), C = packed (1.0, 1.0)
|
||||
# Result should be packed (2*4+1=9.0, 3*5+1=16.0)
|
||||
instructions = [
|
||||
@@ -1,8 +1,8 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test MUBUF, MTBUF, MIMG, EXP, DS formats against LLVM."""
|
||||
import unittest
|
||||
from extra.assembly.rdna3.autogen import *
|
||||
from extra.assembly.rdna3.lib import encode_src
|
||||
from extra.assembly.amd.autogen.rdna3 import *
|
||||
from extra.assembly.amd.lib import encode_src
|
||||
|
||||
class TestMUBUF(unittest.TestCase):
|
||||
"""Test MUBUF (buffer) instructions."""
|
||||
@@ -308,7 +308,7 @@ class TestVOP3Literal(unittest.TestCase):
|
||||
def test_vop3_with_literal(self):
|
||||
# v_add3_u32 v5, vcc_hi, 0xaf123456, v255
|
||||
# GFX11: encoding: [0x05,0x00,0x55,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf]
|
||||
from extra.assembly.rdna3.lib import RawImm
|
||||
from extra.assembly.amd.lib import RawImm
|
||||
inst = VOP3(VOP3Op.V_ADD3_U32, vdst=v[5], src0=RawImm(107), src1=0xaf123456, src2=v[255])
|
||||
expected = bytes([0x05,0x00,0x55,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf])
|
||||
self.assertEqual(inst.to_bytes(), expected)
|
||||
@@ -316,14 +316,14 @@ class TestVOP3Literal(unittest.TestCase):
|
||||
def test_vop3_literal_null_operand(self):
|
||||
# v_add3_u32 v5, null, exec_lo, 0xaf123456
|
||||
# GFX11: encoding: [0x05,0x00,0x55,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf]
|
||||
from extra.assembly.rdna3.lib import RawImm
|
||||
from extra.assembly.amd.lib import RawImm
|
||||
inst = VOP3(VOP3Op.V_ADD3_U32, vdst=v[5], src0=NULL, src1=RawImm(126), src2=0xaf123456)
|
||||
expected = bytes([0x05,0x00,0x55,0xd6,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf])
|
||||
self.assertEqual(inst.to_bytes(), expected)
|
||||
|
||||
def test_vop3p_with_literal(self):
|
||||
# Test VOP3P literal encoding (also uses Inst64)
|
||||
from extra.assembly.rdna3.lib import RawImm
|
||||
from extra.assembly.amd.lib import RawImm
|
||||
inst = VOP3P(VOP3POp.V_PK_ADD_F16, vdst=v[5], src0=RawImm(240), src1=0x12345678, src2=v[0])
|
||||
self.assertEqual(len(inst.to_bytes()), 12) # 8 bytes + 4 byte literal
|
||||
|
||||
@@ -2,10 +2,10 @@
|
||||
# the Inst constructor should be looking at the types of the fields to correctly set the value
|
||||
|
||||
import unittest, struct
|
||||
from extra.assembly.rdna3.autogen import *
|
||||
from extra.assembly.rdna3.lib import Inst
|
||||
from extra.assembly.rdna3.asm import asm
|
||||
from extra.assembly.rdna3.test.test_roundtrip import compile_asm
|
||||
from extra.assembly.amd.autogen.rdna3 import *
|
||||
from extra.assembly.amd.lib import Inst
|
||||
from extra.assembly.amd.asm import asm
|
||||
from extra.assembly.amd.test.test_roundtrip import compile_asm
|
||||
|
||||
class TestIntegration(unittest.TestCase):
|
||||
inst: Inst
|
||||
@@ -1,9 +1,9 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Integration test: round-trip RDNA3 assembly through AMD toolchain."""
|
||||
import unittest, re, io, sys, subprocess
|
||||
from extra.assembly.rdna3.autogen import *
|
||||
from extra.assembly.rdna3.asm import waitcnt, asm
|
||||
from extra.assembly.rdna3.test.helpers import get_llvm_mc
|
||||
from extra.assembly.amd.autogen.rdna3 import *
|
||||
from extra.assembly.amd.asm import waitcnt, asm
|
||||
from extra.assembly.amd.test.helpers import get_llvm_mc
|
||||
|
||||
def disassemble(lib: bytes, arch: str = "gfx1100") -> str:
|
||||
"""Disassemble ELF binary using tinygrad's compiler, return raw output."""
|
||||
@@ -2,9 +2,9 @@
|
||||
"""Test RDNA3 assembler/disassembler against LLVM test vectors."""
|
||||
import unittest, re, subprocess
|
||||
from tinygrad.helpers import fetch
|
||||
from extra.assembly.rdna3.autogen import *
|
||||
from extra.assembly.rdna3.asm import asm
|
||||
from extra.assembly.rdna3.test.helpers import get_llvm_mc
|
||||
from extra.assembly.amd.autogen.rdna3 import *
|
||||
from extra.assembly.amd.asm import asm
|
||||
from extra.assembly.amd.test.helpers import get_llvm_mc
|
||||
|
||||
LLVM_BASE = "https://raw.githubusercontent.com/llvm/llvm-project/main/llvm/test/MC/AMDGPU"
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Tests for the RDNA3 pseudocode DSL."""
|
||||
import unittest
|
||||
from extra.assembly.rdna3.pcode import Reg, TypedView, SliceProxy, ExecContext, compile_pseudocode, _expr, MASK32, MASK64, _f32, _i32, _f16, _i16, f32_to_f16, _isnan
|
||||
from extra.assembly.rdna3.autogen.gen_pcode import _VOP3SDOp_V_DIV_SCALE_F32, _VOPCOp_V_CMP_CLASS_F32
|
||||
from extra.assembly.amd.pcode import Reg, TypedView, SliceProxy, ExecContext, compile_pseudocode, _expr, MASK32, MASK64, _f32, _i32, _f16, _i16, f32_to_f16, _isnan
|
||||
from extra.assembly.amd.autogen.rdna3.gen_pcode import _VOP3SDOp_V_DIV_SCALE_F32, _VOPCOp_V_CMP_CLASS_F32
|
||||
|
||||
class TestReg(unittest.TestCase):
|
||||
def test_u32_read(self):
|
||||
@@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test that PDF parser correctly extracts format fields."""
|
||||
import unittest, os
|
||||
from extra.assembly.rdna3.autogen import (
|
||||
from extra.assembly.amd.autogen.rdna3 import (
|
||||
SOP1, SOP2, SOPK, SOPP, VOP1, VOP2, VOP3SD, VOPC, FLAT, VOPD,
|
||||
SOP1Op, SOP2Op, VOP1Op, VOP3Op
|
||||
)
|
||||
@@ -41,7 +41,7 @@ class TestPDFParserGenerate(unittest.TestCase):
|
||||
|
||||
def test_pdf_parser(self):
|
||||
"""Single test that validates all PDF parser outputs."""
|
||||
from extra.assembly.rdna3.lib import generate
|
||||
from extra.assembly.amd.lib import generate
|
||||
result = generate()
|
||||
|
||||
# test_all_formats_present
|
||||
@@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
import unittest, subprocess
|
||||
from extra.assembly.rdna3.autogen import *
|
||||
from extra.assembly.rdna3.test.helpers import get_llvm_mc
|
||||
from extra.assembly.amd.autogen.rdna3 import *
|
||||
from extra.assembly.amd.test.helpers import get_llvm_mc
|
||||
|
||||
def llvm_assemble(asm: str) -> bytes:
|
||||
"""Assemble using llvm-mc and return bytes."""
|
||||
@@ -1,10 +1,10 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Roundtrip tests: generate tinygrad kernels, decode instructions, re-encode, verify match."""
|
||||
import unittest, io, sys, re, subprocess, os
|
||||
from extra.assembly.rdna3.autogen import *
|
||||
from extra.assembly.rdna3.lib import Inst
|
||||
from extra.assembly.rdna3.asm import asm
|
||||
from extra.assembly.rdna3.test.helpers import get_llvm_mc, get_llvm_objdump
|
||||
from extra.assembly.amd.autogen.rdna3 import *
|
||||
from extra.assembly.amd.lib import Inst
|
||||
from extra.assembly.amd.asm import asm
|
||||
from extra.assembly.amd.test.helpers import get_llvm_mc, get_llvm_objdump
|
||||
|
||||
# Instruction format detection based on encoding bits
|
||||
def detect_format(data: bytes) -> type[Inst] | None:
|
||||
@@ -140,7 +140,7 @@ class TestTinygradKernelRoundtrip(unittest.TestCase):
|
||||
2. asm(disasm()) matches LLVM output
|
||||
3. our disasm() matches LLVM's disassembly string exactly
|
||||
"""
|
||||
from extra.assembly.rdna3.test.test_compare_emulators import get_kernels_from_tinygrad
|
||||
from extra.assembly.amd.test.test_compare_emulators import get_kernels_from_tinygrad
|
||||
from tinygrad.runtime.support.compiler_amd import HIPCompiler
|
||||
|
||||
kernels, _, _ = get_kernels_from_tinygrad(op_fn)
|
||||
4
extra/assembly/rocm/.gitignore
vendored
4
extra/assembly/rocm/.gitignore
vendored
@@ -1,4 +0,0 @@
|
||||
*.deb
|
||||
build
|
||||
src
|
||||
sniffer/sniff.so
|
||||
@@ -1,20 +0,0 @@
|
||||
Built ROCT-Thunk-Interface (hsakmt)
|
||||
hsakmt-roct-dev_5.4.4.99999-local_amd64.deb
|
||||
note: installs to /opt/rocm
|
||||
Built ROCm-Device-Libs
|
||||
Works with ROCM_PATH=/home/tiny/build/ROCm-Device-Libs/build/dist
|
||||
rocm-device-libs_1.0.0.99999-local_amd64.deb
|
||||
Built ROCm-CompilerSupport (amd_comgr)
|
||||
no deb, sudo make install to /usr/local
|
||||
Built ROCR-Runtime
|
||||
hsa-rocr_1.8.0-local_amd64.deb
|
||||
hsa-rocr-dev_1.8.0-local_amd64.deb
|
||||
Built ROCm-OpenCL-Runtime
|
||||
rocm-ocl-icd_2.0.0-local_amd64.deb
|
||||
ISSUE: these depend on "comgr"
|
||||
rocm-opencl_2.0.0-local_amd64.deb
|
||||
rocm-opencl-dev_2.0.0-local_amd64.deb
|
||||
Did sudo make install
|
||||
|
||||
|
||||
|
||||
@@ -1,41 +0,0 @@
|
||||
# run two "rocm-bandwidth-test" in a loop
|
||||
# amdgpu-6.0.5-1581431.20.04
|
||||
# fixed in kernel 6.2.14
|
||||
|
||||
[ 72.153646] RIP: 0010:pm_send_runlist+0x4a/0x630 [amdgpu]
|
||||
[ 72.153815] Code: 30 65 48 8b 04 25 28 00 00 00 48 89 45 d0 31 c0 80 fb 01 0f 87 aa 9d 49 00 83 e3 01 0f 85 1c 05 00 00 49 8b 3f b8 01 00 00 00 <48> 8b 97 30 01 00 00 44 8b b7 6c 01 00 00 8b 9f 70 01 00 00 8b 8a
|
||||
[ 72.153900] RSP: 0018:ffffb48445c03c30 EFLAGS: 00010246
|
||||
[ 72.153928] RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000000
|
||||
[ 72.153962] RDX: 000000000000007b RSI: ffff9395e1562558 RDI: 0000000000000000
|
||||
[ 72.153996] RBP: ffffb48445c03cb8 R08: 0000000000000000 R09: 0000000000000001
|
||||
[ 72.154030] R10: ffff9395c900d840 R11: 0000000000000000 R12: 0000000000000000
|
||||
[ 72.154065] R13: ffff9395c9e00400 R14: 0000000000000001 R15: ffff9395e15624e0
|
||||
[ 72.154099] FS: 00007f345c6463c0(0000) GS:ffff93a4aee80000(0000) knlGS:0000000000000000
|
||||
[ 72.154137] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
|
||||
[ 72.154165] CR2: 0000000000000130 CR3: 0000000112840000 CR4: 0000000000750ee0
|
||||
[ 72.154201] PKRU: 55555554
|
||||
[ 72.154215] Call Trace:
|
||||
[ 72.154230] <TASK>
|
||||
[ 72.154244] map_queues_cpsch+0x75/0xc0 [amdgpu]
|
||||
[ 72.154365] debug_map_and_unlock+0x51/0x90 [amdgpu]
|
||||
[ 72.154480] debug_refresh_runlist+0x1f/0x30 [amdgpu]
|
||||
[ 72.154591] kfd_dbg_runtime_disable+0x13c/0x240 [amdgpu]
|
||||
[ 72.154705] kfd_ioctl_dbg_set_debug_trap+0x69d/0x8b0 [amdgpu]
|
||||
[ 72.154820] kfd_ioctl+0x24a/0x5b0 [amdgpu]
|
||||
[ 72.154925] ? kfd_ioctl_create_queue+0x770/0x770 [amdgpu]
|
||||
[ 72.155035] ? syscall_exit_to_user_mode+0x27/0x50
|
||||
[ 72.155061] ? exit_to_user_mode_prepare+0x3d/0x1c0
|
||||
[ 72.155088] __x64_sys_ioctl+0x95/0xd0
|
||||
[ 72.155109] do_syscall_64+0x5c/0xc0
|
||||
[ 72.155128] ? syscall_exit_to_user_mode+0x27/0x50
|
||||
[ 72.155151] ? do_syscall_64+0x69/0xc0
|
||||
[ 72.155172] entry_SYSCALL_64_after_hwframe+0x61/0xcb
|
||||
[ 72.155198] RIP: 0033:0x7f345c7f63ab
|
||||
[ 72.155218] Code: 0f 1e fa 48 8b 05 e5 7a 0d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d b5 7a 0d 00 f7 d8 64 89 01 48
|
||||
[ 72.155301] RSP: 002b:00007ffc97cc89f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
|
||||
[ 72.155339] RAX: ffffffffffffffda RBX: 00007ffc97cc8a30 RCX: 00007f345c7f63ab
|
||||
[ 72.155375] RDX: 00007ffc97cc8a30 RSI: 00000000c0284b82 RDI: 0000000000000003
|
||||
[ 72.155411] RBP: 00000000c0284b82 R08: 0000000000000000 R09: 0000000000000000
|
||||
[ 72.155447] R10: 00007f345cd4ddb0 R11: 0000000000000246 R12: 00007ffc97cc8a30
|
||||
[ 72.155481] R13: 0000000000000003 R14: 00007ffc97cc8d20 R15: 0000000000000000
|
||||
[ 72.155517] </TASK>
|
||||
@@ -1,41 +0,0 @@
|
||||
# run two tinygrad matrix example in a loop
|
||||
# amdgpu-6.0.5-1581431.20.04
|
||||
# NOT fixed in kernel 6.2.14
|
||||
|
||||
[ 553.016624] gmc_v11_0_process_interrupt: 30 callbacks suppressed
|
||||
[ 553.016631] amdgpu 0000:0b:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:24 vmid:9 pasid:32770, for process python3 pid 10001 thread python3 pid 10001)
|
||||
[ 553.016790] amdgpu 0000:0b:00.0: amdgpu: in page starting at address 0x00007f0000000000 from client 10
|
||||
[ 553.016892] amdgpu 0000:0b:00.0: amdgpu: GCVM_L2_PROTECTION_FAULT_STATUS:0x00901A30
|
||||
[ 553.016974] amdgpu 0000:0b:00.0: amdgpu: Faulty UTCL2 client ID: SDMA0 (0xd)
|
||||
[ 553.017051] amdgpu 0000:0b:00.0: amdgpu: MORE_FAULTS: 0x0
|
||||
[ 553.017111] amdgpu 0000:0b:00.0: amdgpu: WALKER_ERROR: 0x0
|
||||
[ 553.017173] amdgpu 0000:0b:00.0: amdgpu: PERMISSION_FAULTS: 0x3
|
||||
[ 553.017238] amdgpu 0000:0b:00.0: amdgpu: MAPPING_ERROR: 0x0
|
||||
[ 553.017300] amdgpu 0000:0b:00.0: amdgpu: RW: 0x0
|
||||
[ 553.123921] [drm:mes_v11_0_submit_pkt_and_poll_completion.constprop.0 [amdgpu]] *ERROR* MES failed to response msg=2
|
||||
[ 553.124153] amdgpu: failed to add hardware queue to MES, doorbell=0x1a16
|
||||
[ 553.124195] amdgpu: MES might be in unrecoverable state, issue a GPU reset
|
||||
[ 553.124237] amdgpu: Failed to restore queue 2
|
||||
[ 553.124266] amdgpu: Failed to restore process queues
|
||||
[ 553.124270] amdgpu: Failed to evict queue 3
|
||||
[ 553.124297] amdgpu: amdgpu_amdkfd_restore_userptr_worker: Failed to resume KFD
|
||||
|
||||
# alternative crash in kernel 6.2.14
|
||||
|
||||
[ 151.097948] gmc_v11_0_process_interrupt: 30 callbacks suppressed
|
||||
[ 151.097953] amdgpu 0000:0b:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:24 vmid:8 pasid:32771, for process python3 pid 7525 thread python3 pid 7525)
|
||||
[ 151.097993] amdgpu 0000:0b:00.0: amdgpu: in page starting at address 0x00007f0000000000 from client 10
|
||||
[ 151.098008] amdgpu 0000:0b:00.0: amdgpu: GCVM_L2_PROTECTION_FAULT_STATUS:0x00801A30
|
||||
[ 151.098020] amdgpu 0000:0b:00.0: amdgpu: Faulty UTCL2 client ID: SDMA0 (0xd)
|
||||
[ 151.098032] amdgpu 0000:0b:00.0: amdgpu: MORE_FAULTS: 0x0
|
||||
[ 151.098042] amdgpu 0000:0b:00.0: amdgpu: WALKER_ERROR: 0x0
|
||||
[ 151.098052] amdgpu 0000:0b:00.0: amdgpu: PERMISSION_FAULTS: 0x3
|
||||
[ 151.098062] amdgpu 0000:0b:00.0: amdgpu: MAPPING_ERROR: 0x0
|
||||
[ 151.098071] amdgpu 0000:0b:00.0: amdgpu: RW: 0x0
|
||||
[ 151.209517] [drm:mes_v11_0_submit_pkt_and_poll_completion.constprop.0 [amdgpu]] *ERROR* MES failed to response msg=2
|
||||
[ 151.209724] amdgpu: failed to add hardware queue to MES, doorbell=0x1002
|
||||
[ 151.209734] amdgpu: MES might be in unrecoverable state, issue a GPU reset
|
||||
[ 151.209743] amdgpu: Failed to restore queue 1
|
||||
[ 151.209751] amdgpu: Failed to restore process queues
|
||||
[ 151.209759] amdgpu: amdgpu_amdkfd_restore_userptr_worker: Failed to resume KFD
|
||||
[ 151.209858] amdgpu 0000:0b:00.0: amdgpu: GPU reset begin!
|
||||
@@ -1,20 +0,0 @@
|
||||
# two tinygrad + two bandwidth test
|
||||
# RDNA2, driver 6.0.5
|
||||
# recovered from this!
|
||||
|
||||
[ 136.971209] gmc_v10_0_process_interrupt: 39 callbacks suppressed
|
||||
[ 136.971218] amdgpu 0000:0b:00.0: amdgpu: [gfxhub] page fault (src_id:0 ring:24 vmid:11 pasid:32773, for process rocm-bandwidth- pid 20281 thread rocm-bandwidth- pid 20281)
|
||||
[ 136.971228] amdgpu 0000:0b:00.0: amdgpu: in page starting at address 0x00007f5c2b800000 from client 0x1b (UTCL2)
|
||||
[ 136.971232] amdgpu 0000:0b:00.0: amdgpu: GCVM_L2_PROTECTION_FAULT_STATUS:0x00B01A31
|
||||
[ 136.971233] amdgpu 0000:0b:00.0: amdgpu: Faulty UTCL2 client ID: SDMA0 (0xd)
|
||||
[ 136.971235] amdgpu 0000:0b:00.0: amdgpu: MORE_FAULTS: 0x1
|
||||
[ 136.971236] amdgpu 0000:0b:00.0: amdgpu: WALKER_ERROR: 0x0
|
||||
[ 136.971236] amdgpu 0000:0b:00.0: amdgpu: PERMISSION_FAULTS: 0x3
|
||||
[ 136.971237] amdgpu 0000:0b:00.0: amdgpu: MAPPING_ERROR: 0x0
|
||||
[ 136.971238] amdgpu 0000:0b:00.0: amdgpu: RW: 0x0
|
||||
...
|
||||
[ 136.993979] amdgpu 0000:0b:00.0: amdgpu: IH ring buffer overflow (0x000BE5A0, 0x0003C480, 0x0003E5C0)
|
||||
[ 138.209072] amdgpu 0000:0b:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x001a address=0x7c00004000 flags=0x0000]
|
||||
[ 138.209078] amdgpu 0000:0b:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x001a address=0x7c00004d80 flags=0x0000]
|
||||
[ 138.209081] amdgpu 0000:0b:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x001a address=0x7c00005000 flags=0x0000]
|
||||
[ 138.209084] amdgpu 0000:0b:00.0: AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x001a address=0x7c00005d80 flags=0x0000]
|
||||
@@ -1,33 +0,0 @@
|
||||
# ROCK-Kernel-Driver 0b579de9622f5c93021dcb7927d13926313740a2
|
||||
# non fatal "crash"
|
||||
|
||||
[ 127.418045] ------------[ cut here ]------------
|
||||
[ 127.418046] User pages unexpectedly invalid
|
||||
[ 127.418056] WARNING: CPU: 16 PID: 260 at drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:3000 amdgpu_amdkfd_restore_userptr_worker+0x4d9/0x500 [amdgpu]
|
||||
[ 127.418235] Modules linked in: rfcomm cmac algif_hash algif_skcipher af_alg bnep nls_iso8859_1 iwlmvm mac80211 intel_rapl_msr intel_rapl_common edac_mce_amd snd_hda_codec_realtek snd_hda_codec_generic snd_hda_codec_hdmi kvm_amd binfmt_misc snd_hda_intel snd_intel_dspcfg kvm libarc4 snd_intel_sdw_acpi snd_hda_codec btusb iwlwifi btrtl snd_hda_core btbcm btintel irqbypass btmtk snd_hwdep crct10dif_pclmul snd_pcm polyval_clmulni bluetooth snd_seq_midi snd_seq_midi_event snd_rawmidi snd_seq polyval_generic cfg80211 ghash_clmulni_intel eeepc_wmi snd_seq_device snd_timer aesni_intel asus_wmi ecdh_generic snd platform_profile crypto_simd ledtrig_audio cryptd ecc ccp soundcore sparse_keymap rapl k10temp wmi_bmof mac_hid sch_fq_codel msr parport_pc ppdev lp parport ramoops pstore_blk efi_pstore reed_solomon pstore_zone ip_tables x_tables autofs4 amdgpu hid_generic usbhid hid i2c_algo_bit drm_ttm_helper ttm video iommu_v2 drm_buddy gpu_sched drm_display_helper drm_kms_helper syscopyarea
|
||||
[ 127.418276] sysfillrect sysimgblt fb_sys_fops drm nvme nvme_core cec r8169 ahci crc32_pclmul rc_core i2c_piix4 xhci_pci libahci nvme_common xhci_pci_renesas realtek wmi
|
||||
[ 127.418284] CPU: 16 PID: 260 Comm: kworker/16:1 Tainted: G W 6.0.0 #4
|
||||
[ 127.418286] Hardware name: System manufacturer System Product Name/TUF GAMING X570-PLUS (WI-FI), BIOS 3603 03/20/2021
|
||||
[ 127.418287] Workqueue: events amdgpu_amdkfd_restore_userptr_worker [amdgpu]
|
||||
[ 127.418455] RIP: 0010:amdgpu_amdkfd_restore_userptr_worker+0x4d9/0x500 [amdgpu]
|
||||
[ 127.418601] Code: ff e8 2b 8a 96 d1 e9 66 fe ff ff 48 c7 c7 40 4f f5 c0 e8 56 7b 8a d1 0f 0b e9 2e ff ff ff 48 c7 c7 d8 d0 ed c0 e8 43 7b 8a d1 <0f> 0b e9 0a fe ff ff 4c 89 ef e8 f8 89 96 d1 e9 cb fd ff ff e8 ce
|
||||
[ 127.418603] RSP: 0018:ffffb36740a83dc8 EFLAGS: 00010282
|
||||
[ 127.418604] RAX: 0000000000000000 RBX: ffff9d159ee9df30 RCX: 0000000000000027
|
||||
[ 127.418605] RDX: 0000000000000027 RSI: ffffb36740a83c88 RDI: ffff9d242a220568
|
||||
[ 127.418606] RBP: ffffb36740a83e58 R08: ffff9d242a220560 R09: 0000000000000001
|
||||
[ 127.418607] R10: 0000000000000001 R11: 0000000000000020 R12: ffff9d159ee9df98
|
||||
[ 127.418607] R13: ffff9d159ee9df70 R14: ffff9d159ee9dee0 R15: ffff9d159ee9dee0
|
||||
[ 127.418608] FS: 0000000000000000(0000) GS:ffff9d242a200000(0000) knlGS:0000000000000000
|
||||
[ 127.418609] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
|
||||
[ 127.418610] CR2: 00007fd5d4715000 CR3: 0000000120ffe000 CR4: 0000000000750ee0
|
||||
[ 127.418611] PKRU: 55555554
|
||||
[ 127.418611] Call Trace:
|
||||
[ 127.418612] <TASK>
|
||||
[ 127.418613] process_one_work+0x21f/0x3f0
|
||||
[ 127.418615] worker_thread+0x4a/0x3c0
|
||||
[ 127.418617] ? process_one_work+0x3f0/0x3f0
|
||||
[ 127.418618] kthread+0xf0/0x120
|
||||
[ 127.418619] ? kthread_complete_and_exit+0x20/0x20
|
||||
[ 127.418620] ret_from_fork+0x22/0x30
|
||||
[ 127.418622] </TASK>
|
||||
[ 127.418623] ---[ end trace 0000000000000000 ]---
|
||||
@@ -1,80 +0,0 @@
|
||||
import numpy as np
|
||||
import pathlib
|
||||
from hexdump import hexdump
|
||||
from tinygrad.helpers import colored
|
||||
from extra.helpers import enable_early_exec
|
||||
early_exec = enable_early_exec()
|
||||
|
||||
from tinygrad.runtime.ops_cl import CLProgram, CLBuffer, ROCM_LLVM_PATH
|
||||
|
||||
ENABLE_NON_ASM = False
|
||||
|
||||
WMMA = True
|
||||
DUAL_ALU = True
|
||||
F32 = True
|
||||
|
||||
if ENABLE_NON_ASM:
|
||||
buf = CLBuffer.fromCPU(np.zeros(10, np.float32))
|
||||
prg_empty = CLProgram("code", "__kernel void code(__global float *a) { a[0] = 1; }")
|
||||
asm_real = prg_empty.binary()
|
||||
with open("/tmp/cc.elf", "wb") as f:
|
||||
f.write(asm_real)
|
||||
prg_empty([1], [1], buf, wait=True)
|
||||
print(buf.toCPU())
|
||||
|
||||
print(colored("creating CLBuffer", "green"))
|
||||
buf = CLBuffer.fromCPU(np.zeros(10, np.float32))
|
||||
code = open(pathlib.Path(__file__).parent / "prog.s", "r").read()
|
||||
|
||||
gen = []
|
||||
FLOPS = 0
|
||||
MAX_REG = 251
|
||||
for j in range(1):
|
||||
if WMMA:
|
||||
KY, KX = 4, 4
|
||||
for y in range(KY):
|
||||
for x in range(KX):
|
||||
c = (y*KX+x)*8
|
||||
a = (KY*KX*8) + y*8
|
||||
b = (KY*KX*8) + (KY*8) + x*8
|
||||
gen.append(f"v_wmma_f32_16x16x16_f16 v[{c}:{c+7}], v[{a}:{a+7}], v[{b}:{b+7}], v[{c}:{c+7}]")
|
||||
FLOPS += 16*8*2
|
||||
else:
|
||||
for i in range(0, MAX_REG, 6):
|
||||
if DUAL_ALU:
|
||||
if F32:
|
||||
gen.append(f"v_dual_fmac_f32 v{i+0}, v{i+1}, v{i+2} :: v_dual_fmac_f32 v{i+3}, v{i+4}, v{i+5}")
|
||||
FLOPS += 4
|
||||
else:
|
||||
gen.append(f"v_dual_dot2acc_f32_f16 v{i+0}, v{i+1}, v{i+2} :: v_dual_dot2acc_f32_f16 v{i+3}, v{i+4}, v{i+5}")
|
||||
FLOPS += 8
|
||||
else:
|
||||
assert F32
|
||||
gen.append(f"v_fmac_f32 v{i+0}, v{i+1}, v{i+2}")
|
||||
gen.append(f"v_fmac_f32 v{i+3}, v{i+4}, v{i+5}")
|
||||
code = code.replace("// FLOPS", '\n'.join(gen))
|
||||
print(code)
|
||||
|
||||
|
||||
# fix: COMGR failed to get code object ISA name. set triple to 'amdgcn-amd-amdhsa'
|
||||
|
||||
object = early_exec(([ROCM_LLVM_PATH / "llvm-mc", '--arch=amdgcn', '--mcpu=gfx1100', '--triple=amdgcn-amd-amdhsa', '--filetype=obj', '-'], code.encode("utf-8")))
|
||||
asm = early_exec(([ROCM_LLVM_PATH / "ld.lld", "/dev/stdin", "-o", "/dev/stdout", "--pie"], object))
|
||||
|
||||
with open("/tmp/cc2.o", "wb") as f:
|
||||
f.write(object)
|
||||
with open("/tmp/cc2.elf", "wb") as f:
|
||||
f.write(asm)
|
||||
|
||||
print(colored("creating CLProgram", "green"))
|
||||
prg = CLProgram("code", asm)
|
||||
|
||||
print(colored("running program", "green"))
|
||||
G = 512
|
||||
FLOPS *= 100000*G*G # loop * global_size
|
||||
for i in range(3):
|
||||
tm = prg(buf, global_size=[G//256, G, 1], local_size=[256, 1, 1], wait=True)
|
||||
print(f"ran in {tm*1e3:.2f} ms, {FLOPS/(tm*1e9):.2f} GFLOPS")
|
||||
|
||||
print(colored("transferring buffer", "green"))
|
||||
print(buf.toCPU())
|
||||
@@ -1,80 +0,0 @@
|
||||
.global _start
|
||||
_start:
|
||||
.rodata
|
||||
.align 0x10
|
||||
.global code.kd
|
||||
.type code.kd,STT_OBJECT
|
||||
# amd_kernel_code_t (must be at 0x440 for kernel_code_entry_byte_offset to be right)
|
||||
code.kd:
|
||||
# amd_kernel_..., amd_machine_...
|
||||
.long 0,0,0,0
|
||||
# kernel_code_entry_byte_offset, kernel_code_prefetch_byte_offset
|
||||
.long 0x00000bc0,0x00000000,0x00000000,0x00000000
|
||||
# kernel_code_prefetch_byte_size, max_scratch_backing_memory_byte_size
|
||||
.long 0,0,0,0
|
||||
# compute_pgm_rsrc1, compute_pgm_rsrc2, kernel_code_properties, workitem_private_segment_byte_size
|
||||
.long 0x60af0000,0x0000009e,0x00000408,0x00000000
|
||||
# compute_pgm_rsrc1 |= AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32 | AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64
|
||||
# compute_pgm_rsrc1 |= AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP | AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE
|
||||
# compute_pgm_rsrc2 |= AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT = 0xF
|
||||
# compute_pgm_rsrc2 |= AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X
|
||||
# kernel_code_properties |= AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR = 1
|
||||
# kernel_code_properties |= AMD_KERNEL_CODE_PROPERTIES_RESERVED1 = 1
|
||||
.text
|
||||
.global code
|
||||
.type code,STT_FUNC
|
||||
code:
|
||||
# https://llvm.org/docs/AMDGPUUsage.html#initial-kernel-execution-state
|
||||
# s[0:1] contains the kernarg_address
|
||||
# TODO: can we use s[2:3] if this was really a wave since we only alloced 2 SGPRs?
|
||||
s_load_b64 s[2:3], s[0:1], null
|
||||
|
||||
s_mov_b32 s8, 0
|
||||
loop:
|
||||
s_addk_i32 s8, 1
|
||||
s_cmp_eq_u32 s8, 100000
|
||||
// FLOPS
|
||||
s_cbranch_scc0 loop
|
||||
|
||||
# wait for the s_load_b64
|
||||
s_waitcnt lgkmcnt(0)
|
||||
|
||||
v_dual_mov_b32 v0, 4 :: v_dual_mov_b32 v1, 2.0
|
||||
global_store_b32 v0, v1, s[2:3]
|
||||
|
||||
# Deallocate all VGPRs for this wave. Use only when next instruction is S_ENDPGM.
|
||||
s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||||
s_endpgm
|
||||
s_code_end
|
||||
|
||||
.amdgpu_metadata
|
||||
amdhsa.kernels:
|
||||
- .args:
|
||||
- .address_space: global
|
||||
.name: a
|
||||
.offset: 0
|
||||
.size: 8
|
||||
.type_name: 'float*'
|
||||
.value_kind: global_buffer
|
||||
.group_segment_fixed_size: 0
|
||||
.kernarg_segment_align: 8
|
||||
.kernarg_segment_size: 8
|
||||
.language: OpenCL C
|
||||
.language_version:
|
||||
- 1
|
||||
- 2
|
||||
.max_flat_workgroup_size: 256
|
||||
.name: code
|
||||
.private_segment_fixed_size: 0
|
||||
.sgpr_count: 2
|
||||
.sgpr_spill_count: 0
|
||||
.symbol: code.kd
|
||||
.uses_dynamic_stack: false
|
||||
.vgpr_count: 256
|
||||
.vgpr_spill_count: 0
|
||||
.wavefront_size: 32
|
||||
amdhsa.target: amdgcn-amd-amdhsa--gfx1100
|
||||
amdhsa.version:
|
||||
- 1
|
||||
- 2
|
||||
.end_amdgpu_metadata
|
||||
@@ -1,11 +0,0 @@
|
||||
#!/bin/bash
|
||||
mkdir -p src
|
||||
cd src
|
||||
git clone https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface.git -b rocm-5.5.0
|
||||
git clone https://github.com/RadeonOpenCompute/ROCm-Device-Libs.git -b rocm-5.5.0
|
||||
git clone https://github.com/RadeonOpenCompute/llvm-project.git -b rocm-5.5.0 --depth 1
|
||||
git clone https://github.com/RadeonOpenCompute/ROCR-Runtime.git -b rocm-5.5.0
|
||||
git clone https://github.com/ROCm-Developer-Tools/ROCclr.git -b rocm-5.5.0
|
||||
git clone https://github.com/RadeonOpenCompute/ROCm-CompilerSupport.git -b rocm-5.5.0
|
||||
git clone https://github.com/RadeonOpenCompute/ROCm-OpenCL-Runtime.git -b rocm-5.5.0
|
||||
cd ../
|
||||
@@ -1,69 +0,0 @@
|
||||
#!/bin/bash
|
||||
mkdir -p build/debs
|
||||
cd build
|
||||
|
||||
# ROCT-Thunk-Interface (hsakmt)
|
||||
if [ ! -f debs/hsakmt-roct-dev_5.5.0.99999-local_amd64.deb ]
|
||||
then
|
||||
mkdir -p ROCT-Thunk-Interface
|
||||
cd ROCT-Thunk-Interface
|
||||
cmake ../../src/ROCT-Thunk-Interface
|
||||
make -j32 package
|
||||
cp hsakmt-roct-dev_5.5.0.99999-local_amd64.deb ../debs
|
||||
cd ../
|
||||
fi
|
||||
|
||||
|
||||
# build custom LLVM
|
||||
if [ ! -f llvm-project/bin/clang ]
|
||||
then
|
||||
mkdir -p llvm-project
|
||||
cd llvm-project
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="llvm;clang;lld" -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" ../../src/llvm-project/llvm
|
||||
make -j32
|
||||
cd ..
|
||||
fi
|
||||
|
||||
# use custom LLVM
|
||||
export PATH="$PWD/llvm-project/bin:$PATH"
|
||||
|
||||
# ROCm-Device-Libs
|
||||
if [ ! -f debs/rocm-device-libs_1.0.0.99999-local_amd64.deb ]
|
||||
then
|
||||
mkdir -p ROCm-Device-Libs
|
||||
cd ROCm-Device-Libs
|
||||
cmake ../../src/ROCm-Device-Libs
|
||||
make -j32 package
|
||||
cp rocm-device-libs_1.0.0.99999-local_amd64.deb ../debs
|
||||
cd ../
|
||||
fi
|
||||
|
||||
# ROCR-Runtime
|
||||
if [ ! -f debs/hsa-rocr_1.8.0-local_amd64.deb ]
|
||||
then
|
||||
mkdir -p ROCR-Runtime
|
||||
cd ROCR-Runtime
|
||||
cmake ../../src/ROCR-Runtime/src
|
||||
make -j32 package
|
||||
cp hsa-rocr_1.8.0-local_amd64.deb ../debs
|
||||
cp hsa-rocr-dev_1.8.0-local_amd64.deb ../debs
|
||||
cd ../
|
||||
fi
|
||||
|
||||
# ROCm-OpenCL-Runtime (needs ROCclr)
|
||||
if [ ! -f debs/rocm-opencl_2.0.0-local_amd64.deb ]
|
||||
then
|
||||
mkdir -p ROCm-OpenCL-Runtime
|
||||
cd ROCm-OpenCL-Runtime
|
||||
cmake ../../src/ROCm-OpenCL-Runtime
|
||||
make -j32 package
|
||||
cp rocm-opencl_2.0.0-local_amd64.deb ../debs
|
||||
cp rocm-opencl-dev_2.0.0-local_amd64.deb ../debs
|
||||
cp rocm-ocl-icd_2.0.0-local_amd64.deb ../debs
|
||||
fi
|
||||
|
||||
# ROCm-CompilerSupport (broken)
|
||||
#mkdir -p ROCm-CompilerSupport
|
||||
#cd ROCm-CompilerSupport
|
||||
#cmake ../../src/ROCm-CompilerSupport/lib/comgr
|
||||
#make -j32
|
||||
@@ -1,14 +0,0 @@
|
||||
#!/bin/bash
|
||||
rm amdgpu-install_5.5.50500-1_all.deb
|
||||
wget https://repo.radeon.com/amdgpu-install/5.5/ubuntu/$(lsb_release -cs)/amdgpu-install_5.5.50500-1_all.deb
|
||||
sudo dpkg -i amdgpu-install_5.5.50500-1_all.deb
|
||||
sudo apt-get update
|
||||
|
||||
# kernel driver
|
||||
sudo apt-get install amdgpu-dkms
|
||||
|
||||
# for opencl
|
||||
sudo apt-get install rocm-opencl-runtime
|
||||
|
||||
# for HIP
|
||||
sudo apt-get install hip-runtime-amd rocm-device-libs hip-dev
|
||||
@@ -1,11 +0,0 @@
|
||||
#!/bin/bash -e
|
||||
clang sniff.cc -Werror -shared -fPIC -I../src/ -I../src/ROCT-Thunk-Interface/include -I../src/ROCm-Device-Libs/ockl/inc -o sniff.so -lstdc++
|
||||
#AMD_LOG_LEVEL=4 HSAKMT_DEBUG_LEVEL=7 LD_PRELOAD=$PWD/sniff.so /home/tiny/build/HIP-Examples/HIP-Examples-Applications/HelloWorld/HelloWorld
|
||||
#AMD_LOG_LEVEL=4 LD_PRELOAD=$PWD/sniff.so $HOME/build/HIP-Examples/HIP-Examples-Applications/HelloWorld/HelloWorld
|
||||
#AMD_LOG_LEVEL=5 LD_PRELOAD=$PWD/sniff.so python3 ../rdna3/asm.py
|
||||
DEBUG=5 LD_PRELOAD=$PWD/sniff.so python3 ../rdna3/asm.py
|
||||
#AMD_LOG_LEVEL=5 HSAKMT_DEBUG_LEVEL=7 DEBUG=5 LD_PRELOAD=$PWD/sniff.so strace -F python3 ../rdna3/asm.py
|
||||
#LD_PRELOAD=$PWD/sniff.so python3 ../rdna3/asm.py
|
||||
#AMD_LOG_LEVEL=4 LD_PRELOAD=$PWD/sniff.so FORWARD_ONLY=1 DEBUG=2 python3 ../../../test/test_ops.py TestOps.test_add
|
||||
#AMD_LOG_LEVEL=4 HSAKMT_DEBUG_LEVEL=7 LD_PRELOAD=$PWD/sniff.so rocm-bandwidth-test -s 0 -d 1 -m 1
|
||||
#AMD_LOG_LEVEL=4 HSAKMT_DEBUG_LEVEL=7 LD_PRELOAD=$PWD/sniff.so rocm-bandwidth-test -s 1 -d 2 -m 1
|
||||
@@ -1,282 +0,0 @@
|
||||
// template copied from https://github.com/geohot/cuda_ioctl_sniffer/blob/master/sniff.cc
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <dlfcn.h>
|
||||
#include <signal.h>
|
||||
#include <ucontext.h>
|
||||
|
||||
#include <sys/mman.h>
|
||||
|
||||
// includes from the ROCm sources
|
||||
#include <linux/kfd_ioctl.h>
|
||||
#include <hsa.h>
|
||||
#include <amd_hsa_kernel_code.h>
|
||||
#include <ROCR-Runtime/src/core/inc/sdma_registers.h>
|
||||
using namespace rocr::AMD;
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
std::map<int, std::string> files;
|
||||
std::map<uint64_t, uint64_t> ring_base_addresses;
|
||||
|
||||
#define D(args...) fprintf(stderr, args)
|
||||
|
||||
uint64_t doorbell_offset = -1;
|
||||
std::map<uint64_t, int> queue_types;
|
||||
|
||||
void hexdump(void *d, int l) {
|
||||
for (int i = 0; i < l; i++) {
|
||||
if (i%0x10 == 0 && i != 0) printf("\n");
|
||||
if (i%0x10 == 8) printf(" ");
|
||||
if (i%0x10 == 0) printf("%8X: ", i);
|
||||
printf("%2.2X ", ((uint8_t*)d)[i]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
// https://defuse.ca/online-x86-assembler.htm#disassembly2
|
||||
static void handler(int sig, siginfo_t *si, void *unused) {
|
||||
ucontext_t *u = (ucontext_t *)unused;
|
||||
uint8_t *rip = (uint8_t*)u->uc_mcontext.gregs[REG_RIP];
|
||||
|
||||
int store_size = 0;
|
||||
uint64_t value;
|
||||
if (rip[0] == 0x48 && rip[1] == 0x89 && rip[2] == 0x30) {
|
||||
// 0: 48 89 30 mov QWORD PTR [rax],rsi
|
||||
store_size = 8;
|
||||
value = u->uc_mcontext.gregs[REG_RSI];
|
||||
u->uc_mcontext.gregs[REG_RIP] += 3;
|
||||
} else if (rip[0] == 0x4c && rip[1] == 0x89 && rip[2] == 0x28) {
|
||||
// 0: 4c 89 28 mov QWORD PTR [rax],r13
|
||||
store_size = 8;
|
||||
value = u->uc_mcontext.gregs[REG_R13];
|
||||
u->uc_mcontext.gregs[REG_RIP] += 3;
|
||||
} else {
|
||||
D("segfault %02X %02X %02X %02X %02X %02X %02X %02X rip: %p addr: %p\n", rip[0], rip[1], rip[2], rip[3], rip[4], rip[5], rip[6], rip[7], rip, si->si_addr);
|
||||
D("rax: %llx rcx: %llx rdx: %llx rsi: %llx rbx: %llx\n", u->uc_mcontext.gregs[REG_RAX], u->uc_mcontext.gregs[REG_RCX], u->uc_mcontext.gregs[REG_RDX], u->uc_mcontext.gregs[REG_RSI], u->uc_mcontext.gregs[REG_RBX]);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
uint64_t ring_base_address = ring_base_addresses[((uint64_t)si->si_addr)&0xFFF];
|
||||
int queue_type = queue_types[((uint64_t)si->si_addr)&0xFFF];
|
||||
D("%16p: \u001b[31mDING DONG\u001b[0m (queue_type %d) store(%d): 0x%8lx -> %p ring_base_address:0x%lx\n", rip, queue_type, store_size, value, si->si_addr, ring_base_address);
|
||||
|
||||
if (queue_type == KFD_IOC_QUEUE_TYPE_SDMA) {
|
||||
uint8_t *sdma_ptr = (uint8_t*)(ring_base_address);
|
||||
while (sdma_ptr < ((uint8_t*)(ring_base_address)+value)) {
|
||||
D("0x%3lx: ", sdma_ptr-(uint8_t*)(ring_base_address));
|
||||
if (sdma_ptr[0] == SDMA_OP_TIMESTAMP) {
|
||||
D("SDMA_PKT_TIMESTAMP\n");
|
||||
sdma_ptr += sizeof(SDMA_PKT_TIMESTAMP);
|
||||
} else if (sdma_ptr[0] == SDMA_OP_GCR) {
|
||||
D("SDMA_PKT_GCR\n");
|
||||
sdma_ptr += sizeof(SDMA_PKT_GCR);
|
||||
} else if (sdma_ptr[0] == SDMA_OP_ATOMIC) {
|
||||
D("SDMA_PKT_ATOMIC\n");
|
||||
sdma_ptr += sizeof(SDMA_PKT_ATOMIC);
|
||||
} else if (sdma_ptr[0] == SDMA_OP_FENCE) {
|
||||
D("SDMA_PKT_FENCE\n");
|
||||
sdma_ptr += sizeof(SDMA_PKT_FENCE);
|
||||
} else if (sdma_ptr[0] == SDMA_OP_TRAP) {
|
||||
D("SDMA_PKT_TRAP\n");
|
||||
sdma_ptr += sizeof(SDMA_PKT_TRAP);
|
||||
} else if (sdma_ptr[0] == SDMA_OP_COPY && sdma_ptr[1] == SDMA_SUBOP_COPY_LINEAR) {
|
||||
SDMA_PKT_COPY_LINEAR *pkt = (SDMA_PKT_COPY_LINEAR *)sdma_ptr;
|
||||
D("SDMA_PKT_COPY_LINEAR: count:0x%x src:0x%lx dst:0x%lx\n", pkt->COUNT_UNION.count+1,
|
||||
(uint64_t)pkt->SRC_ADDR_LO_UNION.src_addr_31_0 | ((uint64_t)pkt->SRC_ADDR_HI_UNION.src_addr_63_32 << 32),
|
||||
(uint64_t)pkt->DST_ADDR_LO_UNION.dst_addr_31_0 | ((uint64_t)pkt->DST_ADDR_HI_UNION.dst_addr_63_32 << 32)
|
||||
);
|
||||
sdma_ptr += sizeof(SDMA_PKT_COPY_LINEAR);
|
||||
} else {
|
||||
D("unhandled packet type %d %d, exiting\n", sdma_ptr[0], sdma_ptr[1]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
//hexdump((void*)(ring_base_address), 0x100);
|
||||
} else if (queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL) {
|
||||
hsa_kernel_dispatch_packet_t *pkt = (hsa_kernel_dispatch_packet_t *)(ring_base_address+value*0x40);
|
||||
if ((pkt->header&0xFF) == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
|
||||
D("HSA_PACKET_TYPE_KERNEL_DISPATCH -- setup:%d workgroup[%d, %d, %d] grid[%d, %d, %d] kernel_object:0x%lx kernarg_address:%p\n", pkt->setup, pkt->workgroup_size_x, pkt->workgroup_size_y, pkt->workgroup_size_z, pkt->grid_size_x, pkt->grid_size_y, pkt->grid_size_z, pkt->kernel_object, pkt->kernarg_address);
|
||||
amd_kernel_code_t *code = (amd_kernel_code_t *)pkt->kernel_object;
|
||||
D("kernel_code_entry_byte_offset:%lx\n", code->kernel_code_entry_byte_offset);
|
||||
uint32_t *kernel_code = (uint32_t*)(pkt->kernel_object + code->kernel_code_entry_byte_offset);
|
||||
int code_len = 0;
|
||||
while (kernel_code[code_len] != 0xbf9f0000 && kernel_code[code_len] != 0) code_len++;
|
||||
hexdump(kernel_code, code_len*4);
|
||||
/*FILE *f = fopen("/tmp/kernel_code", "wb");
|
||||
fwrite(kernel_code, 4, code_len, f);
|
||||
fclose(f);
|
||||
system("python -c 'print(\" \".join([(\"0x%02X\"%x) for x in open(\"/tmp/kernel_code\", \"rb\").read()]))' | ../build/llvm-project/bin/llvm-mc --disassemble --arch=amdgcn --mcpu=gfx1100 --show-encoding");*/
|
||||
D("kernargs (kernarg_segment_byte_size:0x%lx)\n", code->kernarg_segment_byte_size);
|
||||
// get length
|
||||
int i;
|
||||
for (i = 0; i < 0x400; i+=0x10) {
|
||||
if (memcmp((void*)((uint64_t)pkt->kernarg_address+i), "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", 0x10) == 0) break;
|
||||
}
|
||||
hexdump((void*)pkt->kernarg_address, i+0x10);
|
||||
} else if ((pkt->header&0xFF) == HSA_PACKET_TYPE_BARRIER_AND) {
|
||||
hsa_barrier_and_packet_t *pkt_and = (hsa_barrier_and_packet_t *)(ring_base_address+value*0x40);
|
||||
D("HSA_PACKET_TYPE_BARRIER_AND completion_signal:0x%lx\n", pkt_and->completion_signal.handle);
|
||||
//hexdump((void*)(ring_base_address+value*0x40), 0x40);
|
||||
} else if ((pkt->header&0xFF) == HSA_PACKET_TYPE_VENDOR_SPECIFIC) {
|
||||
D("HSA_PACKET_TYPE_VENDOR_SPECIFIC\n");
|
||||
hexdump((void*)(ring_base_address+value*0x40), 0x40);
|
||||
} else {
|
||||
hexdump((void*)(ring_base_address+value*0x40), 0x40);
|
||||
}
|
||||
}
|
||||
|
||||
mprotect((void *)((uint64_t)si->si_addr & ~0xFFF), 0x2000, PROT_READ | PROT_WRITE);
|
||||
if (store_size == 8) {
|
||||
*(volatile uint64_t*)(si->si_addr) = value;
|
||||
} else if (store_size == 4) {
|
||||
*(volatile uint32_t*)(si->si_addr) = value;
|
||||
} else if (store_size == 2) {
|
||||
*(volatile uint16_t*)(si->si_addr) = value;
|
||||
} else {
|
||||
D("store size not supported\n");
|
||||
exit(-1);
|
||||
}
|
||||
mprotect((void *)((uint64_t)si->si_addr & ~0xFFF), 0x2000, PROT_NONE);
|
||||
}
|
||||
|
||||
void register_sigsegv_handler() {
|
||||
struct sigaction sa = {0};
|
||||
sa.sa_flags = SA_SIGINFO;
|
||||
sigemptyset(&sa.sa_mask);
|
||||
sa.sa_sigaction = handler;
|
||||
if (sigaction(SIGSEGV, &sa, NULL) == -1) {
|
||||
D("ERROR: failed to register sigsegv handler");
|
||||
exit(-1);
|
||||
}
|
||||
// NOTE: python (or ocl runtime?) blocks the SIGSEGV signal
|
||||
sigset_t x;
|
||||
sigemptyset(&x);
|
||||
sigaddset(&x, SIGSEGV);
|
||||
sigprocmask(SIG_UNBLOCK, &x, NULL);
|
||||
}
|
||||
|
||||
int (*my_open)(const char *pathname, int flags, mode_t mode);
|
||||
#undef open
|
||||
int open(const char *pathname, int flags, mode_t mode) {
|
||||
if (my_open == NULL) my_open = reinterpret_cast<decltype(my_open)>(dlsym(RTLD_NEXT, "open"));
|
||||
int ret = my_open(pathname, flags, mode);
|
||||
//D("open %s (0o%o) = %d\n", pathname, flags, ret);
|
||||
files[ret] = pathname;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int (*my_open64)(const char *pathname, int flags, mode_t mode);
|
||||
#undef open
|
||||
int open64(const char *pathname, int flags, mode_t mode) {
|
||||
if (my_open64 == NULL) my_open64 = reinterpret_cast<decltype(my_open64)>(dlsym(RTLD_NEXT, "open64"));
|
||||
int ret = my_open64(pathname, flags, mode);
|
||||
//D("open %s (0o%o) = %d\n", pathname, flags, ret);
|
||||
files[ret] = pathname;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void *(*my_mmap)(void *addr, size_t length, int prot, int flags, int fd, off_t offset);
|
||||
#undef mmap
|
||||
void *mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset) {
|
||||
if (my_mmap == NULL) my_mmap = reinterpret_cast<decltype(my_mmap)>(dlsym(RTLD_NEXT, "mmap"));
|
||||
void *ret = my_mmap(addr, length, prot, flags, fd, offset);
|
||||
|
||||
if (doorbell_offset != -1 && offset == doorbell_offset) {
|
||||
D("HIDDEN DOORBELL %p, handled by %p\n", addr, handler);
|
||||
register_sigsegv_handler();
|
||||
mprotect(addr, length, PROT_NONE);
|
||||
}
|
||||
|
||||
if (fd != -1) D("mmapped %p (target %p) with flags 0x%x length 0x%zx fd %d %s offset 0x%lx\n", ret, addr, flags, length, fd, files[fd].c_str(), offset);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void *(*my_mmap64)(void *addr, size_t length, int prot, int flags, int fd, off_t offset);
|
||||
#undef mmap64
|
||||
void *mmap64(void *addr, size_t length, int prot, int flags, int fd, off_t offset) { return mmap(addr, length, prot, flags, fd, offset); }
|
||||
|
||||
int ioctl_num = 1;
|
||||
int (*my_ioctl)(int filedes, unsigned long request, void *argp) = NULL;
|
||||
#undef ioctl
|
||||
int ioctl(int filedes, unsigned long request, void *argp) {
|
||||
if (my_ioctl == NULL) my_ioctl = reinterpret_cast<decltype(my_ioctl)>(dlsym(RTLD_NEXT, "ioctl"));
|
||||
int ret = 0;
|
||||
ret = my_ioctl(filedes, request, argp);
|
||||
if (!files.count(filedes)) return ret;
|
||||
|
||||
uint8_t type = (request >> 8) & 0xFF;
|
||||
uint8_t nr = (request >> 0) & 0xFF;
|
||||
uint16_t size = (request >> 16) & 0xFFF;
|
||||
|
||||
D("%3d: %d = %3d(%20s) 0x%3x ", ioctl_num, ret, filedes, files[filedes].c_str(), size);
|
||||
|
||||
if (request == AMDKFD_IOC_SET_EVENT) {
|
||||
kfd_ioctl_set_event_args *args = (kfd_ioctl_set_event_args *)argp;
|
||||
D("AMDKFD_IOC_SET_EVENT event_id:%d", args->event_id);
|
||||
} else if (request == AMDKFD_IOC_ALLOC_MEMORY_OF_GPU) {
|
||||
kfd_ioctl_alloc_memory_of_gpu_args *args = (kfd_ioctl_alloc_memory_of_gpu_args *)argp;
|
||||
D("AMDKFD_IOC_ALLOC_MEMORY_OF_GPU va_addr:0x%llx size:0x%llx handle:%llX gpu_id:0x%x", args->va_addr, args->size, args->handle, args->gpu_id);
|
||||
} else if (request == AMDKFD_IOC_MAP_MEMORY_TO_GPU) {
|
||||
kfd_ioctl_map_memory_to_gpu_args *args = (kfd_ioctl_map_memory_to_gpu_args *)argp;
|
||||
D("AMDKFD_IOC_MAP_MEMORY_TO_GPU handle:%llX", args->handle);
|
||||
} else if (request == AMDKFD_IOC_CREATE_EVENT) {
|
||||
kfd_ioctl_create_event_args *args = (kfd_ioctl_create_event_args *)argp;
|
||||
D("AMDKFD_IOC_CREATE_EVENT event_page_offset:0x%llx event_type:%d event_id:%d", args->event_page_offset, args->event_type, args->event_id);
|
||||
} else if (request == AMDKFD_IOC_WAIT_EVENTS) {
|
||||
D("AMDKFD_IOC_WAIT_EVENTS");
|
||||
} else if (request == AMDKFD_IOC_SET_XNACK_MODE) {
|
||||
D("AMDKFD_IOC_SET_XNACK_MODE");
|
||||
} else if (request == AMDKFD_IOC_SVM || (type == 0x4b && nr == 0x20)) {
|
||||
// NOTE: this one is variable length
|
||||
kfd_ioctl_svm_args *args = (kfd_ioctl_svm_args *)argp;
|
||||
D("AMDKFD_IOC_SVM start_addr:0x%llx size:0x%llx op:%d", args->start_addr, args->size, args->op);
|
||||
} else if (request == AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU) {
|
||||
kfd_ioctl_unmap_memory_from_gpu_args *args = (kfd_ioctl_unmap_memory_from_gpu_args *)argp;
|
||||
D("AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU handle:%llX", args->handle);
|
||||
} else if (request == AMDKFD_IOC_FREE_MEMORY_OF_GPU) {
|
||||
D("AMDKFD_IOC_FREE_MEMORY_OF_GPU");
|
||||
} else if (request == AMDKFD_IOC_SET_SCRATCH_BACKING_VA) {
|
||||
D("AMDKFD_IOC_SET_SCRATCH_BACKING_VA");
|
||||
} else if (request == AMDKFD_IOC_GET_TILE_CONFIG) {
|
||||
D("AMDKFD_IOC_GET_TILE_CONFIG");
|
||||
} else if (request == AMDKFD_IOC_SET_TRAP_HANDLER) {
|
||||
D("AMDKFD_IOC_SET_TRAP_HANDLER");
|
||||
} else if (request == AMDKFD_IOC_GET_VERSION) {
|
||||
kfd_ioctl_get_version_args *args = (kfd_ioctl_get_version_args *)argp;
|
||||
D("AMDKFD_IOC_GET_VERSION major_version:%d minor_version:%d", args->major_version, args->minor_version);
|
||||
} else if (request == AMDKFD_IOC_GET_PROCESS_APERTURES_NEW) {
|
||||
D("AMDKFD_IOC_GET_PROCESS_APERTURES_NEW");
|
||||
} else if (request == AMDKFD_IOC_ACQUIRE_VM) {
|
||||
D("AMDKFD_IOC_ACQUIRE_VM");
|
||||
} else if (request == AMDKFD_IOC_SET_MEMORY_POLICY) {
|
||||
D("AMDKFD_IOC_SET_MEMORY_POLICY");
|
||||
} else if (request == AMDKFD_IOC_GET_CLOCK_COUNTERS) {
|
||||
D("AMDKFD_IOC_GET_CLOCK_COUNTERS");
|
||||
} else if (request == AMDKFD_IOC_CREATE_QUEUE) {
|
||||
kfd_ioctl_create_queue_args *args = (kfd_ioctl_create_queue_args *)argp;
|
||||
D("AMDKFD_IOC_CREATE_QUEUE\n");
|
||||
D("queue_type:%d ring_base_address:0x%llx\n", args->queue_type, args->ring_base_address);
|
||||
D("eop_buffer_address:0x%llx ctx_save_restore_address:0x%llx\n", args->eop_buffer_address, args->ctx_save_restore_address);
|
||||
D("ring_size:0x%x queue_priority:%d\n", args->ring_size, args->queue_priority);
|
||||
D("RETURNS write_pointer_address:0x%llx read_pointer_address:0x%llx doorbell_offset:0x%llx queue_id:%d\n", args->write_pointer_address, args->read_pointer_address, args->doorbell_offset, args->queue_id);
|
||||
//D("RETURNS *write_pointer_address:0x%llx *read_pointer_address:0x%llx\n", *(uint64_t*)args->write_pointer_address, *(uint64_t*)args->read_pointer_address);
|
||||
ring_base_addresses[args->doorbell_offset&0xFFF] = args->ring_base_address;
|
||||
queue_types[args->doorbell_offset&0xFFF] = args->queue_type;
|
||||
doorbell_offset = args->doorbell_offset&~0xFFF;
|
||||
} else {
|
||||
D("type:0x%x nr:0x%x size:0x%x", type, nr, size);
|
||||
}
|
||||
|
||||
D("\n");
|
||||
ioctl_num++;
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -11,8 +11,8 @@ from tinygrad.runtime.support.compiler_amd import amdgpu_disassemble
|
||||
from tinygrad.renderer import ProgramSpec
|
||||
from tinygrad.engine.realize import CompiledRunner
|
||||
|
||||
from extra.assembly.rdna3.autogen import *
|
||||
from extra.assembly.rdna3.asm import waitcnt
|
||||
from extra.assembly.amd.autogen.rdna3 import *
|
||||
from extra.assembly.amd.asm import waitcnt
|
||||
from test.testextra.test_cfg_viz import template
|
||||
|
||||
def get_output(asm:list, n_threads:int=1, vdst:VGPR=v[1]):
|
||||
|
||||
@@ -21,7 +21,7 @@ class PythonRemu:
|
||||
rsrc2: int = 0x19c # Default: USER_SGPR_COUNT=14, enable X and Y workgroup IDs
|
||||
|
||||
def run_asm(self, lib: int, lib_sz: int, gx: int, gy: int, gz: int, lx: int, ly: int, lz: int, args_ptr: int) -> int:
|
||||
from extra.assembly.rdna3.emu import run_asm, set_valid_mem_ranges
|
||||
from extra.assembly.amd.emu import run_asm, set_valid_mem_ranges
|
||||
# Pad ranges to handle GPU loads that may read past small buffers (e.g. s_load_b128 on 12-byte buffer)
|
||||
set_valid_mem_ranges({(start, size + 4096) for start, size in self.valid_mem_ranges})
|
||||
return run_asm(lib, lib_sz, gx, gy, gz, lx, ly, lz, args_ptr, self.rsrc2)
|
||||
|
||||
@@ -10,7 +10,7 @@ from tinygrad.renderer import ProgramSpec
|
||||
from tinygrad.helpers import TracingKey, getenv
|
||||
from tinygrad.engine.realize import ExecItem, CompiledRunner
|
||||
|
||||
from extra.assembly.rdna3.autogen import *
|
||||
from extra.assembly.amd.autogen.rdna3 import *
|
||||
|
||||
# TODO: use the RDNA3 renderer when it's in master
|
||||
template = """.text
|
||||
|
||||
Reference in New Issue
Block a user