From 2f95c1070220417e8ec6c2a5c5ab0635e4415890 Mon Sep 17 00:00:00 2001 From: qazal <77887910+Qazalin@users.noreply.github.com> Date: Thu, 23 Oct 2025 11:13:43 +0800 Subject: [PATCH] remu new instructions / use volatile in emulator tests (#12862) * remu new instructions * start moving to volatile * test_simple works * test_exec_mov works and lid is still here * test_exec_cmp_vopc * clang did s_mov_b32 exec_lo, 1 * don't hardcode v1 * support volatile in tests * hw_test passes * only the volatile version * subrev saturating behavior --- extra/remu/src/thread.rs | 11 ++- extra/remu/test/hwtest.py | 183 +++++++++++++------------------------- 2 files changed, 74 insertions(+), 120 deletions(-) diff --git a/extra/remu/src/thread.rs b/extra/remu/src/thread.rs index ca448d7c23..4f73557bbc 100644 --- a/extra/remu/src/thread.rs +++ b/extra/remu/src/thread.rs @@ -882,6 +882,11 @@ impl<'a> Thread<'a> { let s1 = sign_ext((s1 & 0xffffff) as u64, 24) as i32; (s0 * s1) as u32 } + 10 => { + let s0 = sign_ext((s0 & 0xffffff) as u64, 24) as i64; + let s1 = sign_ext((s1 & 0xffffff) as u64, 24) as i64; + ((s0 * s1) >> 32) as u32 + } 17 | 18 | 26 => { let (s0, s1) = (s0 as i32, s1 as i32); (match op { @@ -930,7 +935,7 @@ impl<'a> Thread<'a> { let op = ((instr >> 16) & 0x3ff) as u32; match op { - 764 | 765 | 288 | 289 | 290 | 766 | 767 | 768 | 769 => { + 764 | 765 | 288 | 289 | 290 | 766 | 767 | 768 | 769 | 770 => { let vdst = (instr & 0xff) as usize; let sdst = ((instr >> 8) & 0x7f) as usize; let f = |i: u32| -> usize { ((instr >> i) & 0x1ff) as usize }; @@ -996,6 +1001,10 @@ impl<'a> Thread<'a> { let ret = s0.wrapping_sub(s1); (ret as u32, s1 > s0) } + 770 => { + let ret = s1.wrapping_sub(s0); + (ret as u32, s0 > s1) + } _ => todo_instr!(instruction)?, }; if self.exec.read() { diff --git a/extra/remu/test/hwtest.py b/extra/remu/test/hwtest.py index 76bd2f6e69..769d687045 100644 --- a/extra/remu/test/hwtest.py +++ b/extra/remu/test/hwtest.py @@ -1,98 +1,32 @@ import numpy as np import unittest import subprocess, struct, math -from typing import cast -from tinygrad.runtime.ops_amd import AMDProgram, AMDDevice -from tinygrad import Tensor, dtypes, Device -from tinygrad.helpers import diskcache, OSX, getenv +from tinygrad import Tensor, dtypes, Device, UOp +from tinygrad.helpers import getenv +from tinygrad.runtime.support.compiler_amd import amdgpu_disassemble +from tinygrad.renderer import ProgramSpec +from tinygrad.engine.realize import CompiledRunner -@diskcache -def assemble(code:str) -> bytes: - try: - LLVM_MC = "llvm-mc" if OSX else "/opt/rocm/llvm/bin/llvm-mc" - return subprocess.run([LLVM_MC, "--arch=amdgcn", "--mcpu=gfx1100", "--triple=amdgcn-amd-amdhsa", "-filetype=obj", "-o", "-"], - input=code.encode("utf-8"), stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True).stdout - except subprocess.CalledProcessError as e: - print("stderr:") - print(e.stderr.decode()) - raise - -# copied from extra/rdna -def get_prg(code:str, v_cnt:int, s_cnt:int): - function_name = "test" - metadata = f""" -amdhsa.kernels: -- .args: - - .address_space: global - .name: buf_0 - .offset: 0 - .size: 8 - .type_name: unsigned int* - .value_kind: global_buffer - .group_segment_fixed_size: 0 - .kernarg_segment_align: 8 - .kernarg_segment_size: 8 - .language: OpenCL C - .language_version: - - 1 - - 2 - .max_flat_workgroup_size: 256 - .name: test - .private_segment_fixed_size: 0 - .sgpr_count: {s_cnt} - .sgpr_spill_count: 0 - .symbol: test.kd - .uses_dynamic_stack: false - .vgpr_count: {v_cnt} - .vgpr_spill_count: 0 - .wavefront_size: 32 -amdhsa.target: amdgcn-amd-amdhsa--gfx1100 -amdhsa.version: -- 1 -- 2 - """ - boilerplate_start = f""" - .rodata - .global {function_name}.kd - .type {function_name}.kd,STT_OBJECT - .align 0x10 - .amdhsa_kernel {function_name}""" - kernel_desc = { - '.amdhsa_group_segment_fixed_size': 0, '.amdhsa_private_segment_fixed_size': 0, '.amdhsa_kernarg_size': 0, - '.amdhsa_next_free_vgpr': v_cnt, # this matters! - '.amdhsa_reserve_vcc': 0, '.amdhsa_reserve_xnack_mask': 0, - '.amdhsa_next_free_sgpr': s_cnt, - '.amdhsa_float_round_mode_32': 0, '.amdhsa_float_round_mode_16_64': 0, '.amdhsa_float_denorm_mode_32': 3, '.amdhsa_float_denorm_mode_16_64': 3, - '.amdhsa_dx10_clamp': 1, '.amdhsa_ieee_mode': 1, '.amdhsa_fp16_overflow': 0, - '.amdhsa_workgroup_processor_mode': 1, '.amdhsa_memory_ordered': 1, '.amdhsa_forward_progress': 0, '.amdhsa_enable_private_segment': 0, - '.amdhsa_system_sgpr_workgroup_id_x': 1, '.amdhsa_system_sgpr_workgroup_id_y': 1, '.amdhsa_system_sgpr_workgroup_id_z': 1, - '.amdhsa_system_sgpr_workgroup_info': 0, '.amdhsa_system_vgpr_workitem_id': 2, # is amdhsa_system_vgpr_workitem_id real? - '.amdhsa_exception_fp_ieee_invalid_op': 0, '.amdhsa_exception_fp_denorm_src': 0, - '.amdhsa_exception_fp_ieee_div_zero': 0, '.amdhsa_exception_fp_ieee_overflow': 0, '.amdhsa_exception_fp_ieee_underflow': 0, - '.amdhsa_exception_fp_ieee_inexact': 0, '.amdhsa_exception_int_div_zero': 0, - '.amdhsa_user_sgpr_dispatch_ptr': 0, '.amdhsa_user_sgpr_queue_ptr': 0, '.amdhsa_user_sgpr_kernarg_segment_ptr': 1, - '.amdhsa_user_sgpr_dispatch_id': 0, '.amdhsa_user_sgpr_private_segment_size': 0, '.amdhsa_wavefront_size32': 1, '.amdhsa_uses_dynamic_stack': 0} - code_start = f""".end_amdhsa_kernel - .text - .global {function_name} - .type {function_name},@function - .p2align 8 - {function_name}: - """ - ret = ".amdgpu_metadata\n" + metadata + ".end_amdgpu_metadata" + boilerplate_start + "\n" + '\n'.join("%s %d" % x for x in kernel_desc.items()) \ - + "\n" + code_start + code + f"\n.size {function_name}, .-{function_name}" - return AMDProgram(cast(AMDDevice, Device["AMD"]), function_name, assemble(ret)) - -def get_output(s:str, n_threads:int=1): - assert n_threads <= 32 - code = "\n".join(["s_load_b64 s[0:1], s[0:1], null", "v_lshlrev_b32_e32 v0, 2, v0", s, - "s_waitcnt 0", - "global_store_b32 v0, v1, s[0:1]", - "s_nop 0", "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)", "s_endpgm"]) - test = Tensor.zeros((n_threads,), dtype=dtypes.uint32).contiguous().realize().uop.buffer - prg = get_prg(code, 32, 32) - prg(test._buf, global_size=(1, 1, 1), local_size=(n_threads, 1, 1), wait=True) - return test.numpy() +def get_output(asm:str, n_threads:int=1): + input_asm = "\n".join([ln if ln.strip().startswith('asm volatile') else f'asm volatile("{ln.strip().lstrip()}" : "+v"(a), "+v"(b));' + for ln in asm.strip().splitlines() if ln.strip()]) + src = f""" + typedef long unsigned int size_t; + extern "C" __attribute__((device, const)) size_t __ockl_get_local_id(unsigned int); + extern "C" __attribute__((global)) void __attribute__((amdgpu_flat_work_group_size(1, {n_threads}))) test(unsigned int* data0_1) {{ + int l = __ockl_get_local_id(0); + unsigned a = 0, b = 0, c = 0; + {input_asm} + unsigned res; + asm volatile("v_mov_b32 %0, %1" : "=v"(res) : "v"(a)); + *(data0_1+l) = res; + }}""" + t = Tensor.zeros(n_threads, dtype=dtypes.uint32).contiguous().realize() + prg = ProgramSpec("test", src, Device.DEFAULT, UOp.sink(t), global_size=[1, 1, 1], local_size=[n_threads, 1, 1]) + car = CompiledRunner(prg) + if getenv("PRINT_ASM"): amdgpu_disassemble(car.lib) + car([t.uop.buffer], {}, wait=True) + return t.numpy() def f16_to_bits(x:float) -> int: return struct.unpack(' float: return struct.unpack('