remu: only write v_cmp result if exec is set (#10084)

2026-01-09 15:08:02 -05:00 · 2025-04-28 15:31:52 +03:00
parent d6b436a815
commit ac37510f60
2 changed files with 157 additions and 9 deletions
--- a/extra/remu/src/thread.rs
+++ b/extra/remu/src/thread.rs
@@ -810,10 +810,12 @@ impl<'a> Thread<'a> {
                _ => todo_instr!(instruction)?,
            };

-            match op >= 128 {
-                true => self.exec.set_lane(ret),
-                false => self.vcc.set_lane(ret),
-            };
+            if self.exec.read() {
+                match op >= 128 {
+                    true => self.exec.set_lane(ret),
+                    false => self.vcc.set_lane(ret),
+                };
+            }
        }
        else if let Instruction::VOP2 { vsrc, src, vdst, op } = decoded {
            let s0 = src as usize;
@@ -1072,11 +1074,13 @@ impl<'a> Thread<'a> {
                                _ => todo_instr!(instruction)?,
                            };

-                            match vdst {
-                                0..=SGPR_SRC | 107 => self.set_sgpr_co(vdst, ret),
-                                VCC => self.vcc.set_lane(ret),
-                                EXEC => self.exec.set_lane(ret),
-                                _ => todo_instr!(instruction)?,
+                            if self.exec.read() {
+                                match vdst {
+                                    0..=SGPR_SRC | 107 => self.set_sgpr_co(vdst, ret),
+                                    VCC => self.vcc.set_lane(ret),
+                                    EXEC => self.exec.set_lane(ret),
+                                    _ => todo_instr!(instruction)?,
+                                }
                            }
                        }
                        828..=830 => {
--- a/extra/remu/test/hwtest.py
+++ b/extra/remu/test/hwtest.py
@@ -0,0 +1,144 @@
+import numpy as np
+import unittest
+import subprocess
+from typing import cast
+from tinygrad.runtime.ops_amd import AMDProgram, AMDDevice
+from tinygrad import Tensor, dtypes, Device
+from tinygrad.helpers import diskcache, OSX, getenv
+
+@diskcache
+def assemble(code:str) -> bytes:
+  try:
+    LLVM_MC = "llvm-mc" if OSX else "/opt/rocm/llvm/bin/llvm-mc"
+    return subprocess.run([LLVM_MC, "--arch=amdgcn", "--mcpu=gfx1100", "--triple=amdgcn-amd-amdhsa", "-filetype=obj", "-o", "-"],
+                           input=code.encode("utf-8"), stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True).stdout
+  except subprocess.CalledProcessError as e:
+    print("stderr:")
+    print(e.stderr.decode())
+    raise
+
+# copied from extra/rdna
+def get_prg(code:str, v_cnt:int, s_cnt:int):
+  function_name = "test"
+  metadata = f"""
+amdhsa.kernels:
+- .args:
+  - .address_space: global
+    .name: buf_0
+    .offset: 0
+    .size: 8
+    .type_name: unsigned int*
+    .value_kind: global_buffer
+  .group_segment_fixed_size: 0
+  .kernarg_segment_align: 8
+  .kernarg_segment_size: 8
+  .language: OpenCL C
+  .language_version:
+  - 1
+  - 2
+  .max_flat_workgroup_size: 256
+  .name: test
+  .private_segment_fixed_size: 0
+  .sgpr_count: {s_cnt}
+  .sgpr_spill_count: 0
+  .symbol: test.kd
+  .uses_dynamic_stack: false
+  .vgpr_count: {v_cnt}
+  .vgpr_spill_count: 0
+  .wavefront_size: 32
+amdhsa.target: amdgcn-amd-amdhsa--gfx1100
+amdhsa.version:
+- 1
+- 2
+  """
+  boilerplate_start = f"""
+  .rodata
+  .global {function_name}.kd
+  .type {function_name}.kd,STT_OBJECT
+  .align 0x10
+  .amdhsa_kernel {function_name}"""
+  kernel_desc = {
+    '.amdhsa_group_segment_fixed_size': 0, '.amdhsa_private_segment_fixed_size': 0, '.amdhsa_kernarg_size': 0,
+    '.amdhsa_next_free_vgpr': v_cnt,   # this matters!
+    '.amdhsa_reserve_vcc': 0, '.amdhsa_reserve_xnack_mask': 0,
+    '.amdhsa_next_free_sgpr': s_cnt,
+    '.amdhsa_float_round_mode_32': 0, '.amdhsa_float_round_mode_16_64': 0, '.amdhsa_float_denorm_mode_32': 3, '.amdhsa_float_denorm_mode_16_64': 3,
+    '.amdhsa_dx10_clamp': 1, '.amdhsa_ieee_mode': 1, '.amdhsa_fp16_overflow': 0,
+    '.amdhsa_workgroup_processor_mode': 1, '.amdhsa_memory_ordered': 1, '.amdhsa_forward_progress': 0, '.amdhsa_enable_private_segment': 0,
+    '.amdhsa_system_sgpr_workgroup_id_x': 1, '.amdhsa_system_sgpr_workgroup_id_y': 1, '.amdhsa_system_sgpr_workgroup_id_z': 1,
+    '.amdhsa_system_sgpr_workgroup_info': 0, '.amdhsa_system_vgpr_workitem_id': 2, # is amdhsa_system_vgpr_workitem_id real?
+    '.amdhsa_exception_fp_ieee_invalid_op': 0, '.amdhsa_exception_fp_denorm_src': 0,
+    '.amdhsa_exception_fp_ieee_div_zero': 0, '.amdhsa_exception_fp_ieee_overflow': 0, '.amdhsa_exception_fp_ieee_underflow': 0,
+    '.amdhsa_exception_fp_ieee_inexact': 0, '.amdhsa_exception_int_div_zero': 0,
+    '.amdhsa_user_sgpr_dispatch_ptr': 0, '.amdhsa_user_sgpr_queue_ptr': 0, '.amdhsa_user_sgpr_kernarg_segment_ptr': 1,
+    '.amdhsa_user_sgpr_dispatch_id': 0, '.amdhsa_user_sgpr_private_segment_size': 0, '.amdhsa_wavefront_size32': 1, '.amdhsa_uses_dynamic_stack': 0}
+  code_start = f""".end_amdhsa_kernel
+  .text
+  .global {function_name}
+  .type {function_name},@function
+  .p2align 8
+  {function_name}:
+  """
+  ret = ".amdgpu_metadata\n" + metadata + ".end_amdgpu_metadata" + boilerplate_start + "\n" + '\n'.join("%s %d" % x for x in kernel_desc.items()) \
+      + "\n" + code_start + code + f"\n.size {function_name}, .-{function_name}"
+  return AMDProgram(cast(AMDDevice, Device["AMD"]), function_name, assemble(ret))
+
+def get_output(s:str, n_threads:int):
+  assert n_threads <= 32
+  code = "\n".join(["s_load_b64 s[0:1], s[0:1], null", "v_lshlrev_b32_e32 v0, 2, v0", s,
+    "s_waitcnt 0",
+    "global_store_b32 v0, v1, s[0:1]",
+    "s_nop 0", "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)", "s_endpgm"])
+  test = Tensor.zeros((n_threads,), dtype=dtypes.uint32).contiguous().realize().lazydata.buffer
+  prg = get_prg(code, 32, 32)
+  prg(test._buf, global_size=(1, 1, 1), local_size=(n_threads, 1, 1), wait=True)
+  return test.numpy()
+
+@unittest.skipUnless(Device.DEFAULT == "AMD", "tests RDNA3")
+class TestHW(unittest.TestCase):
+  def setUp(self):
+    if getenv("MOCKGPU"): subprocess.run(["cargo", "build", "--release", "--manifest-path", "./extra/remu/Cargo.toml"], check=True)
+
+  def test_simple(self):
+    out = get_output("""
+    v_mov_b32_e32 v10 42
+    v_mov_b32_e32 v1 v10
+    """, n_threads=2)
+    np.testing.assert_equal(out, 42)
+
+  def test_exec_mov(self):
+    out = get_output("""
+    v_mov_b32_e32 v10 42
+    s_mov_b32_e32 exec_lo 0b10
+    v_mov_b32_e32 v10 10
+    s_mov_b32_e32 exec_lo 0b11
+    v_mov_b32_e32 v1 v10
+    """, n_threads=2)
+    np.testing.assert_equal(out, [42, 10])
+
+  def test_exec_cmp_vopc(self):
+    out = get_output("""
+    s_mov_b32 vcc_lo 0 // reset vcc
+    v_mov_b32_e32 v10 42
+    v_mov_b32_e32 v11 10
+    s_mov_b32_e32 exec_lo 0b01
+    v_cmp_ne_u32 v10 v11
+    s_mov_b32_e32 exec_lo 0b11
+    v_mov_b32_e32 v1 vcc_lo
+    """, n_threads=2)
+    np.testing.assert_equal(out, 0b01)
+
+  def test_exec_cmpx_vop3(self):
+    out = get_output("""
+    v_mov_b32_e32 v10 42
+    v_mov_b32_e32 v11 10
+    s_mov_b32_e32 exec_lo 0b01
+    v_cmpx_ne_u32 v10 v11
+    s_mov_b32_e32 s10 exec_lo
+    s_mov_b32_e32 exec_lo 0b11
+    v_mov_b32_e32 v1 s10
+    """, n_threads=2)
+    np.testing.assert_equal(out, 0b01)
+
+if __name__ == "__main__":
+  unittest.main()