remu: add new instructions introduced in RANGEIFY (#12363)

* add v_mad_i64_i32 for test_output_padded_conv_transpose2d * run amd test_ops * skip test_masked_select
2026-01-09 15:08:02 -05:00 · 2025-09-30 12:36:29 +03:00
parent 360980f1a3
commit e8c595c29e
2 changed files with 12 additions and 1 deletions
--- a/extra/remu/src/thread.rs
+++ b/extra/remu/src/thread.rs
@@ -930,7 +930,7 @@ impl<'a> Thread<'a> {

            let op = ((instr >> 16) & 0x3ff) as u32;
            match op {
-                764 | 765 | 288 | 289 | 290 | 766 | 768 | 769 => {
+                764 | 765 | 288 | 289 | 290 | 766 | 767 | 768 | 769 => {
                    let vdst = (instr & 0xff) as usize;
                    let sdst = ((instr >> 8) & 0x7f) as usize;
                    let f = |i: u32| -> usize { ((instr >> i) & 0x1ff) as usize };
@@ -944,6 +944,16 @@ impl<'a> Thread<'a> {
                    assert_eq!(clmp, 0);

                    let vcc = match op {
+                        767 => {
+                            let (s0, s1, s2): (u32, u32, u64) = (self.val(s0), self.val(s1), self.val(s2));
+                            let (mul_result, overflow_mul) = (s0 as i64).overflowing_mul(s1 as i64);
+                            let (ret, overflow_add) = mul_result.overflowing_add(s2 as i64);
+                            let overflowed = overflow_mul || overflow_add;
+                            if self.exec.read() {
+                                self.vec_reg.write64(vdst, ret as u64);
+                            }
+                            overflowed
+                        },
                        766 => {
                            let (s0, s1, s2): (u32, u32, u64) = (self.val(s0), self.val(s1), self.val(s2));
                            let (mul_result, overflow_mul) = (s0 as u64).overflowing_mul(s1 as u64);
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -3164,6 +3164,7 @@ class TestOps(unittest.TestCase):
    helper_test_op([(32,10)], lambda x: x.masked_fill((x>0.1).detach(), -math.inf))
    helper_test_op([(32,10)], lambda x: x.masked_fill((x<0.1).detach(), -math.inf))

+  @unittest.skipIf(getenv("MOCKGPU") and Device.DEFAULT == "AMD" and RANGEIFY, "very slow on MOCKGPU because reduce does not fold")
  def test_masked_select(self):
    helper_test_op([(32, 10)], lambda x: x.masked_select(x>0.5), lambda x: x.masked_select(x>0.5), forward_only=True)
    helper_test_op([(32, 10)], lambda x: x.masked_select(torch.tensor(True)), lambda x: x.masked_select(Tensor(True)), forward_only=True)