From e8c595c29ebd9f56cde119d7a8721edce73cc70e Mon Sep 17 00:00:00 2001 From: qazal <77887910+Qazalin@users.noreply.github.com> Date: Tue, 30 Sep 2025 12:36:29 +0300 Subject: [PATCH] remu: add new instructions introduced in RANGEIFY (#12363) * add v_mad_i64_i32 for test_output_padded_conv_transpose2d * run amd test_ops * skip test_masked_select --- extra/remu/src/thread.rs | 12 +++++++++++- test/test_ops.py | 1 + 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/extra/remu/src/thread.rs b/extra/remu/src/thread.rs index 4662d3fa3f..ca448d7c23 100644 --- a/extra/remu/src/thread.rs +++ b/extra/remu/src/thread.rs @@ -930,7 +930,7 @@ impl<'a> Thread<'a> { let op = ((instr >> 16) & 0x3ff) as u32; match op { - 764 | 765 | 288 | 289 | 290 | 766 | 768 | 769 => { + 764 | 765 | 288 | 289 | 290 | 766 | 767 | 768 | 769 => { let vdst = (instr & 0xff) as usize; let sdst = ((instr >> 8) & 0x7f) as usize; let f = |i: u32| -> usize { ((instr >> i) & 0x1ff) as usize }; @@ -944,6 +944,16 @@ impl<'a> Thread<'a> { assert_eq!(clmp, 0); let vcc = match op { + 767 => { + let (s0, s1, s2): (u32, u32, u64) = (self.val(s0), self.val(s1), self.val(s2)); + let (mul_result, overflow_mul) = (s0 as i64).overflowing_mul(s1 as i64); + let (ret, overflow_add) = mul_result.overflowing_add(s2 as i64); + let overflowed = overflow_mul || overflow_add; + if self.exec.read() { + self.vec_reg.write64(vdst, ret as u64); + } + overflowed + }, 766 => { let (s0, s1, s2): (u32, u32, u64) = (self.val(s0), self.val(s1), self.val(s2)); let (mul_result, overflow_mul) = (s0 as u64).overflowing_mul(s1 as u64); diff --git a/test/test_ops.py b/test/test_ops.py index a8513df59d..ddefcc1812 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -3164,6 +3164,7 @@ class TestOps(unittest.TestCase): helper_test_op([(32,10)], lambda x: x.masked_fill((x>0.1).detach(), -math.inf)) helper_test_op([(32,10)], lambda x: x.masked_fill((x<0.1).detach(), -math.inf)) + @unittest.skipIf(getenv("MOCKGPU") and Device.DEFAULT == "AMD" and RANGEIFY, "very slow on MOCKGPU because reduce does not fold") def test_masked_select(self): helper_test_op([(32, 10)], lambda x: x.masked_select(x>0.5), lambda x: x.masked_select(x>0.5), forward_only=True) helper_test_op([(32, 10)], lambda x: x.masked_select(torch.tensor(True)), lambda x: x.masked_select(Tensor(True)), forward_only=True)