add v_fmac_f16 vop3 instruction to remu (#10247)

* fmac vop3

* from the box
This commit is contained in:
qazal
2025-05-10 23:48:25 +03:00
committed by GitHub
parent 697259a8a1
commit 9210280811
2 changed files with 16 additions and 3 deletions

View File

@@ -1127,13 +1127,14 @@ impl<'a> Thread<'a> {
self.vec_reg.write64(vdst, ret)
}
}
306 | 309 | 313 | 596 | 584 | 585 | 588 => {
306 | 309 | 310 | 313 | 596 | 584 | 585 | 588 => {
let (s0, s1, s2) = (self.val(src.0), self.val(src.1), self.val(src.2));
let s0 = f16::from_bits(s0).negate(0, neg).absolute(0, abs);
let s1 = f16::from_bits(s1).negate(1, neg).absolute(1, abs);
let s2 = f16::from_bits(s2).negate(1, neg).absolute(1, abs);
let ret = match op {
309 => s0 * s1,
310 => f16::mul_add(s0, s1, f16::from_bits(self.vec_reg[vdst] as u16)),
306 => s0 + s1,
584 => f16::mul_add(s0, s1, s2),
585 => f16::min(f16::min(s0, s1), s2),

View File

@@ -1,6 +1,6 @@
import numpy as np
import unittest
import subprocess
import subprocess, struct
from typing import cast
from tinygrad.runtime.ops_amd import AMDProgram, AMDDevice
from tinygrad import Tensor, dtypes, Device
@@ -83,7 +83,7 @@ amdhsa.version:
+ "\n" + code_start + code + f"\n.size {function_name}, .-{function_name}"
return AMDProgram(cast(AMDDevice, Device["AMD"]), function_name, assemble(ret))
def get_output(s:str, n_threads:int):
def get_output(s:str, n_threads:int=1):
assert n_threads <= 32
code = "\n".join(["s_load_b64 s[0:1], s[0:1], null", "v_lshlrev_b32_e32 v0, 2, v0", s,
"s_waitcnt 0",
@@ -94,6 +94,8 @@ def get_output(s:str, n_threads:int):
prg(test._buf, global_size=(1, 1, 1), local_size=(n_threads, 1, 1), wait=True)
return test.numpy()
def f16_to_bits(x:float) -> int: return struct.unpack('<H', struct.pack('<e', x))[0]
@unittest.skipUnless(Device.DEFAULT == "AMD", "tests RDNA3")
class TestHW(unittest.TestCase):
def setUp(self):
@@ -140,5 +142,15 @@ class TestHW(unittest.TestCase):
""", n_threads=2)
np.testing.assert_equal(out, 0b01)
def test_fmac_vop3_modifier(self):
init_state = f"""
v_mov_b32_e32 v10 {f16_to_bits(4.0)}
v_mov_b32_e32 v11 {f16_to_bits(3.0)}
v_mov_b32_e32 v1 {f16_to_bits(2.0)}
"""
self.assertEqual(get_output(init_state+"\n"+"v_fmac_f16_e64 v1 v11 v10"), f16_to_bits(14.))
self.assertEqual(get_output(init_state+"\n"+"v_fmac_f16_e64 v1 -v11 v10"), f16_to_bits(-10.))
self.assertEqual(get_output(init_state+"\n"+"v_fmac_f16_e64 v1 -v11 -v10"), f16_to_bits(14.))
if __name__ == "__main__":
unittest.main()