diff --git a/.github/actions/setup-tinygrad/action.yml b/.github/actions/setup-tinygrad/action.yml index 506b84422a..48352760cc 100644 --- a/.github/actions/setup-tinygrad/action.yml +++ b/.github/actions/setup-tinygrad/action.yml @@ -121,9 +121,8 @@ runs: echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600 sudo apt update || true sudo apt install --no-install-recommends --allow-unauthenticated -y hsa-rocr comgr hsa-rocr-dev liburing-dev libc6-dev - curl -s https://api.github.com/repos/Qazalin/remu/releases/latest | \ - jq -r '.assets[] | select(.name == "libremu.so").browser_download_url' | \ - sudo xargs curl -L -o /usr/local/lib/libremu.so + cargo build --release --manifest-path ./extra/remu/Cargo.toml + sudo ln -sf ${{ github.workspace }}/extra/remu/target/release/libremu.so /usr/local/lib/libremu.so sudo tee --append /etc/ld.so.conf.d/rocm.conf <<'EOF' /opt/rocm/lib /opt/rocm/lib64 @@ -137,9 +136,7 @@ runs: curl -s -H "Authorization: token $GH_TOKEN" curl -s https://api.github.com/repos/nimlgen/amdcomgr_dylib/releases/latest | \ jq -r '.assets[] | select(.name == "libamd_comgr.dylib").browser_download_url' | \ sudo xargs curl -L -o /usr/local/lib/libamd_comgr.dylib - curl -s -H "Authorization: token $GH_TOKEN" curl -s https://api.github.com/repos/Qazalin/remu/releases/latest | \ - jq -r '.assets[] | select(.name == "libremu.dylib").browser_download_url' | \ - sudo xargs curl -L -o /usr/local/lib/libremu.dylib + cargo build --release --manifest-path ./extra/remu/Cargo.toml # **** CUDA **** diff --git a/.gitignore b/.gitignore index b3e0505f49..a1b09eff5d 100644 --- a/.gitignore +++ b/.gitignore @@ -60,3 +60,4 @@ comgr_* site/ profile_stats *.log +target diff --git a/extra/remu/Cargo.lock b/extra/remu/Cargo.lock new file mode 100644 index 0000000000..da9b2bf6d0 --- /dev/null +++ b/extra/remu/Cargo.lock @@ -0,0 +1,66 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "float-cmp" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98de4bbd547a563b716d8dfa9aad1cb19bfab00f4fa09a6a4ed21dbcf44ce9c4" +dependencies = [ + "num-traits", +] + +[[package]] +name = "half" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872" +dependencies = [ + "cfg-if", + "crunchy", + "num-traits", +] + +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" + +[[package]] +name = "num-traits" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "remu" +version = "0.1.0" +dependencies = [ + "float-cmp", + "half", + "num-traits", +] diff --git a/extra/remu/Cargo.toml b/extra/remu/Cargo.toml new file mode 100644 index 0000000000..85b4e24450 --- /dev/null +++ b/extra/remu/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "remu" +version = "0.1.0" +edition = "2021" +rust-version = "1.80.0" + +[lib] +crate-type = ["cdylib"] + +[dependencies] +half = { version = "2.3.1", features = ["num-traits"] } +num-traits = "0.2.17" + +[dev-dependencies] +float-cmp = "0.9.0" diff --git a/extra/remu/README.md b/extra/remu/README.md new file mode 100644 index 0000000000..dab0b80c5d --- /dev/null +++ b/extra/remu/README.md @@ -0,0 +1,80 @@ +## Intro + +Remu is an RDNA3 emulator built to test correctness of RDNA3 code. It is used in [tinygrad's AMD CI](https://github.com/tinygrad/tinygrad). + +Most of the common instructions are implemented, but some formats like IMG are not supported. + +Remu is only for testing correctness of program output, it is not a cycle accurate simulator. + +## Build Locally + +Remu is written in Rust. Make sure you have [Cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html). + +To build the project, run: + +```bash +cargo build --release --manifest-path ./extra/remu/Cargo.toml +``` + +This will produce a binary in the `extra/remu/target/release` directory. + +## Usage with tinygrad + +The latest binaries are released in https://github.com/Qazalin/remu/releases. Alternatively, you can [build locally](#build-locally). + +Tinygrad does not yet output RDNA3 kernels directly. You can either install comgr or use `AMD_LLVM=1` if you have [LLVM@19](https://github.com/tinygrad/tinygrad/blob/e2ed673c946c8f1774d816c75e52a994c2dd8a88/.github/actions/setup-tinygrad/action.yml#L208). + +`PYTHONPATH="." MOCKGPU=1 AMD=1 python test/test_tiny.py TestTiny.test_plus` runs an emulated RDNA3 kernel with Remu. + +Add `DEBUG=6` to see Remu's logs. + +### DEBUG output + +Remu runs each thread one at a time in a nested for loop, see lib.rs. The DEBUG output prints information about the current thread. + +The DEBUG output has 3 sections: + +``` +<------------ 1 ----------> <--- 2 ---> <--------------------------------------- 3 ------------------------------------------> +[0 0 0 ] [0 0 0 ] 0 F4080100 SMEM sbase=0 sdata=4 op=2 offset=0 soffset=0 +``` + +#### Section 1: Grid info + +`[gid.x, gid.y, gid.z], [lid.x, lid.y, lid.z]` of the current thread. + +#### Section 2: Wave info + +` ` + +RDNA3 divides threads into chunks of 32. Each thread is assigned to a "lane" from 0-31. + +In Remu, even though all threads run one at a time, each 32 thread chunk (a wave) shares state like SGPR, VGPR, LDS, EXEC mask, etc. +Remu can simulate up to one wave sync instruction. +For more details, see work_group.rs. + +Section 2 can have a green or gray color. + +Green = The thread is actively executing the instruction. + +Gray = The thread has been "turned off" by the EXEC mask, it skips execution of some instructions. (refer to "EXECute Mask" on [page 23](https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/instruction-set-architectures/rdna3-shader-instruction-set-architecture-feb-2023_0.pdf#page=23) of ISA docs for more details.) + +To see the colors in action, try running `DEBUG=6 PYTHONPATH="." MOCKGPU=1 AMD=1 python test/test_ops.py TestOps.test_arange_big`. See how only lane 0 writes to global memory: +``` +[255 0 0 ] [0 0 0 ] 0 DC6A0000 GLOBAL offset=0 op=26 addr=8 data=0 saddr=0 vdst=0 +[255 0 0 ] [1 0 0 ] 1 DC6A0000 +[255 0 0 ] [2 0 0 ] 2 DC6A0000 +[255 0 0 ] [3 0 0 ] 3 DC6A0000 +[255 0 0 ] [4 0 0 ] 4 DC6A0000 +``` + +#### Section 3: Decoded Instruction + +This prints the instruction type and all the parsed bitfields. + +Remu output vs llvm-objdump: + +``` +s_load_b64 s[0:1], s[0:1], 0x10 // 00000000160C: F4040000 F8000010 +SMEM sbase=0 sdata=0 op=1 offset=16 soffset=0 +``` diff --git a/extra/remu/rustfmt.toml b/extra/remu/rustfmt.toml new file mode 100644 index 0000000000..bc30bf42fb --- /dev/null +++ b/extra/remu/rustfmt.toml @@ -0,0 +1 @@ +max_width = 150 diff --git a/extra/remu/src/helpers.rs b/extra/remu/src/helpers.rs new file mode 100644 index 0000000000..f360389e72 --- /dev/null +++ b/extra/remu/src/helpers.rs @@ -0,0 +1,170 @@ +use half::f16; +use num_traits::float::FloatCore; + +pub fn nth(val: u32, pos: usize) -> u32 { + (val >> (31 - pos as u32)) & 1 +} +pub fn f16_lo(val: u32) -> f16 { + f16::from_bits((val & 0xffff) as u16) +} +pub fn f16_hi(val: u32) -> f16 { + f16::from_bits(((val >> 16) & 0xffff) as u16) +} + +pub fn sign_ext(num: u64, bits: usize) -> i64 { + let mut value = num; + let is_negative = (value >> (bits - 1)) & 1 != 0; + if is_negative { + value |= !0 << bits; + } + value as i64 +} + +pub trait IEEEClass { + fn exponent(&self) -> T; +} +impl IEEEClass for f32 { + fn exponent(&self) -> u32 { + (self.to_bits() & 0b01111111100000000000000000000000) >> 23 + } +} +impl IEEEClass for f16 { + fn exponent(&self) -> u16 { + (self.to_bits() & 0b0111110000000000) >> 10 + } +} +impl IEEEClass for f64 { + fn exponent(&self) -> u64 { + (self.to_bits() & 0b0111111111110000000000000000000000000000000000000000000000000000) >> 52 + } +} + +pub trait VOPModifier { + fn negate(&self, pos: usize, modifier: usize) -> T; + fn absolute(&self, pos: usize, modifier: usize) -> T; +} +impl VOPModifier for T +where + T: FloatCore, +{ + fn negate(&self, pos: usize, modifier: usize) -> T { + match (modifier >> pos) & 1 { + 1 => match self.is_zero() { + true => *self, + false => -*self, + }, + _ => *self, + } + } + fn absolute(&self, pos: usize, modifier: usize) -> T { + match (modifier >> pos) & 1 { + 1 => self.abs(), + _ => *self, + } + } +} + +pub fn extract_mantissa(x: f64) -> f64 { + if x.is_infinite() || x.is_nan() { + return x; + } + let bits = x.to_bits(); + let mantissa_mask: u64 = 0x000FFFFFFFFFFFFF; + let bias: u64 = 1023; + let normalized_mantissa_bits = (bits & mantissa_mask) | ((bias - 1) << 52); + return f64::from_bits(normalized_mantissa_bits); +} +pub fn ldexp(x: f64, exp: i32) -> f64 { + x * 2f64.powi(exp) +} + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_extract_mantissa() { + assert_eq!(extract_mantissa(2.0f64), 0.5); + } + + #[test] + fn test_normal_exponent() { + assert_eq!(2.5f32.exponent(), 128); + assert_eq!(1.17549435e-38f32.exponent(), 1); + assert_eq!(f32::INFINITY.exponent(), 255); + assert_eq!(f32::NEG_INFINITY.exponent(), 255); + } + + #[test] + fn test_denormal_exponent() { + assert_eq!(1.0e-40f32.exponent(), 0); + assert_eq!(1.0e-42f32.exponent(), 0); + assert_eq!(1.0e-44f32.exponent(), 0); + assert_eq!((1.17549435e-38f32 / 2.0).exponent(), 0); + } + + #[test] + fn test_normal_exponent_f16() { + assert_eq!(f16::from_f32(3.14f32).exponent(), 16); + assert_eq!(f16::NEG_INFINITY.exponent(), 31); + assert_eq!(f16::INFINITY.exponent(), 31); + } + + #[test] + fn test_neg() { + assert_eq!(0.3_f32.negate(0, 0b001), -0.3_f32); + assert_eq!(0.3_f32.negate(1, 0b010), -0.3_f32); + assert_eq!(0.3_f32.negate(2, 0b100), -0.3_f32); + assert_eq!(0.3_f32.negate(0, 0b110), 0.3_f32); + assert_eq!(0.3_f32.negate(1, 0b010), -0.3_f32); + assert_eq!(0.0_f32.negate(0, 0b001).to_bits(), 0); + } + + #[test] + fn test_sign_ext() { + assert_eq!(sign_ext(0b000000000000000101000, 21), 40); + assert_eq!(sign_ext(0b111111111111111011000, 21), -40); + assert_eq!(sign_ext(0b000000000000000000000, 21), 0); + assert_eq!(sign_ext(0b111111111111111111111, 21), -1); + assert_eq!(sign_ext(0b111000000000000000000, 21), -262144); + assert_eq!(sign_ext(0b000111111111111111111, 21), 262143); + assert_eq!(sign_ext(7608, 13), -584); + } +} + +use std::sync::LazyLock; +pub static DEBUG: LazyLock = LazyLock::new(|| std::env::var("DEBUG").map(|v| v.parse::().unwrap_or(0) >= 6).unwrap_or(false)); + +pub trait Colorize { + fn color(self, color: &str) -> String; +} +impl<'a> Colorize for &'a str { + fn color(self, color: &str) -> String { + let ansi_code = match color { + "blue" => format!("\x1b[{};2;112;184;255m", 38), + "green" => format!("\x1b[{};2;39;176;139m", 38), + "gray" => format!("\x1b[{};2;169;169;169m", 38), + _ => format!("\x1b[{};2;255;255;255m", 38), + }; + format!("{}{}{}", ansi_code, self, "\x1b[0m") + } +} + +#[macro_export] +macro_rules! todo_instr { + ($x:expr) => {{ + println!("{:08X}", $x); + Err(1) + }}; +} + +#[macro_export] +macro_rules! print_instr { + ($name:expr $(, $arg:ident)* $(,)?) => { + if *DEBUG { + print!("{}", format!("{:<8}", $name).color("blue")); + $( + print!(" {:<16}", format!("{}={:?}", stringify!($arg), $arg)); + )* + } + }; +} diff --git a/extra/remu/src/lib.rs b/extra/remu/src/lib.rs new file mode 100644 index 0000000000..a41ec8144d --- /dev/null +++ b/extra/remu/src/lib.rs @@ -0,0 +1,31 @@ +use crate::work_group::WorkGroup; +use std::os::raw::c_char; +use std::slice; +mod helpers; +mod state; +mod thread; +mod work_group; + +#[no_mangle] +pub extern "C" fn run_asm(lib: *const c_char, lib_sz: u32, gx: u32, gy: u32, gz: u32, lx: u32, ly: u32, lz: u32, args_ptr: *const u64) -> i32 { + if lib.is_null() || (lib_sz % 4) != 0 { + panic!("Pointer is null or length is not properly aligned to 4 bytes"); + } + let kernel = unsafe { slice::from_raw_parts(lib as *const u32, (lib_sz / 4) as usize).to_vec() }; + let dispatch_dim = match (gy != 1, gz != 1) { + (true, true) => 3, + (true, false) => 2, + _ => 1, + }; + for gx in 0..gx { + for gy in 0..gy { + for gz in 0..gz { + let mut wg = WorkGroup::new(dispatch_dim, [gx, gy, gz], [lx, ly, lz], &kernel, args_ptr); + if let Err(err) = wg.exec_waves() { + return err; + } + } + } + } + 0 +} diff --git a/extra/remu/src/state.rs b/extra/remu/src/state.rs new file mode 100644 index 0000000000..1dc7cd86e4 --- /dev/null +++ b/extra/remu/src/state.rs @@ -0,0 +1,256 @@ +use std::ops::{Index, IndexMut}; + +pub trait Register { + fn read64(&self, idx: usize) -> u64; + fn write64(&mut self, idx: usize, addr: u64); +} +impl Register for T +where + T: Index + IndexMut, +{ + fn read64(&self, idx: usize) -> u64 { + let addr_lsb = self[idx]; + let addr_msb = self[idx + 1]; + ((addr_msb as u64) << 32) | addr_lsb as u64 + } + fn write64(&mut self, idx: usize, addr: u64) { + self[idx] = (addr & 0xffffffff) as u32; + self[idx + 1] = ((addr & (0xffffffff << 32)) >> 32) as u32; + } +} + +#[derive(Clone)] +pub struct VGPR { + values: [[u32; 256]; 32], + pub default_lane: Option, +} +impl Index for VGPR { + type Output = u32; + fn index(&self, index: usize) -> &Self::Output { + &self.values[self.default_lane.unwrap()][index] + } +} +impl IndexMut for VGPR { + fn index_mut(&mut self, index: usize) -> &mut Self::Output { + &mut self.values[self.default_lane.unwrap()][index] + } +} +impl VGPR { + pub fn new() -> Self { + VGPR { + values: [[0; 256]; 32], + default_lane: None, + } + } + pub fn get_lane(&self, lane: usize) -> [u32; 256] { + *self.values.get(lane).unwrap() + } + pub fn get_lane_mut(&mut self, lane: usize) -> &mut [u32; 256] { + self.values.get_mut(lane).unwrap() + } +} + +pub trait Value { + fn mut_hi16(&mut self, val: u16); + fn mut_lo16(&mut self, val: u16); +} +impl Value for u32 { + fn mut_hi16(&mut self, val: u16) { + *self = ((val as u32) << 16) | (*self as u16 as u32); + } + fn mut_lo16(&mut self, val: u16) { + *self = ((((*self & (0xffff << 16)) >> 16) as u32) << 16) | val as u32; + } +} + +#[derive(Debug, Clone, Copy)] +pub struct WaveValue { + pub value: u32, + pub warp_size: usize, + pub default_lane: Option, + pub mutations: Option<[bool; 32]>, +} +impl WaveValue { + pub fn new(value: u32, warp_size: usize) -> Self { + Self { + value, + warp_size, + default_lane: None, + mutations: None, + } + } + pub fn read(&self) -> bool { + (self.value >> self.default_lane.unwrap()) & 1 == 1 + } + pub fn set_lane(&mut self, value: bool) { + if self.mutations.is_none() { + self.mutations = Some([false; 32]) + } + self.mutations.as_mut().unwrap()[self.default_lane.unwrap()] = value; + } + pub fn apply_muts(&mut self) { + self.value = 0; + for lane in 0..self.warp_size { + if self.mutations.unwrap()[lane] { + self.value |= 1 << lane; + } + } + } +} + +#[derive(Clone, Debug)] +pub struct VecDataStore { + pub data: Vec, +} + +impl VecDataStore { + pub fn new() -> Self { + Self { data: Vec::new() } + } + pub fn write(&mut self, addr: usize, val: u32) { + if addr + 4 >= self.data.len() { + self.data.resize(self.data.len() + addr + 5, 0); + } + self.data[addr..addr + 4].iter_mut().enumerate().for_each(|(i, x)| { + *x = val.to_le_bytes()[i]; + }); + } + pub fn write64(&mut self, addr: usize, val: u64) { + self.write(addr, (val & 0xffffffff) as u32); + self.write(addr + 4, ((val & (0xffffffff << 32)) >> 32) as u32); + } + pub fn read(&self, addr: usize) -> u32 { + let mut bytes: [u8; 4] = [0; 4]; + bytes.copy_from_slice(&self.data[addr + 0..addr + 4]); + u32::from_le_bytes(bytes) + } + pub fn read64(&mut self, addr: usize) -> u64 { + let lsb = self.read(addr); + let msb = self.read(addr + 4); + ((msb as u64) << 32) | lsb as u64 + } +} + +#[cfg(test)] +mod test_state { + use super::*; + + #[test] + fn test_wave_value() { + let mut val = WaveValue::new(0b11000000000000011111111111101110, 32); + val.default_lane = Some(0); + assert!(!val.read()); + val.default_lane = Some(31); + assert!(val.read()); + } + + #[test] + fn test_wave_value_small() { + let mut val = WaveValue::new(0, 1); + val.default_lane = Some(0); + assert!(!val.read()); + assert_eq!(val.value, 0); + val.set_lane(true); + val.apply_muts(); + assert!(val.read()); + assert_eq!(val.value, 1); + } + + #[test] + fn test_wave_value_small_alt() { + let mut val = WaveValue::new(0, 2); + val.default_lane = Some(0); + assert!(!val.read()); + assert_eq!(val.value, 0); + val.set_lane(true); + val.apply_muts(); + assert!(val.read()); + assert_eq!(val.value, 1); + } + + #[test] + fn test_wave_value_exec() { + let warp_size = 32; + let val = WaveValue::new(u32::MAX, warp_size); + assert_eq!(val.value, u32::MAX); + let warp_size = 3; + let val = WaveValue::new((1 << warp_size) - 1, warp_size); + assert_eq!(val.value, 7) + } + + #[test] + fn test_wave_value_toggle_one() { + let warp_size = 2; + let mut val = WaveValue::new(0b11, warp_size); + // 0 + val.default_lane = Some(0); + val.set_lane(false); + // 1 + val.default_lane = Some(1); + val.set_lane(true); + val.apply_muts(); + assert_eq!(val.value, 2); + } + + #[test] + fn test_wave_value_mutate_small() { + let mut val = WaveValue::new(0, 2); + val.default_lane = Some(0); + assert!(!val.read()); + assert_eq!(val.value, 0); + val.set_lane(true); + val.apply_muts(); + assert!(val.read()); + assert_eq!(val.value, 1); + } + + #[test] + fn test_wave_value_mutations() { + let mut val = WaveValue::new(0b10001, 32); + val.default_lane = Some(0); + val.set_lane(false); + assert!(val.mutations.unwrap().iter().all(|x| !x)); + val.default_lane = Some(1); + val.set_lane(true); + assert_eq!(val.value, 0b10001); + assert_eq!( + val.mutations, + Some([ + false, true, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, false, false, false, false, false, + ]) + ); + + val.apply_muts(); + assert_eq!(val.value, 0b10); + } + + #[test] + fn test_write16() { + let mut vgpr = VGPR::new(); + vgpr.default_lane = Some(0); + vgpr[0] = 0b11100000000000001111111111111111; + vgpr[0].mut_lo16(0b1011101111111110); + assert_eq!(vgpr[0], 0b11100000000000001011101111111110); + } + + #[test] + fn test_write16hi() { + let mut vgpr = VGPR::new(); + vgpr.default_lane = Some(0); + vgpr[0] = 0b11100000000000001111111111111111; + vgpr[0].mut_hi16(0b1011101111111110); + assert_eq!(vgpr[0], 0b10111011111111101111111111111111); + } + + #[test] + fn test_vgpr() { + let mut vgpr = VGPR::new(); + vgpr.default_lane = Some(0); + vgpr[0] = 42; + vgpr.default_lane = Some(10); + vgpr[0] = 10; + assert_eq!(vgpr.get_lane(0)[0], 42); + assert_eq!(vgpr.get_lane(10)[0], 10); + } +} diff --git a/extra/remu/src/thread.rs b/extra/remu/src/thread.rs new file mode 100644 index 0000000000..1d067c0e9a --- /dev/null +++ b/extra/remu/src/thread.rs @@ -0,0 +1,3716 @@ +use crate::helpers::{extract_mantissa, f16_hi, f16_lo, ldexp, nth, sign_ext, IEEEClass, VOPModifier}; +use crate::helpers::{Colorize, DEBUG}; +use crate::state::{Register, Value, VecDataStore, WaveValue, VGPR}; +use crate::{print_instr, todo_instr}; +use half::{bf16, f16}; +use num_traits::Float; + +const SGPR_COUNT: usize = 105; +const VGPR_COUNT: usize = 256; +const NULL_SRC: u32 = 124; +const SIMM_SRC: usize = 255; + +pub const END_PRG: u32 = 0xbfb00000; + +pub struct Thread<'a> { + pub scalar_reg: &'a mut Vec, + pub scc: &'a mut u32, + + pub vec_reg: &'a mut VGPR, + pub vcc: &'a mut WaveValue, + pub exec: &'a mut WaveValue, + + pub lds: &'a mut VecDataStore, + pub sds: &'a mut VecDataStore, + + pub pc_offset: usize, + pub stream: Vec, + pub simm: Option, + pub sgpr_co: &'a mut Option<(usize, WaveValue)>, + pub warp_size: usize, + pub scalar: bool, +} + +impl<'a> Thread<'a> { + pub fn interpret(&mut self) -> Result<(), i32> { + let instruction = self.stream[self.pc_offset]; + // smem + if instruction >> 26 == 0b111101 { + let instr = self.u64_instr(); + /* addr: s[sbase:sbase+1] */ + let sbase = (instr & 0x3f) * 2; + let sdata = ((instr >> 6) & 0x7f) as usize; + let op = (instr >> 18) & 0xff; + let offset = sign_ext((instr >> 32) & 0x1fffff, 21); + let soffset = match self.val(((instr >> 57) & 0x7f) as usize) { + NULL_SRC => 0, + val => val, + }; + + print_instr!("SMEM", sbase, sdata, op, offset, soffset); + + // TODO: refactor vcc_lo to store in scalar register 106 + let base_addr = match sbase { + 106 => ((self.scalar_reg[107] as u64) << 32) | self.vcc.value as u64, + _ => self.scalar_reg.read64(sbase as usize), + }; + let addr = (base_addr as i64 + offset + soffset as i64) as u64; + + match op { + 0..=4 => (0..2_usize.pow(op as u32)).for_each(|i| { + let ret = unsafe { *((addr + (4 * i as u64)) as *const u32) }; + self.write_to_sdst(sdata + i, ret); + }), + _ => todo_instr!(instruction)?, + }; + self.scalar = true; + } + // sop1 + else if instruction >> 23 == 0b10_1111101 { + let src = (instruction & 0xFF) as usize; + let op = (instruction >> 8) & 0xFF; + let sdst = ((instruction >> 16) & 0x7F) as usize; + + print_instr!("SOP1", src, sdst, op); + + match op { + 1 => { + let s0 = self.val(src); + let ret = match op { + 1 => s0, + _ => todo_instr!(instruction)?, + }; + self.scalar_reg.write64(sdst as usize, ret); + } + _ => { + let s0 = self.val(src); + let ret = match op { + 0 => s0, + 10 => self.clz_i32_u32(s0), + 12 => self.cls_i32(s0), + 4 => s0.reverse_bits(), + 14 => s0 as i8 as i32 as u32, + 15 => s0 as i16 as i32 as u32, + 16 | 18 => { + let sdst: u32 = self.val(sdst as usize); + if op == 16 { + sdst & !(1 << (s0 & 0x1f)) + } else { + sdst | (1 << (s0 & 0x1f)) + } + } + 30 => { + let ret = !s0; + *self.scc = (ret != 0) as u32; + ret + } + 32 | 34 | 48 => { + let saveexec = self.exec.value; + self.exec.value = match op { + 32 => s0 & saveexec, + 34 => s0 | saveexec, + 48 => s0 & !saveexec, + _ => todo_instr!(instruction)?, + }; + *self.scc = (self.exec.value != 0) as u32; + saveexec + } + _ => todo_instr!(instruction)?, + }; + + self.write_to_sdst(sdst, ret); + } + }; + self.scalar = true; + } + // sopc + else if (instruction >> 23) & 0x3ff == 0b101111110 { + let s0 = (instruction & 0xff) as usize; + let s1 = ((instruction >> 8) & 0xff) as usize; + let op = (instruction >> 16) & 0x7f; + + print_instr!("SOPC", s0, s1, op); + + fn scmp(s0: T, s1: T, offset: u32, op: u32) -> bool + where + T: PartialOrd + PartialEq, + { + match op - offset { + 0 => s0 == s1, + 1 => s0 != s1, + 2 => s0 > s1, + 3 => s0 >= s1, + 4 => s0 < s1, + _ => s0 <= s1, + } + } + *self.scc = match op { + 0..=5 => { + let (s0, s1): (u32, u32) = (self.val(s0), self.val(s1)); + scmp(s0 as i32, s1 as i32, 0, op) + } + 6..=11 => { + let (s0, s1): (u32, u32) = (self.val(s0), self.val(s1)); + scmp(s0, s1, 6, op) + } + 12 => { + let (s0, s1): (u32, u32) = (self.val(s0), self.val(s1)); + s0 & (1 << (s1 & 0x1F)) == 0 + } + 13 => { + let (s0, s1): (u32, u32) = (self.val(s0), self.val(s1)); + s0 & (1 << (s1 & 0x1F)) == 1 + } + 16 | 17 => { + let (s0, s1): (u64, u64) = (self.val(s0), self.val(s1)); + scmp(s0, s1, 16, op) + } + _ => todo_instr!(instruction)?, + } as u32; + self.scalar = true; + } + // sopp + else if instruction >> 23 == 0b10_1111111 { + let simm16 = (instruction & 0xffff) as i16; + let op = (instruction >> 16) & 0x7f; + + print_instr!("SOPP", simm16, op); + + match op { + 32..=42 => { + let should_jump = match op { + 32 => true, + 33 => *self.scc == 0, + 34 => *self.scc == 1, + 35 => self.vcc.value == 0, + 36 => self.vcc.value != 0, + 37 => self.exec.value == 0, + 38 => self.exec.value != 0, + _ => todo_instr!(instruction)?, + }; + if should_jump { + self.pc_offset = (self.pc_offset as i64 + simm16 as i64) as usize; + } + } + _ => todo_instr!(instruction)?, + }; + self.scalar = true; + } + // sopk + else if instruction >> 28 == 0b1011 { + let simm = instruction & 0xffff; + let sdst = ((instruction >> 16) & 0x7f) as usize; + let op = (instruction >> 23) & 0x1f; + let s0: u32 = self.val(sdst); + + print_instr!("SOPK", simm, sdst, s0, op); + + match op { + 0 => self.write_to_sdst(sdst, simm as i16 as i32 as u32), + 3..=8 => { + let s1 = simm as i16 as i64; + let s0 = s0 as i32 as i64; + *self.scc = match op { + 3 => s0 == s1, + 4 => s0 != s1, + 5 => s0 > s1, + 7 => s0 < s1, + _ => todo_instr!(instruction)?, + } as u32 + } + 9..=14 => { + let s1 = simm as u16 as u32; + *self.scc = match op { + 9 => s0 == s1, + 10 => s0 != s1, + 11 => s0 > s1, + 12 => s0 >= s1, + 13 => s0 < s1, + 14 => s0 <= s1, + _ => todo_instr!(instruction)?, + } as u32 + } + 15 => { + let temp = s0 as i32; + let simm16 = simm as i16; + let dest = (temp as i64 + simm16 as i64) as i32; + self.write_to_sdst(sdst, dest as u32); + let temp_sign = ((temp >> 31) & 1) as u32; + let simm_sign = ((simm16 >> 15) & 1) as u32; + let dest_sign = ((dest >> 31) & 1) as u32; + *self.scc = ((temp_sign == simm_sign) && (temp_sign != dest_sign)) as u32; + } + 16 => { + let simm16 = simm as i16; + let ret = (s0 as i32 * simm16 as i32) as u32; + self.write_to_sdst(sdst, ret); + } + _ => todo_instr!(instruction)?, + }; + self.scalar = true; + } + // sop2 + else if instruction >> 30 == 0b10 { + let s0 = (instruction & 0xFF) as usize; + let s1 = ((instruction >> 8) & 0xFF) as usize; + let sdst = ((instruction >> 16) & 0x7F) as usize; + let op = (instruction >> 23) & 0xFF; + + print_instr!("SOP2", s0, s1, sdst, op); + + match op { + 23 | 25 | 27 => { + let (s0, s1): (u64, u64) = (self.val(s0), self.val(s1)); + let ret = match op { + 23 => s0 & s1, + 25 => s0 | s1, + 27 => s0 ^ s1, + _ => todo_instr!(instruction)?, + }; + self.scalar_reg.write64(sdst as usize, ret); + *self.scc = (ret != 0) as u32; + } + 9 | 13 | 11 | 40 | 41 => { + let (s0, s1): (u64, u32) = (self.val(s0), self.val(s1)); + let ret = match op { + 9 => { + let ret = s0 << (s1 & 0x3f); + (ret, Some(ret != 0)) + } + 11 => { + let ret = s0 >> (s1 & 0x3f); + (ret as u64, Some(ret != 0)) + } + 13 => { + let ret = (s0 as i64) >> (s1 & 0x3f); + (ret as u64, Some(ret != 0)) + } + 40 => { + let ret = (s0 >> (s1 & 0x3f)) & ((1 << ((s1 >> 16) & 0x7f)) - 1); + (ret as u64, Some(ret != 0)) + } + 41 => { + let s0 = s0 as i64; + let mut ret = (s0 >> (s1 & 0x3f)) & ((1 << ((s1 >> 16) & 0x7f)) - 1); + let shift = 64 - ((s1 >> 16) & 0x7f); + ret = (ret << shift) >> shift; + (ret as u64, Some(ret != 0)) + } + _ => todo_instr!(instruction)?, + }; + self.scalar_reg.write64(sdst as usize, ret.0); + if let Some(val) = ret.1 { + *self.scc = val as u32 + } + } + _ => { + let (s0, s1): (u32, u32) = (self.val(s0), self.val(s1)); + let ret = match op { + 0 | 4 => { + let (s0, s1) = (s0 as u64, s1 as u64); + let ret = match op { + 0 => s0 + s1, + 4 => s0 + s1 + *self.scc as u64, + _ => todo_instr!(instruction)?, + }; + (ret as u32, Some(ret >= 0x100000000)) + } + 1 => (s0 - s1, Some(s1 > s0)), + 5 => (s0 - s1 - *self.scc, Some((s1 as u64 + *self.scc as u64) > s0 as u64)), + 2 | 3 => { + let s0 = s0 as i32 as i64; + let s1 = s1 as i32 as i64; + let ret = match op { + 2 => s0 + s1, + 3 => s0 - s1, + _ => todo_instr!(instruction)?, + }; + let overflow = (nth(s0 as u32, 31) == nth(s1 as u32, 31)) && (nth(s0 as u32, 31) != nth(ret as u32, 31)); + + (ret as i32 as u32, Some(overflow)) + } + (8..=17) => { + let s1 = s1 & 0x1f; + let ret = match op { + 8 => s0 << s1, + 10 => s0 >> s1, + 12 => ((s0 as i32) >> (s1 as i32)) as u32, + _ => todo_instr!(instruction)?, + }; + (ret, Some(ret != 0)) + } + (18..=21) => { + let scc = match op { + 18 => (s0 as i32) < (s1 as i32), + 19 => s0 < s1, + 20 => (s0 as i32) > (s1 as i32), + 21 => s0 > s1, + _ => todo_instr!(instruction)?, + }; + let ret = match scc { + true => s0, + false => s1, + }; + (ret, Some(scc)) + } + (22..=26) | 34 | 36 => { + let ret = match op { + 22 => s0 & s1, + 24 => s0 | s1, + 26 => s0 ^ s1, + 34 => s0 & !s1, + 36 => s0 | !s1, + _ => todo_instr!(instruction)?, + }; + (ret, Some(ret != 0)) + } + 38 => { + let ret = (s0 >> (s1 & 0x1f)) & ((1 << ((s1 >> 16) & 0x7f)) - 1); + (ret, Some(ret != 0)) + } + 39 => { + let s0 = s0 as i32; + let mut ret = (s0 >> (s1 & 0x1f)) & ((1 << ((s1 >> 16) & 0x1f)) - 1); + let shift = 32 - ((s1 >> 16) & 0x7f); + ret = (ret << shift) >> shift; + (ret as u32, Some(ret != 0)) + } + 44 => (((s0 as i32) * (s1 as i32)) as u32, None), + 45 => (((s0 as u64) * (s1 as u64) >> 32) as u32, None), + 46 => ((((s0 as i32 as i64 * s1 as i32 as i64) as u64) >> 32u64) as i32 as u32, None), + 48 => match *self.scc != 0 { + true => (s0, None), + false => (s1, None), + }, + 50..=53 => { + let (s0, s1) = match op { + 50 => (s0 as u16, s1 as u16), + 51 => (s0 as u16, (s1 >> 16) as u16), + 52 => ((s0 >> 16) as u16, (s1 >> 16) as u16), + _ => ((s0 >> 16) as u16, s1 as u16), + }; + (((s1 as u32) << 16) | (s0 as u32), None) + } + _ => todo_instr!(instruction)?, + }; + + self.write_to_sdst(sdst, ret.0); + if let Some(val) = ret.1 { + *self.scc = val as u32 + } + } + }; + self.scalar = true; + } + // vopp + else if instruction >> 24 == 0b11001100 { + let instr = self.u64_instr(); + let vdst = (instr & 0xff) as usize; + let clmp = (instr >> 15) & 0x1; + assert_eq!([clmp], [0]); + let op = (instr >> 16) & 0x7f; + + let mut src = |x: usize| -> (u16, u16, u32) { + let val: u32 = self.val(x); + match x { + 255 => { + let val_lo: u16 = self.val(x); + (val_lo, val_lo, val) + } + (240..=247) => { + let val_lo: u16 = self.val(x); + (val_lo, f16::from_bits(0).to_bits(), val) + } + _ => ((val & 0xffff) as u16, ((val >> 16) & 0xffff) as u16, val), + } + }; + + let s = [32, 41, 50].iter().map(|x| ((instr >> x) & 0x1ff) as usize).collect::>(); + let src_parts = s.iter().map(|x| src(*x)).collect::>(); + + let b = |i: usize| (instr >> i) & 0x1 != 0; + let neg_hi = ((instr >> 8) & 0x7) as usize; + let neg = ((instr >> 61) & 0x7) as usize; + let opsel = [b(11), b(12), b(13)]; + let opsel_hi = [b(59), b(60), b(14)]; + + print_instr!("VOPP", op, vdst, src_parts, opsel, opsel_hi, neg, neg_hi); + + match op { + 0..=18 => { + let fxn = |x, y, z| -> u16 { + match op { + 0 => x * y + z, + 1 => x * y, + 2 => x + y, + 3 => x - y, + 4 => y << (x & 0xf), + 5 => y >> (x & 0xf), + 6 => ((y as i16) >> ((x as i16) & 0xf)) as u16, + 7 => i16::max(x as i16, y as i16) as u16, + 8 => i16::min(x as i16, y as i16) as u16, + 9 => x * y + z, + 10 => x + y, + 11 => x - y, + 12 => u16::max(x, y), + 13 => u16::min(x, y), + _ => { + let (x, y, z) = (f16::from_bits(x), f16::from_bits(y), f16::from_bits(z)); + let ret = match op { + 14 => f16::mul_add(x, y, z), + 15 => x + y, + 16 => x * y, + 17 => f16::min(x, y), + 18 => f16::max(x, y), + _ => unreachable!("op should be in range 0..=18, got {op}"), + }; + ret.to_bits() + } + } + }; + let src = |opsel: [bool; 3]| { + opsel + .iter() + .enumerate() + .map(|(i, sel)| { + if (14..=19).contains(&op) { + let half = |x, n| f16::from_bits(x).negate(i, n).to_bits(); + match sel { + true => half(src_parts[i].1, neg), + false => half(src_parts[i].0, neg_hi), + } + } else { + match sel { + true => src_parts[i].1, + false => src_parts[i].0, + } + } + }) + .collect::>() + }; + let (src_hi, src_lo) = (src(opsel_hi), src(opsel)); + let ret = ((fxn(src_hi[0], src_hi[1], src_hi[2]) as u32) << 16) | (fxn(src_lo[0], src_lo[1], src_lo[2]) as u32); + + if self.exec.read() { + self.vec_reg[vdst] = ret; + } + } + 32..=34 => { + let src: Vec = src_parts + .iter() + .enumerate() + .map(|(i, (lo, hi, full))| { + if !opsel_hi[i] { + f32::from_bits(*full).absolute(i, neg_hi) + } else if opsel[i] { + f32::from(f16::from_bits(*hi)).absolute(i, neg_hi) + } else { + f32::from(f16::from_bits(*lo)).absolute(i, neg_hi) + } + }) + .collect(); + let ret = match op { + 32 => f32::mul_add(src[0], src[1], src[2]).to_bits(), + 33 | 34 => { + let ret = f16::from_f32(f32::mul_add(src[0], src[1], src[2])).to_bits(); + match op { + 33 => (self.vec_reg[vdst] & 0xffff0000) | (ret as u32), + 34 => (self.vec_reg[vdst] & 0x0000ffff) | ((ret as u32) << 16), + _ => todo_instr!(instruction)?, + } + } + _ => todo_instr!(instruction)?, + }; + if self.exec.read() { + self.vec_reg[vdst] = ret; + } + } + 64..=69 => { + match op { + 64 => { + let a = self.wmma_b16_16x16(s[0]).map(|v| f16::from_bits(v).to_f32()); + let b = self.wmma_b16_16x16(s[1]).map(|v| f16::from_bits(v).to_f32()); + let c = self.wmma_b32_16x16(s[2]).map(|v| f32::from_bits(v)); + let ret = wmma(a.collect(), b.collect(), c.collect()); + for (i, val) in ret.into_iter().enumerate() { + let lane = i % 32; + self.vec_reg.get_lane_mut(lane)[(i / 32) + vdst] = val.to_bits(); + } + } + 65 => { + let a = self.wmma_b16_16x16(s[0]).map(|v| bf16::from_bits(v).to_f32()); + let b = self.wmma_b16_16x16(s[1]).map(|v| bf16::from_bits(v).to_f32()); + let c = self.wmma_b32_16x16(s[2]).map(|v| f32::from_bits(v)); + let ret = wmma(a.collect(), b.collect(), c.collect()); + for (i, val) in ret.into_iter().enumerate() { + let register = (i / 32) + vdst; + let lane = i % 32; + self.vec_reg.get_lane_mut(lane)[register] = val.to_bits() + } + } + 66 => { + let a = self.wmma_b16_16x16(s[0]).map(|v| f16::from_bits(v)); + let b = self.wmma_b16_16x16(s[1]).map(|v| f16::from_bits(v)); + let c = self.wmma_b32_16x16(s[2]).map(|v| f16::from_bits(v as u16)); + let ret = wmma(a.collect(), b.collect(), c.collect()); + for (i, val) in ret.into_iter().enumerate() { + let register = (i / 32) + vdst; + let lane = i % 32; + self.vec_reg.get_lane_mut(lane)[register].mut_lo16(val.to_bits()); + } + } + _ => todo_instr!(instruction)?, + }; + self.scalar = true; + } + _ => todo_instr!(instruction)?, + } + } + // vop1 + else if instruction >> 25 == 0b0111111 { + let s0 = (instruction & 0x1ff) as usize; + let op = (instruction >> 9) & 0xff; + let vdst = ((instruction >> 17) & 0xff) as usize; + + print_instr!("VOP1", s0, op, vdst); + + match op { + 3 | 15 | 21 | 23 | 25 | 26 | 60 | 61 | 47 | 49 => { + let s0: u64 = self.val(s0); + match op { + 3 | 15 | 21 | 23 | 25 | 26 | 60 | 61 | 47 | 49 => { + let s0 = f64::from_bits(s0); + match op { + 23 | 25 | 26 | 61 | 47 | 49 => { + let ret = match op { + 23 => f64::trunc(s0), + 25 => { + let mut temp = f64::floor(s0 + 0.5); + if f64::floor(s0) % 2.0 != 0.0 && f64::fract(s0) == 0.5 { + temp -= 1.0; + } + temp + } + 26 => f64::floor(s0), + 47 => 1.0 / s0, + 49 => 1.0 / f64::sqrt(s0), + 61 => extract_mantissa(s0), + _ => todo_instr!(instruction)?, + }; + if self.exec.read() { + self.vec_reg.write64(vdst, ret.to_bits()) + } + } + _ => { + let ret = match op { + 3 => s0 as i32 as u32, + 15 => (s0 as f32).to_bits(), + 21 => s0 as u32, + 60 => match (s0 == f64::INFINITY) || (s0 == f64::NEG_INFINITY) || s0.is_nan() { + true => 0, + false => (s0.exponent() as i32 - 1023 + 1) as u32, + }, + _ => todo_instr!(instruction)?, + }; + if self.exec.read() { + self.vec_reg[vdst] = ret; + } + } + } + } + _ => todo_instr!(instruction)?, + } + } + 84..=97 => { + let s0 = f16::from_bits(self.val(s0)); + let ret = match op { + 84 => f16::recip(s0), + 85 => f16::sqrt(s0), + 87 => f16::log2(s0), + 88 => f16::exp2(s0), + _ => todo_instr!(instruction)?, + }; + if self.exec.read() { + self.vec_reg[vdst] = ret.to_bits() as u32; + } + } + _ => { + let s0: u32 = self.val(s0); + match op { + 4 | 16 | 22 => { + let ret = match op { + 4 => (s0 as i32 as f64).to_bits(), + 22 => (s0 as f64).to_bits(), + 16 => (f32::from_bits(s0) as f64).to_bits(), + _ => todo_instr!(instruction)?, + }; + if self.exec.read() { + self.vec_reg.write64(vdst, ret) + } + } + 2 => { + let idx = self.exec.value.trailing_zeros() as usize; + self.scalar_reg[vdst] = self.vec_reg.get_lane(idx)[(instruction & 0x1ff) as usize - VGPR_COUNT]; + } + _ => { + let ret = match op { + 1 => s0, + 5 => (s0 as i32 as f32).to_bits(), + 6 => (s0 as f32).to_bits(), + 7 => f32::from_bits(s0) as u32, + 8 => f32::from_bits(s0) as i32 as u32, + 10 => f16::from_f32(f32::from_bits(s0)).to_bits() as u32, + 11 => f32::from(f16::from_bits(s0 as u16)).to_bits(), + 17 => ((s0 & 0xff) as f32).to_bits(), + 18 => (((s0 >> 8) & 0xff) as f32).to_bits(), + 19 => (((s0 >> 16) & 0xff) as f32).to_bits(), + 20 => (((s0 >> 24) & 0xff) as f32).to_bits(), + 56 => s0.reverse_bits(), + 57 => self.clz_i32_u32(s0), + 33..=51 => { + let s0 = f32::from_bits(s0); + match op { + 33 => s0.trunc(), + 34 => { + let mut d0 = s0.trunc(); + if s0 > 0.0 && s0 != d0 { + d0 += 1.0; + } + d0 + } + 35 => { + let mut temp = f32::floor(s0 + 0.5); + if f32::floor(s0) % 2.0 != 0.0 && f32::fract(s0) == 0.5 { + temp -= 1.0; + } + temp + } + 36 => { + let mut d0 = s0.trunc(); + if s0 < 0.0 && s0 != d0 { + d0 -= 1.0; + } + d0 + } + 37 => f32::exp2(s0), + 39 => f32::log2(s0), + 42 => 1.0 / s0, + 43 => 1.0 / s0, + 51 => f32::sqrt(s0), + _ => todo_instr!(instruction)?, + } + .to_bits() + } + 55 => !s0, + 59 => self.cls_i32(s0), + 80 => f16::from_f32(s0 as u16 as f32).to_bits() as u32, + 81 => f16::from_f32(s0 as i16 as f32).to_bits() as u32, + 82 => f32::from(f16::from_bits(s0 as u16)) as u32, + 83 => f32::from(f16::from_bits(s0 as u16)) as i16 as u32, + _ => todo_instr!(instruction)?, + }; + if self.exec.read() { + self.vec_reg[vdst] = ret; + } + } + } + } + } + } + // vopd + else if instruction >> 26 == 0b110010 { + let instr = self.u64_instr(); + let sx = instr & 0x1ff; + let vx = (instr >> 9) & 0xff; + let srcx0 = self.val(sx as usize); + let vsrcx1 = self.vec_reg[(vx) as usize] as u32; + let opy = (instr >> 17) & 0x1f; + let sy = (instr >> 32) & 0x1ff; + let vy = (instr >> 41) & 0xff; + let opx = (instr >> 22) & 0xf; + let srcy0 = match sy { + 255 => match sx { + 255 => srcx0, + _ => self.val(sy as usize), + }, + _ => self.val(sy as usize), + }; + let vsrcy1 = self.vec_reg[(vy) as usize]; + + let vdstx = ((instr >> 56) & 0xff) as usize; + // LSB is the opposite of VDSTX[0] + let vdsty = (((instr >> 49) & 0x7f) << 1 | ((vdstx as u64 & 1) ^ 1)) as usize; + + print_instr!("VOPD", opx, vdstx, sx, srcx0, vx, vsrcx1, opy, vdsty, sy, srcy0, vy, vsrcy1); + + for (op, s0, s1, dst) in ([(opx, srcx0, vsrcx1, vdstx), (opy, srcy0, vsrcy1, vdsty)]).iter() { + let ret = match *op { + 0 | 1 | 2 | 3 | 4 | 5 | 6 | 10 | 11 => { + let s0 = f32::from_bits(*s0 as u32); + let s1 = f32::from_bits(*s1 as u32); + match *op { + 0 => f32::mul_add(s0, s1, f32::from_bits(self.vec_reg[*dst])), + 1 => f32::mul_add(s0, s1, f32::from_bits(self.val(SIMM_SRC))), + 2 => f32::mul_add(s0, f32::from_bits(self.val(SIMM_SRC)), s1), + 3 => s0 * s1, + 4 => s0 + s1, + 5 => s0 - s1, + 6 => s1 - s0, + 10 => f32::max(s0, s1), + 11 => f32::min(s0, s1), + _ => todo_instr!(instruction)?, + } + .to_bits() + } + 8 => *s0, + 9 => match self.vcc.read() { + true => *s1, + false => *s0, + }, + 16 => s0 + s1, + 17 => s1 << s0, + 18 => s0 & s1, + _ => todo_instr!(instruction)?, + }; + if self.exec.read() { + self.vec_reg[*dst] = ret; + }; + } + } + // vopc + else if instruction >> 25 == 0b0111110 { + let s0 = (instruction & 0x1ff) as usize; + let s1 = ((instruction >> 9) & 0xff) as usize; + let op = (instruction >> 17) & 0xff; + + print_instr!("VOPC", s0, s1, op); + + let dest_offset = if op >= 128 { 128 } else { 0 }; + let ret = match op { + (0..=15) | 125 | (128..=143) => { + let s0 = f16::from_bits(self.val(s0)); + let s1 = f16::from_bits(self.vec_reg[s1] as u16); + match op { + 125 => self.cmp_class_f16(s0, s1.to_bits()), + _ => self.cmpf(s0, s1, op - dest_offset), + } + } + (16..=31) | 126 | (144..=159) => { + let s0 = f32::from_bits(self.val(s0)); + let s1 = f32::from_bits(self.vec_reg[s1]); + match op { + 126 => self.cmp_class_f32(s0, s1.to_bits()), + _ => self.cmpf(s0, s1, op - 16 - dest_offset), + } + } + (32..=47) | 127 | (160..=174) => { + let s0 = self.val(s0); + match op { + 127 => { + let s1 = self.val(s1); + self.cmp_class_f64(s0, s1) + } + _ => { + let s1 = f64::from_bits(self.vec_reg.read64(s1)); + self.cmpf(s0, s1, op - 32 - dest_offset) + } + } + } + (49..=54) | (177..=182) => { + let (s0, s1): (u16, u16) = (self.val(s0), self.vec_reg[s1] as u16); + self.cmpi(s0 as i16, s1 as i16, op - 48 - dest_offset) + } + (57..=62) | (185..=190) => { + let (s0, s1): (u16, u16) = (self.val(s0), self.vec_reg[s1] as u16); + self.cmpi(s0, s1, op - 56 - dest_offset) + } + (64..=71) | (192..=199) => { + let (s0, s1): (u32, u32) = (self.val(s0), self.vec_reg[s1]); + self.cmpi(s0 as i32, s1 as i32, op - 64 - dest_offset) + } + (72..=79) | (200..=207) => { + let (s0, s1): (u32, u32) = (self.val(s0), self.vec_reg[s1]); + self.cmpi(s0, s1, op - 72 - dest_offset) + } + (80..=87) | (208..=215) => { + let (s0, s1): (u64, u64) = (self.val(s0), self.vec_reg.read64(s1)); + self.cmpi(s0 as i64, s1 as i64, op - 80 - dest_offset) + } + (88..=95) | (216..=223) => { + let (s0, s1): (u64, u64) = (self.val(s0), self.vec_reg.read64(s1)); + self.cmpi(s0, s1, op - 88 - dest_offset) + } + _ => todo_instr!(instruction)?, + }; + + match op >= 128 { + true => self.exec.set_lane(ret), + false => self.vcc.set_lane(ret), + }; + } + // vop2 + else if instruction >> 31 == 0b0 { + let s0 = (instruction & 0x1FF) as usize; + let s1 = self.vec_reg[((instruction >> 9) & 0xFF) as usize]; + let vdst = ((instruction >> 17) & 0xFF) as usize; + let op = (instruction >> 25) & 0x3F; + + print_instr!("VOP2", s0, s1, vdst, op); + + match op { + (50..=60) => { + let (s0, s1) = (f16::from_bits(self.val(s0)), f16::from_bits(s1 as u16)); + let ret = match op { + 50 => s0 + s1, + 51 => s0 - s1, + 53 => s0 * s1, + 54 => f16::mul_add(s0, s1, f16::from_bits(self.vec_reg[vdst] as u16)), + 55 => f16::mul_add(s0, f16::from_bits(self.val(SIMM_SRC)), s1), + 56 => f16::mul_add(s0, s1, f16::from_bits(self.val(SIMM_SRC))), + 57 => f16::max(s0, s1), + 58 => f16::min(s0, s1), + _ => todo_instr!(instruction)?, + }; + if self.exec.read() { + self.vec_reg[vdst] = ret.to_bits() as u32; + } + } + _ => { + let s0 = self.val(s0); + let ret = match op { + 1 => match self.vcc.read() { + true => s1, + false => s0, + }, + 2 => { + let mut acc = f32::from_bits(self.vec_reg[vdst]); + acc += f32::from(f16_lo(s0)) * f32::from(f16_lo(s1)); + acc += f32::from(f16_hi(s0)) * f32::from(f16_hi(s1)); + acc.to_bits() + } + + 3 | 4 | 5 | 8 | 15 | 16 | 43 | 44 | 45 => { + let (s0, s1) = (f32::from_bits(s0), f32::from_bits(s1)); + match op { + 3 => s0 + s1, + 4 => s0 - s1, + 5 => s1 - s0, + 8 => s0 * s1, + 15 => f32::min(s0, s1), + 16 => f32::max(s0, s1), + 43 => f32::mul_add(s0, s1, f32::from_bits(self.vec_reg[vdst])), + 44 => f32::mul_add(s0, f32::from_bits(self.val(SIMM_SRC)), s1), + 45 => f32::mul_add(s0, s1, f32::from_bits(self.val(SIMM_SRC))), + _ => todo_instr!(instruction)?, + } + .to_bits() + } + 9 => { + let s0 = sign_ext((s0 & 0xffffff) as u64, 24) as i32; + let s1 = sign_ext((s1 & 0xffffff) as u64, 24) as i32; + (s0 * s1) as u32 + } + 17 | 18 | 26 => { + let (s0, s1) = (s0 as i32, s1 as i32); + (match op { + 17 => i32::min(s0, s1), + 18 => i32::max(s0, s1), + 26 => s1 >> s0, + _ => todo_instr!(instruction)?, + }) as u32 + } + 32 => { + let temp = s0 as u64 + s1 as u64 + self.vcc.read() as u64; + self.vcc.set_lane(temp >= 0x100000000); + temp as u32 + } + 33 | 34 => { + let temp = match op { + 33 => s0 - s1 - self.vcc.read() as u32, + 34 => s1 - s0 - self.vcc.read() as u32, + _ => todo_instr!(instruction)?, + }; + self.vcc.set_lane((s1 as u64 + self.vcc.read() as u64) > s0 as u64); + temp + } + 11 => s0 * s1, + 19 => u32::min(s0, s1), + 20 => u32::max(s0, s1), + 24 => s1 << s0, + 25 => s1 >> s0, + 27 => s0 & s1, + 28 => s0 | s1, + 29 => s0 ^ s1, + 37 => s0 + s1, + 38 => s0 - s1, + 39 => s1 - s0, + _ => todo_instr!(instruction)?, + }; + if self.exec.read() { + self.vec_reg[vdst] = ret; + } + } + }; + } + // vop3 + else if instruction >> 26 == 0b110101 { + let instr = self.u64_instr(); + + let op = ((instr >> 16) & 0x3ff) as u32; + match op { + 764 | 765 | 288 | 289 | 290 | 766 | 768 | 769 => { + let vdst = (instr & 0xff) as usize; + let sdst = ((instr >> 8) & 0x7f) as usize; + let f = |i: u32| -> usize { ((instr >> i) & 0x1ff) as usize }; + let (s0, s1, s2) = (f(32), f(41), f(50)); + let mut carry_in = WaveValue::new(self.val(s2), self.warp_size); + carry_in.default_lane = self.vcc.default_lane; + let omod = (instr >> 59) & 0x3; + let _neg = (instr >> 61) & 0x7; + let clmp = (instr >> 15) & 0x1; + assert_eq!(omod, 0); + assert_eq!(clmp, 0); + + print_instr!("VOPSD", vdst, sdst, op, s0, s1, s2); + + let vcc = match op { + 766 => { + let (s0, s1, s2): (u32, u32, u64) = (self.val(s0), self.val(s1), self.val(s2)); + let (mul_result, overflow_mul) = (s0 as u64).overflowing_mul(s1 as u64); + let (ret, overflow_add) = mul_result.overflowing_add(s2); + let overflowed = overflow_mul || overflow_add; + if self.exec.read() { + self.vec_reg.write64(vdst, ret); + } + overflowed + } + 765 => { + assert!(f64::from_bits(self.val(s2)).exponent() <= 1076); + let ret = ldexp(self.val(s0), 128); + if self.exec.read() { + self.vec_reg.write64(vdst, ret.to_bits()); + } + false + } + _ => { + let (s0, s1, _s2): (u32, u32, u32) = (self.val(s0), self.val(s1), self.val(s2)); + let (ret, vcc) = match op { + 288 => { + let ret = s0 as u64 + s1 as u64 + carry_in.read() as u64; + (ret as u32, ret >= 0x100000000) + } + 289 => { + let ret = (s0 as u64).wrapping_sub(s1 as u64).wrapping_sub(carry_in.read() as u64); + (ret as u32, s1 as u64 + (carry_in.read() as u64) > s0 as u64) + } + 290 => { + let ret = (s1 as u64).wrapping_sub(s0 as u64).wrapping_sub(carry_in.read() as u64); + (ret as u32, s1 as u64 + (carry_in.read() as u64) > s0 as u64) + } + 764 => (0, false), // NOTE: div scaling isn't required + 768 => { + let ret = s0 as u64 + s1 as u64; + (ret as u32, ret >= 0x100000000) + } + 769 => { + let ret = s0.wrapping_sub(s1); + (ret as u32, s1 > s0) + } + _ => todo_instr!(instruction)?, + }; + if self.exec.read() { + self.vec_reg[vdst] = ret; + } + vcc + } + }; + + match sdst { + 106 => self.vcc.set_lane(vcc), + 124 => {} + _ => self.set_sgpr_co(sdst, vcc), + } + } + _ => { + let vdst = (instr & 0xff) as usize; + let abs = ((instr >> 8) & 0x7) as usize; + let opsel = ((instr >> 11) & 0xf) as usize; + let cm = (instr >> 15) & 0x1; + + let s = |n: usize| ((instr >> n) & 0x1ff) as usize; + let src = (s(32), s(41), s(50)); + + let omod = (instr >> 59) & 0x3; + let neg = ((instr >> 61) & 0x7) as usize; + assert_eq!(omod, 0); + assert_eq!(cm, 0); + assert_eq!(opsel, 0); + + print_instr!("VOP3", vdst, abs, opsel, op, src, neg); + + match op { + // VOPC using VOP3 encoding + 0..=255 => { + let dest_offset = if op >= 128 { 128 } else { 0 }; + let ret = match op { + (0..=15) | 125 | (128..=143) => { + let (s0, s1) = (self.val(src.0), self.val(src.1)); + let s0 = f16::from_bits(s0).negate(0, neg).absolute(0, abs); + let s1 = f16::from_bits(s1).negate(1, neg).absolute(1, abs); + match op { + 125 => self.cmp_class_f16(s0, s1.to_bits()), + _ => self.cmpf(s0, s1, op - dest_offset), + } + } + (16..=31) | 126 | (144..=159) => { + let (s0, s1) = (self.val(src.0), self.val(src.1)); + let s0 = f32::from_bits(s0).negate(0, neg).absolute(0, abs); + let s1 = f32::from_bits(s1).negate(1, neg).absolute(1, abs); + match op { + 126 => self.cmp_class_f32(s0, s1.to_bits()), + _ => self.cmpf(s0, s1, op - 16 - dest_offset), + } + } + (32..=47) | 127 | (160..=174) => { + let s0: f64 = self.val(src.0); + let s0 = s0.negate(0, neg).absolute(0, abs); + match op { + 127 => { + let s1 = self.val(src.1); + self.cmp_class_f64(s0, s1) + } + _ => { + let s1 = self.val(src.1); + let s1 = f64::from_bits(s1).negate(1, neg).absolute(1, abs); + self.cmpf(s0, s1, op - 32 - dest_offset) + } + } + } + (49..=54) | (177..=182) => { + let (s0, s1): (u16, u16) = (self.val(src.0), self.val(src.1)); + self.cmpi(s0 as i16, s1 as i16, op - 48 - dest_offset) + } + (57..=62) | (185..=190) => { + let (s0, s1): (u16, u16) = (self.val(src.0), self.val(src.1)); + self.cmpi(s0, s1, op - 56 - dest_offset) + } + (64..=71) | (192..=199) => { + let (s0, s1): (u32, u32) = (self.val(src.0), self.val(src.1)); + self.cmpi(s0 as i32, s1 as i32, op - 64 - dest_offset) + } + (72..=79) | (200..=207) => { + let (s0, s1): (u32, u32) = (self.val(src.0), self.val(src.1)); + self.cmpi(s0, s1, op - 72 - dest_offset) + } + (80..=87) | (208..=215) => { + let (s0, s1): (u64, u64) = (self.val(src.0), self.val(src.1)); + self.cmpi(s0 as i64, s1 as i64, op - 80 - dest_offset) + } + (88..=95) | (216..=223) => { + let (s0, s1): (u64, u64) = (self.val(src.0), self.val(src.1)); + self.cmpi(s0, s1, op - 88 - dest_offset) + } + _ => todo_instr!(instruction)?, + }; + + match vdst { + 0..=SGPR_COUNT | 107 => self.set_sgpr_co(vdst, ret), + 106 => self.vcc.set_lane(ret), + 126 => self.exec.set_lane(ret), + _ => todo_instr!(instruction)?, + } + } + 828..=830 => { + let (s0, s1, _s2): (u32, u64, u64) = (self.val(src.0), self.val(src.1), self.val(src.2)); + let shift = s0 & 0x3f; + let ret = match op { + 828 => s1 << shift, + 829 => s1 >> shift, + 830 => ((s1 as i64) >> shift) as u64, + _ => todo_instr!(instruction)?, + }; + if self.exec.read() { + self.vec_reg.write64(vdst, ret) + } + } + 407 | 532 | 552 | 568 | (807..=811) => { + let (_s0, _s1, _s2): (f64, f64, f64) = (self.val(src.0), self.val(src.1), self.val(src.2)); + let s0 = _s0.negate(0, neg).absolute(0, abs); + let s1 = _s1.negate(1, neg).absolute(1, abs); + let s2 = _s2.negate(2, neg).absolute(2, abs); + let ret = match op { + 407 => f64::trunc(s0), + 532 => f64::mul_add(s0, s1, s2), + 552 => { + assert!(s0.is_normal()); + s0 + } + 807 => s0 + s1, + 808 => s0 * s1, + 809 => f64::min(s0, s1), + 810 => f64::max(s0, s1), + 811 => { + let s1: u32 = self.val(src.1); + s0 * 2f64.powi(s1 as i32) + } + 568 => { + assert!(!self.vcc.read()); + f64::mul_add(s0, s1, s2) + } + _ => todo_instr!(instruction)?, + } + .to_bits(); + if self.exec.read() { + self.vec_reg.write64(vdst, ret) + } + } + 306 | 313 | 596 | 584 | 585 | 588 => { + let (s0, s1, s2) = (self.val(src.0), self.val(src.1), self.val(src.2)); + let s0 = f16::from_bits(s0).negate(0, neg).absolute(0, abs); + let s1 = f16::from_bits(s1).negate(1, neg).absolute(1, abs); + let s2 = f16::from_bits(s2).negate(1, neg).absolute(1, abs); + let ret = match op { + 306 => s0 + s1, + 584 => f16::mul_add(s0, s1, s2), + 585 => f16::min(f16::min(s0, s1), s2), + 588 => f16::max(f16::max(s0, s1), s2), + 596 => s2 / s1, + 313 => f16::max(s0, s1), + 314 => f16::min(s0, s1), + _ => todo_instr!(instruction)?, + } + .to_bits(); + if self.exec.read() { + self.vec_reg[vdst] = ret as u32; + } + } + 394 => { + let s0 = f32::from_bits(self.val(src.0)).negate(0, neg).absolute(0, abs); + if self.exec.read() { + self.vec_reg[vdst].mut_lo16(f16::from_f32(s0).to_bits()); + } + } + 467 => { + let s0 = f16::from_bits(self.val(src.0)).negate(0, neg).absolute(0, abs); + if self.exec.read() { + self.vec_reg[vdst] = s0.to_f32() as i16 as u32; + } + } + 395 => { + let s0 = f16::from_bits(self.val(src.0)).negate(0, neg).absolute(0, abs); + if self.exec.read() { + self.vec_reg[vdst] = f32::from(s0).to_bits(); + } + } + 399 => { + let s0: f64 = self.val(src.0); + let s0 = s0.negate(0, neg).absolute(0, abs); + if self.exec.read() { + self.vec_reg[vdst] = (s0 as f32).to_bits(); + } + } + 785 => { + let (s0, s1) = (self.val(src.0), self.val(src.1)); + if self.exec.read() { + self.vec_reg[vdst] = (f16::from_bits(s1).to_bits() as u32) << 16 | f16::from_bits(s0).to_bits() as u32; + } + } + _ => { + let (s0, s1, s2) = (self.val(src.0), self.val(src.1), self.val(src.2)); + match op { + 865 => { + if self.exec.read() { + self.vec_reg.get_lane_mut(s1 as usize)[vdst] = s0; + } + return Ok(()); + } + 864 => { + let val = self.vec_reg.get_lane(s1 as usize)[src.0 - VGPR_COUNT]; + self.write_to_sdst(vdst, val); + return Ok(()); + } + 826 => { + if self.exec.read() { + self.vec_reg[vdst].mut_lo16(((s1 as i16) >> (s0 & 0xf)) as u16); + } + return Ok(()); + } + 587 | 577 | 590 | 771 | 772 | 773 | 777 | 779 | 824 | 825 => { + let (s0, s1, s2) = (s0 as u16, s1 as u16, s2 as u16); + let ret = match op { + 587 => u16::min(u16::min(s0, s1), s2), + 590 => u16::max(u16::max(s0, s1), s2), + 577 => s0 * s1 + s2, + 771 => s0 + s1, + 772 => s0 - s1, + 773 => s0 * s1, + 777 => u16::max(s0, s1), + 779 => u16::min(s0, s1), + 824 => s1 << s0, + 825 => s1 >> s0, + _ => todo_instr!(instruction)?, + }; + if self.exec.read() { + self.vec_reg[vdst].mut_lo16(ret); + } + return Ok(()); + } + 586 | 589 | 778 | 780 | 781 | 782 => { + let (s0, s1, s2) = (s0 as i16, s1 as i16, s2 as i16); + let ret = match op { + 586 => i16::min(i16::min(s0, s1), s2), + 589 => i16::max(i16::max(s0, s1), s2), + 778 => i16::max(s0, s1), + 780 => i16::min(s0, s1), + 781 => s0 + s1, + 782 => s0 - s1, + _ => todo_instr!(instruction)?, + }; + if self.exec.read() { + self.vec_reg[vdst].mut_lo16(ret as u16); + } + return Ok(()); + } + _ => {} + } + + let ret = match op { + 257 | 259 | 299 | 260 | 261 | 264 | 272 | 392 | 531 | 537 | 540 | 551 | 567 | 796 => { + let s0 = f32::from_bits(s0).negate(0, neg).absolute(0, abs); + let s1 = f32::from_bits(s1).negate(1, neg).absolute(1, abs); + let s2 = f32::from_bits(s2).negate(2, neg).absolute(2, abs); + match op { + 259 => s0 + s1, + 260 => s0 - s1, + 261 => s1 - s0, + 264 => s0 * s1, + 272 => f32::max(s0, s1), + 299 => f32::mul_add(s0, s1, f32::from_bits(self.vec_reg[vdst])), + 531 => f32::mul_add(s0, s1, s2), + 537 => f32::min(f32::min(s0, s1), s2), + 540 => f32::max(f32::max(s0, s1), s2), + 551 => s2 / s1, + 567 => { + let ret = f32::mul_add(s0, s1, s2); + match self.vcc.read() { + true => 2.0_f32.powi(32) * ret, + false => ret, + } + } + 796 => s0 * 2f32.powi(s1.to_bits() as i32), + // cnd_mask isn't a float only ALU but supports neg + 257 => { + let mut cond = WaveValue::new(s2.to_bits(), self.warp_size); + cond.default_lane = self.vcc.default_lane; + match cond.read() { + true => s1, + false => s0, + } + } + 392 => f32::from_bits(s0 as i32 as u32), + _ => todo_instr!(instruction)?, + } + .to_bits() + } + _ => { + if neg != 0 { + todo_instr!(instruction)? + } + match op { + 529 => { + let s0 = s0 as i32; + let shift = 32 - (s2 & 0x1f); + let mask: i32 = 1 << (s2 & 0x1f); + let ret = (s0 >> (s1 & 0x1f)) & (mask.wrapping_sub(1)); + ((ret << shift) >> shift) as u32 + } + 522 | 541 | 538 | 544 | 814 => { + let (s0, s1, s2) = (s0 as i32, s1 as i32, s2 as i32); + + (match op { + 522 => { + let s0 = sign_ext((s0 & 0xffffff) as u64, 24) as i32; + let s1 = sign_ext((s1 & 0xffffff) as u64, 24) as i32; + s0 * s1 + s2 + } + 538 => i32::min(i32::min(s0, s1), s2), + 541 => i32::max(i32::max(s0, s1), s2), + 544 => { + if (i32::max(i32::max(s0, s1), s2)) == s0 { + i32::max(s1, s2) + } else if (i32::max(i32::max(s0, s1), s2)) == s1 { + i32::max(s0, s2) + } else { + i32::max(s0, s1) + } + } + 814 => ((s0 as i64) * (s1 as i64) >> 32) as i32, + _ => todo_instr!(instruction)?, + }) as u32 + } + 275 => u32::min(s0, s1), + 276 => u32::max(s0, s1), + 280 => s1 << s0, + 281 => s1 >> s0, + 283 => s0 & s1, + 284 => s0 | s1, + 285 => s0 ^ s1, + 286 => !(s0 ^ s1), + 523 => s0 * s1 + s2, // TODO 24 bit trunc + 528 => (s0 >> s1) & ((1 << s2) - 1), + 530 => (s0 & s1) | (!s0 & s2), + 534 => { + let val = ((s0 as u64) << 32) | (s1 as u64); + let shift = (s2 & 0x1F) as u64; + ((val >> shift) & 0xffffffff) as u32 + } + 542 => u32::max(u32::max(s0, s1), s2), + 576 => s0 ^ s1 ^ s2, + 580 => { + fn byte_permute(data: u64, sel: u32) -> u8 { + let bytes = data.to_ne_bytes(); + match sel { + 13..=u32::MAX => 0xff, + 12 => 0x00, + 11 => ((bytes[7] & 0x80) != 0) as u8 * 0xff, + 10 => ((bytes[5] & 0x80) != 0) as u8 * 0xff, + 9 => ((bytes[3] & 0x80) != 0) as u8 * 0xff, + 8 => ((bytes[1] & 0x80) != 0) as u8 * 0xff, + _ => bytes[sel as usize], + } + } + let combined = ((s0 as u64) << 32) | s1 as u64; + let d0 = ((byte_permute(combined, s2 >> 24) as u32) << 24) + | ((byte_permute(combined, (s2 >> 16) & 0xFF) as u32) << 16) + | ((byte_permute(combined, (s2 >> 8) & 0xFF) as u32) << 8) + | (byte_permute(combined, s2 & 0xFF) as u32); + d0 + } + 581 => (s0 ^ s1) + s2, + 582 => (s0 << s1) + s2, + 583 => (s0 + s1) << s2, + 597 => s0 + s1 + s2, + 598 => (s0 << s1) | s2, + 599 => (s0 & s1) | s2, + 600 => s0 | s1 | s2, + 798 => { + let mut ret = s1; + (0..=31).into_iter().for_each(|i| ret += nth(s0, i)); + ret + } + 812 => s0 * s1, + 813 => ((s0 as u64) * (s1 as u64) >> 32) as u32, + _ => todo_instr!(instruction)?, + } + } + }; + if self.exec.read() { + self.vec_reg[vdst] = ret; + } + } + }; + } + } + } else if instruction >> 26 == 0b110110 { + let instr = self.u64_instr(); + if !self.exec.read() { + return Ok(()); + } + let op = (instr >> 18) & 0xff; + assert_eq!((instr >> 17) & 0x1, 0); + let addr = ((instr >> 32) & 0xff) as usize; + let data0 = ((instr >> 40) & 0xff) as usize; + let data1 = ((instr >> 48) & 0xff) as usize; + let vdst = ((instr >> 56) & 0xff) as usize; + + print_instr!("LDS", op, addr, data0, data1, vdst); + + let lds_base = self.vec_reg[addr]; + let single_addr = || (lds_base + (instr & 0xffff) as u32) as usize; + let double_addr = |adj: u32| { + let addr0 = lds_base + (instr & 0xff) as u32 * adj; + let addr1 = lds_base + ((instr >> 8) & 0xff) as u32 * adj; + (addr0 as usize, addr1 as usize) + }; + + match op { + // load + 54 | 118 | 255 => { + let dwords = match op { + 255 => 4, + 118 => 2, + _ => 1, + }; + (0..dwords).for_each(|i| { + self.vec_reg[vdst + i] = self.lds.read(single_addr() + 4 * i); + }); + } + 60 => self.vec_reg[vdst] = self.lds.read(single_addr()) as u16 as u32, + 55 => { + let (addr0, addr1) = double_addr(4); + self.vec_reg[vdst] = self.lds.read(addr0); + self.vec_reg[vdst + 1] = self.lds.read(addr1); + } + 119 => { + let (addr0, addr1) = double_addr(8); + self.vec_reg.write64(vdst, self.lds.read64(addr0)); + self.vec_reg.write64(vdst + 2, self.lds.read64(addr1)); + } + // store + 13 | 77 | 223 => { + let dwords = match op { + 223 => 4, + 77 => 2, + _ => 1, + }; + (0..dwords).for_each(|i| { + self.lds.write(single_addr() + 4 * i, self.vec_reg[data0 + i]); + }) + } + 30 => { + let addr = single_addr(); + if addr + 1 >= self.lds.data.len() { + self.lds.data.resize(self.lds.data.len() + addr + 2, 0); + } + self.lds.data[addr..addr + 1].iter_mut().enumerate().for_each(|(i, x)| { + *x = (self.vec_reg[data0] as u8).to_le_bytes()[i]; + }); + } + 31 => { + let addr = single_addr(); + if addr + 2 >= self.lds.data.len() { + self.lds.data.resize(self.lds.data.len() + addr + 3, 0); + } + self.lds.data[addr..addr + 2].iter_mut().enumerate().for_each(|(i, x)| { + *x = (self.vec_reg[data0] as u16).to_le_bytes()[i]; + }); + } + 14 => { + let (addr0, addr1) = double_addr(4); + self.lds.write(addr0, self.vec_reg[data0]); + self.lds.write(addr1, self.vec_reg[data1]); + } + 78 => { + let (addr0, addr1) = double_addr(8); + self.lds.write64(addr0, self.vec_reg.read64(data0)); + self.lds.write64(addr1, self.vec_reg.read64(data1)); + } + _ => todo_instr!(instruction)?, + } + } + // global + // flat + else if instruction >> 26 == 0b110111 { + let instr = self.u64_instr(); + if !self.exec.read() { + return Ok(()); + } + let offset = sign_ext(instr & 0x1fff, 13); + let seg = (instr >> 16) & 0x3; + let op = ((instr >> 18) & 0x7f) as usize; + let addr = ((instr >> 32) & 0xff) as usize; + let data = ((instr >> 40) & 0xff) as usize; + let saddr = ((instr >> 48) & 0x7f) as usize; + let vdst = ((instr >> 56) & 0xff) as usize; + + let saddr_val: u32 = self.val(saddr); + let saddr_off = saddr_val == 0x7F || saddr as u32 == NULL_SRC; + + match seg { + 1 => { + let sve = ((instr >> 50) & 0x1) != 0; + + print_instr!("SCRATCH", offset, op, addr, data, saddr, vdst, sve); + + let addr = match (sve, saddr_off) { + (true, true) => offset as u64 as usize, + (false, false) => saddr_val as usize, + _ => todo_instr!(instruction)?, + }; + match op { + // load + 20..=23 => (0..op - 19).for_each(|i| { + self.vec_reg[vdst + i] = self.sds.read(addr + 4 * i); + }), + // store + 26..=29 => (0..op - 25).for_each(|i| { + self.sds.write(addr + 4 * i, self.vec_reg[data + i]); + }), + _ => todo_instr!(instruction)?, + } + } + 2 => { + print_instr!("GLOBAL", offset, op, addr, data, saddr, vdst); + + let addr = match saddr_off { + true => self.vec_reg.read64(addr) as i64 + (offset as i64), + false => { + let scalar_addr = self.scalar_reg.read64(saddr); + let vgpr_offset = self.vec_reg[addr]; + scalar_addr as i64 + vgpr_offset as i64 + offset + } + } as u64; + + unsafe { + match op { + // load + 16 => self.vec_reg[vdst] = *(addr as *const u8) as u32, + 17 => self.vec_reg[vdst] = *(addr as *const i8) as u32, + 18 => self.vec_reg[vdst] = *(addr as *const u16) as u32, + 19 => self.vec_reg[vdst] = *(addr as *const i16) as u32, + + 20..=23 => (0..op - 19).for_each(|i| { + self.vec_reg[vdst + i] = *((addr + 4 * i as u64) as *const u32); + }), + 32 => self.vec_reg[vdst].mut_lo16(*(addr as *const u16)), + 35 => self.vec_reg[vdst].mut_hi16(*(addr as *const u16)), + // store + 24 => *(addr as *mut u8) = self.vec_reg[data] as u8, + 25 => *(addr as *mut u16) = self.vec_reg[data] as u16, + 26..=29 => (0..op - 25).for_each(|i| { + *((addr + 4 * i as u64) as u64 as *mut u32) = self.vec_reg[data + i]; + }), + 37 => *(addr as *mut u16) = ((self.vec_reg[data] >> 16) & 0xffff) as u16, + _ => todo_instr!(instruction)?, + }; + } + } + _ => todo_instr!(instruction)?, + }; + } + // mubuf + else if instruction >> 26 == 0b111000 { + let instr = self.u64_instr(); + let op = ((instr >> 18) & 0x7f) as usize; + print_instr!("MUBUF", op); + match op { + 43 => {} // NOTE: remu doesn't have an l0 cache, it just has the software managed lds + _ => todo_instr!(instruction)?, + }; + } else { + todo_instr!(instruction)?; + } + Ok(()) + } + + fn cmpf(&self, s0: T, s1: T, offset: u32) -> bool + where + T: Float + std::fmt::Display, + { + return match offset { + 0 => true, + 1 => s0 < s1, + 2 => s0 == s1, + 3 => s0 <= s1, + 4 => s0 > s1, + 5 => s0 != s1, + 6 => s0 >= s1, + 7 => (!s0.is_nan()) && (!s1.is_nan()), + 8 => s0.is_nan() || s1.is_nan(), + 9 => !(s0 >= s1), + 10 => !(s0 != s1), + 11 => !(s0 > s1), + 12 => !(s0 <= s1), + 13 => !(s0 == s1), + 14 => !(s0 < s1), + 15 => true, + _ => panic!("invalid offset for float compare {offset}"), + }; + } + fn cmp_class_f64(&self, s0: f64, s1: u32) -> bool { + let offset = match s0 { + _ if s0.is_nan() => 1, + _ if s0.is_infinite() => match s0.signum() == -1.0 { + true => 2, + false => 9, + }, + _ if s0.exponent() > 0 => match s0.signum() == -1.0 { + true => 3, + false => 8, + }, + _ if s0.abs() > 0.0 => match s0.signum() == -1.0 { + true => 4, + false => 7, + }, + _ => match s0.signum() == -1.0 { + true => 5, + false => 6, + }, + }; + ((s1 >> offset) & 1) != 0 + } + fn cmp_class_f32(&self, s0: f32, s1: u32) -> bool { + let offset = match s0 { + _ if (s0 as f64).is_nan() => 1, + _ if s0.exponent() == 255 => match s0.signum() == -1.0 { + true => 2, + false => 9, + }, + _ if s0.exponent() > 0 => match s0.signum() == -1.0 { + true => 3, + false => 8, + }, + _ if s0.abs() as f64 > 0.0 => match s0.signum() == -1.0 { + true => 4, + false => 7, + }, + _ => match s0.signum() == -1.0 { + true => 5, + false => 6, + }, + }; + ((s1 >> offset) & 1) != 0 + } + fn cmp_class_f16(&self, s0: f16, s1: u16) -> bool { + let offset = match s0 { + _ if (f64::from(s0)).is_nan() => 1, + _ if s0.exponent() == 31 => match s0.signum() == f16::NEG_ONE { + true => 2, + false => 9, + }, + _ if s0.exponent() > 0 => match s0.signum() == f16::NEG_ONE { + true => 3, + false => 8, + }, + _ if f64::from(s0.abs()) > 0.0 => match s0.signum() == f16::NEG_ONE { + true => 4, + false => 7, + }, + _ => match s0.signum() == f16::NEG_ONE { + true => 5, + false => 6, + }, + }; + ((s1 >> offset) & 1) != 0 + } + fn cmpi(&self, s0: T, s1: T, offset: u32) -> bool + where + T: PartialOrd + PartialEq, + { + return match offset { + 0 => false, + 1 => s0 < s1, + 2 => s0 == s1, + 3 => s0 <= s1, + 4 => s0 > s1, + 5 => s0 != s1, + 6 => s0 >= s1, + 7 => true, + _ => panic!("invalid offset for integer compare {offset}"), + }; + } + fn cls_i32(&self, s0: u32) -> u32 { + let mut ret: i32 = -1; + let s0 = s0 as i32; + for i in (1..=31).into_iter() { + if s0 >> (31 - i as u32) != s0 >> 31 { + ret = i; + break; + } + } + ret as u32 + } + fn clz_i32_u32(&self, s0: u32) -> u32 { + let mut ret: i32 = -1; + for i in (0..=31).into_iter() { + if s0 >> (31 - i as u32) == 1 { + ret = i; + break; + } + } + ret as u32 + } + + /* ALU utils */ + fn _common_srcs(&mut self, code: usize) -> u32 { + match code { + 106 => self.vcc.value, + 107 => self.scalar_reg[code as usize], + 126 => self.exec.value, + 128 => 0, + 124 => NULL_SRC, + 255 => match self.simm { + None => { + let val = self.stream[self.pc_offset + 1]; + self.simm = Some(val); + self.pc_offset += 1; + val + } + Some(val) => val, + }, + _ => todo!("resolve_src={code}"), + } + } + fn write_to_sdst(&mut self, sdst_bf: usize, val: u32) { + match sdst_bf { + // NOTE: remu is only wave32, vcc_hi is treated as a regular SGPR + 0..=SGPR_COUNT | 107 => self.scalar_reg[sdst_bf] = val, + 106 => self.vcc.value = val, + 126 => self.exec.value = val, + _ => todo!("write to sdst {}", sdst_bf), + } + } + fn set_sgpr_co(&mut self, idx: usize, val: bool) { + let mut wv = self.sgpr_co.map(|(_, wv)| wv).unwrap_or_else(|| WaveValue::new(0, self.warp_size)); + wv.default_lane = self.vcc.default_lane; + wv.set_lane(val); + *self.sgpr_co = Some((idx, wv)); + } + + fn u64_instr(&mut self) -> u64 { + let msb = self.stream[self.pc_offset + 1] as u64; + let instr = msb << 32 | self.stream[self.pc_offset] as u64; + self.pc_offset += 1; + return instr; + } + + fn wmma_b16_16x16(&'a self, vsrc: usize) -> impl Iterator + 'a { + (0..16).flat_map(move |i| { + let lane = self.vec_reg.get_lane(i); + (vsrc..=vsrc + 7).flat_map(move |j| { + let val = lane[j - VGPR_COUNT]; + [(val & 0xffff) as u16, (val >> 16) as u16] + }) + }) + } + + fn wmma_b32_16x16(&'a self, vsrc: usize) -> impl Iterator + 'a { + (0..256).map(move |i| self.vec_reg.get_lane(i % 32)[(i / 32) + vsrc - VGPR_COUNT]) + } +} + +fn wmma(a: Vec, b: Vec, c: Vec) -> [T; 256] { + let mut ret = [T::zero(); 256]; + for row in 0..16 { + for col in 0..16 { + let mut sum = T::zero(); + for k in 0..16 { + let a_val = a[row * 16 + k]; + let b_val = b[col * 16 + k]; + sum = sum + (a_val * b_val); + } + let c_val = c[row * 16 + col]; + ret[row * 16 + col] = sum + c_val; + } + } + ret +} + +pub trait ALUSrc { + fn val(&mut self, code: usize) -> T; +} +impl ALUSrc for Thread<'_> { + fn val(&mut self, code: usize) -> u16 { + match code { + 0..=SGPR_COUNT => self.scalar_reg[code] as u16, + VGPR_COUNT..=511 => self.vec_reg[code - VGPR_COUNT] as u16, + 129..=192 => (code - 128) as u16, + 193..=208 => ((code - 192) as i16 * -1) as u16, + 240..=247 => f16::from_f32( + [ + (240, 0.5_f32), + (241, -0.5_f32), + (242, 1_f32), + (243, -1.0_f32), + (244, 2.0_f32), + (245, -2.0_f32), + (246, 4.0_f32), + (247, -4.0_f32), + ] + .iter() + .find(|x| x.0 == code) + .unwrap() + .1, + ) + .to_bits(), + _ => self._common_srcs(code) as u16, + } + } +} +impl ALUSrc for Thread<'_> { + fn val(&mut self, code: usize) -> u32 { + match code { + 0..=SGPR_COUNT => self.scalar_reg[code], + VGPR_COUNT..=511 => self.vec_reg[code - VGPR_COUNT], + 129..=192 => (code - 128) as u32, + 193..=208 => ((code - 192) as i32 * -1) as u32, + 240..=247 => [ + (240, 0.5_f32), + (241, -0.5_f32), + (242, 1_f32), + (243, -1.0_f32), + (244, 2.0_f32), + (245, -2.0_f32), + (246, 4.0_f32), + (247, -4.0_f32), + ] + .iter() + .find(|x| x.0 == code) + .unwrap() + .1 + .to_bits(), + _ => self._common_srcs(code), + } + } +} +impl ALUSrc for Thread<'_> { + fn val(&mut self, code: usize) -> u64 { + match code { + 0..=SGPR_COUNT => self.scalar_reg.read64(code), + VGPR_COUNT..=511 => self.vec_reg.read64(code - VGPR_COUNT), + 129..=192 => (code - 128) as u64, + 193..=208 => ((code - 192) as i64 * -1) as u64, + 240..=247 => [ + (240, 0.5_f64), + (241, -0.5_f64), + (242, 1_f64), + (243, -1.0_f64), + (244, 2.0_f64), + (245, -2.0_f64), + (246, 4.0_f64), + (247, -4.0_f64), + ] + .iter() + .find(|x| x.0 == code) + .unwrap() + .1 + .to_bits(), + _ => self._common_srcs(code) as u64, + } + } +} +impl ALUSrc for Thread<'_> { + fn val(&mut self, code: usize) -> f64 { + let uret: u64 = self.val(code); + match code { + SIMM_SRC => f64::from_bits(uret << 32), + _ => f64::from_bits(uret), + } + } +} + +#[cfg(test)] +mod test_alu_utils { + use super::*; + + #[test] + fn test_write_to_sdst_sgpr() { + let mut thread = _helper_test_thread(); + thread.write_to_sdst(10, 200); + assert_eq!(thread.scalar_reg[10], 200); + } + + #[test] + fn test_write_to_sdst_vcc_val() { + let mut thread = _helper_test_thread(); + let val = 0b1011101011011011111011101111; + thread.write_to_sdst(106, val); + assert_eq!(thread.vcc.value, 195935983); + } + + #[test] + fn test_clz_i32_u32() { + let thread = _helper_test_thread(); + assert_eq!(thread.clz_i32_u32(0x00000000), 0xffffffff); + assert_eq!(thread.clz_i32_u32(0x0000cccc), 16); + assert_eq!(thread.clz_i32_u32(0xffff3333), 0); + assert_eq!(thread.clz_i32_u32(0x7fffffff), 1); + assert_eq!(thread.clz_i32_u32(0x80000000), 0); + assert_eq!(thread.clz_i32_u32(0xffffffff), 0); + } + + #[test] + fn test_cls_i32() { + let thread = _helper_test_thread(); + assert_eq!(thread.cls_i32(0x00000000), 0xffffffff); + assert_eq!(thread.cls_i32(0x0000cccc), 16); + assert_eq!(thread.cls_i32(0xffff3333), 16); + assert_eq!(thread.cls_i32(0x7fffffff), 1); + assert_eq!(thread.cls_i32(0x80000000), 1); + } + + #[test] + fn test_sgpr_co_init() { + let mut thread = _helper_test_thread(); + thread.vcc.default_lane = Some(0); + thread.set_sgpr_co(10, true); + thread.vcc.default_lane = Some(1); + assert_eq!(thread.sgpr_co.unwrap().0, 10); + assert_eq!(thread.sgpr_co.unwrap().1.mutations.unwrap()[0], true); + thread.set_sgpr_co(10, true); + assert_eq!(thread.sgpr_co.unwrap().0, 10); + assert_eq!(thread.sgpr_co.unwrap().1.mutations.unwrap()[1], true); + assert_eq!(thread.sgpr_co.unwrap().1.mutations.unwrap()[0], true); + } +} + +#[cfg(test)] +mod test_smem { + use super::*; + + #[test] + fn test_s_load_b32_simple() { + let mut thread = _helper_test_thread(); + let mut buf = vec![0u8; 4]; + let a: u32 = 0xDEADBEEF; + unsafe { + *(buf.as_mut_ptr() as *mut u32) = a; + } + let base_addr = buf.as_ptr() as u64; + thread.scalar_reg.write64(0, base_addr); + r(&vec![0xF4000040, 0xF8000000, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[1], a); + std::mem::forget(buf); + } + + #[test] + fn test_s_load_b32_vcc() { + let mut thread = _helper_test_thread(); + let mut buf = vec![0u8; 4]; + let a: u32 = 0xDEADBEEF; + unsafe { + *(buf.as_mut_ptr() as *mut u32) = a; + } + let base_addr = buf.as_ptr() as u64; + thread.scalar_reg.write64(0, base_addr); + r(&vec![0xF4001A80, 0xF8000000, END_PRG], &mut thread); + assert_eq!(thread.vcc.value, a); + std::mem::forget(buf); + } + + #[test] + fn test_s_load_b32_vcc_addr() { + let mut thread = _helper_test_thread(); + let mut buf = vec![0u8; 4]; + let a: u32 = 0xDEADBEEF; + unsafe { + *(buf.as_mut_ptr() as *mut u32) = a; + } + let addr = buf.as_ptr() as u64; + // NOTE: vcc is an alias for s[106:107] + thread.scalar_reg.write64(106, addr); + // TODO: vcc_lo should just read from s106 + thread.vcc.value = (addr & 0xffffffff) as u32; + r(&vec![0xF4000035, 0xF8000000, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[0], a); + std::mem::forget(buf); + } +} + +#[cfg(test)] +mod test_sop1 { + use super::*; + + #[test] + fn test_s_brev_b32() { + let mut thread = _helper_test_thread(); + thread.scalar_reg[5] = 8; + r(&vec![0xBE850405, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[5], 268435456); + } + + #[test] + fn test_s_mov_b64() { + let mut thread = _helper_test_thread(); + thread.scalar_reg.write64(16, 5236523008); + r(&vec![0xBE880110, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg.read64(8), 5236523008); + assert_eq!(thread.scalar, true); + } + + #[test] + fn test_mov_exec() { + let mut thread = _helper_test_thread(); + thread.exec.value = 0b11111111110111111110111111111111; + r(&vec![0xBE80007E, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[0], 0b11111111110111111110111111111111); + } + + #[test] + fn test_s_mov_b32() { + let mut thread = _helper_test_thread(); + thread.scalar_reg[15] = 42; + r(&vec![0xbe82000f, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[2], 42); + } + + #[test] + fn test_s_bitset0_b32() { + [ + [ + 0b11111111111111111111111111111111, + 0b00000000000000000000000000000001, + 0b11111111111111111111111111111101, + ], + [ + 0b11111111111111111111111111111111, + 0b00000000000000000000000000000010, + 0b11111111111111111111111111111011, + ], + ] + .iter() + .for_each(|[a, b, ret]| { + let mut thread = _helper_test_thread(); + thread.scalar_reg[20] = *a; + thread.scalar_reg[10] = *b; + r(&vec![0xBE94100A, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[20], *ret); + }); + } + + #[test] + fn test_s_bitset1_b32() { + [ + [ + 0b00000000000000000000000000000000, + 0b00000000000000000000000000000001, + 0b00000000000000000000000000000010, + ], + [ + 0b00000000000000000000000000000000, + 0b00000000000000000000000000000010, + 0b00000000000000000000000000000100, + ], + ] + .iter() + .for_each(|[a, b, ret]| { + let mut thread = _helper_test_thread(); + thread.scalar_reg[20] = *a; + thread.scalar_reg[10] = *b; + r(&vec![0xbe94120a, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[20], *ret); + }); + } + + #[test] + fn test_s_not_b32() { + [[0, 4294967295, 1], [1, 4294967294, 1], [u32::MAX, 0, 0]] + .iter() + .for_each(|[a, ret, scc]| { + let mut thread = _helper_test_thread(); + thread.scalar_reg[10] = *a; + r(&vec![0xBE8A1E0A, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[10], *ret); + assert_eq!(*thread.scc, *scc); + }); + } +} + +#[cfg(test)] +mod test_sopk { + use super::*; + + #[test] + fn test_cmp_zero_extend() { + let mut thread = _helper_test_thread(); + thread.scalar_reg[20] = 0xcd14; + r(&vec![0xB494CD14, END_PRG], &mut thread); + assert_eq!(*thread.scc, 1); + + r(&vec![0xB194CD14, END_PRG], &mut thread); + assert_eq!(*thread.scc, 0); + } + + #[test] + fn test_cmp_sign_extend() { + let mut thread = _helper_test_thread(); + thread.scalar_reg[6] = 0x2db4; + r(&vec![0xB1862DB4, END_PRG], &mut thread); + assert_eq!(*thread.scc, 1); + + r(&vec![0xB1862DB4, END_PRG], &mut thread); + assert_eq!(*thread.scc, 1); + } +} + +#[cfg(test)] +mod test_sop2 { + use super::*; + + #[test] + fn test_xor_exec() { + let mut thread = _helper_test_thread(); + thread.exec.value = 0b10010010010010010010010010010010; + thread.scalar_reg[2] = 0b11111111111111111111111111111111; + r(&vec![0x8D02027E, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[2], 1840700269); + } + + #[test] + fn test_s_add_u32() { + [[10, 20, 30, 0], [u32::MAX, 10, 9, 1], [u32::MAX, 0, u32::MAX, 0]] + .iter() + .for_each(|[a, b, expected, scc]| { + let mut thread = _helper_test_thread(); + thread.scalar_reg[2] = *a; + thread.scalar_reg[6] = *b; + r(&vec![0x80060206, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[6], *expected); + assert_eq!(*thread.scc, *scc); + }); + } + + #[test] + fn test_s_addc_u32() { + [[10, 20, 31, 1, 0], [10, 20, 30, 0, 0], [u32::MAX, 10, 10, 1, 1]] + .iter() + .for_each(|[a, b, expected, scc_before, scc_after]| { + let mut thread = _helper_test_thread(); + *thread.scc = *scc_before; + thread.scalar_reg[7] = *a; + thread.scalar_reg[3] = *b; + r(&vec![0x82070307, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[7], *expected); + assert_eq!(*thread.scc, *scc_after); + }); + } + + #[test] + fn test_s_add_i32() { + [[-10, 20, 10, 0], [i32::MAX, 1, -2147483648, 1]] + .iter() + .for_each(|[a, b, expected, scc]| { + let mut thread = _helper_test_thread(); + thread.scalar_reg[14] = *a as u32; + thread.scalar_reg[10] = *b as u32; + r(&vec![0x81060E0A, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[6], *expected as u32); + assert_eq!(*thread.scc, *scc as u32); + }); + } + + #[test] + fn test_s_sub_i32() { + [[-10, 20, -30, 0], [i32::MAX, -1, -2147483648, 1]] + .iter() + .for_each(|[a, b, expected, scc]| { + let mut thread = _helper_test_thread(); + thread.scalar_reg[13] = *a as u32; + thread.scalar_reg[8] = *b as u32; + r(&vec![0x818C080D, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[12], *expected as u32); + assert_eq!(*thread.scc, *scc as u32); + }); + } + + #[test] + fn test_s_lshl_b32() { + [[20, 40, 1], [0, 0, 0]].iter().for_each(|[a, expected, scc]| { + let mut thread = _helper_test_thread(); + thread.scalar_reg[15] = *a as u32; + r(&vec![0x8408810F, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[8], *expected as u32); + assert_eq!(*thread.scc, *scc as u32); + }); + } + + #[test] + fn test_s_lshl_b64() { + let mut thread = _helper_test_thread(); + thread.scalar_reg.write64(2, u64::MAX - 30); + r(&vec![0x84828202, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[2], 4294967172); + assert_eq!(thread.scalar_reg[3], 4294967295); + assert_eq!(*thread.scc, 1); + } + + #[test] + fn test_s_ashr_i32() { + let mut thread = _helper_test_thread(); + thread.scalar_reg[2] = 36855; + r(&vec![0x86039F02, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[3], 0); + assert_eq!(*thread.scc, 0); + } + + #[test] + fn test_source_vcc() { + let mut thread = _helper_test_thread(); + thread.scalar_reg[10] = 0x55; + thread.vcc.value = 29; + r(&vec![0x8B140A6A, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[20], 21); + } + + #[test] + fn test_s_min_i32() { + let mut thread = _helper_test_thread(); + thread.scalar_reg[2] = -42i32 as u32; + thread.scalar_reg[3] = -92i32 as u32; + r(&vec![0x89020203, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[2], -92i32 as u32); + assert_eq!(*thread.scc, 1); + } + + #[test] + fn test_s_mul_hi_u32() { + [[u32::MAX, 10, 9], [u32::MAX / 2, 4, 1]].iter().for_each(|[a, b, expected]| { + let mut thread = _helper_test_thread(); + thread.scalar_reg[0] = *a; + thread.scalar_reg[8] = *b; + r(&vec![0x96810800, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[1], *expected); + }); + } + + #[test] + fn test_s_mul_hi_i32() { + [[(u64::MAX) as i32, (u64::MAX / 2) as i32, 0], [2, -2, -1]] + .iter() + .for_each(|[a, b, expected]| { + let mut thread = _helper_test_thread(); + thread.scalar_reg[0] = *a as u32; + thread.scalar_reg[8] = *b as u32; + r(&vec![0x97010800, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[1], *expected as u32); + }); + } + + #[test] + fn test_s_mul_i32() { + [[40, 2, 80], [-10, -10, 100]].iter().for_each(|[a, b, expected]| { + let mut thread = _helper_test_thread(); + thread.scalar_reg[0] = *a as u32; + thread.scalar_reg[6] = *b as u32; + r(&vec![0x96000600, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[0], *expected as u32); + }); + } + + #[test] + fn test_s_bfe_u64() { + [[2, 4, 2, 0], [800, 400, 32, 0], [-10i32 as u32, 3, 246, 0], [u32::MAX, u32::MAX, 255, 0]] + .iter() + .for_each(|[a_lo, a_hi, ret_lo, ret_hi]| { + let mut thread = _helper_test_thread(); + thread.scalar_reg[6] = *a_lo; + thread.scalar_reg[7] = *a_hi; + r(&vec![0x940cff06, 524288, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[12], *ret_lo); + assert_eq!(thread.scalar_reg[13], *ret_hi); + }); + } + + #[test] + fn test_s_bfe_i64() { + [[131073, 0, 1, 0, 0x100000], [-2, 0, -2, -1, 524288], [2, 0, 2, 0, 524288]] + .iter() + .for_each(|[a_lo, a_hi, ret_lo, ret_hi, shift]| { + let mut thread = _helper_test_thread(); + thread.scalar_reg[6] = *a_lo as u32; + thread.scalar_reg[7] = *a_hi as u32; + r(&vec![0x948cff06, *shift as u32, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[12], *ret_lo as u32); + assert_eq!(thread.scalar_reg[13], *ret_hi as u32); + }); + } + + #[test] + fn test_s_bfe_u32() { + [ + [67305985, 2], + [0b100000000110111111100000001, 0b1111111], + [0b100000000100000000000000001, 0b0], + [0b100000000111000000000000001, 0b10000000], + [0b100000000111111111100000001, 0b11111111], + ] + .iter() + .for_each(|[a, ret]| { + let mut thread = _helper_test_thread(); + thread.scalar_reg[0] = *a; + r(&vec![0x9303FF00, 0x00080008, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[3], *ret); + }); + } + + #[test] + fn test_s_pack_xx_b32_b16() { + let mut thread = _helper_test_thread(); + // ll + thread.scalar_reg[0] = 0x12345678; + r(&vec![0x9903ff00, 0x9ABCDEF0, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[3], 0xdef05678); + // lh + r(&vec![0x9983ff00, 0x9ABCDEF0, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[3], 0x9abc5678); + // hh + r(&vec![0x9a03ff00, 0x9ABCDEF0, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[3], 2596016692); + // hl + r(&vec![0x9a83ff00, 0x9ABCDEF0, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[3], 3740275252); + } +} + +#[cfg(test)] +mod test_sopc { + use super::*; + + #[test] + fn test_s_bitcmp0_b32() { + [[0b00, 0b1, 0], [0b01, 0b1, 1], [0b10, 0b1, 1], [0b10000000, 0b1, 0]] + .iter() + .for_each(|[s0, s1, scc]| { + let mut thread = _helper_test_thread(); + thread.scalar_reg[3] = *s0; + thread.scalar_reg[4] = *s1; + r(&vec![0xBF0C0304, END_PRG], &mut thread); + assert_eq!(*thread.scc, *scc); + }) + } +} + +#[cfg(test)] +mod test_vopd { + use super::*; + + #[test] + fn test_inline_const_vopx_only() { + let mut thread = _helper_test_thread(); + thread.vec_reg[0] = f32::to_bits(0.5); + let constant = f32::from_bits(0x39a8b099); + thread.vec_reg[1] = 10; + r(&vec![0xC8D000FF, 0x00000080, 0x39A8B099, END_PRG], &mut thread); + assert_eq!(f32::from_bits(thread.vec_reg[0]), 0.5 * constant); + assert_eq!(thread.vec_reg[1], 0); + } + + #[test] + fn test_inline_const_vopy_only() { + let mut thread = _helper_test_thread(); + thread.vec_reg[0] = 10; + thread.vec_reg[1] = 10; + r(&vec![0xCA100080, 0x000000FF, 0x3E15F480, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[0], 0); + assert_eq!(thread.vec_reg[1], 0x3e15f480); + + let mut thread = _helper_test_thread(); + thread.vec_reg[18] = f32::to_bits(2.0); + thread.vec_reg[32] = f32::to_bits(4.0); + thread.vec_reg[7] = 10; + r(&vec![0xC9204112, 0x00060EFF, 0x0000006E, END_PRG], &mut thread); + assert_eq!(f32::from_bits(thread.vec_reg[0]), 2.0f32 + 4.0f32); + assert_eq!(thread.vec_reg[7], 120); + } + + #[test] + fn test_inline_const_shared() { + let mut thread = _helper_test_thread(); + thread.vec_reg[2] = f32::to_bits(2.0); + thread.vec_reg[3] = f32::to_bits(4.0); + let constant = f32::from_bits(0x3e800000); + r(&vec![0xC8C604FF, 0x020206FF, 0x3E800000, END_PRG], &mut thread); + assert_eq!(f32::from_bits(thread.vec_reg[2]), 2.0 * constant); + assert_eq!(f32::from_bits(thread.vec_reg[3]), 4.0 * constant); + } + + #[test] + fn test_simm_op_shared_1() { + let mut thread = _helper_test_thread(); + thread.vec_reg[23] = f32::to_bits(4.0); + thread.vec_reg[12] = f32::to_bits(2.0); + + thread.vec_reg[13] = f32::to_bits(10.0); + thread.vec_reg[24] = f32::to_bits(3.0); + + let simm = f32::from_bits(0x3e000000); + r(&vec![0xC8841917, 0x0C0C1B18, 0x3E000000, END_PRG], &mut thread); + assert_eq!(f32::from_bits(thread.vec_reg[12]), 4.0 * simm + 2.0); + assert_eq!(f32::from_bits(thread.vec_reg[13]), 3.0 * simm + 10.0); + } + + #[test] + fn test_simm_op_shared_2() { + let mut thread = _helper_test_thread(); + thread.vec_reg[29] = f32::to_bits(4.0); + thread.vec_reg[10] = f32::to_bits(2.0); + + thread.vec_reg[11] = f32::to_bits(10.0); + thread.vec_reg[26] = f32::to_bits(6.5); + + let simm = 0.125; + r(&vec![0xC880151D, 0x0A0A34FF, 0x3E000000, END_PRG], &mut thread); + assert_eq!(f32::from_bits(thread.vec_reg[10]), 4.0 * simm + 2.0); + assert_eq!(f32::from_bits(thread.vec_reg[11]), simm * 6.5 + 10.0); + } + + #[test] + fn test_add_mov() { + let mut thread = _helper_test_thread(); + thread.vec_reg[0] = f32::to_bits(10.5); + r(&vec![0xC9100300, 0x00000080, END_PRG], &mut thread); + assert_eq!(f32::from_bits(thread.vec_reg[0]), 10.5); + assert_eq!(thread.vec_reg[1], 0); + } + + #[test] + fn test_max_add() { + let mut thread = _helper_test_thread(); + thread.vec_reg[0] = f32::to_bits(5.0); + thread.vec_reg[3] = f32::to_bits(2.0); + thread.vec_reg[1] = f32::to_bits(2.0); + r(&vec![0xCA880280, 0x01000700, END_PRG], &mut thread); + assert_eq!(f32::from_bits(thread.vec_reg[0]), 7.0); + assert_eq!(f32::from_bits(thread.vec_reg[1]), 2.0); + } +} +#[cfg(test)] +mod test_vop1 { + use super::*; + use float_cmp::approx_eq; + + #[test] + fn test_v_cvt_f32_f64() { + let mut thread = _helper_test_thread(); + thread.vec_reg.write64(0, 2.0f64.to_bits()); + r(&vec![0xD58F0101, 0x00000100, END_PRG], &mut thread); + assert_eq!(f32::from_bits(thread.vec_reg[1]), 2.0); + thread.vec_reg.write64(0, (-2.0f64).to_bits()); + r(&vec![0xD58F0101, 0x00000100, END_PRG], &mut thread); + assert_eq!(f32::from_bits(thread.vec_reg[1]), 2.0); + } + + #[test] + fn test_v_mov_b32_srrc_const0() { + let mut thread = _helper_test_thread(); + r(&vec![0x7e000280, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[0], 0); + r(&vec![0x7e020280, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[1], 0); + r(&vec![0x7e040280, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[2], 0); + } + + #[test] + fn test_v_mov_b32_srrc_register() { + let mut thread = _helper_test_thread(); + thread.scalar_reg[6] = 31; + r(&vec![0x7e020206, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[1], 31); + } + + fn helper_test_fexp(val: f32) -> f32 { + let mut thread = _helper_test_thread(); + thread.vec_reg[6] = val.to_bits(); + r(&vec![0x7E0C4B06, END_PRG], &mut thread); + f32::from_bits(thread.vec_reg[6]) + } + + #[test] + fn test_fexp_1ulp() { + let test_values = [-2.0, -1.0, 0.0, 1.0, 2.0, 3.0]; + for &val in test_values.iter() { + let expected = (2.0_f32).powf(val); + assert!((helper_test_fexp(val) - expected).abs() <= f32::EPSILON); + } + } + + #[test] + fn test_fexp_flush_denormals() { + assert_eq!(helper_test_fexp(f32::from_bits(0xff800000)), 0.0); + assert_eq!(helper_test_fexp(f32::from_bits(0x80000000)), 1.0); + assert_eq!(helper_test_fexp(f32::from_bits(0x7f800000)), f32::from_bits(0x7f800000)); + } + + #[test] + fn test_cast_f32_i32() { + let mut thread = _helper_test_thread(); + [(10.42, 10i32), (-20.08, -20i32)].iter().for_each(|(src, expected)| { + thread.scalar_reg[2] = f32::to_bits(*src); + r(&vec![0x7E001002, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[0] as i32, *expected); + }) + } + + #[test] + fn test_cast_f32_u32() { + let mut thread = _helper_test_thread(); + thread.scalar_reg[4] = 2; + r(&vec![0x7E000C04, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[0], 1073741824); + } + + #[test] + fn test_cast_u32_f32() { + let mut thread = _helper_test_thread(); + thread.vec_reg[0] = 1325400062; + r(&vec![0x7E000F00, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[0], 2147483392); + } + + #[test] + fn test_cast_i32_f32() { + let mut thread = _helper_test_thread(); + [(10.0, 10i32), (-20.0, -20i32)].iter().for_each(|(expected, src)| { + thread.vec_reg[0] = *src as u32; + r(&vec![0x7E000B00, END_PRG], &mut thread); + assert_eq!(f32::from_bits(thread.vec_reg[0]), *expected); + }) + } + + #[test] + fn test_v_readfirstlane_b32_basic() { + let mut thread = _helper_test_thread(); + thread.vec_reg[0] = 2147483392; + r(&vec![0x7E060500, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[3], 2147483392); + } + + #[test] + fn test_v_readfirstlane_b32_fancy() { + let mut thread = _helper_test_thread(); + thread.vec_reg.get_lane_mut(0)[13] = 44; + thread.vec_reg.get_lane_mut(1)[13] = 22; + thread.exec.value = 0b00000000000000000000000000000010; + thread.exec.default_lane = Some(2); + r(&vec![0x7E1A050D, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[13], 22); + + thread.exec.value = 0b00000000000000000000000000000000; + thread.exec.default_lane = Some(1); + r(&vec![0x7E1A050D, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[13], 44); + + thread.exec.value = 0b10000000000000000000000000000000; + thread.vec_reg.get_lane_mut(31)[13] = 88; + thread.exec.default_lane = Some(1); + r(&vec![0x7E1A050D, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[13], 88); + } + + #[test] + fn test_v_cls_i32() { + fn t(val: u32) -> u32 { + let mut thread = _helper_test_thread(); + thread.vec_reg[2] = val; + r(&vec![0x7E087702, END_PRG], &mut thread); + return thread.vec_reg[4]; + } + + assert_eq!(t(0x00000000), 0xffffffff); + assert_eq!(t(0x40000000), 1); + assert_eq!(t(0x80000000), 1); + assert_eq!(t(0x0fffffff), 4); + assert_eq!(t(0xffff0000), 16); + assert_eq!(t(0xfffffffe), 31); + } + + #[test] + fn test_v_rndne_f32() { + [ + [1.2344, 1.0], + [2.3, 2.0], // [0.5f32, 0.0f32], + [0.51, 1.0], + [f32::from_bits(1186963295), f32::from_bits(1186963456)], + ] + .iter() + .for_each(|[a, ret]| { + let mut thread = _helper_test_thread(); + thread.vec_reg[0] = f32::to_bits(*a); + r(&vec![0x7E024700, END_PRG], &mut thread); + assert_eq!(f32::from_bits(thread.vec_reg[1]), *ret); + }) + } + + #[test] + fn test_v_rndne_f64() { + let mut thread = _helper_test_thread(); + thread.vec_reg[0] = 0x652b82fe; + thread.vec_reg[1] = 0x40071547; + r(&vec![0x7E043300, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[2], 0); + assert_eq!(thread.vec_reg[3], 1074266112); + } + + #[test] + fn test_v_cvt_i32_f64() { + let mut thread = _helper_test_thread(); + thread.vec_reg[2] = 0; + thread.vec_reg[3] = 0x40080000; + r(&vec![0x7E080702, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], 3); + } + + #[test] + fn test_v_frexp_mant_f64() { + [[2.0, 0.5], [1.0, 0.5], [0.54, 0.54], [f64::NAN, f64::NAN]] + .iter() + .for_each(|[x, expected]| { + let mut thread = _helper_test_thread(); + thread.vec_reg.write64(0, f64::to_bits(*x)); + r(&vec![0x7E047B00, END_PRG], &mut thread); + let ret = f64::from_bits(thread.vec_reg.read64(2)); + if ret.is_nan() { + assert!(ret.is_nan() && expected.is_nan()); + } else { + assert_eq!(f64::from_bits(thread.vec_reg.read64(2)), *expected) + } + }) + } + + #[test] + fn test_v_rcp_f64() { + let mut thread = _helper_test_thread(); + thread.vec_reg[0] = 0; + thread.vec_reg[1] = 1073741824; + r(&vec![0x7E045F00, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[2], 0); + assert_eq!(thread.vec_reg[3], 1071644672); + } + + #[test] + fn test_v_frexp_exp_i32_f64() { + [(3573412790272.0, 42), (69.0, 7), (2.0, 2), (f64::NEG_INFINITY, 0)] + .iter() + .for_each(|(x, ret)| { + let mut thread = _helper_test_thread(); + thread.vec_reg.write64(0, f64::to_bits(*x)); + r(&vec![0x7E047900, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[2], *ret); + }) + } + + #[test] + fn test_v_rsq_f64() { + [(2.0, 0.707)].iter().for_each(|(x, ret)| { + let mut thread = _helper_test_thread(); + thread.vec_reg.write64(0, f64::to_bits(*x)); + println!("{} {}", thread.vec_reg[0], thread.vec_reg[1]); + r(&vec![0x7E046300, END_PRG], &mut thread); + assert!(approx_eq!(f64, f64::from_bits(thread.vec_reg.read64(2)), *ret, (0.01, 2))); + }) + } +} + +#[cfg(test)] +mod test_vopc { + use super::*; + + #[test] + fn test_v_cmp_gt_i32() { + let mut thread = _helper_test_thread(); + + thread.vec_reg[1] = (4_i32 * -1) as u32; + r(&vec![0x7c8802c1, END_PRG], &mut thread); + assert_eq!(thread.vcc.read(), true); + + thread.vec_reg[1] = 4; + r(&vec![0x7c8802c1, END_PRG], &mut thread); + assert_eq!(thread.vcc.read(), false); + } + + #[test] + fn test_v_cmpx_nlt_f32() { + let mut thread = _helper_test_thread(); + thread.exec.value = 0b010011; + thread.vec_reg[0] = f32::to_bits(0.9); + thread.vec_reg[3] = f32::to_bits(0.4); + r(&vec![0x7D3C0700, END_PRG], &mut thread); + assert_eq!(thread.exec.read(), true); + } + + #[test] + fn test_v_cmpx_gt_i32_e32() { + let mut thread = _helper_test_thread(); + thread.vec_reg[3] = 100; + r(&vec![0x7D8806FF, 0x00000041, END_PRG], &mut thread); + assert_eq!(thread.exec.read(), false); + + thread.vec_reg[3] = -20i32 as u32; + r(&vec![0x7D8806FF, 0x00000041, END_PRG], &mut thread); + assert_eq!(thread.exec.read(), true); + } + + #[test] + fn test_cmp_class_f32() { + let thread = _helper_test_thread(); + assert!(!thread.cmp_class_f32(f32::NAN, 0b00001)); + assert!(thread.cmp_class_f32(f32::NAN, 0b00010)); + + assert!(thread.cmp_class_f32(f32::INFINITY, 0b00000000000000000000001000000000)); + assert!(!thread.cmp_class_f32(f32::INFINITY, 0b00000000000000000000000000000010)); + + assert!(thread.cmp_class_f32(f32::NEG_INFINITY, 0b00000000000000000000000000000100)); + assert!(!thread.cmp_class_f32(f32::NEG_INFINITY, 0b00000000000000000000010000000000)); + + assert!(!thread.cmp_class_f32(0.752, 0b00000000000000000000000000000000)); + assert!(thread.cmp_class_f32(0.752, 0b00000000000000000000000100000000)); + + assert!(!thread.cmp_class_f32(-0.752, 0b00000000000000000000010000000000)); + assert!(thread.cmp_class_f32(-0.752, 0b00000000000000000000010000001000)); + + assert!(!thread.cmp_class_f32(1.0e-42, 0b11111111111111111111111101111111)); + assert!(thread.cmp_class_f32(1.0e-42, 0b00000000000000000000000010000000)); + + assert!(thread.cmp_class_f32(-1.0e-42, 0b00000000000000000000000000010000)); + assert!(!thread.cmp_class_f32(-1.0e-42, 0b11111111111111111111111111101111)); + + assert!(thread.cmp_class_f32(-0.0, 0b00000000000000000000000000100000)); + assert!(thread.cmp_class_f32(0.0, 0b00000000000000000000000001000000)); + } + + #[test] + fn test_cmp_class_f64() { + let thread = _helper_test_thread(); + + assert!(!thread.cmp_class_f64(f64::NAN, 0b00001)); + assert!(thread.cmp_class_f64(f64::NAN, 0b00010)); + + assert!(thread.cmp_class_f64(f64::INFINITY, 0b00000000000000000000001000000000)); + assert!(!thread.cmp_class_f64(f64::INFINITY, 0b00000000000000000000000000000010)); + + assert!(thread.cmp_class_f64(f64::NEG_INFINITY, 0b00000000000000000000000000000100)); + assert!(!thread.cmp_class_f64(f64::NEG_INFINITY, 0b00000000000000000000010000000000)); + + assert!(!thread.cmp_class_f64(0.752, 0b00000000000000000000000000000000)); + assert!(thread.cmp_class_f64(0.752, 0b00000000000000000000000100000000)); + + assert!(!thread.cmp_class_f64(-1.0e-42, 0b00000000000000000000000000010000)); + assert!(thread.cmp_class_f64(-1.0e-42, 0b11111111111111111111111111101111)); + + assert!(thread.cmp_class_f64(-0.0, 0b00000000000000000000000000100000)); + assert!(thread.cmp_class_f64(0.0, 0b00000000000000000000000001000000)); + } +} +#[cfg(test)] +mod test_vop2 { + use super::*; + + #[test] + fn test_v_add_f32_e32() { + let mut thread = _helper_test_thread(); + thread.scalar_reg[2] = f32::to_bits(42.0); + thread.vec_reg[0] = f32::to_bits(1.0); + r(&vec![0x06000002, END_PRG], &mut thread); + assert_eq!(f32::from_bits(thread.vec_reg[0]), 43.0); + } + + #[test] + fn test_v_and_b32() { + let mut thread = _helper_test_thread(); + thread.vec_reg[10] = 15; + r(&vec![0x36141482, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[10], 2); + } + + #[test] + fn test_v_mul_f32_e32() { + let mut thread = _helper_test_thread(); + thread.vec_reg[2] = f32::to_bits(21.0); + thread.vec_reg[4] = f32::to_bits(2.0); + r(&vec![0x10060504, END_PRG], &mut thread); + assert_eq!(f32::from_bits(thread.vec_reg[3]), 42.0); + } + + #[test] + fn test_v_ashrrev_i32() { + let mut thread = _helper_test_thread(); + thread.vec_reg[0] = 4294967295; + r(&vec![0x3402009F, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[1] as i32, -1); + } + + #[test] + fn test_v_mul_i32_i24() { + [ + [18, 0x64, 1800], + [0b10000000000000000000000000, 0b1, 0], + [0b100000000000000000000000, 0b1, 0b11111111100000000000000000000000], + ] + .iter() + .for_each(|[a, b, ret]| { + let mut thread = _helper_test_thread(); + thread.vec_reg[1] = *a; + r(&vec![0x124E02FF, *b, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[39], *ret); + }); + } + + #[test] + fn test_v_add_nc_u32_const() { + let mut thread = _helper_test_thread(); + thread.vec_reg[18] = 7; + r(&vec![0x4A3024B8, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[24], 63); + } + + #[test] + fn test_v_add_nc_u32_sint() { + let mut thread = _helper_test_thread(); + thread.vec_reg[14] = 7; + thread.vec_reg[6] = 4294967279; + r(&vec![0x4A0C1D06, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[6], 4294967286); + } +} + +#[cfg(test)] +mod test_vopsd { + use super::*; + + #[test] + fn test_v_add_co_u32_scalar_co_zero() { + let mut thread = _helper_test_thread(); + thread.scalar_reg[10] = 0; + thread.vcc.default_lane = Some(1); + thread.vec_reg.default_lane = Some(1); + thread.vec_reg[10] = u32::MAX; + thread.vec_reg[20] = 20; + r(&vec![0xD7000A0A, 0x0002290A, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[10], 19); + assert_eq!(thread.scalar_reg[10], 2); + } + + #[test] + fn test_v_add_co_u32_scalar_co_override() { + let mut thread = _helper_test_thread(); + thread.scalar_reg[10] = 0b11111111111111111111111111111111; + thread.vcc.default_lane = Some(2); + thread.vec_reg.default_lane = Some(2); + thread.vec_reg[10] = u32::MAX; + thread.vec_reg[20] = 20; + r(&vec![0xD7000A0A, 0x0002290A, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[10], 19); + // NOTE: the co mask only writes to the bit that it needs to write, then at the _wave_ + // level, the final result accumulates + assert_eq!(thread.scalar_reg[10], 0b100); + } + + #[test] + fn test_v_add_co_ci_u32() { + [[0, 0, 0b0], [1, -1i32 as usize, 0b10]].iter().for_each(|[lane_id, result, carry_out]| { + let mut thread = _helper_test_thread(); + thread.vcc.default_lane = Some(*lane_id); + thread.vec_reg.default_lane = Some(*lane_id); + thread.scalar_reg[20] = 0b10; + thread.vec_reg[1] = 2; + thread.vec_reg[2] = 2; + r(&vec![0xD5211401, 0x00520501, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[1], *result as u32); + assert_eq!(thread.scalar_reg[20], *carry_out as u32); + }) + } + + #[test] + fn test_v_sub_co_ci_u32() { + [[3, 2, 0b1000], [2, 0, 0b100]].iter().for_each(|[lane_id, result, carry_out]| { + let mut thread = _helper_test_thread(); + thread.vcc.default_lane = Some(*lane_id); + thread.vec_reg.default_lane = Some(*lane_id); + thread.scalar_reg[20] = 0b1010; + thread.vec_reg[1] = *lane_id as u32; + thread.vec_reg[2] = u32::MAX - 1; + r(&vec![0xD5201401, 0x00520501, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[1], *result as u32); + assert_eq!(thread.scalar_reg[20], *carry_out as u32); + }) + } + + #[test] + fn test_v_mad_u64_u32() { + let mut thread = _helper_test_thread(); + thread.vec_reg.write64(3, u64::MAX - 3); + thread.scalar_reg[13] = 3; + thread.scalar_reg[10] = 1; + r(&vec![0xD6FE0D06, 0x040C140D, END_PRG], &mut thread); + assert_eq!(thread.vec_reg.read64(6), u64::MAX); + assert_eq!(thread.scalar_reg[13], 0); + + thread.vec_reg.write64(3, u64::MAX - 3); + thread.scalar_reg[13] = 4; + thread.scalar_reg[10] = 1; + r(&vec![0xD6FE0D06, 0x040C140D, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[6], 0); + assert_eq!(thread.vec_reg[7], 0); + assert_eq!(thread.scalar_reg[13], 1); + } + + #[test] + fn test_v_add_co_u32() { + let mut thread = _helper_test_thread(); + thread.vcc.default_lane = Some(1); + thread.vec_reg[2] = u32::MAX; + thread.vec_reg[3] = 3; + r(&vec![0xD7000D02, 0x00020503, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[2], 2); + assert_eq!(thread.scalar_reg[13], 0b10); + } + + #[test] + fn test_v_sub_co_u32() { + [[69, 0, 69, 0], [100, 200, 4294967196, 1]].iter().for_each(|[a, b, ret, scc]| { + let mut thread = _helper_test_thread(); + thread.vec_reg[4] = *a; + thread.vec_reg[15] = *b; + r(&vec![0xD7016A04, 0x00021F04, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], *ret); + assert_eq!(thread.vcc.read(), *scc != 0); + }) + } + + #[test] + fn test_return_value_exec_zero() { + let mut thread = _helper_test_thread(); + thread.exec.value = 0b11111111111111111111111111111101; + thread.vcc.default_lane = Some(1); + thread.exec.default_lane = Some(1); + thread.vec_reg[2] = u32::MAX; + thread.vec_reg[3] = 3; + r(&vec![0xD7000D02, 0x00020503, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[2], u32::MAX); + assert_eq!(thread.scalar_reg[13], 0b10); + } + + #[test] + fn test_v_div_scale_f64() { + let mut thread = _helper_test_thread(); + let v = -0.41614683654714246; + thread.vec_reg.write64(0, f64::to_bits(v)); + thread.vec_reg.write64(2, f64::to_bits(v)); + thread.vec_reg.write64(4, f64::to_bits(0.909)); + r(&vec![0xD6FD7C06, 0x04120500, END_PRG], &mut thread); + thread.vec_reg[6] = 1465086470; + thread.vec_reg[7] = 3218776614; + let ret = f64::from_bits(thread.vec_reg.read64(6)); + assert_eq!(ret, v); + } +} + +#[cfg(test)] +mod test_vop3 { + use super::*; + use float_cmp::approx_eq; + + fn helper_test_vop3(op: u32, a: f32, b: f32) -> f32 { + let mut thread = _helper_test_thread(); + thread.scalar_reg[0] = f32::to_bits(a); + thread.scalar_reg[6] = f32::to_bits(b); + r(&vec![op, 0x00000006, END_PRG], &mut thread); + return f32::from_bits(thread.vec_reg[0]); + } + + #[test] + fn test_v_add_f32() { + assert_eq!(helper_test_vop3(0xd5030000, 0.4, 0.2), 0.6); + } + + #[test] + fn test_v_max_f32() { + assert_eq!(helper_test_vop3(0xd5100000, 0.4, 0.2), 0.4); + assert_eq!(helper_test_vop3(0xd5100000, 0.2, 0.8), 0.8); + } + + #[test] + fn test_v_mul_f32() { + assert_eq!(helper_test_vop3(0xd5080000, 0.4, 0.2), 0.4 * 0.2); + } + + #[test] + fn test_signed_src() { + // v0, max(s2, s2) + let mut thread = _helper_test_thread(); + thread.scalar_reg[2] = f32::to_bits(0.5); + r(&vec![0xd5100000, 0x00000402, END_PRG], &mut thread); + assert_eq!(f32::from_bits(thread.vec_reg[0]), 0.5); + + // v1, max(-s2, -s2) + let mut thread = _helper_test_thread(); + thread.scalar_reg[2] = f32::to_bits(0.5); + r(&vec![0xd5100001, 0x60000402, END_PRG], &mut thread); + assert_eq!(f32::from_bits(thread.vec_reg[1]), -0.5); + } + + #[test] + fn test_cnd_mask_cond_src_sgpr() { + let mut thread = _helper_test_thread(); + thread.scalar_reg[3] = 0b001; + r(&vec![0xD5010000, 0x000D0280, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[0], 1); + + thread.scalar_reg[3] = 0b00; + r(&vec![0xD5010000, 0x000D0280, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[0], 0); + } + + #[test] + fn test_cnd_mask_cond_src_vcclo() { + let mut thread = _helper_test_thread(); + thread.vec_reg[2] = 20; + thread.vec_reg[0] = 100; + r(&vec![0xD5010002, 0x41AA0102, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[2], 20); + } + + #[test] + fn test_cnd_mask_float_const() { + let mut thread = _helper_test_thread(); + thread.vcc.value = 0b00000010; + thread.vcc.default_lane = Some(0); + r(&vec![0xD5010003, 0x01A9E480, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[3], 0); + + thread.vcc.value = 0b00000010; + thread.vcc.default_lane = Some(1); + r(&vec![0xD5010003, 0x01A9E480, END_PRG], &mut thread); + assert_eq!(f32::from_bits(thread.vec_reg[3]), 1.0); + } + + #[test] + fn test_v_cndmask_b32_e64_neg() { + [[0.0f32, 0.0], [1.0f32, -1.0], [-1.0f32, 1.0]].iter().for_each(|[input, ret]| { + let mut thread = _helper_test_thread(); + thread.scalar_reg[0] = false as u32; + thread.vec_reg[3] = input.to_bits(); + r(&vec![0xD5010003, 0x2001FF03, 0x80000000, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[3], ret.to_bits()); + }); + } + + #[test] + fn test_v_mul_hi_i32() { + let mut thread = _helper_test_thread(); + thread.vec_reg[2] = -2i32 as u32; + r(&vec![0xD72E0003, 0x000204FF, 0x2E8BA2E9, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[3] as i32, -1); + + thread.vec_reg[2] = 2; + r(&vec![0xD72E0003, 0x000204FF, 0x2E8BA2E9, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[3], 0); + } + + #[test] + fn test_v_writelane_b32() { + let mut thread = _helper_test_thread(); + thread.scalar_reg[8] = 25056; + r(&vec![0xD7610004, 0x00010008, END_PRG], &mut thread); + assert_eq!(thread.vec_reg.get_lane(0)[4], 25056); + + thread.scalar_reg[9] = 25056; + r(&vec![0xD7610004, 0x00010209, END_PRG], &mut thread); + assert_eq!(thread.vec_reg.get_lane(1)[4], 25056); + } + + #[test] + fn test_v_readlane_b32() { + let mut thread = _helper_test_thread(); + thread.vec_reg.get_lane_mut(15)[4] = 0b1111; + r(&vec![0xD760006A, 0x00011F04, END_PRG], &mut thread); + assert_eq!(thread.vcc.read(), true); + } + + #[test] + fn test_v_lshlrev_b64() { + let mut thread = _helper_test_thread(); + thread.vec_reg.write64(2, 100); + thread.vec_reg[4] = 2; + r(&vec![0xD73C0002, 0x00020504, END_PRG], &mut thread); + assert_eq!(thread.vec_reg.read64(2), 400); + } + + #[test] + fn test_v_lshrrev_b64() { + let mut thread = _helper_test_thread(); + thread.vec_reg.write64(2, 100); + thread.vec_reg[4] = 2; + r(&vec![0xd73d0002, 0x00020504, END_PRG], &mut thread); + assert_eq!(thread.vec_reg.read64(2), 25); + } + + #[test] + fn test_v_add_f64_neg_modifier() { + let mut thread = _helper_test_thread(); + thread.vec_reg[0] = 0x652b82fe; + thread.vec_reg[1] = 0x40071547; + thread.vec_reg[2] = 0; + thread.vec_reg[3] = 0x40080000; + r(&vec![0xD7270004, 0x40020500, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], 1519362112); + assert_eq!(thread.vec_reg[5], 3216856851); + } + + #[test] + fn test_v_cvt_f32_f16_abs_modifier() { + [[0.4, 0.4], [-0.4, 0.4]].iter().for_each(|[a, ret]| { + let mut thread = _helper_test_thread(); + thread.vec_reg[1] = f16::from_f32_const(*a).to_bits() as u32; + r(&vec![0xD58B0102, 0x00000101, END_PRG], &mut thread); + assert!(approx_eq!(f32, f32::from_bits(thread.vec_reg[2]), *ret, (0.01, 2))); + }); + } + + #[test] + fn test_v_alignbit_b32() { + let mut thread = _helper_test_thread(); + thread.scalar_reg[4] = 5340353; + thread.scalar_reg[10] = 3072795146; + thread.vec_reg[0] = 8; + r(&vec![0xD6160001, 0x04001404, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[1], 3250005794); + } + + #[test] + fn test_v_bfe_i32() { + [ + [0b00000000000000000000000000000001, -1], + [0b00000000000000000000000000000000, 0], + [0b00000000000000000000000000000010, 0], + ] + .iter() + .for_each(|[a, ret]| { + let mut thread = _helper_test_thread(); + thread.vec_reg[2] = *a as u32; + r(&vec![0xD6110005, 0x02050102, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[5] as i32, *ret); + }); + + [ + [0b00000000000000000000000000000010, -2], + [0b00000000000000000000000000000001, 1], + [0b00000000000000000000000000000100, 0], + ] + .iter() + .for_each(|[a, ret]| { + let mut thread = _helper_test_thread(); + thread.vec_reg[2] = *a as u32; + r(&vec![0xD6110005, 0x02090102, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[5] as i32, *ret); + }); + + [ + [0b00100000000000000000000000000000, 0b100000000000000000000000000000], + [0b00000000000000001000000000000000, 0b1000000000000000], + [-1, -1], + ] + .iter() + .for_each(|[a, ret]| { + let mut thread = _helper_test_thread(); + thread.vec_reg[2] = *a as u32; + r(&vec![0xD6110005, 0x03050102, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[5] as i32, *ret); + }); + } + + #[test] + fn test_v_ashrrev_i16() { + let mut thread = _helper_test_thread(); + [ + [0b10000000000000000000000000000000, 0], + [0b10000000000000000000000000000111, 3], + [0b0000000000000000, 0], + [0b1000000000000000, 0b1100000000000000], + [0b0100000000000000, 0b0010000000000000], + [0b0010000000000000, 0b0001000000000000], + [0b1010000000000000, 0b1101000000000000], + [0b1110000000000000, 0b1111000000000000], + [0b0110000000000000, 0b0011000000000000], + ] + .iter() + .for_each(|[a, ret]| { + thread.vec_reg[2] = *a; + thread.scalar_reg[1] = 1; + r(&vec![0xd73a0005, 0b11000001100000010000000001, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[5], *ret); + }); + + [ + [0b1000000000000000, 0b1111, 0b1111111111111111], + [0b1000000000000000, 0b11111, 0b1111111111111111], + [0b1000000000000000, 0b0111, 0b1111111100000000], + ] + .iter() + .for_each(|[a, shift, ret]| { + thread.vec_reg[2] = *a; + thread.scalar_reg[1] = *shift; + r(&vec![0xd73a0005, 0b11000001100000010000000001, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[5], *ret); + }); + + thread.vec_reg[5] = 0b11100000000000001111111111111111; + thread.vec_reg[2] = 0b0100000000000000; + thread.scalar_reg[1] = 1; + r(&vec![0xd73a0005, 0b11000001100000010000000001, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[5], 0b11100000000000000010000000000000); + } + + #[test] + fn test_v_add_nc_u16() { + let mut thread = _helper_test_thread(); + thread.vec_reg[5] = 10; + thread.vec_reg[8] = 20; + r(&vec![0xD7030005, 0x00021105, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[5], 30); + } + + #[test] + fn test_v_mul_lo_u16() { + let mut thread = _helper_test_thread(); + thread.vec_reg[5] = 2; + thread.vec_reg[15] = 0; + r(&vec![0xD705000F, 0x00010B05, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[15], 10); + + thread.vec_reg[5] = 2; + thread.vec_reg[15] = 0b10000000000000000000000000000000; + r(&vec![0xD705000F, 0x00010B05, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[15], 0b10000000000000000000000000000000 + 10); + } + + #[test] + fn test_v_cmp_gt_u16() { + let mut thread = _helper_test_thread(); + thread.vec_reg[1] = 52431; + thread.scalar_reg[5] = 0; + r(&vec![0xD43C0005, 0x000202FF, 0x00003334, END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[5], 0); + } + + #[test] + fn test_v_cmp_ngt_f32_abs() { + [(0.5f32, 0.5f32, 1), (-0.5, 0.5, 1), (0.1, 0.2, 0), (-0.1, 0.2, 0)] + .iter() + .for_each(|(x, y, ret)| { + let mut thread = _helper_test_thread(); + thread.scalar_reg[2] = x.to_bits(); + r(&vec![0xD41B0203, 0x000004FF, y.to_bits(), END_PRG], &mut thread); + assert_eq!(thread.scalar_reg[3], *ret); + }) + } + #[test] + fn test_fma() { + fn v_fma_f32(a: u32, b: u32, c: u32, ret: u32) { + let mut thread = _helper_test_thread(); + thread.vec_reg[1] = b; + thread.scalar_reg[3] = c; + r(&vec![0xD6130000, 0x000E02FF, a, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[0], ret); + } + fn v_fmac_f32(a: u32, b: u32, c: u32, ret: u32) { + let mut thread = _helper_test_thread(); + thread.scalar_reg[1] = a; + thread.scalar_reg[2] = b; + thread.vec_reg[0] = c; + r(&vec![0xd52b0000, 0x401, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[0], ret); + } + [[0xbfc90fda, 1186963456, 1192656896, 3204127872]].iter().for_each(|[a, b, c, ret]| { + v_fma_f32(*a, *b, *c, *ret); + v_fmac_f32(*a, *b, *c, *ret); + }) + } + + #[test] + fn test_v_perm_b32() { + let mut thread = _helper_test_thread(); + thread.vec_reg[1] = 15944; + thread.vec_reg[0] = 84148480; + r(&vec![0xD644000F, 0x03FE0101, 0x05040100, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[15], 1044906240); + } + + #[test] + fn test_v_mul_f64() { + let mut thread = _helper_test_thread(); + thread.vec_reg[0] = 0x5a8fa040; + thread.vec_reg[1] = 0xbfbd5713; + thread.vec_reg[2] = 0x3b39803f; + thread.vec_reg[3] = 0x3c7abc9e; + r(&vec![0xD7280004, 0x00020500, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], 1602589062); + assert_eq!(thread.vec_reg[5], 3158868912); + } + + #[test] + fn test_v_fma_f64() { + let mut thread = _helper_test_thread(); + thread.vec_reg[0] = 0x5a8fa040; + thread.vec_reg[1] = 0xbfbd5713; + thread.vec_reg[2] = 0xfefa39ef; + thread.vec_reg[3] = 0x3fe62e42; + thread.vec_reg[4] = 0x5f859186; + thread.vec_reg[5] = 0xbc4883b0; + r(&vec![0xD6140006, 0x04120500, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[6], 3883232879); + assert_eq!(thread.vec_reg[7], 3216266823); + } + + #[test] + fn test_v_fma_f64_const() { + let mut thread = _helper_test_thread(); + thread.vec_reg[0] = 0xf690ecbf; + thread.vec_reg[1] = 0x3fdf2b4f; + thread.vec_reg[2] = 0xe7756e6f; + thread.vec_reg[3] = 0xbfb45647; + r(&vec![0xD6140004, 0x03CA0500, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], 962012421); + assert_eq!(thread.vec_reg[5], 1072612110); + } + + #[test] + fn test_v_ldexp_f64() { + let mut thread = _helper_test_thread(); + thread.vec_reg.write64(0, f64::to_bits(5.0)); + thread.vec_reg[2] = 3; + thread.vec_reg[3] = 3; + r(&vec![0xD72B0000, 0x00020500, END_PRG], &mut thread); + let val = f64::from_bits(thread.vec_reg.read64(0)); + assert_eq!(val, 40.0); + } + + #[test] + fn test_simm_resolve_int_in_double_op() { + let mut thread = _helper_test_thread(); + thread.vec_reg.write64(0, 3.0f64.to_bits()); + let simm = 0xFFFFFFE0; + r(&vec![0xD72B0002, 0x0001FF00, simm, END_PRG], &mut thread); + assert_eq!(f64::from_bits(thread.vec_reg.read64(2)), 3.0 * 2.0.powi(-32)); + } + + #[test] + fn test_simm_resolve_double_in_double_op() { + let mut thread = _helper_test_thread(); + thread.vec_reg.write64(0, 2.0f64.to_bits()); + let simm = 0x40080000; + r(&vec![0xD7280000, 0x000200FF, simm, END_PRG], &mut thread); + assert_eq!(f64::from_bits(thread.vec_reg.read64(0)), 6.0); + } +} + +#[cfg(test)] +mod test_vopp { + use super::*; + + #[test] + fn test_v_fma_mix_f32() { + let mut thread = _helper_test_thread(); + thread.vec_reg[2] = 1065353216; + thread.scalar_reg[2] = 3217620992; + thread.vec_reg[1] = 15360; + r(&vec![0xCC204403, 0x04040502, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[3], 3205627904); + + thread.vec_reg[2] = 1065353216; + thread.scalar_reg[2] = 3217620992; + thread.vec_reg[1] = 48128; + r(&vec![0xCC204403, 0x04040502, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[3], 3205627904); + } + + #[test] + fn test_packed_opsel_000_op_000() { + let mut thread = _helper_test_thread(); + thread.vec_reg[1] = 1; + thread.vec_reg[2] = 2; + thread.vec_reg[3] = 3; + r(&vec![0xCC090004, 0x040E0501, 0xBFB00000, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], 0b1010000000000000101); + } + + #[test] + fn test_packed_opsel_001_op_100() { + let mut thread = _helper_test_thread(); + thread.vec_reg[1] = 1; + thread.vec_reg[2] = 2; + thread.vec_reg[3] = 3; + r(&vec![0xCC092004, 0x0C0E0501, 0xBFB00000, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], 0b110000000000000010); + } + + #[test] + fn test_packed_inline_const_int() { + let mut thread = _helper_test_thread(); + thread.vec_reg[1] = 1; + thread.vec_reg[2] = 2; + thread.vec_reg[3] = 3; + + r(&vec![0xCC090004, 0x020E0501, 0xBFB00000, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], 0b1010000000000000101); + + r(&vec![0xCC090804, 0x0A0E0501, 0xBFB00000, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], 0b110000000000000011); + + r(&vec![0xCC096004, 0x020E0501, 0xBFB00000, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], 0b100000000000000010); + + r(&vec![0xCC090004, 0x03FE0501, 0x00000080, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], 8519810); + } + + #[test] + fn test_pk_fma_f16_inline_const() { + let mut thread = _helper_test_thread(); + thread.vec_reg[2] = 0x393a35f6; + thread.vec_reg[3] = 0x2800; + + r(&vec![0xCC0E0004, 0x03FE0702, 0x0000A400, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], 2618596372); + + r(&vec![0xCC0E0004, 0x0BFE0702, 0x0000A400, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], 485006356); + + r(&vec![0xCC0E0004, 0x1BFE0702, 0x0000A400, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], 2751503380); + + r(&vec![0xCC0E0804, 0x03FE0702, 0x0000A400, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], 2618563816); + + r(&vec![0xCC0E1804, 0x03FE0702, 0x0000A400, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], 2618598400); + } + + #[test] + fn test_v_fma_mixhilo_f16() { + let mut thread = _helper_test_thread(); + thread.vec_reg[11] = 1065353216; + thread.vec_reg[7] = 3047825943; + thread.vec_reg[16] = 3047825943; + + thread.vec_reg[14] = 0b10101010101010101111111111111111; + r(&vec![0xCC21000E, 0x04420F0B, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[14], 0b10101010101010101000000000101011); + + thread.vec_reg[14] = 0b10101010101010101111111111111111; + r(&vec![0xCC22000E, 0x04420F0B, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[14], 0b10000000001010111111111111111111); + } + + #[test] + fn test_v_pk_lshlrev_b16() { + let mut thread = _helper_test_thread(); + thread.vec_reg[3] = 0b1010101011101101; + + r(&vec![0xCC044004, 0x0002068E, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], 0b1000000000000000100000000000000); + + r(&vec![0xCC044004, 0x1002068E, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], 0b100000000000000); + + r(&vec![0xCC044004, 0x100206FF, 0x00010002, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], 0b1010101110110100); + r(&vec![0xCC044004, 0x100206FF, 0x05012002, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], 0b1010101110110100); + + r(&vec![0xCC044004, 0x100206FF, 0x0503E00F, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], 0b1000000000000000); + r(&vec![0xCC044004, 0x100206FF, 0x0503E007, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], 0b111011010000000); + r(&vec![0xCC044004, 0x100206FF, 0x0503E01F, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[4], 0b1000000000000000); + } + + #[test] + fn test_pk_fma_with_neg() { + let mut thread = _helper_test_thread(); + let a1 = f16::from_f32(1.0); + let b1 = f16::from_f32(2.0); + let c1 = f16::from_f32(3.0); + + let a2 = f16::from_f32(4.0); + let b2 = f16::from_f32(5.0); + let c2 = f16::from_f32(6.0); + + thread.vec_reg[0] = (a1.to_bits() as u32) << 16 | (a2.to_bits() as u32); + thread.vec_reg[9] = (b1.to_bits() as u32) << 16 | (b2.to_bits() as u32); + thread.vec_reg[10] = (c1.to_bits() as u32) << 16 | (c2.to_bits() as u32); + + r(&vec![0xCC0E3805, 0x042A1300, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[5], 1317029120); + + r(&vec![0xCC0E3805, 0x242A1300, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[5], 1317026816); + + r(&vec![0xCC0E3B05, 0x042A1300, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[5], 1317029120); + + r(&vec![0xCC0E3905, 0x042A1300, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[5], 3405792512); + } + + #[test] + fn test_pk_add_f16_with_float_const() { + let mut thread = _helper_test_thread(); + let a1 = f16::from_f32(5.0); + let a2 = f16::from_f32(10.0); + + thread.vec_reg[1] = (a1.to_bits() as u32) << 16 | (a2.to_bits() as u32); + r(&vec![0xCC0F4002, 0x0001E501, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[2], 1233144192); + + r(&vec![0xCC0F5002, 0x0001E501, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[2], 1233144064); + + r(&vec![0xCC0F5002, 0x1001E501, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[2], 1224755456); + + r(&vec![0xCC0F5802, 0x1801E501, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[2], 1157645568); + } +} + +#[cfg(test)] +mod test_flat { + use super::*; + use std::alloc::{alloc, handle_alloc_error, Layout}; + + #[test] + fn test_scratch_swap_values() { + let mut thread = _helper_test_thread(); + thread.vec_reg[13] = 42; + thread.vec_reg[14] = 10; + r( + &vec![ + 0xDC690096, 0x007C0D00, 0xDC69001E, 0x007C0E00, 0xDC51001E, 0x0D7C0000, 0xDC510096, 0x0E7C0000, END_PRG, + ], + &mut thread, + ); + assert_eq!(thread.vec_reg[13], 10); + assert_eq!(thread.vec_reg[14], 42); + } + + #[test] + fn test_scratch_load_dword_offset() { + let mut thread = _helper_test_thread(); + thread.vec_reg[14] = 14; + thread.vec_reg[15] = 23; + r(&vec![0xDC6D000A, 0x007C0E00, 0xDC51000A, 0x0E7C0000, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[14], 14); + + r(&vec![0xDC6D000A, 0x007C0E00, 0xDC51000E, 0x0E7C0000, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[14], 23); + } + + #[test] + fn test_global_load_d16_hi_b16() { + let mut thread = _helper_test_thread(); + thread.vec_reg[13] = 0b10101011101101001111111111111111; + unsafe { + let layout = Layout::new::(); + let ptr = alloc(layout); + if ptr.is_null() { + handle_alloc_error(layout) + } + *(ptr as *mut u16) = 42; + thread.vec_reg.write64(10, ptr as u64); + } + r(&vec![0xDC8E0000, 0x0D7C000A, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[13], 0b00000000001010101111111111111111); + } +} + +#[cfg(test)] +mod test_lds { + use super::*; + #[test] + fn test_ds_load_offset() { + let mut thread = _helper_test_thread(); + thread.lds.write(256, 69); + thread.vec_reg[9] = 0; + r(&vec![0xD8D80100, 0x01000009, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[1], 69); + + thread.lds.write(800, 69); + thread.vec_reg[9] = 0; + r(&vec![0xD8D80320, 0x01000009, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[1], 69); + + thread.lds.write(3, 69); + thread.vec_reg[9] = 0; + r(&vec![0xD8D80003, 0x01000009, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[1], 69); + } + + #[test] + fn test_ds_load_dwords() { + let mut thread = _helper_test_thread(); + thread.lds.write(0, 100); + thread.lds.write(4, 200); + thread.vec_reg[9] = 0; + r(&vec![0xD9D80000, 0x00000009, END_PRG], &mut thread); + assert_eq!(thread.vec_reg.read64(0), 858993459300); + + thread.lds.write(0, 1); + thread.lds.write(4, 2); + thread.lds.write(8, 3); + thread.lds.write(12, 4); + thread.vec_reg[9] = 0; + r(&vec![0xDBFC0000, 0x00000009, END_PRG], &mut thread); + assert_eq!(thread.vec_reg[0], 1); + assert_eq!(thread.vec_reg[1], 2); + assert_eq!(thread.vec_reg[2], 3); + assert_eq!(thread.vec_reg[3], 4); + } + + #[test] + fn test_ds_store_dwords() { + let mut thread = _helper_test_thread(); + thread.vec_reg[9] = 69; + thread.vec_reg[0] = 0; + r(&vec![0xD83403E8, 0x00000900, END_PRG], &mut thread); + assert_eq!(thread.lds.read(1000), 69); + } +} +#[allow(dead_code)] +fn r(prg: &Vec, thread: &mut Thread) { + let mut pc = 0; + let instructions = prg.to_vec(); + thread.pc_offset = 0; + if thread.exec.value == 0 { + thread.exec.value = u32::MAX; + } + + loop { + if instructions[pc] == END_PRG { + break; + } + if instructions[pc] == 0xbfb60003 || instructions[pc] >> 20 == 0xbf8 { + pc += 1; + continue; + } + thread.pc_offset = 0; + thread.stream = instructions[pc..instructions.len()].to_vec(); + thread.interpret().unwrap(); + thread.simm = None; + if thread.vcc.mutations.is_some() { + thread.vcc.apply_muts(); + thread.vcc.mutations = None; + } + if thread.exec.mutations.is_some() { + thread.exec.apply_muts(); + thread.exec.mutations = None; + } + if let Some((idx, mut wv)) = thread.sgpr_co { + wv.apply_muts(); + thread.scalar_reg[*idx] = wv.value; + } + pc = ((pc as isize) + 1 + (thread.pc_offset as isize)) as usize; + } +} +fn _helper_test_thread() -> Thread<'static> { + let static_lds: &'static mut VecDataStore = Box::leak(Box::new(VecDataStore::new())); + let static_sgpr: &'static mut Vec = Box::leak(Box::new(vec![0; 256])); + let static_vgpr: &'static mut VGPR = Box::leak(Box::new(VGPR::new())); + let static_scc: &'static mut u32 = Box::leak(Box::new(0)); + let static_exec: &'static mut WaveValue = Box::leak(Box::new(WaveValue::new(u32::MAX, 32))); + let static_vcc: &'static mut WaveValue = Box::leak(Box::new(WaveValue::new(0, 32))); + let static_sds: &'static mut VecDataStore = Box::leak(Box::new(VecDataStore::new())); + let static_co: &'static mut Option<(usize, WaveValue)> = Box::leak(Box::new(None)); + + let thread = Thread { + scalar_reg: static_sgpr, + vec_reg: static_vgpr, + scc: static_scc, + vcc: static_vcc, + exec: static_exec, + lds: static_lds, + sds: static_sds, + simm: None, + pc_offset: 0, + stream: vec![], + sgpr_co: static_co, + warp_size: 32, + scalar: false, + }; + thread.vec_reg.default_lane = Some(0); + thread.vcc.default_lane = Some(0); + thread.exec.default_lane = Some(0); + return thread; +} diff --git a/extra/remu/src/work_group.rs b/extra/remu/src/work_group.rs new file mode 100644 index 0000000000..69bd010113 --- /dev/null +++ b/extra/remu/src/work_group.rs @@ -0,0 +1,268 @@ +use crate::helpers::{Colorize, DEBUG}; +use crate::state::{Register, VecDataStore, WaveValue, VGPR}; +use crate::thread::{Thread, END_PRG}; +use std::collections::HashMap; + +pub struct WorkGroup<'a> { + dispatch_dim: u32, + id: [u32; 3], + lds: VecDataStore, + kernel: &'a Vec, + kernel_args: *const u64, + launch_bounds: [u32; 3], + wave_state: HashMap, +} + +struct WaveState { + scalar_reg: Vec, + scc: u32, + vec_reg: VGPR, + vcc: WaveValue, + exec: WaveValue, + pc: usize, + sds: HashMap, +} + +const SYNCS: [u32; 5] = [0xBF89FC07, 0xBFBD0000, 0xBC7C0000, 0xBF890007, 0xbFB60003]; +const BARRIERS: [[u32; 2]; 5] = [ + [SYNCS[0], SYNCS[0]], + [SYNCS[0], SYNCS[1]], + [SYNCS[0], SYNCS[2]], + [SYNCS[3], SYNCS[1]], + [SYNCS[1], SYNCS[0]], +]; +impl<'a> WorkGroup<'a> { + pub fn new(dispatch_dim: u32, id: [u32; 3], launch_bounds: [u32; 3], kernel: &'a Vec, kernel_args: *const u64) -> Self { + return Self { + dispatch_dim, + id, + kernel, + launch_bounds, + kernel_args, + lds: VecDataStore::new(), + wave_state: HashMap::new(), + }; + } + + pub fn exec_waves(&mut self) -> Result<(), i32> { + let mut blocks = vec![]; + for z in 0..self.launch_bounds[2] { + for y in 0..self.launch_bounds[1] { + for x in 0..self.launch_bounds[0] { + blocks.push([x, y, z]) + } + } + } + let waves = blocks.chunks(32).map(|w| w.to_vec()).collect::>(); + + let mut sync = false; + for (i, x) in self.kernel.iter().enumerate() { + if i != 0 && BARRIERS.contains(&[*x, self.kernel[i - 1]]) { + sync = true; + break; + } + } + for _ in 0..=(sync as usize) { + for w in waves.iter().enumerate() { + self.exec_wave(w)? + } + } + Ok(()) + } + + fn exec_wave(&mut self, (wave_id, threads): (usize, &Vec<[u32; 3]>)) -> Result<(), i32> { + let wave_state = self.wave_state.get(&wave_id); + let mut sds = match wave_state { + Some(val) => val.sds.clone(), + None => { + let mut sds = HashMap::new(); + for i in 0..=31 { + sds.insert(i, VecDataStore::new()); + } + sds + } + }; + let (mut scalar_reg, mut scc, mut pc) = match wave_state { + Some(val) => (val.scalar_reg.to_vec(), val.scc, val.pc), + None => { + let mut scalar_reg = vec![0; 256]; + scalar_reg.write64(0, self.kernel_args as u64); + let [gx, gy, gz] = self.id; + match self.dispatch_dim { + 3 => (scalar_reg[13], scalar_reg[14], scalar_reg[15]) = (gx, gy, gz), + 2 => (scalar_reg[14], scalar_reg[15]) = (gx, gy), + _ => scalar_reg[15] = gx, + } + (scalar_reg, 0, 0) + } + }; + let (mut vec_reg, mut vcc) = match wave_state { + Some(val) => (val.vec_reg.clone(), val.vcc.clone()), + None => (VGPR::new(), WaveValue::new(0, threads.len())), + }; + let mut exec = match wave_state { + Some(val) => val.exec.clone(), + None => { + let active = match threads.len() == 32 { + true => u32::MAX, + false => (1 << threads.len()) - 1, + }; + WaveValue::new(active, threads.len()) + } + }; + + let mut seeded_lanes = vec![]; + loop { + if self.kernel[pc] == END_PRG { + break Ok(()); + } + if BARRIERS.contains(&[self.kernel[pc], self.kernel[pc + 1]]) && wave_state.is_none() { + self.wave_state.insert( + wave_id, + WaveState { + scalar_reg, + scc, + vec_reg, + vcc, + exec, + pc, + sds, + }, + ); + break Ok(()); + } + if SYNCS.contains(&self.kernel[pc]) || self.kernel[pc] >> 20 == 0xbf8 || self.kernel[pc] == 0x7E000000 { + pc += 1; + continue; + } + + let mut sgpr_co = None; + for (lane_id, [x, y, z]) in threads.iter().enumerate() { + vec_reg.default_lane = Some(lane_id); + vcc.default_lane = Some(lane_id); + exec.default_lane = Some(lane_id); + if *DEBUG { + let lane = format!("{:<2} {:08X} ", lane_id, self.kernel[pc]); + let state = match exec.read() { + true => "green", + false => "gray", + }; + let [id0, id1, id2] = self.id; + print!("[{id0:<3} {id1:<3} {id2:<3}] [{x:<3} {y:<3} {z:<3}] {}", lane.color(state)); + } + if !seeded_lanes.contains(&lane_id) && self.wave_state.get(&wave_id).is_none() { + match (self.launch_bounds[1] != 1, self.launch_bounds[2] != 1) { + (false, false) => vec_reg[0] = *x, + _ => vec_reg[0] = (z << 20) | (y << 10) | x, + } + seeded_lanes.push(lane_id); + } + let mut thread = Thread { + scalar_reg: &mut scalar_reg, + scc: &mut scc, + vec_reg: &mut vec_reg, + vcc: &mut vcc, + exec: &mut exec, + lds: &mut self.lds, + sds: &mut sds.get_mut(&lane_id).unwrap(), + pc_offset: 0, + stream: self.kernel[pc..self.kernel.len()].to_vec(), + scalar: false, + simm: None, + warp_size: threads.len(), + sgpr_co: &mut sgpr_co, + }; + thread.interpret()?; + if *DEBUG { + println!(); + } + if thread.scalar { + pc = ((pc as isize) + 1 + (thread.pc_offset as isize)) as usize; + break; + } + if lane_id == threads.len() - 1 { + pc = ((pc as isize) + 1 + (thread.pc_offset as isize)) as usize; + } + } + + if vcc.mutations.is_some() { + vcc.apply_muts(); + vcc.mutations = None; + } + if exec.mutations.is_some() { + exec.apply_muts(); + exec.mutations = None; + } + if let Some((idx, mut wv)) = sgpr_co.take() { + wv.apply_muts(); + scalar_reg[idx] = wv.value; + } + } + } +} + +#[cfg(test)] +mod test_workgroup { + use super::*; + + // TODO: make this generic by adding the assembler + fn global_store_sgpr(addr: u64, instructions: Vec, src: u32) -> Vec { + [ + instructions, + vec![ + 0x7E020200 + src, + 0x7E0402FF, + addr as u32, + 0x7E0602FF, + (addr >> 32) as u32, + 0xDC6A0000, + 0x007C0102, + ], + vec![END_PRG], + ] + .concat() + } + + #[test] + fn test_wave_value_state_vcc() { + let mut ret: u32 = 0; + let kernel = vec![ + 0xBEEA00FF, + 0b11111111111111111111111111111111, // initial vcc state + 0x7E140282, + 0x7C94010A, // cmp blockDim.x == 2 + ]; + let addr = (&mut ret as *mut u32) as u64; + let kernel = global_store_sgpr(addr, kernel, 106); + let mut wg = WorkGroup::new(1, [0, 0, 0], [3, 1, 1], &kernel, [addr].as_ptr()); + wg.exec_waves().unwrap(); + assert_eq!(ret, 0b100); + } + + #[test] + fn test_wave_value_state_exec() { + let mut ret: u32 = 0; + let kernel = vec![ + 0xBEFE00FF, + 0b11111111111111111111111111111111, + 0x7E140282, + 0x7D9C010A, // cmpx blockDim.x <= 2 + ]; + let addr = (&mut ret as *mut u32) as u64; + let kernel = global_store_sgpr(addr, kernel, 126); + let mut wg = WorkGroup::new(1, [0, 0, 0], [4, 1, 1], &kernel, [addr].as_ptr()); + wg.exec_waves().unwrap(); + assert_eq!(ret, 0b0111); + } + + #[test] + fn test_wave_value_sgpr_co() { + let mut ret: u32 = 0; + let kernel = vec![0xBE8D00FF, 0x7FFFFFFF, 0x7E1402FF, u32::MAX, 0xD700000A, 0x0002010A]; + let addr = (&mut ret as *mut u32) as u64; + let kernel = global_store_sgpr(addr, kernel, 0); + let mut wg = WorkGroup::new(1, [0, 0, 0], [5, 1, 1], &kernel, [addr].as_ptr()); + wg.exec_waves().unwrap(); + assert_eq!(ret, 0b11110); + } +} diff --git a/test/mockgpu/amd/amdgpu.py b/test/mockgpu/amd/amdgpu.py index 974ed9e667..c3433f6fcb 100644 --- a/test/mockgpu/amd/amdgpu.py +++ b/test/mockgpu/amd/amdgpu.py @@ -18,7 +18,8 @@ WAIT_REG_MEM_FUNCTION_ALWAYS = 0 WAIT_REG_MEM_FUNCTION_EQ = 3 # == WAIT_REG_MEM_FUNCTION_GEQ = 5 # >= -REMU_PATHS = ["libremu.so", "/usr/local/lib/libremu.so", "libremu.dylib", "/usr/local/lib/libremu.dylib", "/opt/homebrew/lib/libremu.dylib"] +REMU_PATHS = ["extra/remu/target/release/libremu.so", "libremu.so", "/usr/local/lib/libremu.so", + "extra/remu/target/release/libremu.dylib", "libremu.dylib", "/usr/local/lib/libremu.dylib", "/opt/homebrew/lib/libremu.dylib"] def _try_dlopen_remu(): for path in REMU_PATHS: try: