upstream remu (#9921)

2026-01-09 15:08:02 -05:00 · 2025-04-18 01:57:36 +03:00
parent d287afe3b1
commit 16dfe0a902
12 changed files with 4609 additions and 7 deletions
--- a/.github/actions/setup-tinygrad/action.yml
+++ b/.github/actions/setup-tinygrad/action.yml
@@ -121,9 +121,8 @@ runs:
        echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
        sudo apt update || true
        sudo apt install --no-install-recommends --allow-unauthenticated -y hsa-rocr comgr hsa-rocr-dev liburing-dev libc6-dev
-        curl -s https://api.github.com/repos/Qazalin/remu/releases/latest | \
-        jq -r '.assets[] | select(.name == "libremu.so").browser_download_url' | \
-        sudo xargs curl -L -o /usr/local/lib/libremu.so
+        cargo build --release --manifest-path ./extra/remu/Cargo.toml
+        sudo ln -sf ${{ github.workspace }}/extra/remu/target/release/libremu.so /usr/local/lib/libremu.so
        sudo tee --append /etc/ld.so.conf.d/rocm.conf <<'EOF'
          /opt/rocm/lib
          /opt/rocm/lib64
@@ -137,9 +136,7 @@ runs:
        curl -s -H "Authorization: token $GH_TOKEN" curl -s https://api.github.com/repos/nimlgen/amdcomgr_dylib/releases/latest | \
          jq -r '.assets[] | select(.name == "libamd_comgr.dylib").browser_download_url' | \
          sudo xargs curl -L -o /usr/local/lib/libamd_comgr.dylib
-        curl -s -H "Authorization: token $GH_TOKEN" curl -s https://api.github.com/repos/Qazalin/remu/releases/latest | \
-          jq -r '.assets[] | select(.name == "libremu.dylib").browser_download_url' | \
-          sudo xargs curl -L -o /usr/local/lib/libremu.dylib
+        cargo build --release --manifest-path ./extra/remu/Cargo.toml

    # **** CUDA ****

--- a/.gitignore
+++ b/.gitignore
@@ -60,3 +60,4 @@ comgr_*
 site/
 profile_stats
 *.log
+target
--- a/extra/remu/Cargo.lock
+++ b/extra/remu/Cargo.lock
@@ -0,0 +1,66 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "autocfg"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "crunchy"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
+
+[[package]]
+name = "float-cmp"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "98de4bbd547a563b716d8dfa9aad1cb19bfab00f4fa09a6a4ed21dbcf44ce9c4"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "half"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+ "num-traits",
+]
+
+[[package]]
+name = "libm"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
+
+[[package]]
+name = "num-traits"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
+dependencies = [
+ "autocfg",
+ "libm",
+]
+
+[[package]]
+name = "remu"
+version = "0.1.0"
+dependencies = [
+ "float-cmp",
+ "half",
+ "num-traits",
+]
--- a/extra/remu/Cargo.toml
+++ b/extra/remu/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "remu"
+version = "0.1.0"
+edition = "2021"
+rust-version = "1.80.0"
+
+[lib]
+crate-type = ["cdylib"]
+
+[dependencies]
+half = { version = "2.3.1", features = ["num-traits"] }
+num-traits = "0.2.17"
+
+[dev-dependencies]
+float-cmp = "0.9.0"
--- a/extra/remu/README.md
+++ b/extra/remu/README.md
@@ -0,0 +1,80 @@
+## Intro
+
+Remu is an RDNA3 emulator built to test correctness of RDNA3 code. It is used in [tinygrad's AMD CI](https://github.com/tinygrad/tinygrad).
+
+Most of the common instructions are implemented, but some formats like IMG are not supported.
+
+Remu is only for testing correctness of program output, it is not a cycle accurate simulator.
+
+## Build Locally
+
+Remu is written in Rust. Make sure you have [Cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html).
+
+To build the project, run:
+
+```bash
+cargo build --release --manifest-path ./extra/remu/Cargo.toml
+```
+
+This will produce a binary in the `extra/remu/target/release` directory.
+
+## Usage with tinygrad
+
+The latest binaries are released in https://github.com/Qazalin/remu/releases. Alternatively, you can [build locally](#build-locally).
+
+Tinygrad does not yet output RDNA3 kernels directly. You can either install comgr or use `AMD_LLVM=1` if you have [LLVM@19](https://github.com/tinygrad/tinygrad/blob/e2ed673c946c8f1774d816c75e52a994c2dd8a88/.github/actions/setup-tinygrad/action.yml#L208).
+
+`PYTHONPATH="." MOCKGPU=1 AMD=1 python test/test_tiny.py TestTiny.test_plus` runs an emulated RDNA3 kernel with Remu.
+
+Add `DEBUG=6` to see Remu's logs.
+
+### DEBUG output
+
+Remu runs each thread one at a time in a nested for loop, see lib.rs. The DEBUG output prints information about the current thread.
+
+The DEBUG output has 3 sections:
+
+```
+<------------ 1 ----------> <--- 2 ---> <--------------------------------------- 3 ------------------------------------------>
+[0   0   0  ] [0   0   0  ] 0  F4080100 SMEM     sbase=0          sdata=4          op=2             offset=0         soffset=0
+```
+
+#### Section 1: Grid info
+
+`[gid.x, gid.y, gid.z], [lid.x, lid.y, lid.z]` of the current thread.
+
+#### Section 2: Wave info
+
+`<lane> <instruction hex>`
+
+RDNA3 divides threads into chunks of 32. Each thread is assigned to a "lane" from 0-31.
+
+In Remu, even though all threads run one at a time, each 32 thread chunk (a wave) shares state like SGPR, VGPR, LDS, EXEC mask, etc.
+Remu can simulate up to one wave sync instruction.
+For more details, see work_group.rs.
+
+Section 2 can have a green or gray color.
+
+Green = The thread is actively executing the instruction.
+
+Gray = The thread has been "turned off" by the EXEC mask, it skips execution of some instructions. (refer to "EXECute Mask" on [page 23](https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/instruction-set-architectures/rdna3-shader-instruction-set-architecture-feb-2023_0.pdf#page=23) of ISA docs for more details.)
+
+To see the colors in action, try running `DEBUG=6 PYTHONPATH="." MOCKGPU=1 AMD=1 python test/test_ops.py TestOps.test_arange_big`. See how only lane 0 writes to global memory:
+```
+[255 0   0  ] [0   0   0  ] 0  DC6A0000 GLOBAL   offset=0         op=26            addr=8           data=0           saddr=0          vdst=0
+[255 0   0  ] [1   0   0  ] 1  DC6A0000
+[255 0   0  ] [2   0   0  ] 2  DC6A0000
+[255 0   0  ] [3   0   0  ] 3  DC6A0000
+[255 0   0  ] [4   0   0  ] 4  DC6A0000
+```
+
+#### Section 3: Decoded Instruction
+
+This prints the instruction type and all the parsed bitfields.
+
+Remu output vs llvm-objdump:
+
+```
+s_load_b64 s[0:1], s[0:1], 0x10                            // 00000000160C: F4040000 F8000010
+SMEM       sbase=0          sdata=0          op=1             offset=16        soffset=0
+```
--- a/extra/remu/rustfmt.toml
+++ b/extra/remu/rustfmt.toml
@@ -0,0 +1 @@
+max_width = 150
--- a/extra/remu/src/helpers.rs
+++ b/extra/remu/src/helpers.rs
@@ -0,0 +1,170 @@
+use half::f16;
+use num_traits::float::FloatCore;
+
+pub fn nth(val: u32, pos: usize) -> u32 {
+    (val >> (31 - pos as u32)) & 1
+}
+pub fn f16_lo(val: u32) -> f16 {
+    f16::from_bits((val & 0xffff) as u16)
+}
+pub fn f16_hi(val: u32) -> f16 {
+    f16::from_bits(((val >> 16) & 0xffff) as u16)
+}
+
+pub fn sign_ext(num: u64, bits: usize) -> i64 {
+    let mut value = num;
+    let is_negative = (value >> (bits - 1)) & 1 != 0;
+    if is_negative {
+        value |= !0 << bits;
+    }
+    value as i64
+}
+
+pub trait IEEEClass<T> {
+    fn exponent(&self) -> T;
+}
+impl IEEEClass<u32> for f32 {
+    fn exponent(&self) -> u32 {
+        (self.to_bits() & 0b01111111100000000000000000000000) >> 23
+    }
+}
+impl IEEEClass<u16> for f16 {
+    fn exponent(&self) -> u16 {
+        (self.to_bits() & 0b0111110000000000) >> 10
+    }
+}
+impl IEEEClass<u64> for f64 {
+    fn exponent(&self) -> u64 {
+        (self.to_bits() & 0b0111111111110000000000000000000000000000000000000000000000000000) >> 52
+    }
+}
+
+pub trait VOPModifier<T> {
+    fn negate(&self, pos: usize, modifier: usize) -> T;
+    fn absolute(&self, pos: usize, modifier: usize) -> T;
+}
+impl<T> VOPModifier<T> for T
+where
+    T: FloatCore,
+{
+    fn negate(&self, pos: usize, modifier: usize) -> T {
+        match (modifier >> pos) & 1 {
+            1 => match self.is_zero() {
+                true => *self,
+                false => -*self,
+            },
+            _ => *self,
+        }
+    }
+    fn absolute(&self, pos: usize, modifier: usize) -> T {
+        match (modifier >> pos) & 1 {
+            1 => self.abs(),
+            _ => *self,
+        }
+    }
+}
+
+pub fn extract_mantissa(x: f64) -> f64 {
+    if x.is_infinite() || x.is_nan() {
+        return x;
+    }
+    let bits = x.to_bits();
+    let mantissa_mask: u64 = 0x000FFFFFFFFFFFFF;
+    let bias: u64 = 1023;
+    let normalized_mantissa_bits = (bits & mantissa_mask) | ((bias - 1) << 52);
+    return f64::from_bits(normalized_mantissa_bits);
+}
+pub fn ldexp(x: f64, exp: i32) -> f64 {
+    x * 2f64.powi(exp)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_extract_mantissa() {
+        assert_eq!(extract_mantissa(2.0f64), 0.5);
+    }
+
+    #[test]
+    fn test_normal_exponent() {
+        assert_eq!(2.5f32.exponent(), 128);
+        assert_eq!(1.17549435e-38f32.exponent(), 1);
+        assert_eq!(f32::INFINITY.exponent(), 255);
+        assert_eq!(f32::NEG_INFINITY.exponent(), 255);
+    }
+
+    #[test]
+    fn test_denormal_exponent() {
+        assert_eq!(1.0e-40f32.exponent(), 0);
+        assert_eq!(1.0e-42f32.exponent(), 0);
+        assert_eq!(1.0e-44f32.exponent(), 0);
+        assert_eq!((1.17549435e-38f32 / 2.0).exponent(), 0);
+    }
+
+    #[test]
+    fn test_normal_exponent_f16() {
+        assert_eq!(f16::from_f32(3.14f32).exponent(), 16);
+        assert_eq!(f16::NEG_INFINITY.exponent(), 31);
+        assert_eq!(f16::INFINITY.exponent(), 31);
+    }
+
+    #[test]
+    fn test_neg() {
+        assert_eq!(0.3_f32.negate(0, 0b001), -0.3_f32);
+        assert_eq!(0.3_f32.negate(1, 0b010), -0.3_f32);
+        assert_eq!(0.3_f32.negate(2, 0b100), -0.3_f32);
+        assert_eq!(0.3_f32.negate(0, 0b110), 0.3_f32);
+        assert_eq!(0.3_f32.negate(1, 0b010), -0.3_f32);
+        assert_eq!(0.0_f32.negate(0, 0b001).to_bits(), 0);
+    }
+
+    #[test]
+    fn test_sign_ext() {
+        assert_eq!(sign_ext(0b000000000000000101000, 21), 40);
+        assert_eq!(sign_ext(0b111111111111111011000, 21), -40);
+        assert_eq!(sign_ext(0b000000000000000000000, 21), 0);
+        assert_eq!(sign_ext(0b111111111111111111111, 21), -1);
+        assert_eq!(sign_ext(0b111000000000000000000, 21), -262144);
+        assert_eq!(sign_ext(0b000111111111111111111, 21), 262143);
+        assert_eq!(sign_ext(7608, 13), -584);
+    }
+}
+
+use std::sync::LazyLock;
+pub static DEBUG: LazyLock<bool> = LazyLock::new(|| std::env::var("DEBUG").map(|v| v.parse::<usize>().unwrap_or(0) >= 6).unwrap_or(false));
+
+pub trait Colorize {
+    fn color(self, color: &str) -> String;
+}
+impl<'a> Colorize for &'a str {
+    fn color(self, color: &str) -> String {
+        let ansi_code = match color {
+            "blue" => format!("\x1b[{};2;112;184;255m", 38),
+            "green" => format!("\x1b[{};2;39;176;139m", 38),
+            "gray" => format!("\x1b[{};2;169;169;169m", 38),
+            _ => format!("\x1b[{};2;255;255;255m", 38),
+        };
+        format!("{}{}{}", ansi_code, self, "\x1b[0m")
+    }
+}
+
+#[macro_export]
+macro_rules! todo_instr {
+    ($x:expr) => {{
+        println!("{:08X}", $x);
+        Err(1)
+    }};
+}
+
+#[macro_export]
+macro_rules! print_instr {
+  ($name:expr $(, $arg:ident)* $(,)?) => {
+    if *DEBUG {
+      print!("{}", format!("{:<8}", $name).color("blue"));
+      $(
+        print!(" {:<16}", format!("{}={:?}", stringify!($arg), $arg));
+      )*
+    }
+  };
+}
--- a/extra/remu/src/lib.rs
+++ b/extra/remu/src/lib.rs
@@ -0,0 +1,31 @@
+use crate::work_group::WorkGroup;
+use std::os::raw::c_char;
+use std::slice;
+mod helpers;
+mod state;
+mod thread;
+mod work_group;
+
+#[no_mangle]
+pub extern "C" fn run_asm(lib: *const c_char, lib_sz: u32, gx: u32, gy: u32, gz: u32, lx: u32, ly: u32, lz: u32, args_ptr: *const u64) -> i32 {
+    if lib.is_null() || (lib_sz % 4) != 0 {
+        panic!("Pointer is null or length is not properly aligned to 4 bytes");
+    }
+    let kernel = unsafe { slice::from_raw_parts(lib as *const u32, (lib_sz / 4) as usize).to_vec() };
+    let dispatch_dim = match (gy != 1, gz != 1) {
+        (true, true) => 3,
+        (true, false) => 2,
+        _ => 1,
+    };
+    for gx in 0..gx {
+        for gy in 0..gy {
+            for gz in 0..gz {
+                let mut wg = WorkGroup::new(dispatch_dim, [gx, gy, gz], [lx, ly, lz], &kernel, args_ptr);
+                if let Err(err) = wg.exec_waves() {
+                    return err;
+                }
+            }
+        }
+    }
+    0
+}
--- a/extra/remu/src/state.rs
+++ b/extra/remu/src/state.rs
@@ -0,0 +1,256 @@
+use std::ops::{Index, IndexMut};
+
+pub trait Register {
+    fn read64(&self, idx: usize) -> u64;
+    fn write64(&mut self, idx: usize, addr: u64);
+}
+impl<T> Register for T
+where
+    T: Index<usize, Output = u32> + IndexMut<usize>,
+{
+    fn read64(&self, idx: usize) -> u64 {
+        let addr_lsb = self[idx];
+        let addr_msb = self[idx + 1];
+        ((addr_msb as u64) << 32) | addr_lsb as u64
+    }
+    fn write64(&mut self, idx: usize, addr: u64) {
+        self[idx] = (addr & 0xffffffff) as u32;
+        self[idx + 1] = ((addr & (0xffffffff << 32)) >> 32) as u32;
+    }
+}
+
+#[derive(Clone)]
+pub struct VGPR {
+    values: [[u32; 256]; 32],
+    pub default_lane: Option<usize>,
+}
+impl Index<usize> for VGPR {
+    type Output = u32;
+    fn index(&self, index: usize) -> &Self::Output {
+        &self.values[self.default_lane.unwrap()][index]
+    }
+}
+impl IndexMut<usize> for VGPR {
+    fn index_mut(&mut self, index: usize) -> &mut Self::Output {
+        &mut self.values[self.default_lane.unwrap()][index]
+    }
+}
+impl VGPR {
+    pub fn new() -> Self {
+        VGPR {
+            values: [[0; 256]; 32],
+            default_lane: None,
+        }
+    }
+    pub fn get_lane(&self, lane: usize) -> [u32; 256] {
+        *self.values.get(lane).unwrap()
+    }
+    pub fn get_lane_mut(&mut self, lane: usize) -> &mut [u32; 256] {
+        self.values.get_mut(lane).unwrap()
+    }
+}
+
+pub trait Value {
+    fn mut_hi16(&mut self, val: u16);
+    fn mut_lo16(&mut self, val: u16);
+}
+impl Value for u32 {
+    fn mut_hi16(&mut self, val: u16) {
+        *self = ((val as u32) << 16) | (*self as u16 as u32);
+    }
+    fn mut_lo16(&mut self, val: u16) {
+        *self = ((((*self & (0xffff << 16)) >> 16) as u32) << 16) | val as u32;
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct WaveValue {
+    pub value: u32,
+    pub warp_size: usize,
+    pub default_lane: Option<usize>,
+    pub mutations: Option<[bool; 32]>,
+}
+impl WaveValue {
+    pub fn new(value: u32, warp_size: usize) -> Self {
+        Self {
+            value,
+            warp_size,
+            default_lane: None,
+            mutations: None,
+        }
+    }
+    pub fn read(&self) -> bool {
+        (self.value >> self.default_lane.unwrap()) & 1 == 1
+    }
+    pub fn set_lane(&mut self, value: bool) {
+        if self.mutations.is_none() {
+            self.mutations = Some([false; 32])
+        }
+        self.mutations.as_mut().unwrap()[self.default_lane.unwrap()] = value;
+    }
+    pub fn apply_muts(&mut self) {
+        self.value = 0;
+        for lane in 0..self.warp_size {
+            if self.mutations.unwrap()[lane] {
+                self.value |= 1 << lane;
+            }
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct VecDataStore {
+    pub data: Vec<u8>,
+}
+
+impl VecDataStore {
+    pub fn new() -> Self {
+        Self { data: Vec::new() }
+    }
+    pub fn write(&mut self, addr: usize, val: u32) {
+        if addr + 4 >= self.data.len() {
+            self.data.resize(self.data.len() + addr + 5, 0);
+        }
+        self.data[addr..addr + 4].iter_mut().enumerate().for_each(|(i, x)| {
+            *x = val.to_le_bytes()[i];
+        });
+    }
+    pub fn write64(&mut self, addr: usize, val: u64) {
+        self.write(addr, (val & 0xffffffff) as u32);
+        self.write(addr + 4, ((val & (0xffffffff << 32)) >> 32) as u32);
+    }
+    pub fn read(&self, addr: usize) -> u32 {
+        let mut bytes: [u8; 4] = [0; 4];
+        bytes.copy_from_slice(&self.data[addr + 0..addr + 4]);
+        u32::from_le_bytes(bytes)
+    }
+    pub fn read64(&mut self, addr: usize) -> u64 {
+        let lsb = self.read(addr);
+        let msb = self.read(addr + 4);
+        ((msb as u64) << 32) | lsb as u64
+    }
+}
+
+#[cfg(test)]
+mod test_state {
+    use super::*;
+
+    #[test]
+    fn test_wave_value() {
+        let mut val = WaveValue::new(0b11000000000000011111111111101110, 32);
+        val.default_lane = Some(0);
+        assert!(!val.read());
+        val.default_lane = Some(31);
+        assert!(val.read());
+    }
+
+    #[test]
+    fn test_wave_value_small() {
+        let mut val = WaveValue::new(0, 1);
+        val.default_lane = Some(0);
+        assert!(!val.read());
+        assert_eq!(val.value, 0);
+        val.set_lane(true);
+        val.apply_muts();
+        assert!(val.read());
+        assert_eq!(val.value, 1);
+    }
+
+    #[test]
+    fn test_wave_value_small_alt() {
+        let mut val = WaveValue::new(0, 2);
+        val.default_lane = Some(0);
+        assert!(!val.read());
+        assert_eq!(val.value, 0);
+        val.set_lane(true);
+        val.apply_muts();
+        assert!(val.read());
+        assert_eq!(val.value, 1);
+    }
+
+    #[test]
+    fn test_wave_value_exec() {
+        let warp_size = 32;
+        let val = WaveValue::new(u32::MAX, warp_size);
+        assert_eq!(val.value, u32::MAX);
+        let warp_size = 3;
+        let val = WaveValue::new((1 << warp_size) - 1, warp_size);
+        assert_eq!(val.value, 7)
+    }
+
+    #[test]
+    fn test_wave_value_toggle_one() {
+        let warp_size = 2;
+        let mut val = WaveValue::new(0b11, warp_size);
+        // 0
+        val.default_lane = Some(0);
+        val.set_lane(false);
+        // 1
+        val.default_lane = Some(1);
+        val.set_lane(true);
+        val.apply_muts();
+        assert_eq!(val.value, 2);
+    }
+
+    #[test]
+    fn test_wave_value_mutate_small() {
+        let mut val = WaveValue::new(0, 2);
+        val.default_lane = Some(0);
+        assert!(!val.read());
+        assert_eq!(val.value, 0);
+        val.set_lane(true);
+        val.apply_muts();
+        assert!(val.read());
+        assert_eq!(val.value, 1);
+    }
+
+    #[test]
+    fn test_wave_value_mutations() {
+        let mut val = WaveValue::new(0b10001, 32);
+        val.default_lane = Some(0);
+        val.set_lane(false);
+        assert!(val.mutations.unwrap().iter().all(|x| !x));
+        val.default_lane = Some(1);
+        val.set_lane(true);
+        assert_eq!(val.value, 0b10001);
+        assert_eq!(
+            val.mutations,
+            Some([
+                false, true, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false,
+                false, false, false, false, false, false, false, false, false, false, false, false, false,
+            ])
+        );
+
+        val.apply_muts();
+        assert_eq!(val.value, 0b10);
+    }
+
+    #[test]
+    fn test_write16() {
+        let mut vgpr = VGPR::new();
+        vgpr.default_lane = Some(0);
+        vgpr[0] = 0b11100000000000001111111111111111;
+        vgpr[0].mut_lo16(0b1011101111111110);
+        assert_eq!(vgpr[0], 0b11100000000000001011101111111110);
+    }
+
+    #[test]
+    fn test_write16hi() {
+        let mut vgpr = VGPR::new();
+        vgpr.default_lane = Some(0);
+        vgpr[0] = 0b11100000000000001111111111111111;
+        vgpr[0].mut_hi16(0b1011101111111110);
+        assert_eq!(vgpr[0], 0b10111011111111101111111111111111);
+    }
+
+    #[test]
+    fn test_vgpr() {
+        let mut vgpr = VGPR::new();
+        vgpr.default_lane = Some(0);
+        vgpr[0] = 42;
+        vgpr.default_lane = Some(10);
+        vgpr[0] = 10;
+        assert_eq!(vgpr.get_lane(0)[0], 42);
+        assert_eq!(vgpr.get_lane(10)[0], 10);
+    }
+}
--- a/extra/remu/src/thread.rs
+++ b/extra/remu/src/thread.rs
--- a/extra/remu/src/work_group.rs
+++ b/extra/remu/src/work_group.rs
@@ -0,0 +1,268 @@
+use crate::helpers::{Colorize, DEBUG};
+use crate::state::{Register, VecDataStore, WaveValue, VGPR};
+use crate::thread::{Thread, END_PRG};
+use std::collections::HashMap;
+
+pub struct WorkGroup<'a> {
+    dispatch_dim: u32,
+    id: [u32; 3],
+    lds: VecDataStore,
+    kernel: &'a Vec<u32>,
+    kernel_args: *const u64,
+    launch_bounds: [u32; 3],
+    wave_state: HashMap<usize, WaveState>,
+}
+
+struct WaveState {
+    scalar_reg: Vec<u32>,
+    scc: u32,
+    vec_reg: VGPR,
+    vcc: WaveValue,
+    exec: WaveValue,
+    pc: usize,
+    sds: HashMap<usize, VecDataStore>,
+}
+
+const SYNCS: [u32; 5] = [0xBF89FC07, 0xBFBD0000, 0xBC7C0000, 0xBF890007, 0xbFB60003];
+const BARRIERS: [[u32; 2]; 5] = [
+    [SYNCS[0], SYNCS[0]],
+    [SYNCS[0], SYNCS[1]],
+    [SYNCS[0], SYNCS[2]],
+    [SYNCS[3], SYNCS[1]],
+    [SYNCS[1], SYNCS[0]],
+];
+impl<'a> WorkGroup<'a> {
+    pub fn new(dispatch_dim: u32, id: [u32; 3], launch_bounds: [u32; 3], kernel: &'a Vec<u32>, kernel_args: *const u64) -> Self {
+        return Self {
+            dispatch_dim,
+            id,
+            kernel,
+            launch_bounds,
+            kernel_args,
+            lds: VecDataStore::new(),
+            wave_state: HashMap::new(),
+        };
+    }
+
+    pub fn exec_waves(&mut self) -> Result<(), i32> {
+        let mut blocks = vec![];
+        for z in 0..self.launch_bounds[2] {
+            for y in 0..self.launch_bounds[1] {
+                for x in 0..self.launch_bounds[0] {
+                    blocks.push([x, y, z])
+                }
+            }
+        }
+        let waves = blocks.chunks(32).map(|w| w.to_vec()).collect::<Vec<_>>();
+
+        let mut sync = false;
+        for (i, x) in self.kernel.iter().enumerate() {
+            if i != 0 && BARRIERS.contains(&[*x, self.kernel[i - 1]]) {
+                sync = true;
+                break;
+            }
+        }
+        for _ in 0..=(sync as usize) {
+            for w in waves.iter().enumerate() {
+                self.exec_wave(w)?
+            }
+        }
+        Ok(())
+    }
+
+    fn exec_wave(&mut self, (wave_id, threads): (usize, &Vec<[u32; 3]>)) -> Result<(), i32> {
+        let wave_state = self.wave_state.get(&wave_id);
+        let mut sds = match wave_state {
+            Some(val) => val.sds.clone(),
+            None => {
+                let mut sds = HashMap::new();
+                for i in 0..=31 {
+                    sds.insert(i, VecDataStore::new());
+                }
+                sds
+            }
+        };
+        let (mut scalar_reg, mut scc, mut pc) = match wave_state {
+            Some(val) => (val.scalar_reg.to_vec(), val.scc, val.pc),
+            None => {
+                let mut scalar_reg = vec![0; 256];
+                scalar_reg.write64(0, self.kernel_args as u64);
+                let [gx, gy, gz] = self.id;
+                match self.dispatch_dim {
+                    3 => (scalar_reg[13], scalar_reg[14], scalar_reg[15]) = (gx, gy, gz),
+                    2 => (scalar_reg[14], scalar_reg[15]) = (gx, gy),
+                    _ => scalar_reg[15] = gx,
+                }
+                (scalar_reg, 0, 0)
+            }
+        };
+        let (mut vec_reg, mut vcc) = match wave_state {
+            Some(val) => (val.vec_reg.clone(), val.vcc.clone()),
+            None => (VGPR::new(), WaveValue::new(0, threads.len())),
+        };
+        let mut exec = match wave_state {
+            Some(val) => val.exec.clone(),
+            None => {
+                let active = match threads.len() == 32 {
+                    true => u32::MAX,
+                    false => (1 << threads.len()) - 1,
+                };
+                WaveValue::new(active, threads.len())
+            }
+        };
+
+        let mut seeded_lanes = vec![];
+        loop {
+            if self.kernel[pc] == END_PRG {
+                break Ok(());
+            }
+            if BARRIERS.contains(&[self.kernel[pc], self.kernel[pc + 1]]) && wave_state.is_none() {
+                self.wave_state.insert(
+                    wave_id,
+                    WaveState {
+                        scalar_reg,
+                        scc,
+                        vec_reg,
+                        vcc,
+                        exec,
+                        pc,
+                        sds,
+                    },
+                );
+                break Ok(());
+            }
+            if SYNCS.contains(&self.kernel[pc]) || self.kernel[pc] >> 20 == 0xbf8 || self.kernel[pc] == 0x7E000000 {
+                pc += 1;
+                continue;
+            }
+
+            let mut sgpr_co = None;
+            for (lane_id, [x, y, z]) in threads.iter().enumerate() {
+                vec_reg.default_lane = Some(lane_id);
+                vcc.default_lane = Some(lane_id);
+                exec.default_lane = Some(lane_id);
+                if *DEBUG {
+                    let lane = format!("{:<2} {:08X} ", lane_id, self.kernel[pc]);
+                    let state = match exec.read() {
+                        true => "green",
+                        false => "gray",
+                    };
+                    let [id0, id1, id2] = self.id;
+                    print!("[{id0:<3} {id1:<3} {id2:<3}] [{x:<3} {y:<3} {z:<3}] {}", lane.color(state));
+                }
+                if !seeded_lanes.contains(&lane_id) && self.wave_state.get(&wave_id).is_none() {
+                    match (self.launch_bounds[1] != 1, self.launch_bounds[2] != 1) {
+                        (false, false) => vec_reg[0] = *x,
+                        _ => vec_reg[0] = (z << 20) | (y << 10) | x,
+                    }
+                    seeded_lanes.push(lane_id);
+                }
+                let mut thread = Thread {
+                    scalar_reg: &mut scalar_reg,
+                    scc: &mut scc,
+                    vec_reg: &mut vec_reg,
+                    vcc: &mut vcc,
+                    exec: &mut exec,
+                    lds: &mut self.lds,
+                    sds: &mut sds.get_mut(&lane_id).unwrap(),
+                    pc_offset: 0,
+                    stream: self.kernel[pc..self.kernel.len()].to_vec(),
+                    scalar: false,
+                    simm: None,
+                    warp_size: threads.len(),
+                    sgpr_co: &mut sgpr_co,
+                };
+                thread.interpret()?;
+                if *DEBUG {
+                    println!();
+                }
+                if thread.scalar {
+                    pc = ((pc as isize) + 1 + (thread.pc_offset as isize)) as usize;
+                    break;
+                }
+                if lane_id == threads.len() - 1 {
+                    pc = ((pc as isize) + 1 + (thread.pc_offset as isize)) as usize;
+                }
+            }
+
+            if vcc.mutations.is_some() {
+                vcc.apply_muts();
+                vcc.mutations = None;
+            }
+            if exec.mutations.is_some() {
+                exec.apply_muts();
+                exec.mutations = None;
+            }
+            if let Some((idx, mut wv)) = sgpr_co.take() {
+                wv.apply_muts();
+                scalar_reg[idx] = wv.value;
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod test_workgroup {
+    use super::*;
+
+    // TODO: make this generic by adding the assembler
+    fn global_store_sgpr(addr: u64, instructions: Vec<u32>, src: u32) -> Vec<u32> {
+        [
+            instructions,
+            vec![
+                0x7E020200 + src,
+                0x7E0402FF,
+                addr as u32,
+                0x7E0602FF,
+                (addr >> 32) as u32,
+                0xDC6A0000,
+                0x007C0102,
+            ],
+            vec![END_PRG],
+        ]
+        .concat()
+    }
+
+    #[test]
+    fn test_wave_value_state_vcc() {
+        let mut ret: u32 = 0;
+        let kernel = vec![
+            0xBEEA00FF,
+            0b11111111111111111111111111111111, // initial vcc state
+            0x7E140282,
+            0x7C94010A, // cmp blockDim.x == 2
+        ];
+        let addr = (&mut ret as *mut u32) as u64;
+        let kernel = global_store_sgpr(addr, kernel, 106);
+        let mut wg = WorkGroup::new(1, [0, 0, 0], [3, 1, 1], &kernel, [addr].as_ptr());
+        wg.exec_waves().unwrap();
+        assert_eq!(ret, 0b100);
+    }
+
+    #[test]
+    fn test_wave_value_state_exec() {
+        let mut ret: u32 = 0;
+        let kernel = vec![
+            0xBEFE00FF,
+            0b11111111111111111111111111111111,
+            0x7E140282,
+            0x7D9C010A, // cmpx blockDim.x <= 2
+        ];
+        let addr = (&mut ret as *mut u32) as u64;
+        let kernel = global_store_sgpr(addr, kernel, 126);
+        let mut wg = WorkGroup::new(1, [0, 0, 0], [4, 1, 1], &kernel, [addr].as_ptr());
+        wg.exec_waves().unwrap();
+        assert_eq!(ret, 0b0111);
+    }
+
+    #[test]
+    fn test_wave_value_sgpr_co() {
+        let mut ret: u32 = 0;
+        let kernel = vec![0xBE8D00FF, 0x7FFFFFFF, 0x7E1402FF, u32::MAX, 0xD700000A, 0x0002010A];
+        let addr = (&mut ret as *mut u32) as u64;
+        let kernel = global_store_sgpr(addr, kernel, 0);
+        let mut wg = WorkGroup::new(1, [0, 0, 0], [5, 1, 1], &kernel, [addr].as_ptr());
+        wg.exec_waves().unwrap();
+        assert_eq!(ret, 0b11110);
+    }
+}
--- a/test/mockgpu/amd/amdgpu.py
+++ b/test/mockgpu/amd/amdgpu.py
@@ -18,7 +18,8 @@ WAIT_REG_MEM_FUNCTION_ALWAYS = 0
 WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
 WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=

-REMU_PATHS = ["libremu.so", "/usr/local/lib/libremu.so", "libremu.dylib", "/usr/local/lib/libremu.dylib", "/opt/homebrew/lib/libremu.dylib"]
+REMU_PATHS = ["extra/remu/target/release/libremu.so", "libremu.so", "/usr/local/lib/libremu.so",
+              "extra/remu/target/release/libremu.dylib", "libremu.dylib", "/usr/local/lib/libremu.dylib", "/opt/homebrew/lib/libremu.dylib"]
 def _try_dlopen_remu():
  for path in REMU_PATHS:
    try: