Powdr openvm extension with new hints (#3100)

Extend our openvm guest/host with support for new hints. Includes hints for `k256` affine coordinate inverse and sqrt.
2026-01-09 14:48:16 -05:00 · 2025-07-30 12:50:23 -03:00
parent 5c8ecd2a46
commit 4be51aa95a
21 changed files with 1571 additions and 65 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -37,6 +37,9 @@ members = [
  "autoprecompiles",
  "openvm",
  "cli-openvm",
+  "openvm/extensions/hints-guest",
+  "openvm/extensions/hints-transpiler",
+  "openvm/extensions/hints-circuit",
 ]

 exclude = ["riscv-runtime"]
@@ -82,6 +85,10 @@ powdr-schemas = { path = "./schemas", version = "0.1.4" }
 powdr-autoprecompiles = { path = "./autoprecompiles", version = "0.1.4" }
 powdr-openvm = { path = "./openvm", version = "0.1.4" }

+powdr-openvm-hints-guest = { path = "./openvm/extensions/hints-guest", version = "0.1.4" }
+powdr-openvm-hints-transpiler = { path = "./openvm/extensions/hints-transpiler", version = "0.1.4" }
+powdr-openvm-hints-circuit = { path = "./openvm/extensions/hints-circuit", version = "0.1.4" }
+
 # openvm
 openvm = { git = "https://github.com/powdr-labs/openvm.git", rev = "391b737" }
 openvm-build = { git = "https://github.com/powdr-labs/openvm.git", rev = "391b737" }
@@ -115,6 +122,8 @@ openvm-pairing-circuit = { git = "https://github.com/powdr-labs/openvm.git", rev
 openvm-pairing-transpiler = { git = "https://github.com/powdr-labs/openvm.git", rev = "391b737" }
 openvm-native-circuit = { git = "https://github.com/powdr-labs/openvm.git", rev = "391b737", default-features = false }
 openvm-native-recursion = { git = "https://github.com/powdr-labs/openvm.git", rev = "391b737", default-features = false }
+openvm-platform = { git = "https://github.com/powdr-labs/openvm.git", rev = "391b737" }
+openvm-custom-insn = { git = "https://github.com/powdr-labs/openvm.git", rev = "391b737" }

 # stark-backend
 openvm-stark-sdk = { git = "https://github.com/powdr-labs/stark-backend.git", rev = "ee4e22b", default-features = false, features = [
--- a/openvm/Cargo.toml
+++ b/openvm/Cargo.toml
@@ -44,6 +44,9 @@ powdr-riscv-elf.workspace = true
 powdr-autoprecompiles.workspace = true
 powdr-constraint-solver.workspace = true

+powdr-openvm-hints-transpiler.workspace = true
+powdr-openvm-hints-circuit.workspace = true
+
 eyre = "0.6.12"
 serde = "1.0.217"
 derive_more = { version = "2.0.1", default-features = false, features = [
--- a/openvm/extensions/hints-circuit/Cargo.toml
+++ b/openvm/extensions/hints-circuit/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "powdr-openvm-hints-circuit"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+homepage.workspace = true
+repository.workspace = true
+
+[dependencies]
+openvm-circuit = { workspace = true }
+openvm-instructions = { workspace = true }
+openvm-rv32im-circuit = { workspace = true }
+openvm-stark-backend = { workspace = true }
+openvm-stark-sdk = { workspace = true }
+powdr-openvm-hints-transpiler = { workspace = true }
+eyre = "0.6.12"
+crypto-bigint = "0.6.1"
+elliptic-curve = "0.13.8"
--- a/openvm/extensions/hints-circuit/src/executors.rs
+++ b/openvm/extensions/hints-circuit/src/executors.rs
@@ -0,0 +1,218 @@
+use openvm_circuit::arch::{PhantomSubExecutor, Streams};
+use openvm_circuit::system::memory::MemoryController;
+use openvm_instructions::riscv::RV32_MEMORY_AS;
+use openvm_instructions::PhantomDiscriminant;
+use openvm_rv32im_circuit::adapters::unsafe_read_rv32_register;
+use openvm_stark_backend::p3_field::PrimeField32;
+
+use crate::field10x26_k256;
+
+/// Example hint implementation.
+/// Takes a single u32 as input and sets the hint to be the bytes of the u32 in reverse order.
+pub struct ReverseBytesSubEx;
+
+impl<F: PrimeField32> PhantomSubExecutor<F> for ReverseBytesSubEx {
+    fn phantom_execute(
+        &mut self,
+        memory: &MemoryController<F>,
+        streams: &mut Streams<F>,
+        _discriminant: PhantomDiscriminant,
+        a: F,
+        _b: F,
+        c_upper: u16,
+    ) -> eyre::Result<()> {
+        assert_eq!(c_upper, 0);
+        // read register
+        let rs1 = unsafe_read_rv32_register(memory, a);
+        // read memory
+        let bytes = memory.unsafe_read::<4>(
+            F::from_canonical_u32(RV32_MEMORY_AS),
+            F::from_canonical_u32(rs1),
+        );
+        // write hint as bytes in reverse
+        let hint_bytes = bytes.into_iter().rev().collect();
+        streams.hint_stream = hint_bytes;
+        Ok(())
+    }
+}
+
+/// Takes as input a pointer to 32 bytes, the SEC1 encoding (i.e., big-endian) of a k256 coordinate field element.
+/// Sets the hint to be the inverse of the field element in the same encoding (if not zero).
+/// Sets the hint to zero when the input is zero.
+pub struct K256InverseFieldSubEx;
+
+use crypto_bigint::const_monty_form;
+use crypto_bigint::impl_modulus;
+use crypto_bigint::modular::ConstMontyParams;
+use crypto_bigint::Encoding;
+use crypto_bigint::Zero;
+use crypto_bigint::U256;
+impl_modulus!(
+    K256Mod,
+    U256,
+    "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F"
+);
+
+impl<F: PrimeField32> PhantomSubExecutor<F> for K256InverseFieldSubEx {
+    fn phantom_execute(
+        &mut self,
+        memory: &MemoryController<F>,
+        streams: &mut Streams<F>,
+        _: PhantomDiscriminant,
+        a: F,
+        _b: F,
+        c_upper: u16,
+    ) -> eyre::Result<()> {
+        assert_eq!(c_upper, 0);
+        // read register
+        let rs1 = unsafe_read_rv32_register(memory, a);
+        // read the field element
+        let bytes: [u8; 32] = memory
+            .unsafe_read::<32>(
+                F::from_canonical_u32(RV32_MEMORY_AS),
+                F::from_canonical_u32(rs1),
+            )
+            .into_iter()
+            .map(|f| u8::try_from(f.as_canonical_u32()).expect("value not a byte"))
+            .collect::<Vec<_>>()
+            .try_into()
+            .unwrap();
+
+        let n = U256::from_be_bytes(bytes);
+
+        // perform the inverse.
+        let n_mod = const_monty_form!(n, K256Mod);
+        let n_inv = if !(bool::from(n_mod.is_zero())) {
+            n_mod.inv().unwrap().retrieve()
+        } else {
+            U256::ZERO
+        };
+        let inv_bytes = n_inv
+            .to_be_bytes()
+            .into_iter()
+            .map(|b| F::from_canonical_u8(b))
+            .collect();
+        streams.hint_stream = inv_bytes;
+
+        Ok(())
+    }
+}
+
+/// Size in bytes of the k256 field element in 10x26 representation.
+const FIELD10X26_BYTES: usize = 40; // [u32;10]
+
+/// Takes as input a pointer to the inner representation of a k256 coordinate field element (in 32-bit architectures).
+/// Sets the hint to be the inverse of the input (if not zero), in the same representation.
+/// If the input is zero (normalized or not), the hint is also set, but undefined.
+pub struct K256InverseField10x26SubEx;
+
+impl<F: PrimeField32> PhantomSubExecutor<F> for K256InverseField10x26SubEx {
+    fn phantom_execute(
+        &mut self,
+        memory: &MemoryController<F>,
+        streams: &mut Streams<F>,
+        _: PhantomDiscriminant,
+        a: F,
+        _b: F,
+        c_upper: u16,
+    ) -> eyre::Result<()> {
+        assert_eq!(c_upper, 0);
+        // read register
+        let rs1 = unsafe_read_rv32_register(memory, a);
+        // read the k256 field_10x26 as raw bytes
+        let bytes: [u8; FIELD10X26_BYTES] = memory
+            .unsafe_read::<{ FIELD10X26_BYTES }>(
+                F::from_canonical_u32(RV32_MEMORY_AS),
+                F::from_canonical_u32(rs1),
+            )
+            .into_iter()
+            .map(|f| u8::try_from(f.as_canonical_u32()).expect("value not a byte"))
+            .collect::<Vec<_>>()
+            .try_into()
+            .unwrap();
+        // we just reinterpret the bytes as a k256 field element. We don't use mem::transmute to avoid alignment issues
+        let mut elem = [0u32; 10];
+        unsafe {
+            std::ptr::copy_nonoverlapping(
+                bytes.as_ptr(),
+                elem.as_mut_ptr() as *mut u8,
+                FIELD10X26_BYTES,
+            );
+        }
+        let elem = field10x26_k256::FieldElement10x26(elem);
+        let inv = elem.invert();
+        // okay to transmute in the opposite direction
+        let inv_bytes: [u8; FIELD10X26_BYTES] = unsafe { std::mem::transmute(inv.0) };
+        streams.hint_stream = inv_bytes
+            .into_iter()
+            .map(|b| F::from_canonical_u8(b))
+            .collect();
+
+        Ok(())
+    }
+}
+
+/// Takes as input a pointer to the inner representation of a k256 coordinate field element (in 32-bit architectures).
+/// If the number is square, sets the hint an u32 of value one, followed by a square root in the same inner representation.
+/// If the number is not square, sets the hint to an u32 of value zero.
+pub struct K256SqrtField10x26SubEx;
+
+impl<F: PrimeField32> PhantomSubExecutor<F> for K256SqrtField10x26SubEx {
+    fn phantom_execute(
+        &mut self,
+        memory: &MemoryController<F>,
+        streams: &mut Streams<F>,
+        _: PhantomDiscriminant,
+        a: F,
+        _b: F,
+        c_upper: u16,
+    ) -> eyre::Result<()> {
+        assert_eq!(c_upper, 0);
+        // read register
+        let rs1 = unsafe_read_rv32_register(memory, a);
+        // read the k256 field_10x26 as raw bytes
+        let bytes: [u8; FIELD10X26_BYTES] = memory
+            .unsafe_read::<{ FIELD10X26_BYTES }>(
+                F::from_canonical_u32(RV32_MEMORY_AS),
+                F::from_canonical_u32(rs1),
+            )
+            .into_iter()
+            .map(|f| u8::try_from(f.as_canonical_u32()).expect("value not a byte"))
+            .collect::<Vec<_>>()
+            .try_into()
+            .unwrap();
+        // we just reinterpret the bytes as a k256 field element. Can't use mem::transmute due to alighment requirements
+        let mut elem = [0u32; 10];
+        unsafe {
+            std::ptr::copy_nonoverlapping(
+                bytes.as_ptr(),
+                elem.as_mut_ptr() as *mut u8,
+                FIELD10X26_BYTES,
+            );
+        }
+        let elem = field10x26_k256::FieldElement10x26(elem);
+        let res = elem.sqrt();
+        if res.is_some().into() {
+            // return 1 followed by the result
+            let bytes: [u8; FIELD10X26_BYTES] = unsafe {
+                // safe to transmute into u8 array
+                std::mem::transmute(res.unwrap().0)
+            };
+            streams.hint_stream = 1u32
+                .to_le_bytes() // indicates that a square root exists
+                .into_iter()
+                .chain(bytes)
+                .map(|b| F::from_canonical_u8(b))
+                .collect();
+        } else {
+            // no square root, return a 0
+            streams.hint_stream = 0u32
+                .to_le_bytes()
+                .map(|b| F::from_canonical_u8(b))
+                .into_iter()
+                .collect();
+        }
+
+        Ok(())
+    }
+}
--- a/openvm/extensions/hints-circuit/src/field10x26_k256.rs
+++ b/openvm/extensions/hints-circuit/src/field10x26_k256.rs
@@ -0,0 +1,812 @@
+//! The code here has been mostly copied from the `k256` crate.
+//! Its the 32-bit implementation of the field element.
+
+use elliptic_curve::consts::U32;
+use elliptic_curve::{
+    subtle::{Choice, ConditionallySelectable, ConstantTimeEq, CtOption},
+    zeroize::Zeroize,
+    FieldBytesEncoding,
+};
+// use crypto_bigint::U256;
+use elliptic_curve::bigint::ArrayEncoding;
+use elliptic_curve::bigint::U256;
+
+pub type FieldBytes = elliptic_curve::FieldBytes<Secp256k1>;
+
+/// Order of the secp256k1 elliptic curve in hexadecimal.
+const ORDER_HEX: &str = "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEBAAEDCE6AF48A03BBFD25E8CD0364141";
+
+/// Order of the secp256k1 elliptic curve.
+const ORDER: U256 = U256::from_be_hex(ORDER_HEX);
+
+#[derive(Copy, Clone, Debug, Default, Eq, PartialEq, PartialOrd, Ord)]
+pub struct Secp256k1;
+
+impl elliptic_curve::Curve for Secp256k1 {
+    /// 32-byte serialized field elements.
+    type FieldBytesSize = U32;
+
+    /// 256-bit field modulus.
+    type Uint = U256;
+
+    /// Curve order.
+    const ORDER: U256 = ORDER;
+}
+
+impl FieldBytesEncoding<Secp256k1> for U256 {
+    fn decode_field_bytes(field_bytes: &FieldBytes) -> Self {
+        U256::from_be_byte_array(*field_bytes)
+    }
+
+    fn encode_field_bytes(&self) -> FieldBytes {
+        self.to_be_byte_array()
+    }
+}
+
+impl elliptic_curve::PrimeCurve for Secp256k1 {}
+
+// -----------------------------------------------------------------------------------------------------
+
+/// Scalars modulo SECP256k1 modulus (2^256 - 2^32 - 2^9 - 2^8 - 2^7 - 2^6 - 2^4 - 1).
+/// Uses 10 32-bit limbs (little-endian), where in the normalized form
+/// first 9 contain 26 bits of the value each, and the last one contains 22 bits.
+/// CurveArithmetic operations can be done without modulo reduction for some time,
+/// using the remaining overflow bits.
+#[derive(Clone, Copy, Debug)]
+pub struct FieldElement10x26(pub(crate) [u32; 10]);
+
+// TODO: maybe instead clean this file up and only keep code that is used?
+#[allow(unused)]
+impl FieldElement10x26 {
+    /// Zero element.
+    pub const ZERO: Self = Self([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+
+    /// Multiplicative identity.
+    pub const ONE: Self = Self([1, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+
+    /// Attempts to parse the given byte array as an SEC1-encoded field element.
+    /// Does not check the result for being in the correct range.
+    pub(crate) const fn from_bytes_unchecked(bytes: &[u8; 32]) -> Self {
+        let w0 = (bytes[31] as u32)
+            | ((bytes[30] as u32) << 8)
+            | ((bytes[29] as u32) << 16)
+            | (((bytes[28] & 0x3) as u32) << 24);
+        let w1 = (((bytes[28] >> 2) as u32) & 0x3f)
+            | ((bytes[27] as u32) << 6)
+            | ((bytes[26] as u32) << 14)
+            | (((bytes[25] & 0xf) as u32) << 22);
+        let w2 = (((bytes[25] >> 4) as u32) & 0xf)
+            | ((bytes[24] as u32) << 4)
+            | ((bytes[23] as u32) << 12)
+            | (((bytes[22] & 0x3f) as u32) << 20);
+        let w3 = (((bytes[22] >> 6) as u32) & 0x3)
+            | ((bytes[21] as u32) << 2)
+            | ((bytes[20] as u32) << 10)
+            | ((bytes[19] as u32) << 18);
+        let w4 = (bytes[18] as u32)
+            | ((bytes[17] as u32) << 8)
+            | ((bytes[16] as u32) << 16)
+            | (((bytes[15] & 0x3) as u32) << 24);
+        let w5 = (((bytes[15] >> 2) as u32) & 0x3f)
+            | ((bytes[14] as u32) << 6)
+            | ((bytes[13] as u32) << 14)
+            | (((bytes[12] & 0xf) as u32) << 22);
+        let w6 = (((bytes[12] >> 4) as u32) & 0xf)
+            | ((bytes[11] as u32) << 4)
+            | ((bytes[10] as u32) << 12)
+            | (((bytes[9] & 0x3f) as u32) << 20);
+        let w7 = (((bytes[9] >> 6) as u32) & 0x3)
+            | ((bytes[8] as u32) << 2)
+            | ((bytes[7] as u32) << 10)
+            | ((bytes[6] as u32) << 18);
+        let w8 = (bytes[5] as u32)
+            | ((bytes[4] as u32) << 8)
+            | ((bytes[3] as u32) << 16)
+            | (((bytes[2] & 0x3) as u32) << 24);
+        let w9 = (((bytes[2] >> 2) as u32) & 0x3f)
+            | ((bytes[1] as u32) << 6)
+            | ((bytes[0] as u32) << 14);
+
+        Self([w0, w1, w2, w3, w4, w5, w6, w7, w8, w9])
+    }
+
+    /// Attempts to parse the given byte array as an SEC1-encoded field element.
+    ///
+    /// Returns None if the byte array does not contain a big-endian integer in the range
+    /// [0, p).
+    pub fn from_bytes(bytes: &FieldBytes) -> CtOption<Self> {
+        let res = Self::from_bytes_unchecked(bytes.as_ref());
+        let overflow = res.get_overflow();
+
+        CtOption::new(res, !overflow)
+    }
+
+    pub const fn from_u64(val: u64) -> Self {
+        let w0 = (val as u32) & 0x3FFFFFF;
+        let val = val >> 26;
+        let w1 = (val as u32) & 0x3FFFFFF;
+        let w2 = (val >> 26) as u32;
+        Self([w0, w1, w2, 0, 0, 0, 0, 0, 0, 0])
+    }
+
+    /// Returns the SEC1 encoding of this field element.
+    pub fn to_bytes(self) -> FieldBytes {
+        let mut r = FieldBytes::default();
+        r[0] = (self.0[9] >> 14) as u8;
+        r[1] = (self.0[9] >> 6) as u8;
+        r[2] = ((self.0[9] as u8 & 0x3Fu8) << 2) | ((self.0[8] >> 24) as u8 & 0x3);
+        r[3] = (self.0[8] >> 16) as u8;
+        r[4] = (self.0[8] >> 8) as u8;
+        r[5] = self.0[8] as u8;
+        r[6] = (self.0[7] >> 18) as u8;
+        r[7] = (self.0[7] >> 10) as u8;
+        r[8] = (self.0[7] >> 2) as u8;
+        r[9] = ((self.0[7] as u8 & 0x3u8) << 6) | ((self.0[6] >> 20) as u8 & 0x3fu8);
+        r[10] = (self.0[6] >> 12) as u8;
+        r[11] = (self.0[6] >> 4) as u8;
+        r[12] = ((self.0[6] as u8 & 0xfu8) << 4) | ((self.0[5] >> 22) as u8 & 0xfu8);
+        r[13] = (self.0[5] >> 14) as u8;
+        r[14] = (self.0[5] >> 6) as u8;
+        r[15] = ((self.0[5] as u8 & 0x3fu8) << 2) | ((self.0[4] >> 24) as u8 & 0x3u8);
+        r[16] = (self.0[4] >> 16) as u8;
+        r[17] = (self.0[4] >> 8) as u8;
+        r[18] = self.0[4] as u8;
+        r[19] = (self.0[3] >> 18) as u8;
+        r[20] = (self.0[3] >> 10) as u8;
+        r[21] = (self.0[3] >> 2) as u8;
+        r[22] = ((self.0[3] as u8 & 0x3u8) << 6) | ((self.0[2] >> 20) as u8 & 0x3fu8);
+        r[23] = (self.0[2] >> 12) as u8;
+        r[24] = (self.0[2] >> 4) as u8;
+        r[25] = ((self.0[2] as u8 & 0xfu8) << 4) | ((self.0[1] >> 22) as u8 & 0xfu8);
+        r[26] = (self.0[1] >> 14) as u8;
+        r[27] = (self.0[1] >> 6) as u8;
+        r[28] = ((self.0[1] as u8 & 0x3fu8) << 2) | ((self.0[0] >> 24) as u8 & 0x3u8);
+        r[29] = (self.0[0] >> 16) as u8;
+        r[30] = (self.0[0] >> 8) as u8;
+        r[31] = self.0[0] as u8;
+        r
+    }
+
+    /// Adds `x * (2^256 - modulus)`.
+    fn add_modulus_correction(&self, x: u32) -> Self {
+        // add (2^256 - modulus) * x to the first limb
+        let t0 = self.0[0] + x * 0x3D1u32;
+
+        // Propagate excess bits up the limbs
+        let t1 = self.0[1] + (x << 6); // add `x` times the high bit of correction (2^32)
+        let t1 = t1 + (t0 >> 26);
+        let t0 = t0 & 0x3FFFFFFu32;
+
+        let t2 = self.0[2] + (t1 >> 26);
+        let t1 = t1 & 0x3FFFFFFu32;
+
+        let t3 = self.0[3] + (t2 >> 26);
+        let t2 = t2 & 0x3FFFFFFu32;
+
+        let t4 = self.0[4] + (t3 >> 26);
+        let t3 = t3 & 0x3FFFFFFu32;
+
+        let t5 = self.0[5] + (t4 >> 26);
+        let t4 = t4 & 0x3FFFFFFu32;
+
+        let t6 = self.0[6] + (t5 >> 26);
+        let t5 = t5 & 0x3FFFFFFu32;
+
+        let t7 = self.0[7] + (t6 >> 26);
+        let t6 = t6 & 0x3FFFFFFu32;
+
+        let t8 = self.0[8] + (t7 >> 26);
+        let t7 = t7 & 0x3FFFFFFu32;
+
+        let t9 = self.0[9] + (t8 >> 26);
+        let t8 = t8 & 0x3FFFFFFu32;
+
+        Self([t0, t1, t2, t3, t4, t5, t6, t7, t8, t9])
+    }
+
+    /// Subtracts the overflow in the last limb and return it with the new field element.
+    /// Equivalent to subtracting a multiple of 2^256.
+    fn subtract_modulus_approximation(&self) -> (Self, u32) {
+        let x = self.0[9] >> 22;
+        let t9 = self.0[9] & 0x03FFFFFu32; // equivalent to self -= 2^256 * x
+        (
+            Self([
+                self.0[0], self.0[1], self.0[2], self.0[3], self.0[4], self.0[5], self.0[6],
+                self.0[7], self.0[8], t9,
+            ]),
+            x,
+        )
+    }
+
+    /// Checks if the field element is greater or equal to the modulus.
+    fn get_overflow(&self) -> Choice {
+        let m = self.0[2] & self.0[3] & self.0[4] & self.0[5] & self.0[6] & self.0[7] & self.0[8];
+        let x = (self.0[9] >> 22 != 0)
+            | ((self.0[9] == 0x3FFFFFu32)
+                & (m == 0x3FFFFFFu32)
+                & ((self.0[1] + 0x40u32 + ((self.0[0] + 0x3D1u32) >> 26)) > 0x3FFFFFFu32));
+        Choice::from(x as u8)
+    }
+
+    /// Brings the field element's magnitude to 1, but does not necessarily normalize it.
+    pub fn normalize_weak(&self) -> Self {
+        // Reduce t9 at the start so there will be at most a single carry from the first pass
+        let (t, x) = self.subtract_modulus_approximation();
+
+        // The first pass ensures the magnitude is 1, ...
+        let res = t.add_modulus_correction(x);
+
+        // ... except for a possible carry at bit 22 of t9 (i.e. bit 256 of the field element)
+        debug_assert!(res.0[9] >> 23 == 0);
+
+        res
+    }
+
+    /// Fully normalizes the field element.
+    /// That is, first nine limbs are at most 26 bit large, the last limb is at most 22 bit large,
+    /// and the value is less than the modulus.
+    pub fn normalize(&self) -> Self {
+        let res = self.normalize_weak();
+
+        // At most a single final reduction is needed;
+        // check if the value is >= the field characteristic
+        let overflow = res.get_overflow();
+
+        // Apply the final reduction (for constant-time behaviour, we do it always)
+        let res_corrected = res.add_modulus_correction(1u32);
+        // Mask off the possible multiple of 2^256 from the final reduction
+        let (res_corrected, x) = res_corrected.subtract_modulus_approximation();
+
+        // If the last limb didn't carry to bit 23 already,
+        // then it should have after any final reduction
+        debug_assert!(x == (overflow.unwrap_u8() as u32));
+
+        Self::conditional_select(&res, &res_corrected, overflow)
+    }
+
+    /// Checks if the field element becomes zero if normalized.
+    pub fn normalizes_to_zero(&self) -> Choice {
+        let res = self.normalize_weak();
+
+        let t0 = res.0[0];
+        let t1 = res.0[1];
+        let t2 = res.0[2];
+        let t3 = res.0[3];
+        let t4 = res.0[4];
+        let t5 = res.0[5];
+        let t6 = res.0[6];
+        let t7 = res.0[7];
+        let t8 = res.0[8];
+        let t9 = res.0[9];
+
+        // z0 tracks a possible raw value of 0, z1 tracks a possible raw value of the modulus
+        let z0 = t0 | t1 | t2 | t3 | t4 | t5 | t6 | t7 | t8 | t9;
+        let z1 = (t0 ^ 0x3D0u32)
+            & (t1 ^ 0x40u32)
+            & t2
+            & t3
+            & t4
+            & t5
+            & t6
+            & t7
+            & t8
+            & (t9 ^ 0x3C00000u32);
+
+        Choice::from(((z0 == 0) | (z1 == 0x3FFFFFFu32)) as u8)
+    }
+
+    /// Determine if this `FieldElement10x26` is zero.
+    ///
+    /// # Returns
+    ///
+    /// If zero, return `Choice(1)`.  Otherwise, return `Choice(0)`.
+    pub fn is_zero(&self) -> Choice {
+        Choice::from(
+            ((self.0[0]
+                | self.0[1]
+                | self.0[2]
+                | self.0[3]
+                | self.0[4]
+                | self.0[5]
+                | self.0[6]
+                | self.0[7]
+                | self.0[8]
+                | self.0[9])
+                == 0) as u8,
+        )
+    }
+
+    /// Determine if this `FieldElement10x26` is odd in the SEC1 sense: `self mod 2 == 1`.
+    ///
+    /// # Returns
+    ///
+    /// If odd, return `Choice(1)`.  Otherwise, return `Choice(0)`.
+    pub fn is_odd(&self) -> Choice {
+        (self.0[0] as u8 & 1).into()
+    }
+
+    // The maximum number `m` for which `0x3FFFFFF * 2 * (m + 1) < 2^32`
+    pub const fn max_magnitude() -> u32 {
+        31u32
+    }
+
+    /// Returns -self, treating it as a value of given magnitude.
+    /// The provided magnitude must be equal or greater than the actual magnitude of `self`.
+    pub const fn negate(&self, magnitude: u32) -> Self {
+        let m: u32 = magnitude + 1;
+        let r0 = 0x3FFFC2Fu32 * 2 * m - self.0[0];
+        let r1 = 0x3FFFFBFu32 * 2 * m - self.0[1];
+        let r2 = 0x3FFFFFFu32 * 2 * m - self.0[2];
+        let r3 = 0x3FFFFFFu32 * 2 * m - self.0[3];
+        let r4 = 0x3FFFFFFu32 * 2 * m - self.0[4];
+        let r5 = 0x3FFFFFFu32 * 2 * m - self.0[5];
+        let r6 = 0x3FFFFFFu32 * 2 * m - self.0[6];
+        let r7 = 0x3FFFFFFu32 * 2 * m - self.0[7];
+        let r8 = 0x3FFFFFFu32 * 2 * m - self.0[8];
+        let r9 = 0x03FFFFFu32 * 2 * m - self.0[9];
+        Self([r0, r1, r2, r3, r4, r5, r6, r7, r8, r9])
+    }
+
+    /// Returns self + rhs mod p.
+    /// Sums the magnitudes.
+    pub const fn add(&self, rhs: &Self) -> Self {
+        Self([
+            self.0[0] + rhs.0[0],
+            self.0[1] + rhs.0[1],
+            self.0[2] + rhs.0[2],
+            self.0[3] + rhs.0[3],
+            self.0[4] + rhs.0[4],
+            self.0[5] + rhs.0[5],
+            self.0[6] + rhs.0[6],
+            self.0[7] + rhs.0[7],
+            self.0[8] + rhs.0[8],
+            self.0[9] + rhs.0[9],
+        ])
+    }
+
+    /// Multiplies by a single-limb integer.
+    /// Multiplies the magnitude by the same value.
+    pub const fn mul_single(&self, rhs: u32) -> Self {
+        Self([
+            self.0[0] * rhs,
+            self.0[1] * rhs,
+            self.0[2] * rhs,
+            self.0[3] * rhs,
+            self.0[4] * rhs,
+            self.0[5] * rhs,
+            self.0[6] * rhs,
+            self.0[7] * rhs,
+            self.0[8] * rhs,
+            self.0[9] * rhs,
+        ])
+    }
+
+    #[inline(always)]
+    fn mul_inner(&self, rhs: &Self) -> Self {
+        /*
+        `square()` is just `mul()` with equal arguments. Rust compiler is smart enough
+        to do all the necessary optimizations for this case, but it needs to have this information
+        inside a function. If a function is just *called* with the same arguments,
+        this information cannot be used, so the function must be inlined while using the same arguments.
+
+        Now `mul()` is quite long and therefore expensive to inline. So we have an inner (inlined)
+        function, that is used inside `mul()` and `square()`, and when it is used with the same
+        arguments in `square()`, compiler is able to use that fact after inlining.
+        */
+
+        let m = 0x3FFFFFFu64;
+        let rr0 = 0x3D10u64;
+        let rr1 = 0x400u64;
+
+        let a0 = self.0[0] as u64;
+        let a1 = self.0[1] as u64;
+        let a2 = self.0[2] as u64;
+        let a3 = self.0[3] as u64;
+        let a4 = self.0[4] as u64;
+        let a5 = self.0[5] as u64;
+        let a6 = self.0[6] as u64;
+        let a7 = self.0[7] as u64;
+        let a8 = self.0[8] as u64;
+        let a9 = self.0[9] as u64;
+
+        let b0 = rhs.0[0] as u64;
+        let b1 = rhs.0[1] as u64;
+        let b2 = rhs.0[2] as u64;
+        let b3 = rhs.0[3] as u64;
+        let b4 = rhs.0[4] as u64;
+        let b5 = rhs.0[5] as u64;
+        let b6 = rhs.0[6] as u64;
+        let b7 = rhs.0[7] as u64;
+        let b8 = rhs.0[8] as u64;
+        let b9 = rhs.0[9] as u64;
+
+        // [... a b c] is a shorthand for ... + a<<52 + b<<26 + c<<0 mod n.
+        // for 0 <= x <= 9, px is a shorthand for sum(a[i]*b[x-i], i=0..x).
+        // for 9 <= x <= 18, px is a shorthand for sum(a[i]*b[x-i], i=(x-9)..9)
+        // Note that [x 0 0 0 0 0 0 0 0 0 0] = [x*rr1 x*rr0].
+
+        let mut c: u64;
+        let mut d: u64;
+
+        d = a0 * b9
+            + a1 * b8
+            + a2 * b7
+            + a3 * b6
+            + a4 * b5
+            + a5 * b4
+            + a6 * b3
+            + a7 * b2
+            + a8 * b1
+            + a9 * b0;
+        // [d 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0]
+        let t9 = (d & m) as u32;
+        d >>= 26;
+        debug_assert!(t9 >> 26 == 0);
+        debug_assert!(d >> 38 == 0);
+        // [d t9 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0]
+
+        c = a0 * b0;
+        debug_assert!(c >> 60 == 0);
+        // [d t9 0 0 0 0 0 0 0 0 c] = [p9 0 0 0 0 0 0 0 0 p0]
+        d +=
+            a1 * b9 + a2 * b8 + a3 * b7 + a4 * b6 + a5 * b5 + a6 * b4 + a7 * b3 + a8 * b2 + a9 * b1;
+        debug_assert!(d >> 63 == 0);
+        // [d t9 0 0 0 0 0 0 0 0 c] = [p10 p9 0 0 0 0 0 0 0 0 p0]
+        let u0 = (d & m) as u32;
+        d >>= 26;
+        c += u0 as u64 * rr0;
+        debug_assert!(u0 >> 26 == 0);
+        debug_assert!(d >> 37 == 0);
+        debug_assert!(c >> 61 == 0);
+        // [d u0 t9 0 0 0 0 0 0 0 0 c-u0*rr0] = [p10 p9 0 0 0 0 0 0 0 0 p0]
+        let t0 = (c & m) as u32;
+        c >>= 26;
+        c += u0 as u64 * rr1;
+        debug_assert!(t0 >> 26 == 0);
+        debug_assert!(c >> 37 == 0);
+        // [d u0 t9 0 0 0 0 0 0 0 c-u0*rr1 t0-u0*rr0] = [p10 p9 0 0 0 0 0 0 0 0 p0]
+        // [d 0 t9 0 0 0 0 0 0 0 c t0] = [p10 p9 0 0 0 0 0 0 0 0 p0]
+
+        c += a0 * b1 + a1 * b0;
+        debug_assert!(c >> 62 == 0);
+        // [d 0 t9 0 0 0 0 0 0 0 c t0] = [p10 p9 0 0 0 0 0 0 0 p1 p0]
+        d += a2 * b9 + a3 * b8 + a4 * b7 + a5 * b6 + a6 * b5 + a7 * b4 + a8 * b3 + a9 * b2;
+        debug_assert!(d >> 63 == 0);
+        // [d 0 t9 0 0 0 0 0 0 0 c t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0]
+        let u1 = (d & m) as u32;
+        d >>= 26;
+        c += u1 as u64 * rr0;
+        debug_assert!(u1 >> 26 == 0);
+        debug_assert!(d >> 37 == 0);
+        debug_assert!(c >> 63 == 0);
+        // [d u1 0 t9 0 0 0 0 0 0 0 c-u1*rr0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0]
+        let t1 = (c & m) as u32;
+        c >>= 26;
+        c += u1 as u64 * rr1;
+        debug_assert!(t1 >> 26 == 0);
+        debug_assert!(c >> 38 == 0);
+        // [d u1 0 t9 0 0 0 0 0 0 c-u1*rr1 t1-u1*rr0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0]
+        // [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0]
+
+        c += a0 * b2 + a1 * b1 + a2 * b0;
+        debug_assert!(c >> 62 == 0);
+        // [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p11 p10 p9 0 0 0 0 0 0 p2 p1 p0]
+        d += a3 * b9 + a4 * b8 + a5 * b7 + a6 * b6 + a7 * b5 + a8 * b4 + a9 * b3;
+        debug_assert!(d >> 63 == 0);
+        // [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0]
+        let u2 = (d & m) as u32;
+        d >>= 26;
+        c += u2 as u64 * rr0;
+        debug_assert!(u2 >> 26 == 0);
+        debug_assert!(d >> 37 == 0);
+        debug_assert!(c >> 63 == 0);
+        // [d u2 0 0 t9 0 0 0 0 0 0 c-u2*rr0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0]
+        let t2 = (c & m) as u32;
+        c >>= 26;
+        c += u2 as u64 * rr1;
+        debug_assert!(t2 >> 26 == 0);
+        debug_assert!(c >> 38 == 0);
+        // [d u2 0 0 t9 0 0 0 0 0 c-u2*rr1 t2-u2*rr0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0]
+        // [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0]
+
+        c += a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0;
+        debug_assert!(c >> 63 == 0);
+        // [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0]
+        d += a4 * b9 + a5 * b8 + a6 * b7 + a7 * b6 + a8 * b5 + a9 * b4;
+        debug_assert!(d >> 63 == 0);
+        // [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0]
+        let u3 = (d & m) as u32;
+        d >>= 26;
+        c += u3 as u64 * rr0;
+        debug_assert!(u3 >> 26 == 0);
+        debug_assert!(d >> 37 == 0);
+        // [d u3 0 0 0 t9 0 0 0 0 0 c-u3*rr0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0]
+        let t3 = (c & m) as u32;
+        c >>= 26;
+        c += u3 as u64 * rr1;
+        debug_assert!(t3 >> 26 == 0);
+        debug_assert!(c >> 39 == 0);
+        // [d u3 0 0 0 t9 0 0 0 0 c-u3*rr1 t3-u3*rr0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0]
+        // [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0]
+
+        c += a0 * b4 + a1 * b3 + a2 * b2 + a3 * b1 + a4 * b0;
+        debug_assert!(c >> 63 == 0);
+        // [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0]
+        d += a5 * b9 + a6 * b8 + a7 * b7 + a8 * b6 + a9 * b5;
+        debug_assert!(d >> 62 == 0);
+        // [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0]
+        let u4 = (d & m) as u32;
+        d >>= 26;
+        c += u4 as u64 * rr0;
+        debug_assert!(u4 >> 26 == 0);
+        debug_assert!(d >> 36 == 0);
+        // [d u4 0 0 0 0 t9 0 0 0 0 c-u4*rr0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0]
+        let t4 = (c & m) as u32;
+        c >>= 26;
+        c += u4 as u64 * rr1;
+        debug_assert!(t4 >> 26 == 0);
+        debug_assert!(c >> 39 == 0);
+        // [d u4 0 0 0 0 t9 0 0 0 c-u4*rr1 t4-u4*rr0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0]
+        // [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0]
+
+        c += a0 * b5 + a1 * b4 + a2 * b3 + a3 * b2 + a4 * b1 + a5 * b0;
+        debug_assert!(c >> 63 == 0);
+        // [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0]
+        d += a6 * b9 + a7 * b8 + a8 * b7 + a9 * b6;
+        debug_assert!(d >> 62 == 0);
+        // [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0]
+        let u5 = (d & m) as u32;
+        d >>= 26;
+        c += u5 as u64 * rr0;
+        debug_assert!(u5 >> 26 == 0);
+        debug_assert!(d >> 36 == 0);
+        // [d u5 0 0 0 0 0 t9 0 0 0 c-u5*rr0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0]
+        let t5 = (c & m) as u32;
+        c >>= 26;
+        c += u5 as u64 * rr1;
+        debug_assert!(t5 >> 26 == 0);
+        debug_assert!(c >> 39 == 0);
+        // [d u5 0 0 0 0 0 t9 0 0 c-u5*rr1 t5-u5*rr0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0]
+        // [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0]
+
+        c += a0 * b6 + a1 * b5 + a2 * b4 + a3 * b3 + a4 * b2 + a5 * b1 + a6 * b0;
+        debug_assert!(c >> 63 == 0);
+        // [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0]
+        d += a7 * b9 + a8 * b8 + a9 * b7;
+        debug_assert!(d >> 61 == 0);
+        // [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0]
+        let u6 = (d & m) as u32;
+        d >>= 26;
+        c += u6 as u64 * rr0;
+        debug_assert!(u6 >> 26 == 0);
+        debug_assert!(d >> 35 == 0);
+        // [d u6 0 0 0 0 0 0 t9 0 0 c-u6*rr0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0]
+        let t6 = (c & m) as u32;
+        c >>= 26;
+        c += u6 as u64 * rr1;
+        debug_assert!(t6 >> 26 == 0);
+        debug_assert!(c >> 39 == 0);
+        // [d u6 0 0 0 0 0 0 t9 0 c-u6*rr1 t6-u6*rr0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0]
+        // [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0]
+
+        c += a0 * b7 + a1 * b6 + a2 * b5 + a3 * b4 + a4 * b3 + a5 * b2 + a6 * b1 + a7 * b0;
+        debug_assert!(c <= 0x8000007C00000007u64);
+        // [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0]
+        d += a8 * b9 + a9 * b8;
+        debug_assert!(d >> 58 == 0);
+        // [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0]
+        let u7 = (d & m) as u32;
+        d >>= 26;
+        c += u7 as u64 * rr0;
+        debug_assert!(u7 >> 26 == 0);
+        debug_assert!(d >> 32 == 0);
+        let d32 = d as u32;
+        debug_assert!(c <= 0x800001703FFFC2F7u64);
+        // [d u7 0 0 0 0 0 0 0 t9 0 c-u7*rr0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0]
+        let t7 = (c & m) as u32;
+        c >>= 26;
+        c += u7 as u64 * rr1;
+        debug_assert!(t7 >> 26 == 0);
+        debug_assert!(c >> 38 == 0);
+        // [d u7 0 0 0 0 0 0 0 t9 c-u7*rr1 t7-u7*rr0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0]
+        // [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0]
+
+        c +=
+            a0 * b8 + a1 * b7 + a2 * b6 + a3 * b5 + a4 * b4 + a5 * b3 + a6 * b2 + a7 * b1 + a8 * b0;
+        debug_assert!(c <= 0x9000007B80000008u64);
+        // [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
+        d = d32 as u64 + a9 * b9;
+        debug_assert!(d >> 57 == 0);
+        // [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
+        let u8 = (d & m) as u32;
+        d >>= 26;
+        c += u8 as u64 * rr0;
+        debug_assert!(u8 >> 26 == 0);
+        debug_assert!(d >> 31 == 0);
+        let d32 = d as u32;
+        debug_assert!(c <= 0x9000016FBFFFC2F8u64);
+        // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*rr0 t7 t6 t5 t4 t3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
+
+        let r3 = t3;
+        debug_assert!(r3 >> 26 == 0);
+        // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*rr0 t7 t6 t5 t4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
+        let r4 = t4;
+        debug_assert!(r4 >> 26 == 0);
+        // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*rr0 t7 t6 t5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
+        let r5 = t5;
+        debug_assert!(r5 >> 26 == 0);
+        // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*rr0 t7 t6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
+        let r6 = t6;
+        debug_assert!(r6 >> 26 == 0);
+        // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*rr0 t7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
+        let r7 = t7;
+        debug_assert!(r7 >> 26 == 0);
+        // [d u8 0 0 0 0 0 0 0 0 t9 c-u8*rr0 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
+
+        let r8 = (c & m) as u32;
+        c >>= 26;
+        c += u8 as u64 * rr1;
+        debug_assert!(r8 >> 26 == 0);
+        debug_assert!(c >> 39 == 0);
+        // [d u8 0 0 0 0 0 0 0 0 t9+c-u8*rr1 r8-u8*rr0 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
+        // [d 0 0 0 0 0 0 0 0 0 t9+c r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
+        c += d32 as u64 * rr0 + t9 as u64;
+        debug_assert!(c >> 45 == 0);
+        // [d 0 0 0 0 0 0 0 0 0 c-d*rr0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
+        let r9 = (c & (m >> 4)) as u32;
+        c >>= 22;
+        c += d * (rr1 << 4);
+        debug_assert!(r9 >> 22 == 0);
+        debug_assert!(c >> 46 == 0);
+        // [d 0 0 0 0 0 0 0 0 r9+((c-d*rr1<<4)<<22)-d*rr0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
+        // [d 0 0 0 0 0 0 0 -d*rr1 r9+(c<<22)-d*rr0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
+        // [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
+
+        d = c * (rr0 >> 4) + t0 as u64;
+        debug_assert!(d >> 56 == 0);
+        // [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1 d-c*rr0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
+        let r0 = (d & m) as u32;
+        d >>= 26;
+        debug_assert!(r0 >> 26 == 0);
+        debug_assert!(d >> 30 == 0);
+        let d32 = d as u32;
+        // [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1+d r0-c*rr0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
+        d = d32 as u64 + c * (rr1 >> 4) + t1 as u64;
+        debug_assert!(d >> 53 == 0);
+        debug_assert!(d <= 0x10000003FFFFBFu64);
+        // [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 d-c*rr1>>4 r0-c*rr0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
+        // [r9 r8 r7 r6 r5 r4 r3 t2 d r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
+        let r1 = (d & m) as u32;
+        d >>= 26;
+        debug_assert!(r1 >> 26 == 0);
+        debug_assert!(d >> 27 == 0);
+        let d32 = d as u32;
+        debug_assert!(d <= 0x4000000u64);
+        // [r9 r8 r7 r6 r5 r4 r3 t2+d r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
+        d = d32 as u64 + t2 as u64;
+        debug_assert!(d >> 27 == 0);
+        // [r9 r8 r7 r6 r5 r4 r3 d r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
+        let r2 = d as u32;
+        debug_assert!(r2 >> 27 == 0);
+        // [r9 r8 r7 r6 r5 r4 r3 r2 r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0]
+
+        Self([r0, r1, r2, r3, r4, r5, r6, r7, r8, r9])
+    }
+
+    /// Returns self * rhs mod p
+    /// Brings the magnitude to 1 (but doesn't normalize the result).
+    /// The magnitudes of arguments should be <= 8.
+    pub fn mul(&self, rhs: &Self) -> Self {
+        self.mul_inner(rhs)
+    }
+
+    /// Returns self * self
+    /// Brings the magnitude to 1 (but doesn't normalize the result).
+    /// The magnitudes of arguments should be <= 8.
+    pub fn square(&self) -> Self {
+        self.mul_inner(self)
+    }
+
+    pub fn pow2k(&self, k: u32) -> Self {
+        let mut x = *self;
+        for _j in 0..k {
+            x = x.square();
+        }
+        x
+    }
+
+    /// Returns the multiplicative inverse of self, if self is non-zero.
+    /// The result has magnitude 1, but is not normalized.
+    pub fn invert(&self) -> Self {
+        let x2 = self.pow2k(1).mul(self);
+        let x3 = x2.pow2k(1).mul(self);
+        let x6 = x3.pow2k(3).mul(&x3);
+        let x9 = x6.pow2k(3).mul(&x3);
+        let x11 = x9.pow2k(2).mul(&x2);
+        let x22 = x11.pow2k(11).mul(&x11);
+        let x44 = x22.pow2k(22).mul(&x22);
+        let x88 = x44.pow2k(44).mul(&x44);
+        let x176 = x88.pow2k(88).mul(&x88);
+        let x220 = x176.pow2k(44).mul(&x44);
+        let x223 = x220.pow2k(3).mul(&x3);
+
+        // The final result is then assembled using a sliding window over the blocks.
+        x223.pow2k(23)
+            .mul(&x22)
+            .pow2k(5)
+            .mul(self)
+            .pow2k(3)
+            .mul(&x2)
+            .pow2k(2)
+            .mul(self)
+    }
+
+    /// Returns the square root of self mod p, or `None` if no square root exists.
+    /// The result has magnitude 1, but is not normalized.
+    pub fn sqrt(&self) -> CtOption<Self> {
+        let x2 = self.pow2k(1).mul(self);
+        let x3 = x2.pow2k(1).mul(self);
+        let x6 = x3.pow2k(3).mul(&x3);
+        let x9 = x6.pow2k(3).mul(&x3);
+        let x11 = x9.pow2k(2).mul(&x2);
+        let x22 = x11.pow2k(11).mul(&x11);
+        let x44 = x22.pow2k(22).mul(&x22);
+        let x88 = x44.pow2k(44).mul(&x44);
+        let x176 = x88.pow2k(88).mul(&x88);
+        let x220 = x176.pow2k(44).mul(&x44);
+        let x223 = x220.pow2k(3).mul(&x3);
+
+        // The final result is then assembled using a sliding window over the blocks.
+        let res = x223.pow2k(23).mul(&x22).pow2k(6).mul(&x2).pow2k(2);
+
+        let is_root = (res.mul(&res).negate(1).add(self)).normalizes_to_zero();
+
+        // Only return Some if it's the square root.
+        CtOption::new(res, is_root)
+    }
+}
+
+impl Default for FieldElement10x26 {
+    fn default() -> Self {
+        Self::ZERO
+    }
+}
+
+impl ConditionallySelectable for FieldElement10x26 {
+    #[inline(always)]
+    fn conditional_select(a: &Self, b: &Self, choice: Choice) -> Self {
+        Self([
+            u32::conditional_select(&a.0[0], &b.0[0], choice),
+            u32::conditional_select(&a.0[1], &b.0[1], choice),
+            u32::conditional_select(&a.0[2], &b.0[2], choice),
+            u32::conditional_select(&a.0[3], &b.0[3], choice),
+            u32::conditional_select(&a.0[4], &b.0[4], choice),
+            u32::conditional_select(&a.0[5], &b.0[5], choice),
+            u32::conditional_select(&a.0[6], &b.0[6], choice),
+            u32::conditional_select(&a.0[7], &b.0[7], choice),
+            u32::conditional_select(&a.0[8], &b.0[8], choice),
+            u32::conditional_select(&a.0[9], &b.0[9], choice),
+        ])
+    }
+}
+
+impl ConstantTimeEq for FieldElement10x26 {
+    fn ct_eq(&self, other: &Self) -> Choice {
+        self.0[0].ct_eq(&other.0[0])
+            & self.0[1].ct_eq(&other.0[1])
+            & self.0[2].ct_eq(&other.0[2])
+            & self.0[3].ct_eq(&other.0[3])
+            & self.0[4].ct_eq(&other.0[4])
+            & self.0[5].ct_eq(&other.0[5])
+            & self.0[6].ct_eq(&other.0[6])
+            & self.0[7].ct_eq(&other.0[7])
+            & self.0[8].ct_eq(&other.0[8])
+            & self.0[9].ct_eq(&other.0[9])
+    }
+}
+
+impl Zeroize for FieldElement10x26 {
+    fn zeroize(&mut self) {
+        self.0.zeroize();
+    }
+}
--- a/openvm/extensions/hints-circuit/src/lib.rs
+++ b/openvm/extensions/hints-circuit/src/lib.rs
@@ -0,0 +1,56 @@
+use openvm_circuit::arch::{VmExtension, VmInventory};
+use openvm_circuit::circuit_derive::{Chip, ChipUsageGetter};
+use openvm_circuit::derive::{AnyEnum, InstructionExecutor};
+use openvm_circuit::system::phantom::PhantomChip;
+use openvm_instructions::PhantomDiscriminant;
+use openvm_stark_backend::p3_field::PrimeField32;
+use powdr_openvm_hints_transpiler::HintsPhantom;
+
+// this module is mostly copy/pasted code from k256 for the field element representation in 32-bit architectures
+mod executors;
+mod field10x26_k256;
+
+/// OpenVM extension with miscellaneous hint implementations.
+pub struct HintsExtension;
+
+#[derive(ChipUsageGetter, Chip, InstructionExecutor, AnyEnum)]
+pub enum HintsExecutor<F: PrimeField32> {
+    Phantom(PhantomChip<F>),
+}
+
+#[derive(ChipUsageGetter, Chip, AnyEnum)]
+pub enum HintsPeriphery<F: PrimeField32> {
+    Phantom(PhantomChip<F>),
+}
+
+impl<F: PrimeField32> VmExtension<F> for HintsExtension {
+    type Executor = HintsExecutor<F>;
+    type Periphery = HintsPeriphery<F>;
+
+    fn build(
+        &self,
+        builder: &mut openvm_circuit::arch::VmInventoryBuilder<F>,
+    ) -> Result<
+        openvm_circuit::arch::VmInventory<Self::Executor, Self::Periphery>,
+        openvm_circuit::arch::VmInventoryError,
+    > {
+        let inventory = VmInventory::new();
+        builder.add_phantom_sub_executor(
+            executors::ReverseBytesSubEx,
+            PhantomDiscriminant(HintsPhantom::HintReverseBytes as u16),
+        )?;
+        builder.add_phantom_sub_executor(
+            executors::K256InverseFieldSubEx,
+            PhantomDiscriminant(HintsPhantom::HintK256InverseField as u16),
+        )?;
+        builder.add_phantom_sub_executor(
+            executors::K256InverseField10x26SubEx,
+            PhantomDiscriminant(HintsPhantom::HintK256InverseField10x26 as u16),
+        )?;
+        builder.add_phantom_sub_executor(
+            executors::K256SqrtField10x26SubEx,
+            PhantomDiscriminant(HintsPhantom::HintK256SqrtField10x26 as u16),
+        )?;
+        Ok(inventory)
+    }
+}
--- a/openvm/extensions/hints-guest/Cargo.toml
+++ b/openvm/extensions/hints-guest/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "powdr-openvm-hints-guest"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+homepage.workspace = true
+repository.workspace = true
+
+[target.'cfg(target_os = "zkvm")'.dependencies]
+openvm-platform = { workspace = true, features = ["rust-runtime"] }
+openvm-rv32im-guest.workspace = true
+openvm-custom-insn.workspace = true
+
+[dependencies]
+strum_macros = "0.27"
--- a/openvm/extensions/hints-guest/src/lib.rs
+++ b/openvm/extensions/hints-guest/src/lib.rs
@@ -0,0 +1,134 @@
+#![no_std]
+#[cfg(target_os = "zkvm")]
+use openvm_custom_insn; // needed for the hint_store_u32 macro
+use strum_macros::FromRepr;
+
+/// This is custom-2 defined in RISC-V spec document
+pub const OPCODE: u8 = 0x5b;
+pub const HINTS_FUNCT3: u8 = 0b000;
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq, FromRepr)]
+#[repr(u8)]
+pub enum HintsFunct7 {
+    ReverseBytes = 0,
+    K256InverseField,
+    K256InverseField10x26,
+    K256SqrtField10x26,
+}
+
+#[cfg(target_os = "zkvm")]
+#[inline(always)]
+fn insn_reverse_bytes(bytes: *const u8) {
+    openvm_platform::custom_insn_r!(
+        opcode = OPCODE,
+        funct3 = HINTS_FUNCT3,
+        funct7 = HintsFunct7::ReverseBytes as u8,
+        rd = Const "x0",
+        rs1 = In bytes,
+        rs2 = Const "x0"
+    );
+}
+
+#[cfg(target_os = "zkvm")]
+#[inline(always)]
+fn insn_k256_inverse_field(bytes: *const u8) {
+    openvm_platform::custom_insn_r!(
+        opcode = OPCODE,
+        funct3 = HINTS_FUNCT3,
+        funct7 = HintsFunct7::K256InverseField as u8,
+        rd = Const "x0",
+        rs1 = In bytes,
+        rs2 = Const "x0"
+    );
+}
+
+#[cfg(target_os = "zkvm")]
+#[inline(always)]
+fn insn_k256_inverse_field_10x26(bytes: *const u8) {
+    openvm_platform::custom_insn_r!(
+        opcode = OPCODE,
+        funct3 = HINTS_FUNCT3,
+        funct7 = HintsFunct7::K256InverseField10x26 as u8,
+        rd = Const "x0",
+        rs1 = In bytes,
+        rs2 = Const "x0",
+    );
+}
+
+#[cfg(target_os = "zkvm")]
+#[inline(always)]
+fn insn_k256_sqrt_field_10x26(bytes: *const u8) {
+    openvm_platform::custom_insn_r!(
+        opcode = OPCODE,
+        funct3 = HINTS_FUNCT3,
+        funct7 = HintsFunct7::K256SqrtField10x26 as u8,
+        rd = Const "x0",
+        rs1 = In bytes,
+        rs2 = Const "x0",
+    );
+}
+
+// Just an example hint that reverses the bytes of a u32 value.
+pub fn hint_reverse_bytes(val: u32) -> u32 {
+    #[cfg(target_os = "zkvm")]
+    {
+        let result = core::mem::MaybeUninit::<u32>::uninit();
+        insn_reverse_bytes(&val as *const u32 as *const u8);
+        unsafe {
+            openvm_rv32im_guest::hint_store_u32!(result.as_ptr() as *const u32);
+            result.assume_init()
+        }
+    }
+    #[cfg(not(target_os = "zkvm"))]
+    {
+        ((val & 0x000000FF) << 24)
+            | ((val & 0x0000FF00) << 8)
+            | ((val & 0x00FF0000) >> 8)
+            | ((val & 0xFF000000) >> 24)
+    }
+}
+
+// Inverse of field element in SECP256k1 modulus (if not zero).
+// The caller is responsible for handling the zero input case, and the returned value is zero in that case.
+#[cfg(target_os = "zkvm")]
+pub fn hint_k256_inverse_field(sec1_bytes: &[u8]) -> [u8; 32] {
+    insn_k256_inverse_field(sec1_bytes.as_ptr() as *const u8);
+    let inverse = core::mem::MaybeUninit::<[u8; 32]>::uninit();
+    unsafe {
+        openvm_rv32im_guest::hint_buffer_u32!(inverse.as_ptr() as *const u8, 8);
+        inverse.assume_init()
+    }
+}
+
+// Inverse of field element in SECP256k1 modulus (if not zero).
+// Takes in the raw 32-bit architecture representation of the field element from k256 (`FieldElement10x26`).
+// The caller is responsible for handling the zero input case, and the returned value is undefined in that case.
+#[cfg(target_os = "zkvm")]
+pub fn hint_k256_inverse_field_10x26(elem: [u32; 10]) -> [u32; 10] {
+    insn_k256_inverse_field_10x26(elem.as_ptr() as *const u8);
+    let inverse = core::mem::MaybeUninit::<[u32; 10]>::uninit();
+    unsafe {
+        openvm_rv32im_guest::hint_buffer_u32!(inverse.as_ptr() as *const u8, 10);
+        inverse.assume_init()
+    }
+}
+
+// Square root of a field element in SECP256k1 modulus (if exists).
+#[cfg(target_os = "zkvm")]
+pub fn hint_k256_sqrt_field_10x26(elem: [u32; 10]) -> Option<[u32; 10]> {
+    insn_k256_sqrt_field_10x26(elem.as_ptr() as *const u8);
+    // read "boolean" result of whether the square root exists
+    let has_sqrt = core::mem::MaybeUninit::<u32>::uninit();
+    unsafe {
+        openvm_rv32im_guest::hint_store_u32!(has_sqrt.as_ptr() as *const u32);
+        if has_sqrt.assume_init() == 0 {
+            return None;
+        }
+    }
+    // read actual square root value
+    let sqrt = core::mem::MaybeUninit::<[u32; 10]>::uninit();
+    unsafe {
+        openvm_rv32im_guest::hint_buffer_u32!(sqrt.as_ptr() as *const u8, 10);
+        Some(sqrt.assume_init())
+    }
+}
--- a/openvm/extensions/hints-transpiler/Cargo.toml
+++ b/openvm/extensions/hints-transpiler/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "powdr-openvm-hints-transpiler"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+homepage.workspace = true
+repository.workspace = true
+
+[dependencies]
+openvm-stark-backend = { workspace = true }
+openvm-instructions = { workspace = true }
+openvm-transpiler = { workspace = true }
+openvm-instructions-derive = { workspace = true }
+rrs-lib = "0.1.0"
+strum = { version = "0.27", features = ["derive"] }
+
+powdr-openvm-hints-guest = { workspace = true }
--- a/openvm/extensions/hints-transpiler/src/lib.rs
+++ b/openvm/extensions/hints-transpiler/src/lib.rs
@@ -0,0 +1,68 @@
+use openvm_instructions::{
+    instruction::Instruction, riscv::RV32_REGISTER_NUM_LIMBS, LocalOpcode, PhantomDiscriminant,
+};
+use openvm_instructions_derive::LocalOpcode;
+use openvm_stark_backend::p3_field::PrimeField32;
+use openvm_transpiler::{TranspilerExtension, TranspilerOutput};
+use powdr_openvm_hints_guest::{HintsFunct7, HINTS_FUNCT3, OPCODE};
+use rrs_lib::instruction_formats::RType;
+use strum::{EnumCount, EnumIter, FromRepr};
+
+#[derive(
+    Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, EnumCount, EnumIter, FromRepr, LocalOpcode,
+)]
+#[opcode_offset = 0x800]
+#[repr(usize)]
+pub enum HintsOpcode {
+    HINTS,
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq, FromRepr)]
+#[repr(u16)]
+pub enum HintsPhantom {
+    // idk if there is a "proper" way for avoiding conflicts in this number,
+    // just looked at ovm code and picked the next range that didn't seem to be
+    // used
+    HintReverseBytes = 0x60,
+    HintK256InverseField = 0x61,
+    HintK256InverseField10x26 = 0x62,
+    HintK256SqrtField10x26 = 0x63,
+}
+
+#[derive(Default)]
+pub struct HintsTranspilerExtension;
+
+impl<F: PrimeField32> TranspilerExtension<F> for HintsTranspilerExtension {
+    fn process_custom(&self, instruction_stream: &[u32]) -> Option<TranspilerOutput<F>> {
+        if instruction_stream.is_empty() {
+            return None;
+        }
+        let instruction_u32 = instruction_stream[0];
+        let opcode = (instruction_u32 & 0x7f) as u8;
+        if opcode != OPCODE {
+            return None;
+        }
+
+        let insn = RType::new(instruction_u32);
+        if insn.funct3 as u8 != HINTS_FUNCT3 {
+            return None;
+        }
+
+        let funct7 = HintsFunct7::from_repr(insn.funct7 as u8)?;
+        let disc = match funct7 {
+            HintsFunct7::ReverseBytes => HintsPhantom::HintReverseBytes,
+            HintsFunct7::K256InverseField => HintsPhantom::HintK256InverseField,
+            HintsFunct7::K256InverseField10x26 => HintsPhantom::HintK256InverseField10x26,
+            HintsFunct7::K256SqrtField10x26 => HintsPhantom::HintK256SqrtField10x26,
+        };
+
+        let instruction = Instruction::phantom(
+            PhantomDiscriminant(disc as u16),
+            F::from_canonical_usize(RV32_REGISTER_NUM_LIMBS * insn.rs1),
+            F::ZERO,
+            0,
+        );
+
+        Some(TranspilerOutput::one_to_one(instruction))
+    }
+}
--- a/openvm/guest-hints-test/Cargo.toml
+++ b/openvm/guest-hints-test/Cargo.toml
@@ -0,0 +1,15 @@
+[workspace]
+[package]
+name = "powdr-openvm-guest-hints-test"
+version = "0.0.0"
+edition = "2021"
+
+[dependencies]
+# The `rev` here must point to the same version used in the workspace.
+# Otherwise, there is conflict with the `powdr-openvm-hints-guest` dependency (which is part of the workspace).
+openvm = { git = "https://github.com/powdr-labs/openvm.git", rev = "391b737" }
+powdr-openvm-hints-guest = { path = "../extensions/hints-guest/" }
+
+[profile.release-with-debug]
+inherits = "release"
+debug = true
--- a/openvm/guest-hints-test/src/main.rs
+++ b/openvm/guest-hints-test/src/main.rs
@@ -0,0 +1,10 @@
+#![cfg_attr(target_os = "zkvm", no_main)]
+#![cfg_attr(target_os = "zkvm", no_std)]
+
+openvm::entry!(main);
+use powdr_openvm_hints_guest::hint_reverse_bytes;
+
+pub fn main() {
+    let res = hint_reverse_bytes(0x11223344);
+    assert_eq!(res, 0x44332211);
+}
--- a/openvm/src/customize_exe.rs
+++ b/openvm/src/customize_exe.rs
@@ -143,17 +143,14 @@ impl<'a, F: PrimeField32> Program<Instr<F>> for Prog<'a, F> {
 }

 pub fn customize(
-    OriginalCompiledProgram {
-        mut exe,
-        sdk_vm_config,
-    }: OriginalCompiledProgram,
+    OriginalCompiledProgram { mut exe, vm_config }: OriginalCompiledProgram,
    labels: &BTreeSet<u32>,
    debug_info: &DebugInfo,
    config: PowdrConfig,
    implementation: PrecompileImplementation,
    pgo_config: PgoConfig,
 ) -> CompiledProgram {
-    let original_config = OriginalVmConfig::new(sdk_vm_config.clone());
+    let original_config = OriginalVmConfig::new(vm_config.clone());
    let airs = original_config.airs().expect("Failed to convert the AIR of an OpenVM instruction, even after filtering by the blacklist!");
    let bus_map = original_config.bus_map();

--- a/openvm/src/extraction_utils.rs
+++ b/openvm/src/extraction_utils.rs
@@ -5,12 +5,14 @@ use crate::air_builder::AirKeygenBuilder;
 use crate::bus_map::{BusMap, OpenVmBusType};
 use crate::opcode::branch_opcodes_set;
 use crate::{opcode::instruction_allowlist, BabyBearSC, SpecializedConfig};
-use crate::{AirMetrics, Instr, SpecializedExecutor, APP_LOG_BLOWUP};
+use crate::{
+    AirMetrics, ExtendedVmConfig, ExtendedVmConfigExecutor, ExtendedVmConfigPeriphery, Instr,
+    SpecializedExecutor, APP_LOG_BLOWUP,
+};
 use openvm_circuit::arch::{VmChipComplex, VmConfig, VmInventoryError};
 use openvm_circuit_primitives::bitwise_op_lookup::SharedBitwiseOperationLookupChip;
 use openvm_circuit_primitives::range_tuple::SharedRangeTupleCheckerChip;
 use openvm_instructions::VmOpcode;
-use openvm_sdk::config::{SdkVmConfig, SdkVmConfigExecutor, SdkVmConfigPeriphery};
 use openvm_stark_backend::air_builders::symbolic::SymbolicRapBuilder;
 use openvm_stark_backend::interaction::fri_log_up::find_interaction_chunks;
 use openvm_stark_backend::{
@@ -109,8 +111,13 @@ fn to_option<T>(mut v: Vec<T>) -> Option<T> {
 }

 /// A lazy chip complex that is initialized on the first access
-type LazyChipComplex =
-    Option<VmChipComplex<BabyBear, SdkVmConfigExecutor<BabyBear>, SdkVmConfigPeriphery<BabyBear>>>;
+type LazyChipComplex = Option<
+    VmChipComplex<
+        BabyBear,
+        ExtendedVmConfigExecutor<BabyBear>,
+        ExtendedVmConfigPeriphery<BabyBear>,
+    >,
+>;

 /// A shared and mutable reference to a `LazyChipComplex`.
 type CachedChipComplex = Arc<Mutex<LazyChipComplex>>;
@@ -121,8 +128,11 @@ pub struct ChipComplexGuard<'a> {
 }

 impl<'a> Deref for ChipComplexGuard<'a> {
-    type Target =
-        VmChipComplex<BabyBear, SdkVmConfigExecutor<BabyBear>, SdkVmConfigPeriphery<BabyBear>>;
+    type Target = VmChipComplex<
+        BabyBear,
+        ExtendedVmConfigExecutor<BabyBear>,
+        ExtendedVmConfigPeriphery<BabyBear>,
+    >;

    fn deref(&self) -> &Self::Target {
        // Unwrap is safe here because we ensure that the chip complex is initialized
@@ -132,27 +142,27 @@ impl<'a> Deref for ChipComplexGuard<'a> {
    }
 }

-/// A wrapper around the `SdkVmConfig` that caches a chip complex.
+/// A wrapper around the `ExtendedVmConfig` that caches a chip complex.
 #[derive(Serialize, Deserialize, Clone)]
 pub struct OriginalVmConfig {
-    sdk_config: SdkVmConfig,
+    sdk_config: ExtendedVmConfig,
    #[serde(skip)]
    chip_complex: CachedChipComplex,
 }

 impl OriginalVmConfig {
-    pub fn new(sdk_config: SdkVmConfig) -> Self {
+    pub fn new(sdk_config: ExtendedVmConfig) -> Self {
        Self {
            sdk_config,
            chip_complex: Default::default(),
        }
    }

-    pub fn config(&self) -> &SdkVmConfig {
+    pub fn config(&self) -> &ExtendedVmConfig {
        &self.sdk_config
    }

-    pub fn config_mut(&mut self) -> &mut SdkVmConfig {
+    pub fn config_mut(&mut self) -> &mut ExtendedVmConfig {
        let mut guard = self.chip_complex.lock().expect("Mutex poisoned");
        *guard = None; // Invalidate cache
        &mut self.sdk_config
@@ -268,7 +278,11 @@ impl OriginalVmConfig {
    pub fn create_chip_complex(
        &self,
    ) -> Result<
-        VmChipComplex<BabyBear, SdkVmConfigExecutor<BabyBear>, SdkVmConfigPeriphery<BabyBear>>,
+        VmChipComplex<
+            BabyBear,
+            ExtendedVmConfigExecutor<BabyBear>,
+            ExtendedVmConfigPeriphery<BabyBear>,
+        >,
        VmInventoryError,
    > {
        // Clear the cache
@@ -490,7 +504,7 @@ mod tests {
    use openvm_ecc_circuit::{WeierstrassExtension, SECP256K1_CONFIG};
    use openvm_pairing_circuit::{PairingCurve, PairingExtension};
    use openvm_rv32im_circuit::Rv32M;
-    use openvm_sdk::config::SdkSystemConfig;
+    use openvm_sdk::config::{SdkSystemConfig, SdkVmConfig};

    #[test]
    fn test_get_bus_map() {
@@ -524,7 +538,7 @@ mod tests {
            supported_curves.push(bls_config.clone());
            supported_pairing_curves.push(PairingCurve::Bls12_381);
        }
-        let vm_config = SdkVmConfig::builder()
+        let sdk_vm_config = SdkVmConfig::builder()
            .system(system_config.into())
            .rv32i(Default::default())
            .rv32m(rv32m)
@@ -538,17 +552,18 @@ mod tests {
            .pairing(PairingExtension::new(supported_pairing_curves))
            .build();

-        let _ = OriginalVmConfig::new(vm_config).bus_map();
+        let _ = OriginalVmConfig::new(ExtendedVmConfig { sdk_vm_config }).bus_map();
    }

    #[test]
    fn test_export_pil() {
        let writer = &mut Vec::new();
-        let base_config = OriginalVmConfig::new(
-            SdkVmConfig::builder()
+        let ext_config = ExtendedVmConfig {
+            sdk_vm_config: SdkVmConfig::builder()
                .system(SdkSystemConfig::default())
                .build(),
-        );
+        };
+        let base_config = OriginalVmConfig::new(ext_config);
        let specialized_config = SpecializedConfig::new(
            base_config,
            vec![],
--- a/openvm/src/lib.rs
+++ b/openvm/src/lib.rs
@@ -27,6 +27,8 @@ use openvm_stark_sdk::openvm_stark_backend::p3_field::PrimeField32;
 use openvm_stark_sdk::p3_baby_bear::BabyBear;
 use powdr_autoprecompiles::{execution_profile::execution_profile, PowdrConfig};
 use powdr_extension::{PowdrExecutor, PowdrExtension, PowdrPeriphery};
+use powdr_openvm_hints_circuit::{HintsExecutor, HintsExtension, HintsPeriphery};
+use powdr_openvm_hints_transpiler::HintsTranspilerExtension;
 use serde::{Deserialize, Serialize};
 use std::cmp::Reverse;
 use std::fs::File;
@@ -145,7 +147,7 @@ impl InitFileGenerator for SpecializedConfig {
 #[derive(ChipUsageGetter, From, AnyEnum, InstructionExecutor, Chip)]
 pub enum SpecializedExecutor<F: PrimeField32> {
    #[any_enum]
-    SdkExecutor(SdkVmConfigExecutor<F>),
+    SdkExecutor(ExtendedVmConfigExecutor<F>),
    #[any_enum]
    PowdrExecutor(PowdrExecutor<F>),
 }
@@ -153,7 +155,7 @@ pub enum SpecializedExecutor<F: PrimeField32> {
 #[derive(From, ChipUsageGetter, Chip, AnyEnum)]
 pub enum MyPeriphery<F: PrimeField32> {
    #[any_enum]
-    SdkPeriphery(SdkVmConfigPeriphery<F>),
+    SdkPeriphery(ExtendedVmConfigPeriphery<F>),
    #[any_enum]
    PowdrPeriphery(PowdrPeriphery<F>),
 }
@@ -263,10 +265,17 @@ pub fn compile_openvm(
        Default::default(),
    )?;

-    // Transpile the ELF into a VmExe. Note that this happens using the sdk transpiler only, our extension does not use a transpiler.
-    let exe = sdk.transpile(elf, sdk_vm_config.transpiler())?;
+    // Transpile the ELF into a VmExe.
+    let mut transpiler = sdk_vm_config.transpiler();

-    Ok(OriginalCompiledProgram { exe, sdk_vm_config })
+    // Add our custom transpiler extensions
+    transpiler = transpiler.with_extension(HintsTranspilerExtension {});
+
+    let exe = sdk.transpile(elf, transpiler)?;
+
+    let vm_config = ExtendedVmConfig { sdk_vm_config };
+
+    Ok(OriginalCompiledProgram { exe, vm_config })
 }

 /// Determines how the precompile (a circuit with algebraic gates and bus interactions)
@@ -409,9 +418,72 @@ pub struct CompiledProgram {
 #[derive(Clone)]
 pub struct OriginalCompiledProgram {
    pub exe: VmExe<BabyBear>,
+    pub vm_config: ExtendedVmConfig,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+// SdkVmConfig plus custom openvm extensions, before autoprecompile transformations.
+// For now, only includes custom hints.
+pub struct ExtendedVmConfig {
    pub sdk_vm_config: SdkVmConfig,
 }

+impl VmConfig<BabyBear> for ExtendedVmConfig {
+    type Executor = ExtendedVmConfigExecutor<BabyBear>;
+    type Periphery = ExtendedVmConfigPeriphery<BabyBear>;
+
+    fn system(&self) -> &SystemConfig {
+        &self.sdk_vm_config.system.config
+    }
+
+    fn system_mut(&mut self) -> &mut SystemConfig {
+        &mut self.sdk_vm_config.system.config
+    }
+
+    fn create_chip_complex(
+        &self,
+    ) -> std::result::Result<
+        VmChipComplex<BabyBear, Self::Executor, Self::Periphery>,
+        VmInventoryError,
+    > {
+        let mut complex = self.sdk_vm_config.create_chip_complex()?.transmute();
+        complex = complex.extend(&HintsExtension)?;
+        Ok(complex)
+    }
+}
+
+impl InitFileGenerator for ExtendedVmConfig {
+    fn generate_init_file_contents(&self) -> Option<String> {
+        self.sdk_vm_config.generate_init_file_contents()
+    }
+
+    fn write_to_init_file(
+        &self,
+        manifest_dir: &Path,
+        init_file_name: Option<&str>,
+    ) -> eyre::Result<()> {
+        self.sdk_vm_config
+            .write_to_init_file(manifest_dir, init_file_name)
+    }
+}
+
+#[derive(ChipUsageGetter, Chip, InstructionExecutor, From, AnyEnum)]
+#[allow(clippy::large_enum_variant)]
+pub enum ExtendedVmConfigExecutor<F: PrimeField32> {
+    #[any_enum]
+    Sdk(SdkVmConfigExecutor<F>),
+    #[any_enum]
+    Hints(HintsExecutor<F>),
+}
+
+#[derive(From, ChipUsageGetter, Chip, AnyEnum)]
+pub enum ExtendedVmConfigPeriphery<F: PrimeField32> {
+    #[any_enum]
+    Sdk(SdkVmConfigPeriphery<F>),
+    #[any_enum]
+    Hints(HintsPeriphery<F>),
+}
+
 #[derive(Clone, Serialize, Deserialize, Default, Debug, Eq, PartialEq)]
 pub struct AirMetrics {
    pub widths: AirWidths,
@@ -520,6 +592,7 @@ pub fn prove(
        vm_config
            .sdk_config
            .config_mut()
+            .sdk_vm_config
            .system
            .config
            .segmentation_strategy = Arc::new(
@@ -613,14 +686,14 @@ pub fn execution_profile_from_guest(
    guest_opts: GuestOptions,
    inputs: StdIn,
 ) -> HashMap<u64, u32> {
-    let OriginalCompiledProgram { exe, sdk_vm_config } = compile_openvm(guest, guest_opts).unwrap();
+    let OriginalCompiledProgram { exe, vm_config } = compile_openvm(guest, guest_opts).unwrap();
    let program = Prog::from(&exe.program);

    // prepare for execute
    let sdk = Sdk::default();

    execution_profile::<BabyBearOpenVmApcAdapter>(&program, || {
-        sdk.execute(exe.clone(), sdk_vm_config.clone(), inputs.clone())
+        sdk.execute(exe.clone(), vm_config.clone(), inputs.clone())
            .unwrap();
    })
 }
@@ -740,6 +813,8 @@ mod tests {
    const GUEST_SHA256_APC_PGO_LARGE: u64 = 50;
    const GUEST_SHA256_SKIP: u64 = 0;

+    const GUEST_HINTS_TEST: &str = "guest-hints-test";
+
    #[test]
    fn guest_prove_simple() {
        let mut stdin = StdIn::default();
@@ -1168,6 +1243,23 @@ mod tests {
        );
    }

+    #[test]
+    /// check that the hints test guest compiles and proves successfully
+    fn hints_test_prove() {
+        let mut stdin = StdIn::default();
+        stdin.write(&GUEST_HINTS_TEST);
+        let config = default_powdr_openvm_config(0, 0);
+
+        prove_simple(
+            GUEST_SHA256,
+            config,
+            PrecompileImplementation::SingleRowChip,
+            stdin,
+            PgoConfig::None,
+            None,
+        );
+    }
+
    // #[test]
    // #[ignore = "Too much RAM"]
    // // TODO: This test currently panics because the kzg params are not set up correctly. Fix this.
--- a/openvm/src/powdr_extension/chip.rs
+++ b/openvm/src/powdr_extension/chip.rs
@@ -7,7 +7,7 @@ use std::{

 use crate::{
    extraction_utils::OriginalAirs, powdr_extension::executor::PowdrPeripheryInstances,
-    utils::algebraic_to_symbolic,
+    utils::algebraic_to_symbolic, ExtendedVmConfig,
 };

 use super::{executor::PowdrExecutor, opcode::PowdrOpcode, PowdrPrecompile};
@@ -18,7 +18,6 @@ use openvm_circuit::{
    system::memory::OfflineMemory,
 };
 use openvm_instructions::{instruction::Instruction, LocalOpcode};
-use openvm_sdk::config::SdkVmConfig;
 use openvm_stark_backend::{
    air_builders::symbolic::{
        symbolic_expression::{SymbolicEvaluator, SymbolicExpression},
@@ -54,7 +53,7 @@ impl<F: PrimeField32> PowdrChip<F> {
        precompile: PowdrPrecompile<F>,
        original_airs: OriginalAirs<F>,
        memory: Arc<Mutex<OfflineMemory<F>>>,
-        base_config: SdkVmConfig,
+        base_config: ExtendedVmConfig,
        periphery: PowdrPeripheryInstances,
    ) -> Self {
        let PowdrPrecompile {
--- a/openvm/src/powdr_extension/executor/inventory.rs
+++ b/openvm/src/powdr_extension/executor/inventory.rs
@@ -8,9 +8,10 @@ use openvm_circuit_primitives::{
    bitwise_op_lookup::SharedBitwiseOperationLookupChip, range_tuple::SharedRangeTupleCheckerChip,
    var_range::SharedVariableRangeCheckerChip, Chip, ChipUsageGetter,
 };
-use openvm_sdk::config::{SdkVmConfigExecutor, SdkVmConfigPeriphery};
 use openvm_stark_backend::p3_field::PrimeField32;

+use crate::{ExtendedVmConfigExecutor, ExtendedVmConfigPeriphery};
+
 /// A dummy inventory used for execution of autoprecompiles
 /// It extends the `SdkVmConfigExecutor` and `SdkVmConfigPeriphery`, providing them with shared, pre-loaded periphery chips to avoid memory allocations by each SDK chip
 pub type DummyInventory<F> = VmInventory<DummyExecutor<F>, DummyPeriphery<F>>;
@@ -20,7 +21,7 @@ pub type DummyChipComplex<F> = VmChipComplex<F, DummyExecutor<F>, DummyPeriphery
 #[derive(ChipUsageGetter, Chip, InstructionExecutor, AnyEnum, From)]
 pub enum DummyExecutor<F: PrimeField32> {
    #[any_enum]
-    Sdk(SdkVmConfigExecutor<F>),
+    Sdk(ExtendedVmConfigExecutor<F>),
    #[any_enum]
    Shared(SharedExecutor<F>),
    #[any_enum]
@@ -32,7 +33,7 @@ pub enum DummyExecutor<F: PrimeField32> {
 #[derive(ChipUsageGetter, Chip, AnyEnum, From)]
 pub enum DummyPeriphery<F: PrimeField32> {
    #[any_enum]
-    Sdk(SdkVmConfigPeriphery<F>),
+    Sdk(ExtendedVmConfigPeriphery<F>),
    #[any_enum]
    Shared(SharedPeriphery<F>),
    #[any_enum]
@@ -75,6 +76,11 @@ mod from_implementations {
        Rv32MPeriphery,
    };
    use openvm_sha256_circuit::{Sha256Executor, Sha256Periphery};
+    use powdr_openvm_hints_circuit::HintsExecutor;
+    use powdr_openvm_hints_circuit::HintsPeriphery;
+
+    use crate::ExtendedVmConfigExecutor;
+    use crate::ExtendedVmConfigPeriphery;

    /// Defines `From<T> for DummyExecutor` and `From<T> for DummyPeriphery`
    /// by mapping to the appropriate `SdkVmConfigExecutor` and `SdkVmConfigPeriphery` variant.
@@ -84,19 +90,31 @@ mod from_implementations {
            $(
                impl<F: PrimeField32> From<$executor_ty> for DummyExecutor<F> {
                    fn from(executor: $executor_ty) -> Self {
-                        DummyExecutor::Sdk(SdkVmConfigExecutor::$variant(executor))
+                        DummyExecutor::Sdk(ExtendedVmConfigExecutor::Sdk(SdkVmConfigExecutor::$variant(executor)))
                    }
                }

                impl<F: PrimeField32> From<$periphery_ty> for DummyPeriphery<F> {
                    fn from(periphery: $periphery_ty) -> Self {
-                        DummyPeriphery::Sdk(SdkVmConfigPeriphery::$variant(periphery))
+                        DummyPeriphery::Sdk(ExtendedVmConfigPeriphery::Sdk(SdkVmConfigPeriphery::$variant(periphery)))
                    }
                }
            )*
        };
    }

+    impl<F: PrimeField32> From<HintsExecutor<F>> for DummyExecutor<F> {
+        fn from(executor: HintsExecutor<F>) -> Self {
+            DummyExecutor::Sdk(ExtendedVmConfigExecutor::Hints(executor))
+        }
+    }
+
+    impl<F: PrimeField32> From<HintsPeriphery<F>> for DummyPeriphery<F> {
+        fn from(executor: HintsPeriphery<F>) -> Self {
+            DummyPeriphery::Sdk(ExtendedVmConfigPeriphery::Hints(executor))
+        }
+    }
+
    impl_zero_cost_conversions!(
        (Rv32i, Rv32IExecutor<F>, Rv32IPeriphery<F>),
        (Io, Rv32IoExecutor<F>, Rv32IoPeriphery<F>),
--- a/openvm/src/powdr_extension/executor/mod.rs
+++ b/openvm/src/powdr_extension/executor/mod.rs
@@ -9,7 +9,7 @@ use crate::{
        inventory::{DummyChipComplex, DummyInventory},
        periphery::SharedPeripheryChips,
    },
-    Instr,
+    ExtendedVmConfig, Instr,
 };

 use super::{
@@ -28,7 +28,6 @@ use openvm_circuit::{
    },
 };
 use openvm_native_circuit::CastFExtension;
-use openvm_sdk::config::SdkVmConfig;
 use openvm_stark_backend::{
    p3_field::FieldAlgebra, p3_matrix::Matrix, p3_maybe_rayon::prelude::ParallelIterator,
 };
@@ -55,6 +54,7 @@ mod inventory;
 mod periphery;

 pub use periphery::PowdrPeripheryInstances;
+use powdr_openvm_hints_circuit::HintsExtension;

 /// A struct which holds the state of the execution based on the original instructions in this block and a dummy inventory.
 pub struct PowdrExecutor<F: PrimeField32> {
@@ -72,7 +72,7 @@ impl<F: PrimeField32> PowdrExecutor<F> {
        air_by_opcode_id: OriginalAirs<F>,
        is_valid_column: AlgebraicReference,
        memory: Arc<Mutex<OfflineMemory<F>>>,
-        base_config: SdkVmConfig,
+        base_config: ExtendedVmConfig,
        periphery: PowdrPeripheryInstances,
    ) -> Self {
        Self {
@@ -388,7 +388,7 @@ fn global_index<F>(
 fn create_chip_complex_with_memory<F: PrimeField32>(
    memory: Arc<Mutex<OfflineMemory<F>>>,
    shared_chips: SharedPeripheryChips,
-    base_config: SdkVmConfig,
+    base_config: ExtendedVmConfig,
 ) -> std::result::Result<DummyChipComplex<F>, VmInventoryError> {
    use openvm_keccak256_circuit::Keccak256;
    use openvm_native_circuit::Native;
@@ -396,7 +396,12 @@ fn create_chip_complex_with_memory<F: PrimeField32>(
    use openvm_sha256_circuit::Sha256;

    let this = base_config;
-    let mut complex: DummyChipComplex<F> = this.system.config.create_chip_complex()?.transmute();
+    let mut complex: DummyChipComplex<F> = this
+        .sdk_vm_config
+        .system
+        .config
+        .create_chip_complex()?
+        .transmute();

    // CHANGE: inject the correct memory here to be passed to the chips, to be accessible in their get_proof_input
    complex.base.memory_controller.offline_memory = memory.clone();
@@ -407,28 +412,28 @@ fn create_chip_complex_with_memory<F: PrimeField32>(
    complex = complex.extend(&shared_chips)?;
    // END CHANGE

-    if this.rv32i.is_some() {
+    if this.sdk_vm_config.rv32i.is_some() {
        complex = complex.extend(&Rv32I)?;
    }
-    if this.io.is_some() {
+    if this.sdk_vm_config.io.is_some() {
        complex = complex.extend(&Rv32Io)?;
    }
-    if this.keccak.is_some() {
+    if this.sdk_vm_config.keccak.is_some() {
        complex = complex.extend(&Keccak256)?;
    }
-    if this.sha256.is_some() {
+    if this.sdk_vm_config.sha256.is_some() {
        complex = complex.extend(&Sha256)?;
    }
-    if this.native.is_some() {
+    if this.sdk_vm_config.native.is_some() {
        complex = complex.extend(&Native)?;
    }
-    if this.castf.is_some() {
+    if this.sdk_vm_config.castf.is_some() {
        complex = complex.extend(&CastFExtension)?;
    }

-    if let Some(rv32m) = this.rv32m {
+    if let Some(rv32m) = this.sdk_vm_config.rv32m {
        let mut rv32m = rv32m;
-        if let Some(ref bigint) = this.bigint {
+        if let Some(ref bigint) = this.sdk_vm_config.bigint {
            rv32m.range_tuple_checker_sizes[0] =
                rv32m.range_tuple_checker_sizes[0].max(bigint.range_tuple_checker_sizes[0]);
            rv32m.range_tuple_checker_sizes[1] =
@@ -436,9 +441,9 @@ fn create_chip_complex_with_memory<F: PrimeField32>(
        }
        complex = complex.extend(&rv32m)?;
    }
-    if let Some(bigint) = this.bigint {
+    if let Some(bigint) = this.sdk_vm_config.bigint {
        let mut bigint = bigint;
-        if let Some(ref rv32m) = this.rv32m {
+        if let Some(ref rv32m) = this.sdk_vm_config.rv32m {
            bigint.range_tuple_checker_sizes[0] =
                rv32m.range_tuple_checker_sizes[0].max(bigint.range_tuple_checker_sizes[0]);
            bigint.range_tuple_checker_sizes[1] =
@@ -446,18 +451,21 @@ fn create_chip_complex_with_memory<F: PrimeField32>(
        }
        complex = complex.extend(&bigint)?;
    }
-    if let Some(ref modular) = this.modular {
+    if let Some(ref modular) = this.sdk_vm_config.modular {
        complex = complex.extend(modular)?;
    }
-    if let Some(ref fp2) = this.fp2 {
+    if let Some(ref fp2) = this.sdk_vm_config.fp2 {
        complex = complex.extend(fp2)?;
    }
-    if let Some(ref pairing) = this.pairing {
+    if let Some(ref pairing) = this.sdk_vm_config.pairing {
        complex = complex.extend(pairing)?;
    }
-    if let Some(ref ecc) = this.ecc {
+    if let Some(ref ecc) = this.sdk_vm_config.ecc {
        complex = complex.extend(ecc)?;
    }

+    // add custom extensions
+    complex = complex.extend(&HintsExtension)?;
+
    Ok(complex)
 }
--- a/openvm/src/powdr_extension/plonk/chip.rs
+++ b/openvm/src/powdr_extension/plonk/chip.rs
@@ -11,6 +11,7 @@ use crate::powdr_extension::plonk::air::PlonkColumns;
 use crate::powdr_extension::plonk::copy_constraint::generate_permutation_columns;
 use crate::powdr_extension::PowdrOpcode;
 use crate::powdr_extension::PowdrPrecompile;
+use crate::ExtendedVmConfig;
 use itertools::Itertools;
 use openvm_circuit::utils::next_power_of_two_or_zero;
 use openvm_circuit::{
@@ -19,7 +20,6 @@ use openvm_circuit::{
 };
 use openvm_instructions::instruction::Instruction;
 use openvm_instructions::LocalOpcode;
-use openvm_sdk::config::SdkVmConfig;
 use openvm_stark_backend::p3_air::BaseAir;
 use openvm_stark_backend::p3_field::FieldAlgebra;
 use openvm_stark_backend::p3_matrix::dense::RowMajorMatrix;
@@ -51,7 +51,7 @@ impl<F: PrimeField32> PlonkChip<F> {
        precompile: PowdrPrecompile<F>,
        original_airs: OriginalAirs<F>,
        memory: Arc<Mutex<OfflineMemory<F>>>,
-        base_config: SdkVmConfig,
+        base_config: ExtendedVmConfig,
        periphery: PowdrPeripheryInstances,
        bus_map: BusMap,
        copy_constraint_bus_id: u16,
--- a/openvm/src/powdr_extension/vm.rs
+++ b/openvm/src/powdr_extension/vm.rs
@@ -22,7 +22,6 @@ use openvm_circuit_primitives::range_tuple::SharedRangeTupleCheckerChip;
 use openvm_circuit_primitives::var_range::SharedVariableRangeCheckerChip;
 use openvm_instructions::VmOpcode;
 use openvm_instructions::{instruction::Instruction, LocalOpcode};
-use openvm_sdk::config::{SdkVmConfig, SdkVmConfigPeriphery};
 use openvm_stark_backend::{
    p3_field::{Field, PrimeField32},
    ChipUsageGetter,
@@ -30,7 +29,7 @@ use openvm_stark_backend::{
 use powdr_autoprecompiles::SymbolicMachine;
 use serde::{Deserialize, Serialize};

-use crate::PrecompileImplementation;
+use crate::{ExtendedVmConfig, ExtendedVmConfigPeriphery, PrecompileImplementation};

 use super::plonk::chip::PlonkChip;
 use super::{chip::PowdrChip, PowdrOpcode};
@@ -39,7 +38,7 @@ use super::{chip::PowdrChip, PowdrOpcode};
 #[serde(bound = "F: Field")]
 pub struct PowdrExtension<F> {
    pub precompiles: Vec<PowdrPrecompile<F>>,
-    pub base_config: SdkVmConfig,
+    pub base_config: ExtendedVmConfig,
    pub implementation: PrecompileImplementation,
    pub bus_map: BusMap,
    pub airs: OriginalAirs<F>,
@@ -102,7 +101,7 @@ impl<F> PowdrPrecompile<F> {
 impl<F> PowdrExtension<F> {
    pub fn new(
        precompiles: Vec<PowdrPrecompile<F>>,
-        base_config: SdkVmConfig,
+        base_config: ExtendedVmConfig,
        implementation: PrecompileImplementation,
        bus_map: BusMap,
        airs: OriginalAirs<F>,
@@ -135,7 +134,7 @@ impl<F: PrimeField32> PowdrExecutor<F> {

 #[derive(From, ChipUsageGetter, Chip, AnyEnum)]
 pub enum PowdrPeriphery<F: PrimeField32> {
-    Sdk(SdkVmConfigPeriphery<F>),
+    Sdk(ExtendedVmConfigPeriphery<F>),
    Phantom(PhantomChip<F>),
 }

--- a/openvm/tests/apc_builder.rs
+++ b/openvm/tests/apc_builder.rs
@@ -8,6 +8,7 @@ use powdr_openvm::bus_interaction_handler::OpenVmBusInteractionHandler;
 use powdr_openvm::extraction_utils::OriginalVmConfig;
 use powdr_openvm::instruction_formatter::openvm_instruction_formatter;
 use powdr_openvm::BabyBearOpenVmApcAdapter;
+use powdr_openvm::ExtendedVmConfig;
 use powdr_openvm::Instr;
 use powdr_openvm::{bus_map::default_openvm_bus_map, OPENVM_DEGREE_BOUND};
 use pretty_assertions::assert_eq;
@@ -23,7 +24,9 @@ fn compile(basic_block: Vec<Instruction<BabyBear>>) -> String {
        .io(Default::default())
        .build();

-    let original_config = OriginalVmConfig::new(sdk_vm_config);
+    let ext_vm_config = ExtendedVmConfig { sdk_vm_config };
+
+    let original_config = OriginalVmConfig::new(ext_vm_config);

    let airs = original_config.airs().unwrap();
    let bus_map = original_config.bus_map();