Merge branch 'main' into ac/artifact-version-warning

refactor: batched poly reads (#897 )
chore: version mismatch warnings for artifacts
2026-01-13 08:17:57 -05:00 · 2025-01-06 15:49:58 +00:00 · 2025-01-06 15:49:47 +00:00 · 2025-01-06 15:35:37 +00:00 · 2024-12-31 07:28:02 -05:00 · 2024-12-30 13:44:03 -05:00
19 changed files with 586 additions and 327 deletions
--- a/.github/workflows/pypi-gpu.yml
+++ b/.github/workflows/pypi-gpu.yml
@@ -34,6 +34,7 @@ jobs:
        run: |
            mv pyproject.toml pyproject.toml.orig
            sed "s/ezkl/ezkl-gpu/" pyproject.toml.orig >pyproject.toml
+            sed "s/0\\.0\\.0/${RELEASE_TAG//v}/" pyproject.toml.orig >pyproject.toml

      - uses: actions-rs/toolchain@v1
        with:
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -233,6 +233,14 @@ jobs:
          python-version: 3.12
          architecture: x64

+      - name: Set pyproject.toml version to match github tag
+        shell: bash
+        env:
+          RELEASE_TAG: ${{ github.ref_name }}
+        run: |
+          mv pyproject.toml pyproject.toml.orig
+          sed "s/0\\.0\\.0/${RELEASE_TAG//v}/" pyproject.toml.orig >pyproject.toml
+
      - name: Set Cargo.toml version to match github tag
        shell: bash
        env:
@@ -242,7 +250,6 @@ jobs:
          sed "s/0\\.0\\.0/${RELEASE_TAG//v}/" Cargo.toml.orig >Cargo.toml
          mv Cargo.lock Cargo.lock.orig
          sed "s/0\\.0\\.0/${RELEASE_TAG//v}/" Cargo.lock.orig >Cargo.lock
-
      - name: Install required libraries
        shell: bash
        run: |
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2377,7 +2377,7 @@ dependencies = [
 [[package]]
 name = "halo2_gadgets"
 version = "0.2.0"
-source = "git+https://github.com/zkonduit/halo2#0654e92bdf725fd44d849bfef3643870a8c7d50b"
+source = "git+https://github.com/zkonduit/halo2#6d72498928cdb69ce0de9f2230d2873ca2cf5324"
 dependencies = [
 "arrayvec 0.7.4",
 "bitvec",
@@ -2394,7 +2394,7 @@ dependencies = [
 [[package]]
 name = "halo2_proofs"
 version = "0.3.0"
-source = "git+https://github.com/zkonduit/halo2#0654e92bdf725fd44d849bfef3643870a8c7d50b#0654e92bdf725fd44d849bfef3643870a8c7d50b"
+source = "git+https://github.com/zkonduit/halo2#6d72498928cdb69ce0de9f2230d2873ca2cf5324#6d72498928cdb69ce0de9f2230d2873ca2cf5324"
 dependencies = [
 "bincode",
 "blake2b_simd",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -147,6 +147,10 @@ shellexpand = "3.1.0"
 runner = 'wasm-bindgen-test-runner'


+[[bench]]
+name = "zero_finder"
+harness = false
+
 [[bench]]
 name = "accum_dot"
 harness = false
@@ -276,7 +280,10 @@ no-update = []


 [patch.'https://github.com/zkonduit/halo2']
-halo2_proofs = { git = "https://github.com/zkonduit/halo2#0654e92bdf725fd44d849bfef3643870a8c7d50b", package = "halo2_proofs" }
+halo2_proofs = { git = "https://github.com/zkonduit/halo2#6d72498928cdb69ce0de9f2230d2873ca2cf5324", package = "halo2_proofs" }
+
+[patch.'https://github.com/zkonduit/halo2#0654e92bdf725fd44d849bfef3643870a8c7d50b']
+halo2_proofs = { git = "https://github.com/zkonduit/halo2#6d72498928cdb69ce0de9f2230d2873ca2cf5324", package = "halo2_proofs" }

 [patch.crates-io]
 uniffi_testing = { git = "https://github.com/ElusAegis/uniffi-rs", branch = "feat/testing-feature-build-fix" }
--- a/benches/zero_finder.rs
+++ b/benches/zero_finder.rs
@@ -0,0 +1,116 @@
+use std::thread;
+
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use halo2curves::{bn256::Fr as F, ff::Field};
+use maybe_rayon::{
+    iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator},
+    slice::ParallelSlice,
+};
+use rand::Rng;
+
+// Assuming these are your types
+#[derive(Clone)]
+enum ValType {
+    Constant(F),
+    AssignedConstant(usize, F),
+    Other,
+}
+
+// Helper to generate test data
+fn generate_test_data(size: usize, zero_probability: f64) -> Vec<ValType> {
+    let mut rng = rand::thread_rng();
+    (0..size)
+        .map(|_i| {
+            if rng.gen::<f64>() < zero_probability {
+                ValType::Constant(F::ZERO)
+            } else {
+                ValType::Constant(F::ONE) // Or some other non-zero value
+            }
+        })
+        .collect()
+}
+
+fn bench_zero_finding(c: &mut Criterion) {
+    let sizes = [
+        1_000,         // 1K
+        10_000,        // 10K
+        100_000,       // 100K
+        256 * 256 * 2, // Our specific case
+        1_000_000,     // 1M
+        10_000_000,    // 10M
+    ];
+
+    let zero_probability = 0.1; // 10% zeros
+
+    let mut group = c.benchmark_group("zero_finding");
+    group.sample_size(10); // Adjust based on your needs
+
+    for &size in &sizes {
+        let data = generate_test_data(size, zero_probability);
+
+        // Benchmark sequential version
+        group.bench_function(format!("sequential_{}", size), |b| {
+            b.iter(|| {
+                let result = data
+                    .iter()
+                    .enumerate()
+                    .filter_map(|(i, e)| match e {
+                        ValType::Constant(r) | ValType::AssignedConstant(_, r) => {
+                            (*r == F::ZERO).then_some(i)
+                        }
+                        _ => None,
+                    })
+                    .collect::<Vec<_>>();
+                black_box(result)
+            })
+        });
+
+        // Benchmark parallel version
+        group.bench_function(format!("parallel_{}", size), |b| {
+            b.iter(|| {
+                let result = data
+                    .par_iter()
+                    .enumerate()
+                    .filter_map(|(i, e)| match e {
+                        ValType::Constant(r) | ValType::AssignedConstant(_, r) => {
+                            (*r == F::ZERO).then_some(i)
+                        }
+                        _ => None,
+                    })
+                    .collect::<Vec<_>>();
+                black_box(result)
+            })
+        });
+
+        // Benchmark chunked parallel version
+        group.bench_function(format!("chunked_parallel_{}", size), |b| {
+            b.iter(|| {
+                let num_cores = thread::available_parallelism()
+                    .map(|n| n.get())
+                    .unwrap_or(1);
+                let chunk_size = (size / num_cores).max(100);
+
+                let result = data
+                    .par_chunks(chunk_size)
+                    .enumerate()
+                    .flat_map(|(chunk_idx, chunk)| {
+                        chunk
+                            .par_iter() // Make sure we use par_iter() here
+                            .enumerate()
+                            .filter_map(move |(i, e)| match e {
+                                ValType::Constant(r) | ValType::AssignedConstant(_, r) => {
+                                    (*r == F::ZERO).then_some(chunk_idx * chunk_size + i)
+                                }
+                                _ => None,
+                            })
+                    })
+                    .collect::<Vec<_>>();
+                black_box(result)
+            })
+        });
+    }
+    group.finish();
+}
+
+criterion_group!(benches, bench_zero_finding);
+criterion_main!(benches);
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,6 +12,7 @@ asyncio_mode = "auto"

 [project]
 name = "ezkl"
+version = "0.0.0"
 requires-python = ">=3.7"
 classifiers = [
    "Programming Language :: Rust",
--- a/src/circuit/ops/layouts.rs
+++ b/src/circuit/ops/layouts.rs
@@ -30,6 +30,8 @@ use crate::{
 use super::*;
 use crate::circuit::ops::lookup::LookupOp;

+const ASCII_ALPHABET: &str = "abcdefghijklmnopqrstuvwxyz";
+
 /// Calculate the L1 distance between two tensors.
 /// ```
 /// use ezkl::tensor::Tensor;
@@ -418,10 +420,6 @@ pub fn dot<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
    values[0].remove_indices(&mut removal_indices, true)?;
    values[1].remove_indices(&mut removal_indices, true)?;

-    let elapsed = global_start.elapsed();
-    trace!("filtering const zero indices took: {:?}", elapsed);
-
-    let start = instant::Instant::now();
    let mut inputs = vec![];
    let block_width = config.custom_gates.output.num_inner_cols();

@@ -429,37 +427,22 @@ pub fn dot<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
    for (i, input) in values.iter_mut().enumerate() {
        input.pad_to_zero_rem(block_width, ValType::Constant(F::ZERO))?;
        let inp = {
-            let (res, len) = region.assign_with_duplication(
-                &config.custom_gates.inputs[i],
-                input,
-                &config.check_mode,
-                false,
-            )?;
+            let (res, len) = region
+                .assign_with_duplication_unconstrained(&config.custom_gates.inputs[i], input)?;
            assigned_len = len;
            res.get_inner()?
        };
        inputs.push(inp);
    }

-    let elapsed = start.elapsed();
-    trace!("assigning inputs took: {:?}", elapsed);
-
    // Now we can assign the dot product
    // time this step
-    let start = instant::Instant::now();
    let accumulated_dot = accumulated::dot(&[inputs[0].clone(), inputs[1].clone()], block_width)?;
-    let elapsed = start.elapsed();
-    trace!("calculating accumulated dot took: {:?}", elapsed);
-
-    let start = instant::Instant::now();
-    let (output, output_assigned_len) = region.assign_with_duplication(
+    let (output, output_assigned_len) = region.assign_with_duplication_constrained(
        &config.custom_gates.output,
        &accumulated_dot.into(),
        &config.check_mode,
-        true,
    )?;
-    let elapsed = start.elapsed();
-    trace!("assigning output took: {:?}", elapsed);

    // enable the selectors
    if !region.is_dummy() {
@@ -1000,7 +983,6 @@ fn select<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
    region: &mut RegionCtx<F>,
    values: &[ValTensor<F>; 2],
 ) -> Result<ValTensor<F>, CircuitError> {
-    let start = instant::Instant::now();
    let (mut input, index) = (values[0].clone(), values[1].clone());
    input.flatten();

@@ -1028,9 +1010,6 @@ fn select<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
    let (_, assigned_output) =
        dynamic_lookup(config, region, &[index, output], &[dim_indices, input])?;

-    let end = start.elapsed();
-    trace!("select took: {:?}", end);
-
    Ok(assigned_output)
 }

@@ -1092,7 +1071,6 @@ pub(crate) fn dynamic_lookup<F: PrimeField + TensorType + PartialOrd + std::hash
    lookups: &[ValTensor<F>; 2],
    tables: &[ValTensor<F>; 2],
 ) -> Result<(ValTensor<F>, ValTensor<F>), CircuitError> {
-    let start = instant::Instant::now();
    // if not all lookups same length err
    if lookups[0].len() != lookups[1].len() {
        return Err(CircuitError::MismatchedLookupLength(
@@ -1126,28 +1104,20 @@ pub(crate) fn dynamic_lookup<F: PrimeField + TensorType + PartialOrd + std::hash
    }
    let table_len = table_0.len();

-    trace!("assigning tables took: {:?}", start.elapsed());
-
    // now create a vartensor of constants for the dynamic lookup index
    let table_index = create_constant_tensor(F::from(dynamic_lookup_index as u64), table_len);
    let _table_index =
        region.assign_dynamic_lookup(&config.dynamic_lookups.tables[2], &table_index)?;

-    trace!("assigning table index took: {:?}", start.elapsed());
-
    let lookup_0 = region.assign(&config.dynamic_lookups.inputs[0], &lookup_0)?;
    let lookup_1 = region.assign(&config.dynamic_lookups.inputs[1], &lookup_1)?;
    let lookup_len = lookup_0.len();

-    trace!("assigning lookups took: {:?}", start.elapsed());
-
    // now set the lookup index
    let lookup_index = create_constant_tensor(F::from(dynamic_lookup_index as u64), lookup_len);

    let _lookup_index = region.assign(&config.dynamic_lookups.inputs[2], &lookup_index)?;

-    trace!("assigning lookup index took: {:?}", start.elapsed());
-
    let mut lookup_block = 0;

    if !region.is_dummy() {
@@ -1194,9 +1164,6 @@ pub(crate) fn dynamic_lookup<F: PrimeField + TensorType + PartialOrd + std::hash
    region.increment_dynamic_lookup_index(1);
    region.increment(lookup_len);

-    let end = start.elapsed();
-    trace!("dynamic lookup took: {:?}", end);
-
    Ok((lookup_0, lookup_1))
 }

@@ -1441,7 +1408,6 @@ pub(crate) fn linearize_element_index<F: PrimeField + TensorType + PartialOrd +
    dim: usize,
    is_flat_index: bool,
 ) -> Result<ValTensor<F>, CircuitError> {
-    let start_time = instant::Instant::now();
    let index = values[0].clone();
    if !is_flat_index {
        assert_eq!(index.dims().len(), dims.len());
@@ -1515,9 +1481,6 @@ pub(crate) fn linearize_element_index<F: PrimeField + TensorType + PartialOrd +

    region.apply_in_loop(&mut output, inner_loop_function)?;

-    let elapsed = start_time.elapsed();
-    trace!("linearize_element_index took: {:?}", elapsed);
-
    Ok(output.into())
 }

@@ -1949,16 +1912,11 @@ pub fn sum<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(

    region.flush()?;
    // time this entire function run
-    let global_start = instant::Instant::now();
-
    let mut values = values.clone();

    // this section has been optimized to death, don't mess with it
    values[0].remove_const_zero_values();

-    let elapsed = global_start.elapsed();
-    trace!("filtering const zero indices took: {:?}", elapsed);
-
    // if empty return a const
    if values[0].is_empty() {
        return Ok(create_zero_tensor(1));
@@ -1970,12 +1928,8 @@ pub fn sum<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
    let input = {
        let mut input = values[0].clone();
        input.pad_to_zero_rem(block_width, ValType::Constant(F::ZERO))?;
-        let (res, len) = region.assign_with_duplication(
-            &config.custom_gates.inputs[1],
-            &input,
-            &config.check_mode,
-            false,
-        )?;
+        let (res, len) =
+            region.assign_with_duplication_unconstrained(&config.custom_gates.inputs[1], &input)?;
        assigned_len = len;
        res.get_inner()?
    };
@@ -1983,11 +1937,10 @@ pub fn sum<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
    // Now we can assign the dot product
    let accumulated_sum = accumulated::sum(&input, block_width)?;

-    let (output, output_assigned_len) = region.assign_with_duplication(
+    let (output, output_assigned_len) = region.assign_with_duplication_constrained(
        &config.custom_gates.output,
        &accumulated_sum.into(),
        &config.check_mode,
-        true,
    )?;

    // enable the selectors
@@ -2053,13 +2006,10 @@ pub fn prod<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
 ) -> Result<ValTensor<F>, CircuitError> {
    region.flush()?;
    // time this entire function run
-    let global_start = instant::Instant::now();

    // this section has been optimized to death, don't mess with it
    let removal_indices = values[0].get_const_zero_indices();

-    let elapsed = global_start.elapsed();
-    trace!("finding const zero indices took: {:?}", elapsed);
    // if empty return a const
    if !removal_indices.is_empty() {
        return Ok(create_zero_tensor(1));
@@ -2070,12 +2020,8 @@ pub fn prod<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
    let input = {
        let mut input = values[0].clone();
        input.pad_to_zero_rem(block_width, ValType::Constant(F::ONE))?;
-        let (res, len) = region.assign_with_duplication(
-            &config.custom_gates.inputs[1],
-            &input,
-            &config.check_mode,
-            false,
-        )?;
+        let (res, len) =
+            region.assign_with_duplication_unconstrained(&config.custom_gates.inputs[1], &input)?;
        assigned_len = len;
        res.get_inner()?
    };
@@ -2083,11 +2029,10 @@ pub fn prod<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
    // Now we can assign the dot product
    let accumulated_prod = accumulated::prod(&input, block_width)?;

-    let (output, output_assigned_len) = region.assign_with_duplication(
+    let (output, output_assigned_len) = region.assign_with_duplication_constrained(
        &config.custom_gates.output,
        &accumulated_prod.into(),
        &config.check_mode,
-        true,
    )?;

    // enable the selectors
@@ -2440,7 +2385,6 @@ pub(crate) fn pairwise<F: PrimeField + TensorType + PartialOrd + std::hash::Hash
    let orig_lhs = lhs.clone();
    let orig_rhs = rhs.clone();

-    let start = instant::Instant::now();
    let first_zero_indices = HashSet::from_iter(lhs.get_const_zero_indices());
    let second_zero_indices = HashSet::from_iter(rhs.get_const_zero_indices());

@@ -2455,7 +2399,6 @@ pub(crate) fn pairwise<F: PrimeField + TensorType + PartialOrd + std::hash::Hash
        BaseOp::Sub => second_zero_indices.clone(),
        _ => return Err(CircuitError::UnsupportedOp),
    };
-    trace!("setting up indices took {:?}", start.elapsed());

    if lhs.len() != rhs.len() {
        return Err(CircuitError::DimMismatch(format!(
@@ -2480,7 +2423,6 @@ pub(crate) fn pairwise<F: PrimeField + TensorType + PartialOrd + std::hash::Hash

    // Now we can assign the dot product
    // time the calc
-    let start = instant::Instant::now();
    let op_result = match op {
        BaseOp::Add => add(&inputs),
        BaseOp::Sub => sub(&inputs),
@@ -2491,20 +2433,13 @@ pub(crate) fn pairwise<F: PrimeField + TensorType + PartialOrd + std::hash::Hash
        error!("{}", e);
        halo2_proofs::plonk::Error::Synthesis
    })?;
-    trace!("pairwise {} calc took {:?}", op.as_str(), start.elapsed());

-    let start = instant::Instant::now();
    let assigned_len = op_result.len() - removal_indices.len();
    let mut output = region.assign_with_omissions(
        &config.custom_gates.output,
        &op_result.into(),
        &removal_indices,
    )?;
-    trace!(
-        "pairwise {} input assign took {:?}",
-        op.as_str(),
-        start.elapsed()
-    );

    // Enable the selectors
    if !region.is_dummy() {
@@ -2671,9 +2606,7 @@ pub fn greater<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
    rhs.expand(&broadcasted_shape)?;

    let diff = pairwise(config, region, &[lhs, rhs], BaseOp::Sub)?;
-
    let sign = sign(config, region, &[diff])?;
-
    equals(config, region, &[sign, create_unit_tensor(1)])
 }

@@ -5286,75 +5219,72 @@ pub(crate) fn decompose<F: PrimeField + TensorType + PartialOrd + std::hash::Has
    base: &usize,
    n: &usize,
 ) -> Result<ValTensor<F>, CircuitError> {
-    let input = values[0].clone();
+    let mut input = values[0].clone();

    let is_assigned = !input.all_prev_assigned();

-    let bases: ValTensor<F> = Tensor::from(
-        (0..*n)
-            .rev()
-            .map(|x| ValType::Constant(integer_rep_to_felt(base.pow(x as u32) as IntegerRep))),
+    if !is_assigned {
+        input = region.assign(&config.custom_gates.inputs[0], &input)?;
+    }
+
+    let mut bases: ValTensor<F> = Tensor::from(
+        // repeat it input.len() times
+        (0..input.len()).flat_map(|_| {
+            (0..*n)
+                .rev()
+                .map(|x| ValType::Constant(integer_rep_to_felt(base.pow(x as u32) as IntegerRep)))
+        }),
    )
    .into();
+    let mut bases_dims = input.dims().to_vec();
+    bases_dims.push(*n);
+    bases.reshape(&bases_dims)?;

-    let cartesian_coord = input
-        .dims()
-        .iter()
-        .map(|x| 0..*x)
-        .multi_cartesian_product()
-        .collect::<Vec<_>>();
+    let mut decomposed_dims = input.dims().to_vec();
+    decomposed_dims.push(*n + 1);

-    let mut output: Tensor<Tensor<ValType<F>>> = Tensor::new(None, input.dims())?;
+    let claimed_output = if region.witness_gen() {
+        input.decompose(*base, *n)?
+    } else {
+        let decomposed_len = decomposed_dims.iter().product();
+        let claimed_output = Tensor::new(
+            Some(&vec![ValType::Value(Value::unknown()); decomposed_len]),
+            &decomposed_dims,
+        )?;

-    let inner_loop_function =
-        |i: usize, region: &mut RegionCtx<F>| -> Result<Tensor<ValType<F>>, CircuitError> {
-            let coord = cartesian_coord[i].clone();
-            let slice = coord.iter().map(|x| *x..*x + 1).collect::<Vec<_>>();
-            let mut sliced_input = input.get_slice(&slice)?;
-            sliced_input.flatten();
+        claimed_output.into()
+    };
+    region.assign(&config.custom_gates.output, &claimed_output)?;
+    region.increment(claimed_output.len());

-            if !is_assigned {
-                sliced_input = region.assign(&config.custom_gates.inputs[0], &sliced_input)?;
-            }
+    let input_slice = input.dims().iter().map(|x| 0..*x).collect::<Vec<_>>();
+    let mut sign_slice = input_slice.clone();
+    sign_slice.push(0..1);
+    let mut rest_slice = input_slice.clone();
+    rest_slice.push(1..n + 1);

-            let mut claimed_output_slice = if region.witness_gen() {
-                sliced_input.decompose(*base, *n)?
-            } else {
-                Tensor::from(vec![ValType::Value(Value::unknown()); *n + 1].into_iter()).into()
-            };
+    let sign = claimed_output.get_slice(&sign_slice)?;
+    let rest = claimed_output.get_slice(&rest_slice)?;

-            claimed_output_slice =
-                region.assign(&config.custom_gates.inputs[1], &claimed_output_slice)?;
-            claimed_output_slice.flatten();
+    let sign = range_check(config, region, &[sign], &(-1, 1))?;
+    let rest = range_check(config, region, &[rest], &(0, (*base - 1) as i128))?;

-            region.increment(claimed_output_slice.len());
+    // equation needs to be constructed as ij,ij->i but for arbitrary n dims we need to construct this dynamically
+    // indices should map in order of the alphabet
+    // start with lhs
+    let lhs = ASCII_ALPHABET.chars().take(rest.dims().len()).join("");
+    let rhs = ASCII_ALPHABET.chars().take(rest.dims().len() - 1).join("");
+    let equation = format!("{},{}->{}", lhs, lhs, rhs);

-            // get the sign bit and make sure it is valid
-            let sign = claimed_output_slice.first()?;
-            let sign = range_check(config, region, &[sign], &(-1, 1))?;
+    // now add the rhs

-            // get the rest of the thing and make sure it is in the correct range
-            let rest = claimed_output_slice.get_slice(&[1..claimed_output_slice.len()])?;
+    let prod_decomp = einsum(config, region, &[rest.clone(), bases], &equation)?;

-            let rest = range_check(config, region, &[rest], &(0, (base - 1) as i128))?;
+    let signed_decomp = pairwise(config, region, &[prod_decomp, sign], BaseOp::Mult)?;

-            let prod_decomp = dot(config, region, &[rest, bases.clone()])?;
+    enforce_equality(config, region, &[input, signed_decomp])?;

-            let signed_decomp = pairwise(config, region, &[prod_decomp, sign], BaseOp::Mult)?;
-
-            enforce_equality(config, region, &[sliced_input, signed_decomp])?;
-
-            Ok(claimed_output_slice.get_inner_tensor()?.clone())
-        };
-
-    region.apply_in_loop(&mut output, inner_loop_function)?;
-
-    let mut combined_output = output.combine()?;
-    let mut output_dims = input.dims().to_vec();
-    output_dims.push(*n + 1);
-    combined_output.reshape(&output_dims)?;
-
-    Ok(combined_output.into())
+    Ok(claimed_output)
 }

 pub(crate) fn sign<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
--- a/src/circuit/ops/region.rs
+++ b/src/circuit/ops/region.rs
@@ -671,22 +671,17 @@ impl<'a, F: PrimeField + TensorType + PartialOrd + std::hash::Hash> RegionCtx<'a
    }

    /// Assign a valtensor to a vartensor with duplication
-    pub fn assign_with_duplication(
+    pub fn assign_with_duplication_unconstrained(
        &mut self,
        var: &VarTensor,
        values: &ValTensor<F>,
-        check_mode: &crate::circuit::CheckMode,
-        single_inner_col: bool,
    ) -> Result<(ValTensor<F>, usize), Error> {
        if let Some(region) = &self.region {
            // duplicates every nth element to adjust for column overflow
-            let (res, len) = var.assign_with_duplication(
+            let (res, len) = var.assign_with_duplication_unconstrained(
                &mut region.borrow_mut(),
-                self.row,
                self.linear_coord,
                values,
-                check_mode,
-                single_inner_col,
                &mut self.assigned_constants,
            )?;
            Ok((res, len))
@@ -695,7 +690,37 @@ impl<'a, F: PrimeField + TensorType + PartialOrd + std::hash::Hash> RegionCtx<'a
                self.row,
                self.linear_coord,
                values,
-                single_inner_col,
+                false,
+                &mut self.assigned_constants,
+            )?;
+            Ok((values.clone(), len))
+        }
+    }
+
+    /// Assign a valtensor to a vartensor with duplication
+    pub fn assign_with_duplication_constrained(
+        &mut self,
+        var: &VarTensor,
+        values: &ValTensor<F>,
+        check_mode: &crate::circuit::CheckMode,
+    ) -> Result<(ValTensor<F>, usize), Error> {
+        if let Some(region) = &self.region {
+            // duplicates every nth element to adjust for column overflow
+            let (res, len) = var.assign_with_duplication_constrained(
+                &mut region.borrow_mut(),
+                self.row,
+                self.linear_coord,
+                values,
+                check_mode,
+                &mut self.assigned_constants,
+            )?;
+            Ok((res, len))
+        } else {
+            let (_, len) = var.dummy_assign_with_duplication(
+                self.row,
+                self.linear_coord,
+                values,
+                true,
                &mut self.assigned_constants,
            )?;
            Ok((values.clone(), len))
--- a/src/eth.rs
+++ b/src/eth.rs
@@ -488,7 +488,7 @@ pub async fn deploy_da_verifier_via_solidity(
        }
    }

-    let contract = match call_to_account {
+    match call_to_account {
        Some(call) => {
            deploy_single_da_contract(
                client,
@@ -514,8 +514,7 @@ pub async fn deploy_da_verifier_via_solidity(
            )
            .await
        }
-    };
-    return contract;
+    }
 }

 async fn deploy_multi_da_contract(
@@ -630,7 +629,7 @@ async fn deploy_single_da_contract(
            // bytes memory _callData,
            PackedSeqToken(call_data.as_ref()),
            // uint256 _decimals,
-            WordToken(B256::from(decimals).into()),
+            WordToken(B256::from(decimals)),
            // uint[] memory _scales,
            DynSeqToken(
                scales
--- a/src/graph/mod.rs
+++ b/src/graph/mod.rs
@@ -280,7 +280,13 @@ impl GraphWitness {
        })?;

        let reader = std::io::BufReader::with_capacity(*EZKL_BUF_CAPACITY, file);
-        serde_json::from_reader(reader).map_err(|e| e.into())
+        let witness: GraphWitness =
+            serde_json::from_reader(reader).map_err(|e| Into::<GraphError>::into(e))?;
+
+        // check versions match
+        crate::check_version_string_matches(witness.version.as_deref().unwrap_or(""));
+
+        Ok(witness)
    }

    /// Save the model input to a file
@@ -572,10 +578,14 @@ impl GraphSettings {
        // buf reader
        let reader =
            std::io::BufReader::with_capacity(*EZKL_BUF_CAPACITY, std::fs::File::open(path)?);
-        serde_json::from_reader(reader).map_err(|e| {
+        let settings: GraphSettings = serde_json::from_reader(reader).map_err(|e| {
            error!("failed to load settings file at {}", e);
            std::io::Error::new(std::io::ErrorKind::Other, e)
-        })
+        })?;
+
+        crate::check_version_string_matches(&settings.version);
+
+        Ok(settings)
    }

    /// Export the ezkl configuration as json
@@ -697,6 +707,9 @@ impl GraphCircuit {
        let reader = std::io::BufReader::with_capacity(*EZKL_BUF_CAPACITY, f);
        let result: GraphCircuit = bincode::deserialize_from(reader)?;

+        // check the versions matche
+        crate::check_version_string_matches(&result.core.settings.version);
+
        Ok(result)
    }
 }
--- a/src/graph/model.rs
+++ b/src/graph/model.rs
@@ -1226,6 +1226,7 @@ impl Model {
                values.iter().map(|v| v.dims()).collect_vec()
            );

+            let start = instant::Instant::now();
            match &node {
                NodeType::Node(n) => {
                    let res = if node.is_constant() && node.num_uses() == 1 {
@@ -1363,6 +1364,7 @@ impl Model {
                    results.insert(*idx, full_results);
                }
            }
+            debug!("------------ layout of {} took {:?}", idx, start.elapsed());
        }

        // we do this so we can support multiple passes of the same model and have deterministic results (Non-assigned inputs etc... etc...)
--- a/src/graph/utilities.rs
+++ b/src/graph/utilities.rs
@@ -142,8 +142,6 @@ use tract_onnx::prelude::SymbolValues;
 pub fn extract_tensor_value(
    input: Arc<tract_onnx::prelude::Tensor>,
 ) -> Result<Tensor<f32>, GraphError> {
-    use maybe_rayon::prelude::{IntoParallelRefIterator, ParallelIterator};
-
    let dt = input.datum_type();
    let dims = input.shape().to_vec();

@@ -156,7 +154,7 @@ pub fn extract_tensor_value(
    match dt {
        DatumType::F16 => {
            let vec = input.as_slice::<tract_onnx::prelude::f16>()?.to_vec();
-            let cast: Vec<f32> = vec.par_iter().map(|x| (*x).into()).collect();
+            let cast: Vec<f32> = vec.iter().map(|x| (*x).into()).collect();
            const_value = Tensor::<f32>::new(Some(&cast), &dims)?;
        }
        DatumType::F32 => {
@@ -165,61 +163,61 @@ pub fn extract_tensor_value(
        }
        DatumType::F64 => {
            let vec = input.as_slice::<f64>()?.to_vec();
-            let cast: Vec<f32> = vec.par_iter().map(|x| *x as f32).collect();
+            let cast: Vec<f32> = vec.iter().map(|x| *x as f32).collect();
            const_value = Tensor::<f32>::new(Some(&cast), &dims)?;
        }
        DatumType::I64 => {
            // Generally a shape or hyperparam
            let vec = input.as_slice::<i64>()?.to_vec();
-            let cast: Vec<f32> = vec.par_iter().map(|x| *x as f32).collect();
+            let cast: Vec<f32> = vec.iter().map(|x| *x as f32).collect();
            const_value = Tensor::<f32>::new(Some(&cast), &dims)?;
        }
        DatumType::I32 => {
            // Generally a shape or hyperparam
            let vec = input.as_slice::<i32>()?.to_vec();
-            let cast: Vec<f32> = vec.par_iter().map(|x| *x as f32).collect();
+            let cast: Vec<f32> = vec.iter().map(|x| *x as f32).collect();
            const_value = Tensor::<f32>::new(Some(&cast), &dims)?;
        }
        DatumType::I16 => {
            // Generally a shape or hyperparam
            let vec = input.as_slice::<i16>()?.to_vec();
-            let cast: Vec<f32> = vec.par_iter().map(|x| *x as f32).collect();
+            let cast: Vec<f32> = vec.iter().map(|x| *x as f32).collect();
            const_value = Tensor::<f32>::new(Some(&cast), &dims)?;
        }
        DatumType::I8 => {
            // Generally a shape or hyperparam
            let vec = input.as_slice::<i8>()?.to_vec();
-            let cast: Vec<f32> = vec.par_iter().map(|x| *x as f32).collect();
+            let cast: Vec<f32> = vec.iter().map(|x| *x as f32).collect();
            const_value = Tensor::<f32>::new(Some(&cast), &dims)?;
        }
        DatumType::U8 => {
            // Generally a shape or hyperparam
            let vec = input.as_slice::<u8>()?.to_vec();
-            let cast: Vec<f32> = vec.par_iter().map(|x| *x as f32).collect();
+            let cast: Vec<f32> = vec.iter().map(|x| *x as f32).collect();
            const_value = Tensor::<f32>::new(Some(&cast), &dims)?;
        }
        DatumType::U16 => {
            // Generally a shape or hyperparam
            let vec = input.as_slice::<u16>()?.to_vec();
-            let cast: Vec<f32> = vec.par_iter().map(|x| *x as f32).collect();
+            let cast: Vec<f32> = vec.iter().map(|x| *x as f32).collect();
            const_value = Tensor::<f32>::new(Some(&cast), &dims)?;
        }
        DatumType::U32 => {
            // Generally a shape or hyperparam
            let vec = input.as_slice::<u32>()?.to_vec();
-            let cast: Vec<f32> = vec.par_iter().map(|x| *x as f32).collect();
+            let cast: Vec<f32> = vec.iter().map(|x| *x as f32).collect();
            const_value = Tensor::<f32>::new(Some(&cast), &dims)?;
        }
        DatumType::U64 => {
            // Generally a shape or hyperparam
            let vec = input.as_slice::<u64>()?.to_vec();
-            let cast: Vec<f32> = vec.par_iter().map(|x| *x as f32).collect();
+            let cast: Vec<f32> = vec.iter().map(|x| *x as f32).collect();
            const_value = Tensor::<f32>::new(Some(&cast), &dims)?;
        }
        DatumType::Bool => {
            // Generally a shape or hyperparam
            let vec = input.as_slice::<bool>()?.to_vec();
-            let cast: Vec<f32> = vec.par_iter().map(|x| *x as usize as f32).collect();
+            let cast: Vec<f32> = vec.iter().map(|x| *x as usize as f32).collect();
            const_value = Tensor::<f32>::new(Some(&cast), &dims)?;
        }
        DatumType::TDim => {
@@ -227,7 +225,7 @@ pub fn extract_tensor_value(
            let vec = input.as_slice::<tract_onnx::prelude::TDim>()?.to_vec();

            let cast: Result<Vec<f32>, GraphError> = vec
-                .par_iter()
+                .iter()
                .map(|x| match x.to_i64() {
                    Ok(v) => Ok(v as f32),
                    Err(_) => match x.to_i64() {
@@ -1136,23 +1134,21 @@ pub fn new_op_from_onnx(
                        a: crate::circuit::utils::F32(exponent),
                    })
                }
-            } else {
-                if let Some(c) = inputs[0].opkind().get_mutable_constant() {
-                    inputs[0].decrement_use();
-                    deleted_indices.push(0);
-                    if c.raw_values.len() > 1 {
-                        unimplemented!("only support scalar base")
-                    }
-
-                    let base = c.raw_values[0];
-
-                    SupportedOp::Nonlinear(LookupOp::Exp {
-                        scale: scale_to_multiplier(input_scales[1]).into(),
-                        base: base.into(),
-                    })
-                } else {
-                    unimplemented!("only support constant base or pow for now")
+            } else if let Some(c) = inputs[0].opkind().get_mutable_constant() {
+                inputs[0].decrement_use();
+                deleted_indices.push(0);
+                if c.raw_values.len() > 1 {
+                    unimplemented!("only support scalar base")
                }
+
+                let base = c.raw_values[0];
+
+                SupportedOp::Nonlinear(LookupOp::Exp {
+                    scale: scale_to_multiplier(input_scales[1]).into(),
+                    base: base.into(),
+                })
+            } else {
+                unimplemented!("only support constant base or pow for now")
            }
        }
        "Div" => {
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -420,3 +420,30 @@ where
    let b = s[pos + 2..].parse()?;
    Ok((a, b))
 }
+
+/// Check if the version string matches the artifact version
+/// If the version string does not match the artifact version, log a warning
+pub fn check_version_string_matches(artifact_version: &str) {
+    if artifact_version == "0.0.0"
+        || artifact_version == "source - no compatibility guaranteed"
+        || artifact_version.is_empty()
+    {
+        log::warn!("Artifact version is 0.0.0, skipping version check");
+        return;
+    }
+
+    let version = crate::version();
+
+    if version == "source - no compatibility guaranteed" {
+        log::warn!("Compiled source version is not guaranteed to match artifact version");
+        return;
+    }
+
+    if version != artifact_version {
+        log::warn!(
+            "Version mismatch: CLI version is {} but artifact version is {}",
+            version,
+            artifact_version
+        );
+    }
+}
--- a/src/pfsys/mod.rs
+++ b/src/pfsys/mod.rs
@@ -822,6 +822,7 @@ where
    Scheme::Scalar: PrimeField + SerdeObject + FromUniformBytes<64>,
 {
    debug!("loading proving key from {:?}", path);
+    let start = instant::Instant::now();
    let f = File::open(path.clone()).map_err(|e| PfsysError::LoadPk(format!("{}", e)))?;
    let mut reader = BufReader::with_capacity(*EZKL_BUF_CAPACITY, f);
    let pk = ProvingKey::<Scheme::Curve>::read::<_, C>(
@@ -830,7 +831,8 @@ where
        params,
    )
    .map_err(|e| PfsysError::LoadPk(format!("{}", e)))?;
-    info!("loaded proving key ✅");
+    let elapsed = start.elapsed();
+    info!("loaded proving key in {:?}", elapsed);
    Ok(pk)
 }

--- a/src/tensor/mod.rs
+++ b/src/tensor/mod.rs
@@ -638,42 +638,44 @@ impl<T: Clone + TensorType> Tensor<T> {
    where
        T: Send + Sync,
    {
-        if indices.is_empty() {
+        // Fast path: empty indices or full tensor slice
+        if indices.is_empty()
+            || indices.iter().map(|x| x.end - x.start).collect::<Vec<_>>() == self.dims
+        {
            return Ok(self.clone());
        }
+
+        // Validate dimensions
        if self.dims.len() < indices.len() {
            return Err(TensorError::DimError(format!(
                "The dimensionality of the slice {:?} is greater than the tensor's {:?}",
                indices, self.dims
            )));
-        } else if indices.iter().map(|x| x.end - x.start).collect::<Vec<_>>() == self.dims {
-            // else if slice is the same as dims, return self
-            return Ok(self.clone());
        }

-        // if indices weren't specified we fill them in as required
-        let mut full_indices = indices.to_vec();
+        // Pre-allocate the full indices vector with capacity
+        let mut full_indices = Vec::with_capacity(self.dims.len());
+        full_indices.extend_from_slice(indices);

-        for i in 0..(self.dims.len() - indices.len()) {
-            full_indices.push(0..self.dims()[indices.len() + i])
-        }
+        // Fill remaining dimensions
+        full_indices.extend((indices.len()..self.dims.len()).map(|i| 0..self.dims[i]));

-        let cartesian_coord: Vec<Vec<usize>> = full_indices
+        // Pre-calculate total size and allocate result vector
+        let total_size: usize = full_indices
            .iter()
-            .cloned()
-            .multi_cartesian_product()
-            .collect();
-
-        let res: Vec<T> = cartesian_coord
-            .par_iter()
-            .map(|e| {
-                let index = self.get_index(e);
-                self[index].clone()
-            })
-            .collect();
+            .map(|range| range.end - range.start)
+            .product();
+        let mut res = Vec::with_capacity(total_size);

+        // Calculate new dimensions once
        let dims: Vec<usize> = full_indices.iter().map(|e| e.end - e.start).collect();

+        // Use iterator directly without collecting into intermediate Vec
+        for coord in full_indices.iter().cloned().multi_cartesian_product() {
+            let index = self.get_index(&coord);
+            res.push(self[index].clone());
+        }
+
        Tensor::new(Some(&res), &dims)
    }

@@ -831,7 +833,7 @@ impl<T: Clone + TensorType> Tensor<T> {
        num_repeats: usize,
        initial_offset: usize,
    ) -> Result<Tensor<T>, TensorError> {
-        let mut inner: Vec<T> = vec![];
+        let mut inner: Vec<T> = Vec::with_capacity(self.inner.len());
        let mut offset = initial_offset;
        for (i, elem) in self.inner.clone().into_iter().enumerate() {
            if (i + offset + 1) % n == 0 {
@@ -860,20 +862,22 @@ impl<T: Clone + TensorType> Tensor<T> {
        num_repeats: usize,
        initial_offset: usize,
    ) -> Result<Tensor<T>, TensorError> {
-        let mut inner: Vec<T> = vec![];
-        let mut indices_to_remove = std::collections::HashSet::new();
-        for i in 0..self.inner.len() {
-            if (i + initial_offset + 1) % n == 0 {
-                for j in 1..(1 + num_repeats) {
-                    indices_to_remove.insert(i + j);
-                }
-            }
-        }
+        // Pre-calculate capacity to avoid reallocations
+        let estimated_size = self.inner.len() - (self.inner.len() / n) * num_repeats;
+        let mut inner = Vec::with_capacity(estimated_size);

-        let old_inner = self.inner.clone();
-        for (i, elem) in old_inner.into_iter().enumerate() {
-            if !indices_to_remove.contains(&i) {
-                inner.push(elem.clone());
+        // Use iterator directly instead of creating intermediate collections
+        let mut i = 0;
+        while i < self.inner.len() {
+            // Add the current element
+            inner.push(self.inner[i].clone());
+
+            // If this is an nth position (accounting for offset)
+            if (i + initial_offset + 1) % n == 0 {
+                // Skip the next num_repeats elements
+                i += num_repeats + 1;
+            } else {
+                i += 1;
            }
        }

--- a/src/tensor/val.rs
+++ b/src/tensor/val.rs
@@ -1,12 +1,12 @@
 use crate::{circuit::region::ConstantsMap, fieldutils::felt_to_integer_rep};
-use maybe_rayon::slice::Iter;
+use maybe_rayon::slice::{Iter, ParallelSlice};

 use super::{
    ops::{intercalate_values, pad, resize},
    *,
 };
 use halo2_proofs::{arithmetic::Field, circuit::Cell, plonk::Instance};
-use maybe_rayon::iter::{FilterMap, IntoParallelIterator, ParallelIterator};
+use maybe_rayon::iter::{FilterMap, ParallelIterator};

 pub(crate) fn create_constant_tensor<
    F: PrimeField + TensorType + std::marker::Send + std::marker::Sync + PartialOrd,
@@ -455,7 +455,7 @@ impl<F: PrimeField + TensorType + PartialOrd + std::hash::Hash> ValTensor<F> {
        }
    }

-    /// Returns the number of constants in the [ValTensor].
+    /// Returns an iterator over the [ValTensor]'s constants.
    pub fn create_constants_map_iterator(
        &self,
    ) -> FilterMap<Iter<'_, ValType<F>>, fn(&ValType<F>) -> Option<(F, ValType<F>)>> {
@@ -473,20 +473,48 @@ impl<F: PrimeField + TensorType + PartialOrd + std::hash::Hash> ValTensor<F> {
        }
    }

-    /// Returns the number of constants in the [ValTensor].
+    /// Returns a map of the constants in the [ValTensor].
    pub fn create_constants_map(&self) -> ConstantsMap<F> {
-        match self {
-            ValTensor::Value { inner, .. } => inner
-                .par_iter()
-                .filter_map(|x| {
-                    if let ValType::Constant(v) = x {
-                        Some((*v, x.clone()))
-                    } else {
-                        None
-                    }
-                })
-                .collect(),
-            ValTensor::Instance { .. } => ConstantsMap::new(),
+        let threshold = 1_000_000; // Tuned using the benchmarks
+
+        if self.len() < threshold {
+            match self {
+                ValTensor::Value { inner, .. } => inner
+                    .par_iter()
+                    .filter_map(|x| {
+                        if let ValType::Constant(v) = x {
+                            Some((*v, x.clone()))
+                        } else {
+                            None
+                        }
+                    })
+                    .collect(),
+                ValTensor::Instance { .. } => ConstantsMap::new(),
+            }
+        } else {
+            // Use parallel for larger arrays
+            let num_cores = std::thread::available_parallelism()
+                .map(|n| n.get())
+                .unwrap_or(1);
+            let chunk_size = (self.len() / num_cores).max(100_000);
+
+            match self {
+                ValTensor::Value { inner, .. } => inner
+                    .par_chunks(chunk_size)
+                    .flat_map(|chunk| {
+                        chunk
+                            .par_iter() // Make sure we use par_iter() here
+                            .filter_map(|x| {
+                                if let ValType::Constant(v) = x {
+                                    Some((*v, x.clone()))
+                                } else {
+                                    None
+                                }
+                            })
+                    })
+                    .collect(),
+                ValTensor::Instance { .. } => ConstantsMap::new(),
+            }
        }
    }

@@ -878,70 +906,161 @@ impl<F: PrimeField + TensorType + PartialOrd + std::hash::Hash> ValTensor<F> {

    /// remove constant zero values constants
    pub fn remove_const_zero_values(&mut self) {
-        match self {
-            ValTensor::Value { inner: v, dims, .. } => {
-                *v = v
-                    .clone()
-                    .into_par_iter()
-                    .filter_map(|e| {
-                        if let ValType::Constant(r) = e {
-                            if r == F::ZERO {
-                                return None;
+        let size_threshold = 1_000_000; // Tuned using the benchmarks
+
+        if self.len() < size_threshold {
+            match self {
+                ValTensor::Value { inner: v, dims, .. } => {
+                    *v = v
+                        .clone()
+                        .into_iter()
+                        .filter_map(|e| {
+                            if let ValType::Constant(r) = e {
+                                if r == F::ZERO {
+                                    return None;
+                                }
+                            } else if let ValType::AssignedConstant(_, r) = e {
+                                if r == F::ZERO {
+                                    return None;
+                                }
                            }
-                        } else if let ValType::AssignedConstant(_, r) = e {
-                            if r == F::ZERO {
-                                return None;
-                            }
-                        }
-                        Some(e)
-                    })
-                    .collect();
-                *dims = v.dims().to_vec();
+                            Some(e)
+                        })
+                        .collect();
+                    *dims = v.dims().to_vec();
+                }
+                ValTensor::Instance { .. } => {}
+            }
+        } else {
+            // Use parallel for larger arrays
+            let num_cores = std::thread::available_parallelism()
+                .map(|n| n.get())
+                .unwrap_or(1);
+            let chunk_size = (self.len() / num_cores).max(100_000);
+
+            match self {
+                ValTensor::Value { inner: v, dims, .. } => {
+                    *v = v
+                        .par_chunks_mut(chunk_size)
+                        .flat_map(|chunk| {
+                            chunk
+                                .par_iter_mut() // Make sure we use par_iter() here
+                                .filter_map(|e| {
+                                    if let ValType::Constant(r) = e {
+                                        if *r == F::ZERO {
+                                            return None;
+                                        }
+                                    } else if let ValType::AssignedConstant(_, r) = e {
+                                        if *r == F::ZERO {
+                                            return None;
+                                        }
+                                    }
+                                    Some(e.clone())
+                                })
+                        })
+                        .collect();
+                    *dims = v.dims().to_vec();
+                }
+                ValTensor::Instance { .. } => {}
            }
-            ValTensor::Instance { .. } => {}
        }
    }

-    /// gets constants
+    /// filter constant zero values constants
    pub fn get_const_zero_indices(&self) -> Vec<usize> {
-        match self {
-            ValTensor::Value { inner: v, .. } => v
-                .par_iter()
-                .enumerate()
-                .filter_map(|(i, e)| {
-                    if let ValType::Constant(r) = e {
-                        if *r == F::ZERO {
-                            return Some(i);
+        let size_threshold = 1_000_000; // Tuned using the benchmarks
+
+        if self.len() < size_threshold {
+            // Use single-threaded for smaller arrays
+            match &self {
+                ValTensor::Value { inner: v, .. } => v
+                    .iter()
+                    .enumerate()
+                    .filter_map(|(i, e)| {
+                        match e {
+                            // Combine both match arms to reduce branching
+                            ValType::Constant(r) | ValType::AssignedConstant(_, r) => {
+                                (*r == F::ZERO).then_some(i)
+                            }
+                            _ => None,
                        }
-                    } else if let ValType::AssignedConstant(_, r) = e {
-                        if *r == F::ZERO {
-                            return Some(i);
-                        }
-                    }
-                    None
-                })
-                .collect(),
-            ValTensor::Instance { .. } => vec![],
+                    })
+                    .collect(),
+                ValTensor::Instance { .. } => vec![],
+            }
+        } else {
+            // Use parallel for larger arrays
+            let num_cores = std::thread::available_parallelism()
+                .map(|n| n.get())
+                .unwrap_or(1);
+            let chunk_size = (self.len() / num_cores).max(100_000);
+
+            match &self {
+                ValTensor::Value { inner: v, .. } => v
+                    .par_chunks(chunk_size)
+                    .enumerate()
+                    .flat_map(|(chunk_idx, chunk)| {
+                        chunk
+                            .par_iter() // Make sure we use par_iter() here
+                            .enumerate()
+                            .filter_map(move |(i, e)| match e {
+                                ValType::Constant(r) | ValType::AssignedConstant(_, r) => {
+                                    (*r == F::ZERO).then_some(chunk_idx * chunk_size + i)
+                                }
+                                _ => None,
+                            })
+                    })
+                    .collect::<Vec<_>>(),
+                ValTensor::Instance { .. } => vec![],
+            }
        }
    }

-    /// gets constants
+    /// gets constant indices
    pub fn get_const_indices(&self) -> Vec<usize> {
-        match self {
-            ValTensor::Value { inner: v, .. } => v
-                .par_iter()
-                .enumerate()
-                .filter_map(|(i, e)| {
-                    if let ValType::Constant(_) = e {
-                        Some(i)
-                    } else if let ValType::AssignedConstant(_, _) = e {
-                        Some(i)
-                    } else {
-                        None
-                    }
-                })
-                .collect(),
-            ValTensor::Instance { .. } => vec![],
+        let size_threshold = 1_000_000; // Tuned using the benchmarks
+
+        if self.len() < size_threshold {
+            // Use single-threaded for smaller arrays
+            match &self {
+                ValTensor::Value { inner: v, .. } => v
+                    .iter()
+                    .enumerate()
+                    .filter_map(|(i, e)| {
+                        match e {
+                            // Combine both match arms to reduce branching
+                            ValType::Constant(_) | ValType::AssignedConstant(_, _) => Some(i),
+                            _ => None,
+                        }
+                    })
+                    .collect(),
+                ValTensor::Instance { .. } => vec![],
+            }
+        } else {
+            // Use parallel for larger arrays
+            let num_cores = std::thread::available_parallelism()
+                .map(|n| n.get())
+                .unwrap_or(1);
+            let chunk_size = (self.len() / num_cores).max(100_000);
+
+            match &self {
+                ValTensor::Value { inner: v, .. } => v
+                    .par_chunks(chunk_size)
+                    .enumerate()
+                    .flat_map(|(chunk_idx, chunk)| {
+                        chunk
+                            .par_iter() // Make sure we use par_iter() here
+                            .enumerate()
+                            .filter_map(move |(i, e)| match e {
+                                ValType::Constant(_) | ValType::AssignedConstant(_, _) => {
+                                    Some(chunk_idx * chunk_size + i)
+                                }
+                                _ => None,
+                            })
+                    })
+                    .collect::<Vec<_>>(),
+                ValTensor::Instance { .. } => vec![],
+            }
        }
    }

--- a/src/tensor/var.rs
+++ b/src/tensor/var.rs
@@ -494,16 +494,56 @@ impl VarTensor {
        }
    }

+    /// Assigns specific values (`ValTensor`) to the columns of the inner tensor but allows for column wrapping for accumulated operations.
+    pub fn assign_with_duplication_unconstrained<
+        F: PrimeField + TensorType + PartialOrd + std::hash::Hash,
+    >(
+        &self,
+        region: &mut Region<F>,
+        offset: usize,
+        values: &ValTensor<F>,
+        constants: &mut ConstantsMap<F>,
+    ) -> Result<(ValTensor<F>, usize), halo2_proofs::plonk::Error> {
+        match values {
+            ValTensor::Instance { .. } => unimplemented!("duplication is not supported on instance columns. increase K if you require more rows."),
+            ValTensor::Value { inner: v, dims , ..} => {
+
+                let duplication_freq = self.block_size();
+
+                let num_repeats = self.num_inner_cols();
+
+                let duplication_offset = offset;
+
+                // duplicates every nth element to adjust for column overflow
+                let v = v.duplicate_every_n(duplication_freq, num_repeats, duplication_offset).unwrap();
+                let mut res: ValTensor<F> = {
+                    v.enum_map(|coord, k| {
+                    let cell = self.assign_value(region, offset, k.clone(), coord, constants)?;
+                    Ok::<_, halo2_proofs::plonk::Error>(cell)
+
+                })?.into()};
+                let total_used_len = res.len();
+                res.remove_every_n(duplication_freq, num_repeats, duplication_offset).unwrap();
+
+                res.reshape(dims).unwrap();
+                res.set_scale(values.scale());
+
+                Ok((res, total_used_len))
+            }
+        }
+    }
+
    /// Assigns specific values (`ValTensor`) to the columns of the inner tensor but allows for column wrapping for accumulated operations.
    /// Duplication occurs by copying the last cell of the column to the first cell next column and creating a copy constraint between the two.
-    pub fn assign_with_duplication<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
+    pub fn assign_with_duplication_constrained<
+        F: PrimeField + TensorType + PartialOrd + std::hash::Hash,
+    >(
        &self,
        region: &mut Region<F>,
        row: usize,
        offset: usize,
        values: &ValTensor<F>,
        check_mode: &CheckMode,
-        single_inner_col: bool,
        constants: &mut ConstantsMap<F>,
    ) -> Result<(ValTensor<F>, usize), halo2_proofs::plonk::Error> {
        let mut prev_cell = None;
@@ -512,34 +552,16 @@ impl VarTensor {
            ValTensor::Instance { .. } => unimplemented!("duplication is not supported on instance columns. increase K if you require more rows."),
            ValTensor::Value { inner: v, dims , ..} => {

-                let duplication_freq = if single_inner_col {
-                    self.col_size()
-                } else {
-                    self.block_size()
-                };
-
-                let num_repeats = if single_inner_col {
-                    1
-                } else {
-                    self.num_inner_cols()
-                };
-
-                let duplication_offset = if single_inner_col {
-                    row
-                } else {
-                    offset
-                };
+                let duplication_freq = self.col_size();
+                let num_repeats = 1;
+                let duplication_offset = row;

                // duplicates every nth element to adjust for column overflow
                let v = v.duplicate_every_n(duplication_freq, num_repeats, duplication_offset).unwrap();
                let mut res: ValTensor<F> = {
                    v.enum_map(|coord, k| {

-                    let step = if !single_inner_col {
-                        1
-                    } else {
-                        self.num_inner_cols()
-                    };
+                    let step = self.num_inner_cols();

                    let (x, y, z) = self.cartesian_coord(offset + coord * step);
                    if matches!(check_mode, CheckMode::SAFE) && coord > 0 && z == 0 && y == 0 {
@@ -549,11 +571,13 @@ impl VarTensor {

                    let cell = self.assign_value(region, offset, k.clone(), coord * step, constants)?;

-                    if single_inner_col {
-                    if z == 0 {
+                    let at_end_of_column = z == duplication_freq - 1;
+                    let at_beginning_of_column = z == 0;
+
+                    if at_end_of_column {
                        // if we are at the end of the column, we need to copy the cell to the next column
                        prev_cell = Some(cell.clone());
-                    } else if coord > 0 && z == 0 && single_inner_col {
+                    } else if coord > 0 && at_beginning_of_column  {
                        if let Some(prev_cell) = prev_cell.as_ref() {
                            let cell = cell.cell().ok_or({
                                error!("Error getting cell: {:?}", (x,y));
@@ -563,10 +587,10 @@ impl VarTensor {
                                halo2_proofs::plonk::Error::Synthesis})?;
                            region.constrain_equal(prev_cell,cell)?;
                        } else {
-                            error!("Error copy-constraining previous value: {:?}", (x,y));
+                            error!("Previous cell was not set");
                            return Err(halo2_proofs::plonk::Error::Synthesis);
                        }
-                    }}
+                    }

                    Ok(cell)

@@ -577,20 +601,6 @@ impl VarTensor {
                res.reshape(dims).unwrap();
                res.set_scale(values.scale());

-                if matches!(check_mode, CheckMode::SAFE) {
-                     // during key generation this will be 0 so we use this as a flag to check
-                     // TODO: this isn't very safe and would be better to get the phase directly
-                    let res_evals = res.int_evals().unwrap();
-                    let is_assigned = res_evals
-                    .iter()
-                    .all(|&x| x == 0);
-                    if !is_assigned {
-                        assert_eq!(
-                           values.int_evals().unwrap(),
-                           res_evals
-                    )};
-                }
-
                Ok((res, total_used_len))
            }
        }
--- a/tests/assets/pk.key
+++ b/tests/assets/pk.key
--- a/tests/assets/vk.key
+++ b/tests/assets/vk.key
Author	SHA1	Message	Date
dante	9c699e30cb	Merge branch 'main' into ac/artifact-version-warning	2025-01-06 15:49:58 +00:00
dante	e86caca8b6	refactor: batched poly reads (#897 )	2025-01-06 15:49:47 +00:00
dante	3b8e44df9b	chore: version mismatch warnings for artifacts	2025-01-06 15:35:37 +00:00
dante	c839a30ae6	fix: clearer duplication functions (#895 )	2024-12-31 07:28:02 -05:00
dante	352812b9ac	refactor!: simplified decompose op (#892 )	2024-12-30 13:44:03 -05:00
dante	d48d0b0b3e	fix: `get_slice` should not use intermediate `Vec` (#894 )	2024-12-27 23:26:22 -05:00
Jseam	8b223354cc	fix: add version string and sed (#893 )	2024-12-27 14:24:28 -05:00
dante	caa6ef8e16	fix: const filtering strat is size dependent (#891 )	2024-12-27 09:43:59 -05:00