ci: update version string in docs

fix: clearer duplication functions (#895 )
refactor!: simplified decompose op (#892 )
2026-01-13 08:17:57 -05:00 · 2024-12-31 12:28:24 +00:00 · 2024-12-31 07:28:02 -05:00 · 2024-12-30 13:44:03 -05:00 · 2024-12-27 23:26:22 -05:00 · 2024-12-27 14:24:28 -05:00
20 changed files with 1465 additions and 417 deletions
--- a/.github/workflows/pypi-gpu.yml
+++ b/.github/workflows/pypi-gpu.yml
@@ -34,6 +34,7 @@ jobs:
        run: |
            mv pyproject.toml pyproject.toml.orig
            sed "s/ezkl/ezkl-gpu/" pyproject.toml.orig >pyproject.toml
+            sed "s/0\\.0\\.0/${RELEASE_TAG//v}/" pyproject.toml.orig >pyproject.toml

      - uses: actions-rs/toolchain@v1
        with:
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -233,6 +233,14 @@ jobs:
          python-version: 3.12
          architecture: x64

+      - name: Set pyproject.toml version to match github tag
+        shell: bash
+        env:
+          RELEASE_TAG: ${{ github.ref_name }}
+        run: |
+          mv pyproject.toml pyproject.toml.orig
+          sed "s/0\\.0\\.0/${RELEASE_TAG//v}/" pyproject.toml.orig >pyproject.toml
+
      - name: Set Cargo.toml version to match github tag
        shell: bash
        env:
@@ -242,7 +250,6 @@ jobs:
          sed "s/0\\.0\\.0/${RELEASE_TAG//v}/" Cargo.toml.orig >Cargo.toml
          mv Cargo.lock Cargo.lock.orig
          sed "s/0\\.0\\.0/${RELEASE_TAG//v}/" Cargo.lock.orig >Cargo.lock
-
      - name: Install required libraries
        shell: bash
        run: |
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -650,7 +650,9 @@ jobs:
        run: python -m venv .env --clear; source .env/bin/activate; pip install -r requirements.txt; python -m ensurepip --upgrade
      - name: Build python ezkl
        run: source .env/bin/activate; unset CONDA_PREFIX; maturin develop --features python-bindings --release
-      - name: Postgres tutorials
+      - name: Neural bow
+        run: source .env/bin/activate; cargo nextest run py_tests::tests::neural_bag_of_words_ --no-capture
+      - name: Felt conversion
        run: source .env/bin/activate; cargo nextest run py_tests::tests::felt_conversion_test_ --no-capture
      - name: Postgres tutorials
        run: source .env/bin/activate; cargo nextest run py_tests::tests::postgres_ --no-capture
--- a/.github/workflows/swift-pm.yml
+++ b/.github/workflows/swift-pm.yml
@@ -0,0 +1,129 @@
+name: Build and Publish EZKL iOS SPM package
+
+on:
+  push:
+    tags:
+      # Only support SemVer versioning tags
+      - 'v[0-9]+.[0-9]+.[0-9]+'
+      - '[0-9]+.[0-9]+.[0-9]+'
+
+jobs:
+  build-and-update:
+    runs-on: macos-latest
+    env:
+      EZKL_SWIFT_PACKAGE_REPO: github.com/zkonduit/ezkl-swift-package.git
+
+    steps:
+      - name: Checkout EZKL
+        uses: actions/checkout@v3
+
+      - name: Extract TAG from github.ref_name
+        run: |
+          # github.ref_name is provided by GitHub Actions and contains the tag name directly.
+          TAG="${{ github.ref_name }}"
+          echo "Original TAG: $TAG"
+          # Remove leading 'v' if present to match the Swift Package Manager version format.
+          NEW_TAG=${TAG#v}
+          echo "Stripped TAG: $NEW_TAG"
+          echo "TAG=$NEW_TAG" >> $GITHUB_ENV
+
+      - name: Install Rust (nightly)
+        uses: actions-rs/toolchain@v1
+        with:
+          toolchain: nightly
+          override: true
+
+      - name: Build EzklCoreBindings
+        run: CONFIGURATION=release cargo run --bin ios_gen_bindings --features "ios-bindings uuid camino uniffi_bindgen" --no-default-features
+
+      - name: Clone ezkl-swift-package repository
+        run: |
+          git clone https://${{ env.EZKL_SWIFT_PACKAGE_REPO }}
+
+      - name: Copy EzklCoreBindings
+        run: |
+          rm -rf ezkl-swift-package/Sources/EzklCoreBindings
+          cp -r build/EzklCoreBindings ezkl-swift-package/Sources/
+
+      - name: Copy Test Files
+        run: |
+          rm -rf ezkl-swift-package/Tests/EzklAssets/*
+          cp tests/assets/kzg ezkl-swift-package/Tests/EzklAssets/kzg.srs
+          cp tests/assets/input.json ezkl-swift-package/Tests/EzklAssets/input.json
+          cp tests/assets/model.compiled ezkl-swift-package/Tests/EzklAssets/network.ezkl
+          cp tests/assets/settings.json ezkl-swift-package/Tests/EzklAssets/settings.json
+
+      - name: Check for changes
+        id: check_changes
+        run: |
+          cd ezkl-swift-package
+          if git diff --quiet Sources/EzklCoreBindings Tests/EzklAssets; then
+            echo "no_changes=true" >> $GITHUB_OUTPUT
+          else
+            echo "no_changes=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Set up Xcode environment
+        if: steps.check_changes.outputs.no_changes == 'false'
+        run: |
+          sudo xcode-select -s /Applications/Xcode.app/Contents/Developer
+          sudo xcodebuild -license accept
+
+      - name: Run Package Tests
+        if: steps.check_changes.outputs.no_changes == 'false'
+        run: |
+          cd ezkl-swift-package
+          xcodebuild test \
+            -scheme EzklPackage \
+            -destination 'platform=iOS Simulator,name=iPhone 15 Pro,OS=17.5' \
+            -resultBundlePath ../testResults
+
+      - name: Run Example App Tests
+        if: steps.check_changes.outputs.no_changes == 'false'
+        run: |
+          cd ezkl-swift-package/Example
+          xcodebuild test \
+            -project Example.xcodeproj \
+            -scheme EzklApp \
+            -destination 'platform=iOS Simulator,name=iPhone 15 Pro,OS=17.5' \
+            -parallel-testing-enabled NO \
+            -resultBundlePath ../../exampleTestResults \
+            -skip-testing:EzklAppUITests/EzklAppUITests/testButtonClicksInOrder
+
+      - name: Setup Git
+        run: |
+          cd ezkl-swift-package
+          git config user.name "GitHub Action"
+          git config user.email "action@github.com"
+          git remote set-url origin https://zkonduit:${EZKL_SWIFT_PACKAGE_REPO_TOKEN}@${{ env.EZKL_SWIFT_PACKAGE_REPO }}
+        env:
+          EZKL_SWIFT_PACKAGE_REPO_TOKEN: ${{ secrets.EZKL_PORTER_TOKEN }}
+
+      - name: Commit and Push Changes
+        if: steps.check_changes.outputs.no_changes == 'false'
+        run: |
+          cd ezkl-swift-package
+          git add Sources/EzklCoreBindings Tests/EzklAssets
+          git commit -m "Automatically updated EzklCoreBindings for EZKL"
+          
+          if ! git push origin; then
+            echo "::error::Failed to push changes to ${{ env.EZKL_SWIFT_PACKAGE_REPO }}. Please ensure that EZKL_PORTER_TOKEN has the correct permissions."
+            exit 1
+          fi
+
+      - name: Tag the latest commit
+        run: |
+          cd ezkl-swift-package
+          source $GITHUB_ENV
+          
+          # Tag the latest commit on the current branch
+          if git rev-parse "$TAG" >/dev/null 2>&1; then
+            echo "Tag $TAG already exists locally. Skipping tag creation."
+          else
+            git tag "$TAG"
+          fi
+
+          if ! git push origin "$TAG"; then
+            echo "::error::Failed to push tag '$TAG' to ${{ env.EZKL_SWIFT_PACKAGE_REPO }}. Please ensure EZKL_PORTER_TOKEN has correct permissions."
+            exit 1
+          fi
--- a/.github/workflows/update-ios-package.yml
+++ b/.github/workflows/update-ios-package.yml
@@ -1,85 +0,0 @@
-name: Build and Publish EZKL iOS SPM package
-
-on:
-  workflow_dispatch:
-    inputs:
-      tag:
-        description: "The tag to release"
-        required: true
-  push:
-    tags:
-      - "*"
-
-jobs:
-  build-and-update:
-    runs-on: macos-latest
-
-    steps:
-      - name: Checkout EZKL
-        uses: actions/checkout@v3
-
-      - name: Install Rust
-        uses: actions-rs/toolchain@v1
-        with:
-          toolchain: nightly
-          override: true
-
-      - name: Build EzklCoreBindings
-        run: CONFIGURATION=release cargo run --bin ios_gen_bindings --features "ios-bindings uuid camino uniffi_bindgen" --no-default-features
-
-      - name: Clone ezkl-swift-package repository
-        run: |
-          git clone https://github.com/zkonduit/ezkl-swift-package.git
-
-      - name: Copy EzklCoreBindings
-        run: |
-          rm -rf ezkl-swift-package/Sources/EzklCoreBindings
-          cp -r build/EzklCoreBindings ezkl-swift-package/Sources/
-
-      - name: Copy Test Files
-        run: |
-          rm -rf ezkl-swift-package/Tests/EzklAssets/*
-          
-          cp tests/assets/kzg ezkl-swift-package/Tests/EzklAssets/kzg.srs
-          cp tests/assets/input.json ezkl-swift-package/Tests/EzklAssets/input.json
-          cp tests/assets/model.compiled ezkl-swift-package/Tests/EzklAssets/network.ezkl
-          cp tests/assets/settings.json ezkl-swift-package/Tests/EzklAssets/settings.json
-
-      - name: Set up Xcode environment
-        run: |
-          sudo xcode-select -s /Applications/Xcode.app/Contents/Developer
-          sudo xcodebuild -license accept
-
-      - name: Run Package Tests
-        run: |
-          cd ezkl-swift-package
-          xcodebuild test \
-            -scheme EzklPackage \
-            -destination 'platform=iOS Simulator,name=iPhone 15 Pro,OS=17.5' \
-            -resultBundlePath ../testResults
-
-      - name: Run Example App Tests
-        run: |
-          cd ezkl-swift-package/Example
-          xcodebuild test \
-            -project Example.xcodeproj \
-            -scheme EzklApp \
-            -destination 'platform=iOS Simulator,name=iPhone 15 Pro,OS=17.5' \
-            -parallel-testing-enabled NO \
-            -resultBundlePath ../../exampleTestResults \
-            -skip-testing:EzklAppUITests/EzklAppUITests/testButtonClicksInOrder
-
-      - name: Commit and Push Changes to feat/ezkl-direct-integration
-        run: |
-          cd ezkl-swift-package
-          git config user.name "GitHub Action"
-          git config user.email "action@github.com"
-          git add Sources/EzklCoreBindings
-          git add Tests/EzklAssets
-          git commit -m "Automatically updated EzklCoreBindings for EZKL"
-          git tag ${{ github.event.inputs.tag }}
-          git remote set-url origin https://zkonduit:${EZKL_PORTER_TOKEN}@github.com/zkonduit/ezkl-swift-package.git
-          git push origin
-          git push origin tag ${{ github.event.inputs.tag }}
-        env:
-          EZKL_PORTER_TOKEN: ${{ secrets.EZKL_PORTER_TOKEN }}
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -147,6 +147,10 @@ shellexpand = "3.1.0"
 runner = 'wasm-bindgen-test-runner'


+[[bench]]
+name = "zero_finder"
+harness = false
+
 [[bench]]
 name = "accum_dot"
 harness = false
@@ -286,3 +290,11 @@ rustflags = ["-C", "relocation-model=pic"]
 lto = "fat"
 codegen-units = 1
 # panic = "abort"
+
+
+[package.metadata.wasm-pack.profile.release]
+wasm-opt = [
+    "-O4",
+    "--flexible-inline-max-function-size",
+    "4294967295",
+]
--- a/benches/zero_finder.rs
+++ b/benches/zero_finder.rs
@@ -0,0 +1,116 @@
+use std::thread;
+
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use halo2curves::{bn256::Fr as F, ff::Field};
+use maybe_rayon::{
+    iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator},
+    slice::ParallelSlice,
+};
+use rand::Rng;
+
+// Assuming these are your types
+#[derive(Clone)]
+enum ValType {
+    Constant(F),
+    AssignedConstant(usize, F),
+    Other,
+}
+
+// Helper to generate test data
+fn generate_test_data(size: usize, zero_probability: f64) -> Vec<ValType> {
+    let mut rng = rand::thread_rng();
+    (0..size)
+        .map(|_i| {
+            if rng.gen::<f64>() < zero_probability {
+                ValType::Constant(F::ZERO)
+            } else {
+                ValType::Constant(F::ONE) // Or some other non-zero value
+            }
+        })
+        .collect()
+}
+
+fn bench_zero_finding(c: &mut Criterion) {
+    let sizes = [
+        1_000,         // 1K
+        10_000,        // 10K
+        100_000,       // 100K
+        256 * 256 * 2, // Our specific case
+        1_000_000,     // 1M
+        10_000_000,    // 10M
+    ];
+
+    let zero_probability = 0.1; // 10% zeros
+
+    let mut group = c.benchmark_group("zero_finding");
+    group.sample_size(10); // Adjust based on your needs
+
+    for &size in &sizes {
+        let data = generate_test_data(size, zero_probability);
+
+        // Benchmark sequential version
+        group.bench_function(format!("sequential_{}", size), |b| {
+            b.iter(|| {
+                let result = data
+                    .iter()
+                    .enumerate()
+                    .filter_map(|(i, e)| match e {
+                        ValType::Constant(r) | ValType::AssignedConstant(_, r) => {
+                            (*r == F::ZERO).then_some(i)
+                        }
+                        _ => None,
+                    })
+                    .collect::<Vec<_>>();
+                black_box(result)
+            })
+        });
+
+        // Benchmark parallel version
+        group.bench_function(format!("parallel_{}", size), |b| {
+            b.iter(|| {
+                let result = data
+                    .par_iter()
+                    .enumerate()
+                    .filter_map(|(i, e)| match e {
+                        ValType::Constant(r) | ValType::AssignedConstant(_, r) => {
+                            (*r == F::ZERO).then_some(i)
+                        }
+                        _ => None,
+                    })
+                    .collect::<Vec<_>>();
+                black_box(result)
+            })
+        });
+
+        // Benchmark chunked parallel version
+        group.bench_function(format!("chunked_parallel_{}", size), |b| {
+            b.iter(|| {
+                let num_cores = thread::available_parallelism()
+                    .map(|n| n.get())
+                    .unwrap_or(1);
+                let chunk_size = (size / num_cores).max(100);
+
+                let result = data
+                    .par_chunks(chunk_size)
+                    .enumerate()
+                    .flat_map(|(chunk_idx, chunk)| {
+                        chunk
+                            .par_iter() // Make sure we use par_iter() here
+                            .enumerate()
+                            .filter_map(move |(i, e)| match e {
+                                ValType::Constant(r) | ValType::AssignedConstant(_, r) => {
+                                    (*r == F::ZERO).then_some(chunk_idx * chunk_size + i)
+                                }
+                                _ => None,
+                            })
+                    })
+                    .collect::<Vec<_>>();
+                black_box(result)
+            })
+        });
+    }
+    group.finish();
+}
+
+criterion_group!(benches, bench_zero_finding);
+criterion_main!(benches);
--- a/docs/python/src/conf.py
+++ b/docs/python/src/conf.py
@@ -1,7 +1,7 @@
 import ezkl

 project = 'ezkl'
-release = '0.0.0'
+release = '16.2.9'
 version = release


--- a/examples/notebooks/neural_bow.ipynb
+++ b/examples/notebooks/neural_bow.ipynb
@@ -0,0 +1,766 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "This is a zk version of the tutorial found [here](https://github.com/bentrevett/pytorch-sentiment-analysis/blob/main/1%20-%20Neural%20Bag%20of%20Words.ipynb). The original tutorial is part of the PyTorch Sentiment Analysis series by Ben Trevett.\n",
+    "\n",
+    "1 - NBoW\n",
+    "\n",
+    "In this series we'll be building a machine learning model to perform sentiment analysis -- a subset of text classification where the task is to detect if a given sentence is positive or negative -- using PyTorch and torchtext. The dataset used will be movie reviews from the IMDb dataset, which we'll obtain using the datasets library.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "Preparing Data\n",
+    "\n",
+    "Before we can implement our NBoW model, we first have to perform quite a few steps to get our data ready to use. NLP usually requires quite a lot of data wrangling beforehand, though libraries such as datasets and torchtext handle most of this for us.\n",
+    "\n",
+    "The steps to take are:\n",
+    "\n",
+    "    1. importing modules\n",
+    "    2. loading data\n",
+    "    3. tokenizing data\n",
+    "    4. creating data splits\n",
+    "    5. creating a vocabulary\n",
+    "    6. numericalizing data\n",
+    "    7. creating the data loaders\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! pip install torchtex"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import collections\n",
+    "\n",
+    "import datasets\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "import torchtext\n",
+    "import tqdm\n",
+    "\n",
+    "# It is usually good practice to run your experiments multiple times with different random seeds -- both to measure the variance of your model and also to avoid having results only calculated with either \"good\" or \"bad\" seeds, i.e. being very lucky or unlucky with the randomness in the training process.\n",
+    "\n",
+    "seed = 1234\n",
+    "\n",
+    "np.random.seed(seed)\n",
+    "torch.manual_seed(seed)\n",
+    "torch.cuda.manual_seed(seed)\n",
+    "torch.backends.cudnn.deterministic = True\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data, test_data = datasets.load_dataset(\"imdb\", split=[\"train\", \"test\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can check the features attribute of a split to get more information about the features. We can see that text is a Value of dtype=string -- in other words, it's a string -- and that label is a ClassLabel. A ClassLabel means the feature is an integer representation of which class the example belongs to. num_classes=2 means that our labels are one of two values, 0 or 1, and names=['neg', 'pos'] gives us the human-readable versions of those values. Thus, a label of 0 means the example is a negative review and a label of 1 means the example is a positive review."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data.features\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "One of the first things we need to do to our data is tokenize it. Machine learning models aren't designed to handle strings, they're design to handle numbers. So what we need to do is break down our string into individual tokens, and then convert these tokens to numbers. We'll get to the conversion later, but first we'll look at tokenization.\n",
+    "\n",
+    "Tokenization involves using a tokenizer to process the strings in our dataset. A tokenizer is a function that goes from a string to a list of strings. There are many types of tokenizers available, but we're going to use a relatively simple one provided by torchtext called the basic_english tokenizer. We load our tokenizer as such:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = torchtext.data.utils.get_tokenizer(\"basic_english\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tokenize_example(example, tokenizer, max_length):\n",
+    "    tokens = tokenizer(example[\"text\"])[:max_length]\n",
+    "    return {\"tokens\": tokens}\n",
+    "\n",
+    "\n",
+    "max_length = 256\n",
+    "\n",
+    "train_data = train_data.map(\n",
+    "    tokenize_example, fn_kwargs={\"tokenizer\": tokenizer, \"max_length\": max_length}\n",
+    ")\n",
+    "test_data = test_data.map(\n",
+    "    tokenize_example, fn_kwargs={\"tokenizer\": tokenizer, \"max_length\": max_length}\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# create validation data \n",
+    "# Why have both a validation set and a test set? Your test set respresents the real world data that you'd see if you actually deployed this model. You won't be able to see what data your model will be fed once deployed, and your test set is supposed to reflect that. Every time we tune our model hyperparameters or training set-up to make it do a bit better on the test set, we are leak information from the test set into the training process. If we do this too often then we begin to overfit on the test set. Hence, we need some data which can act as a \"proxy\" test set which we can look at more frequently in order to evaluate how well our model actually does on unseen data -- this is the validation set.\n",
+    "\n",
+    "test_size = 0.25\n",
+    "\n",
+    "train_valid_data = train_data.train_test_split(test_size=test_size)\n",
+    "train_data = train_valid_data[\"train\"]\n",
+    "valid_data = train_valid_data[\"test\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we have to build a vocabulary. This is look-up table where every unique token in your dataset has a corresponding index (an integer).\n",
+    "\n",
+    "We do this as machine learning models cannot operate on strings, only numerical vaslues. Each index is used to construct a one-hot vector for each token. A one-hot vector is a vector where all the elements are 0, except one, which is 1, and the dimensionality is the total number of unique tokens in your vocabulary, commonly denoted by V."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "min_freq = 5\n",
+    "special_tokens = [\"<unk>\", \"<pad>\"]\n",
+    "\n",
+    "vocab = torchtext.vocab.build_vocab_from_iterator(\n",
+    "    train_data[\"tokens\"],\n",
+    "    min_freq=min_freq,\n",
+    "    specials=special_tokens,\n",
+    ")\n",
+    "\n",
+    "# We store the indices of the unknown and padding tokens (zero and one, respectively) in variables, as we'll use these further on in this notebook.\n",
+    "\n",
+    "unk_index = vocab[\"<unk>\"]\n",
+    "pad_index = vocab[\"<pad>\"]\n",
+    "\n",
+    "\n",
+    "vocab.set_default_index(unk_index)\n",
+    "\n",
+    "# To look-up a list of tokens, we can use the vocabulary's lookup_indices method.\n",
+    "vocab.lookup_indices([\"hello\", \"world\", \"some_token\", \"<pad>\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we have our vocabulary, we can numericalize our data. This involves converting the tokens within our dataset into indices. Similar to how we tokenized our data using the Dataset.map method, we'll define a function that takes an example and our vocabulary, gets the index for each token in each example and then creates an ids field which containes the numericalized tokens."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def numericalize_example(example, vocab):\n",
+    "    ids = vocab.lookup_indices(example[\"tokens\"])\n",
+    "    return {\"ids\": ids}\n",
+    "\n",
+    "train_data = train_data.map(numericalize_example, fn_kwargs={\"vocab\": vocab})\n",
+    "valid_data = valid_data.map(numericalize_example, fn_kwargs={\"vocab\": vocab})\n",
+    "test_data = test_data.map(numericalize_example, fn_kwargs={\"vocab\": vocab})\n",
+    "\n",
+    "train_data = train_data.with_format(type=\"torch\", columns=[\"ids\", \"label\"])\n",
+    "valid_data = valid_data.with_format(type=\"torch\", columns=[\"ids\", \"label\"])\n",
+    "test_data = test_data.with_format(type=\"torch\", columns=[\"ids\", \"label\"])\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The final step of preparing the data is creating the data loaders. We can iterate over a data loader to retrieve batches of examples. This is also where we will perform any padding that is necessary.\n",
+    "\n",
+    "We first need to define a function to collate a batch, consisting of a list of examples, into what we want our data loader to output.\n",
+    "\n",
+    "Here, our desired output from the data loader is a dictionary with keys of \"ids\" and \"label\".\n",
+    "\n",
+    "The value of batch[\"ids\"] should be a tensor of shape [batch size, length], where length is the length of the longest sentence (in terms of tokens) within the batch, and all sentences shorter than this should be padded to that length.\n",
+    "\n",
+    "The value of batch[\"label\"] should be a tensor of shape [batch size] consisting of the label for each sentence in the batch.\n",
+    "\n",
+    "We define a function, get_collate_fn, which is passed the pad token index and returns the actual collate function. Within the actual collate function, collate_fn, we get a list of \"ids\" tensors for each example in the batch, and then use the pad_sequence function, which converts the list of tensors into the desired [batch size, length] shaped tensor and performs padding using the specified pad_index. By default, pad_sequence will return a [length, batch size] shaped tensor, but by setting batch_first=True, these two dimensions are switched. We get a list of \"label\" tensors and convert the list of tensors into a single [batch size] shaped tensor."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_collate_fn(pad_index):\n",
+    "    def collate_fn(batch):\n",
+    "        batch_ids = [i[\"ids\"] for i in batch]\n",
+    "        batch_ids = nn.utils.rnn.pad_sequence(\n",
+    "            batch_ids, padding_value=pad_index, batch_first=True\n",
+    "        )\n",
+    "        batch_label = [i[\"label\"] for i in batch]\n",
+    "        batch_label = torch.stack(batch_label)\n",
+    "        batch = {\"ids\": batch_ids, \"label\": batch_label}\n",
+    "        return batch\n",
+    "\n",
+    "    return collate_fn\n",
+    "\n",
+    "def get_data_loader(dataset, batch_size, pad_index, shuffle=False):\n",
+    "    collate_fn = get_collate_fn(pad_index)\n",
+    "    data_loader = torch.utils.data.DataLoader(\n",
+    "        dataset=dataset,\n",
+    "        batch_size=batch_size,\n",
+    "        collate_fn=collate_fn,\n",
+    "        shuffle=shuffle,\n",
+    "    )\n",
+    "    return data_loader\n",
+    "\n",
+    "\n",
+    "batch_size = 512\n",
+    "\n",
+    "train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)\n",
+    "valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)\n",
+    "test_data_loader = get_data_loader(test_data, batch_size, pad_index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "class NBoW(nn.Module):\n",
+    "    def __init__(self, vocab_size, embedding_dim, output_dim, pad_index):\n",
+    "        super().__init__()\n",
+    "        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)\n",
+    "        self.fc = nn.Linear(embedding_dim, output_dim)\n",
+    "\n",
+    "    def forward(self, ids):\n",
+    "        # ids = [batch size, seq len]\n",
+    "        embedded = self.embedding(ids)\n",
+    "        # embedded = [batch size, seq len, embedding dim]\n",
+    "        pooled = embedded.mean(dim=1)\n",
+    "        # pooled = [batch size, embedding dim]\n",
+    "        prediction = self.fc(pooled)\n",
+    "        # prediction = [batch size, output dim]\n",
+    "        return prediction\n",
+    "\n",
+    "\n",
+    "vocab_size = len(vocab)\n",
+    "embedding_dim = 300\n",
+    "output_dim = len(train_data.unique(\"label\"))\n",
+    "\n",
+    "model = NBoW(vocab_size, embedding_dim, output_dim, pad_index)\n",
+    "\n",
+    "def count_parameters(model):\n",
+    "    return sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
+    "\n",
+    "\n",
+    "print(f\"The model has {count_parameters(model):,} trainable parameters\")\n",
+    "\n",
+    "vectors = torchtext.vocab.GloVe()\n",
+    "\n",
+    "pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())\n",
+    "\n",
+    "optimizer = optim.Adam(model.parameters())\n",
+    "\n",
+    "criterion = nn.CrossEntropyLoss()\n",
+    "\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "\n",
+    "model = model.to(device)\n",
+    "criterion = criterion.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train(data_loader, model, criterion, optimizer, device):\n",
+    "    model.train()\n",
+    "    epoch_losses = []\n",
+    "    epoch_accs = []\n",
+    "    for batch in tqdm.tqdm(data_loader, desc=\"training...\"):\n",
+    "        ids = batch[\"ids\"].to(device)\n",
+    "        label = batch[\"label\"].to(device)\n",
+    "        prediction = model(ids)\n",
+    "        loss = criterion(prediction, label)\n",
+    "        accuracy = get_accuracy(prediction, label)\n",
+    "        optimizer.zero_grad()\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "        epoch_losses.append(loss.item())\n",
+    "        epoch_accs.append(accuracy.item())\n",
+    "    return np.mean(epoch_losses), np.mean(epoch_accs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def evaluate(data_loader, model, criterion, device):\n",
+    "    model.eval()\n",
+    "    epoch_losses = []\n",
+    "    epoch_accs = []\n",
+    "    with torch.no_grad():\n",
+    "        for batch in tqdm.tqdm(data_loader, desc=\"evaluating...\"):\n",
+    "            ids = batch[\"ids\"].to(device)\n",
+    "            label = batch[\"label\"].to(device)\n",
+    "            prediction = model(ids)\n",
+    "            loss = criterion(prediction, label)\n",
+    "            accuracy = get_accuracy(prediction, label)\n",
+    "            epoch_losses.append(loss.item())\n",
+    "            epoch_accs.append(accuracy.item())\n",
+    "    return np.mean(epoch_losses), np.mean(epoch_accs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_accuracy(prediction, label):\n",
+    "    batch_size, _ = prediction.shape\n",
+    "    predicted_classes = prediction.argmax(dim=-1)\n",
+    "    correct_predictions = predicted_classes.eq(label).sum()\n",
+    "    accuracy = correct_predictions / batch_size\n",
+    "    return accuracy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_epochs = 10\n",
+    "best_valid_loss = float(\"inf\")\n",
+    "\n",
+    "metrics = collections.defaultdict(list)\n",
+    "\n",
+    "for epoch in range(n_epochs):\n",
+    "    train_loss, train_acc = train(\n",
+    "        train_data_loader, model, criterion, optimizer, device\n",
+    "    )\n",
+    "    valid_loss, valid_acc = evaluate(valid_data_loader, model, criterion, device)\n",
+    "    metrics[\"train_losses\"].append(train_loss)\n",
+    "    metrics[\"train_accs\"].append(train_acc)\n",
+    "    metrics[\"valid_losses\"].append(valid_loss)\n",
+    "    metrics[\"valid_accs\"].append(valid_acc)\n",
+    "    if valid_loss < best_valid_loss:\n",
+    "        best_valid_loss = valid_loss\n",
+    "        torch.save(model.state_dict(), \"nbow.pt\")\n",
+    "    print(f\"epoch: {epoch}\")\n",
+    "    print(f\"train_loss: {train_loss:.3f}, train_acc: {train_acc:.3f}\")\n",
+    "    print(f\"valid_loss: {valid_loss:.3f}, valid_acc: {valid_acc:.3f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plt.figure(figsize=(10, 6))\n",
+    "ax = fig.add_subplot(1, 1, 1)\n",
+    "ax.plot(metrics[\"train_losses\"], label=\"train loss\")\n",
+    "ax.plot(metrics[\"valid_losses\"], label=\"valid loss\")\n",
+    "ax.set_xlabel(\"epoch\")\n",
+    "ax.set_ylabel(\"loss\")\n",
+    "ax.set_xticks(range(n_epochs))\n",
+    "ax.legend()\n",
+    "ax.grid()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = plt.figure(figsize=(10, 6))\n",
+    "ax = fig.add_subplot(1, 1, 1)\n",
+    "ax.plot(metrics[\"train_accs\"], label=\"train accuracy\")\n",
+    "ax.plot(metrics[\"valid_accs\"], label=\"valid accuracy\")\n",
+    "ax.set_xlabel(\"epoch\")\n",
+    "ax.set_ylabel(\"loss\")\n",
+    "ax.set_xticks(range(n_epochs))\n",
+    "ax.legend()\n",
+    "ax.grid()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.load_state_dict(torch.load(\"nbow.pt\"))\n",
+    "\n",
+    "test_loss, test_acc = evaluate(test_data_loader, model, criterion, device)\n",
+    "\n",
+    "print(f\"test_loss: {test_loss:.3f}, test_acc: {test_acc:.3f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def predict_sentiment(text, model, tokenizer, vocab, device):\n",
+    "    tokens = tokenizer(text)\n",
+    "    ids = vocab.lookup_indices(tokens)\n",
+    "    tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)\n",
+    "    prediction = model(tensor).squeeze(dim=0)\n",
+    "    probability = torch.softmax(prediction, dim=-1)\n",
+    "    predicted_class = prediction.argmax(dim=-1).item()\n",
+    "    predicted_probability = probability[predicted_class].item()\n",
+    "    return predicted_class, predicted_probability"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"This film is terrible!\"\n",
+    "\n",
+    "predict_sentiment(text, model, tokenizer, vocab, device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"This film is great!\"\n",
+    "\n",
+    "predict_sentiment(text, model, tokenizer, vocab, device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"This film is not terrible, it's great!\"\n",
+    "\n",
+    "predict_sentiment(text, model, tokenizer, vocab, device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"This film is not great, it's terrible!\"\n",
+    "\n",
+    "predict_sentiment(text, model, tokenizer, vocab, device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def text_to_tensor(text, tokenizer, vocab, device):\n",
+    "    tokens = tokenizer(text)\n",
+    "    ids = vocab.lookup_indices(tokens)\n",
+    "    tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)\n",
+    "    return tensor\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we do onnx stuff to get the data ready for the zk-circuit."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "import json\n",
+    "\n",
+    "text = \"This film is terrible!\"\n",
+    "x = text_to_tensor(text, tokenizer, vocab, device)\n",
+    "\n",
+    "# Flips the neural net into inference mode\n",
+    "model.eval()\n",
+    "model.to('cpu')\n",
+    "\n",
+    "model_path = \"network.onnx\"\n",
+    "data_path = \"input.json\"\n",
+    "\n",
+    "    # Export the model\n",
+    "torch.onnx.export(model,               # model being run\n",
+    "                      x,                   # model input (or a tuple for multiple inputs)\n",
+    "                      model_path,            # where to save the model (can be a file or file-like object)\n",
+    "                      export_params=True,        # store the trained parameter weights inside the model file\n",
+    "                      opset_version=10,          # the ONNX version to export the model to\n",
+    "                      do_constant_folding=True,  # whether to execute constant folding for optimization\n",
+    "                      input_names = ['input'],   # the model's input names\n",
+    "                      output_names = ['output'], # the model's output names\n",
+    "                      dynamic_axes={'input' : {0 : 'batch_size'},    # variable length axes\n",
+    "                                    'output' : {0 : 'batch_size'}})\n",
+    "\n",
+    "\n",
+    "\n",
+    "data_array = ((x).detach().numpy()).reshape([-1]).tolist()\n",
+    "\n",
+    "data_json = dict(input_data = [data_array])\n",
+    "\n",
+    "print(data_json)\n",
+    "\n",
+    "    # Serialize data into file:\n",
+    "json.dump(data_json, open(data_path, 'w'))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ezkl\n",
+    "\n",
+    "run_args = ezkl.PyRunArgs()\n",
+    "run_args.logrows = 23\n",
+    "run_args.scale_rebase_multiplier = 10\n",
+    "# inputs should be auditable by all\n",
+    "run_args.input_visibility = \"public\"\n",
+    "# same with outputs\n",
+    "run_args.output_visibility = \"public\"\n",
+    "# for simplicity, we'll just use the fixed model visibility: i.e it is public and can't be changed by the prover\n",
+    "run_args.param_visibility = \"fixed\"\n",
+    "\n",
+    "\n",
+    "# TODO: Dictionary outputs\n",
+    "res = ezkl.gen_settings(py_run_args=run_args)\n",
+    "assert res == True\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res = ezkl.compile_circuit()\n",
+    "assert res == True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# srs path\n",
+    "res = await ezkl.get_srs()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# now generate the witness file\n",
+    "res = await ezkl.gen_witness()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res = ezkl.mock()\n",
+    "assert res == True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# HERE WE SETUP THE CIRCUIT PARAMS\n",
+    "# WE GOT KEYS\n",
+    "# WE GOT CIRCUIT PARAMETERS\n",
+    "# EVERYTHING ANYONE HAS EVER NEEDED FOR ZK\n",
+    "\n",
+    "res = ezkl.setup()\n",
+    "\n",
+    "assert res == True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# GENERATE A PROOF\n",
+    "res = ezkl.prove(proof_path=\"proof.json\")\n",
+    "\n",
+    "print(res)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# VERIFY IT\n",
+    "res = ezkl.verify()\n",
+    "\n",
+    "assert res == True\n",
+    "print(\"verified\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can also verify it on chain by creating an onchain verifier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# check if notebook is in colab\n",
+    "try:\n",
+    "    import google.colab\n",
+    "    import subprocess\n",
+    "    import sys\n",
+    "    subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"solc-select\"])\n",
+    "    !solc-select install 0.8.20\n",
+    "    !solc-select use 0.8.20\n",
+    "    !solc --version\n",
+    "    import os\n",
+    "\n",
+    "# rely on local installation if the notebook is not in colab\n",
+    "except:\n",
+    "    import os\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res = await ezkl.create_evm_verifier()\n",
+    "assert res == True\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You should see a `Verifier.sol`. Right-click and save it locally.\n",
+    "\n",
+    "Now go to [https://remix.ethereum.org](https://remix.ethereum.org).\n",
+    "\n",
+    "Create a new file within remix and copy the verifier code over.\n",
+    "\n",
+    "Finally, compile the code and deploy. For the demo you can deploy to the test environment within remix.\n",
+    "\n",
+    "If everything works, you would have deployed your verifer onchain! Copy the values in the cell above to the respective fields to test if the verifier is working."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,6 +12,7 @@ asyncio_mode = "auto"

 [project]
 name = "ezkl"
+version = "0.0.0"
 requires-python = ">=3.7"
 classifiers = [
    "Programming Language :: Rust",
--- a/src/bindings/universal.rs
+++ b/src/bindings/universal.rs
@@ -141,10 +141,11 @@ pub(crate) fn gen_vk(
    .map_err(|e| EZKLError::InternalError(format!("Failed to create verifying key: {}", e)))?;

    let mut serialized_vk = Vec::new();
-    vk.write(&mut serialized_vk, halo2_proofs::SerdeFormat::RawBytes)
-        .map_err(|e| {
-            EZKLError::InternalError(format!("Failed to serialize verifying key: {}", e))
-        })?;
+    vk.write(
+        &mut serialized_vk,
+        halo2_proofs::SerdeFormat::RawBytesUnchecked,
+    )
+    .map_err(|e| EZKLError::InternalError(format!("Failed to serialize verifying key: {}", e)))?;

    Ok(serialized_vk)
 }
@@ -165,7 +166,7 @@ pub(crate) fn gen_pk(
    let mut reader = BufReader::new(&vk[..]);
    let vk = VerifyingKey::<G1Affine>::read::<_, GraphCircuit>(
        &mut reader,
-        halo2_proofs::SerdeFormat::RawBytes,
+        halo2_proofs::SerdeFormat::RawBytesUnchecked,
        circuit.settings().clone(),
    )
    .map_err(|e| EZKLError::InternalError(format!("Failed to deserialize verifying key: {}", e)))?;
@@ -197,7 +198,7 @@ pub(crate) fn verify(
    let mut reader = BufReader::new(&vk[..]);
    let vk = VerifyingKey::<G1Affine>::read::<_, GraphCircuit>(
        &mut reader,
-        halo2_proofs::SerdeFormat::RawBytes,
+        halo2_proofs::SerdeFormat::RawBytesUnchecked,
        circuit_settings.clone(),
    )
    .map_err(|e| EZKLError::InternalError(format!("Failed to deserialize vk: {}", e)))?;
@@ -277,7 +278,7 @@ pub(crate) fn verify_aggr(
    let mut reader = BufReader::new(&vk[..]);
    let vk = VerifyingKey::<G1Affine>::read::<_, AggregationCircuit>(
        &mut reader,
-        halo2_proofs::SerdeFormat::RawBytes,
+        halo2_proofs::SerdeFormat::RawBytesUnchecked,
        (),
    )
    .map_err(|e| EZKLError::InternalError(format!("Failed to deserialize vk: {}", e)))?;
@@ -365,7 +366,7 @@ pub(crate) fn prove(
    let mut reader = BufReader::new(&pk[..]);
    let pk = ProvingKey::<G1Affine>::read::<_, GraphCircuit>(
        &mut reader,
-        halo2_proofs::SerdeFormat::RawBytes,
+        halo2_proofs::SerdeFormat::RawBytesUnchecked,
        circuit.settings().clone(),
    )
    .map_err(|e| EZKLError::InternalError(format!("Failed to deserialize proving key: {}", e)))?;
@@ -487,7 +488,7 @@ pub(crate) fn vk_validation(vk: Vec<u8>, settings: Vec<u8>) -> Result<bool, EZKL
    let mut reader = BufReader::new(&vk[..]);
    let _ = VerifyingKey::<G1Affine>::read::<_, GraphCircuit>(
        &mut reader,
-        halo2_proofs::SerdeFormat::RawBytes,
+        halo2_proofs::SerdeFormat::RawBytesUnchecked,
        circuit_settings,
    )
    .map_err(|e| EZKLError::InternalError(format!("Failed to deserialize verifying key: {}", e)))?;
@@ -504,7 +505,7 @@ pub(crate) fn pk_validation(pk: Vec<u8>, settings: Vec<u8>) -> Result<bool, EZKL
    let mut reader = BufReader::new(&pk[..]);
    let _ = ProvingKey::<G1Affine>::read::<_, GraphCircuit>(
        &mut reader,
-        halo2_proofs::SerdeFormat::RawBytes,
+        halo2_proofs::SerdeFormat::RawBytesUnchecked,
        circuit_settings,
    )
    .map_err(|e| EZKLError::InternalError(format!("Failed to deserialize proving key: {}", e)))?;
--- a/src/circuit/ops/layouts.rs
+++ b/src/circuit/ops/layouts.rs
@@ -30,6 +30,8 @@ use crate::{
 use super::*;
 use crate::circuit::ops::lookup::LookupOp;

+const ASCII_ALPHABET: &str = "abcdefghijklmnopqrstuvwxyz";
+
 /// Calculate the L1 distance between two tensors.
 /// ```
 /// use ezkl::tensor::Tensor;
@@ -418,10 +420,6 @@ pub fn dot<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
    values[0].remove_indices(&mut removal_indices, true)?;
    values[1].remove_indices(&mut removal_indices, true)?;

-    let elapsed = global_start.elapsed();
-    trace!("filtering const zero indices took: {:?}", elapsed);
-
-    let start = instant::Instant::now();
    let mut inputs = vec![];
    let block_width = config.custom_gates.output.num_inner_cols();

@@ -429,37 +427,22 @@ pub fn dot<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
    for (i, input) in values.iter_mut().enumerate() {
        input.pad_to_zero_rem(block_width, ValType::Constant(F::ZERO))?;
        let inp = {
-            let (res, len) = region.assign_with_duplication(
-                &config.custom_gates.inputs[i],
-                input,
-                &config.check_mode,
-                false,
-            )?;
+            let (res, len) = region
+                .assign_with_duplication_unconstrained(&config.custom_gates.inputs[i], input)?;
            assigned_len = len;
            res.get_inner()?
        };
        inputs.push(inp);
    }

-    let elapsed = start.elapsed();
-    trace!("assigning inputs took: {:?}", elapsed);
-
    // Now we can assign the dot product
    // time this step
-    let start = instant::Instant::now();
    let accumulated_dot = accumulated::dot(&[inputs[0].clone(), inputs[1].clone()], block_width)?;
-    let elapsed = start.elapsed();
-    trace!("calculating accumulated dot took: {:?}", elapsed);
-
-    let start = instant::Instant::now();
-    let (output, output_assigned_len) = region.assign_with_duplication(
+    let (output, output_assigned_len) = region.assign_with_duplication_constrained(
        &config.custom_gates.output,
        &accumulated_dot.into(),
        &config.check_mode,
-        true,
    )?;
-    let elapsed = start.elapsed();
-    trace!("assigning output took: {:?}", elapsed);

    // enable the selectors
    if !region.is_dummy() {
@@ -1000,7 +983,6 @@ fn select<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
    region: &mut RegionCtx<F>,
    values: &[ValTensor<F>; 2],
 ) -> Result<ValTensor<F>, CircuitError> {
-    let start = instant::Instant::now();
    let (mut input, index) = (values[0].clone(), values[1].clone());
    input.flatten();

@@ -1028,9 +1010,6 @@ fn select<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
    let (_, assigned_output) =
        dynamic_lookup(config, region, &[index, output], &[dim_indices, input])?;

-    let end = start.elapsed();
-    trace!("select took: {:?}", end);
-
    Ok(assigned_output)
 }

@@ -1092,7 +1071,6 @@ pub(crate) fn dynamic_lookup<F: PrimeField + TensorType + PartialOrd + std::hash
    lookups: &[ValTensor<F>; 2],
    tables: &[ValTensor<F>; 2],
 ) -> Result<(ValTensor<F>, ValTensor<F>), CircuitError> {
-    let start = instant::Instant::now();
    // if not all lookups same length err
    if lookups[0].len() != lookups[1].len() {
        return Err(CircuitError::MismatchedLookupLength(
@@ -1126,28 +1104,20 @@ pub(crate) fn dynamic_lookup<F: PrimeField + TensorType + PartialOrd + std::hash
    }
    let table_len = table_0.len();

-    trace!("assigning tables took: {:?}", start.elapsed());
-
    // now create a vartensor of constants for the dynamic lookup index
    let table_index = create_constant_tensor(F::from(dynamic_lookup_index as u64), table_len);
    let _table_index =
        region.assign_dynamic_lookup(&config.dynamic_lookups.tables[2], &table_index)?;

-    trace!("assigning table index took: {:?}", start.elapsed());
-
    let lookup_0 = region.assign(&config.dynamic_lookups.inputs[0], &lookup_0)?;
    let lookup_1 = region.assign(&config.dynamic_lookups.inputs[1], &lookup_1)?;
    let lookup_len = lookup_0.len();

-    trace!("assigning lookups took: {:?}", start.elapsed());
-
    // now set the lookup index
    let lookup_index = create_constant_tensor(F::from(dynamic_lookup_index as u64), lookup_len);

    let _lookup_index = region.assign(&config.dynamic_lookups.inputs[2], &lookup_index)?;

-    trace!("assigning lookup index took: {:?}", start.elapsed());
-
    let mut lookup_block = 0;

    if !region.is_dummy() {
@@ -1194,9 +1164,6 @@ pub(crate) fn dynamic_lookup<F: PrimeField + TensorType + PartialOrd + std::hash
    region.increment_dynamic_lookup_index(1);
    region.increment(lookup_len);

-    let end = start.elapsed();
-    trace!("dynamic lookup took: {:?}", end);
-
    Ok((lookup_0, lookup_1))
 }

@@ -1441,7 +1408,6 @@ pub(crate) fn linearize_element_index<F: PrimeField + TensorType + PartialOrd +
    dim: usize,
    is_flat_index: bool,
 ) -> Result<ValTensor<F>, CircuitError> {
-    let start_time = instant::Instant::now();
    let index = values[0].clone();
    if !is_flat_index {
        assert_eq!(index.dims().len(), dims.len());
@@ -1515,9 +1481,6 @@ pub(crate) fn linearize_element_index<F: PrimeField + TensorType + PartialOrd +

    region.apply_in_loop(&mut output, inner_loop_function)?;

-    let elapsed = start_time.elapsed();
-    trace!("linearize_element_index took: {:?}", elapsed);
-
    Ok(output.into())
 }

@@ -1949,16 +1912,11 @@ pub fn sum<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(

    region.flush()?;
    // time this entire function run
-    let global_start = instant::Instant::now();
-
    let mut values = values.clone();

    // this section has been optimized to death, don't mess with it
    values[0].remove_const_zero_values();

-    let elapsed = global_start.elapsed();
-    trace!("filtering const zero indices took: {:?}", elapsed);
-
    // if empty return a const
    if values[0].is_empty() {
        return Ok(create_zero_tensor(1));
@@ -1970,12 +1928,8 @@ pub fn sum<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
    let input = {
        let mut input = values[0].clone();
        input.pad_to_zero_rem(block_width, ValType::Constant(F::ZERO))?;
-        let (res, len) = region.assign_with_duplication(
-            &config.custom_gates.inputs[1],
-            &input,
-            &config.check_mode,
-            false,
-        )?;
+        let (res, len) =
+            region.assign_with_duplication_unconstrained(&config.custom_gates.inputs[1], &input)?;
        assigned_len = len;
        res.get_inner()?
    };
@@ -1983,11 +1937,10 @@ pub fn sum<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
    // Now we can assign the dot product
    let accumulated_sum = accumulated::sum(&input, block_width)?;

-    let (output, output_assigned_len) = region.assign_with_duplication(
+    let (output, output_assigned_len) = region.assign_with_duplication_constrained(
        &config.custom_gates.output,
        &accumulated_sum.into(),
        &config.check_mode,
-        true,
    )?;

    // enable the selectors
@@ -2053,13 +2006,10 @@ pub fn prod<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
 ) -> Result<ValTensor<F>, CircuitError> {
    region.flush()?;
    // time this entire function run
-    let global_start = instant::Instant::now();

    // this section has been optimized to death, don't mess with it
    let removal_indices = values[0].get_const_zero_indices();

-    let elapsed = global_start.elapsed();
-    trace!("finding const zero indices took: {:?}", elapsed);
    // if empty return a const
    if !removal_indices.is_empty() {
        return Ok(create_zero_tensor(1));
@@ -2070,12 +2020,8 @@ pub fn prod<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
    let input = {
        let mut input = values[0].clone();
        input.pad_to_zero_rem(block_width, ValType::Constant(F::ONE))?;
-        let (res, len) = region.assign_with_duplication(
-            &config.custom_gates.inputs[1],
-            &input,
-            &config.check_mode,
-            false,
-        )?;
+        let (res, len) =
+            region.assign_with_duplication_unconstrained(&config.custom_gates.inputs[1], &input)?;
        assigned_len = len;
        res.get_inner()?
    };
@@ -2083,11 +2029,10 @@ pub fn prod<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
    // Now we can assign the dot product
    let accumulated_prod = accumulated::prod(&input, block_width)?;

-    let (output, output_assigned_len) = region.assign_with_duplication(
+    let (output, output_assigned_len) = region.assign_with_duplication_constrained(
        &config.custom_gates.output,
        &accumulated_prod.into(),
        &config.check_mode,
-        true,
    )?;

    // enable the selectors
@@ -2440,7 +2385,6 @@ pub(crate) fn pairwise<F: PrimeField + TensorType + PartialOrd + std::hash::Hash
    let orig_lhs = lhs.clone();
    let orig_rhs = rhs.clone();

-    let start = instant::Instant::now();
    let first_zero_indices = HashSet::from_iter(lhs.get_const_zero_indices());
    let second_zero_indices = HashSet::from_iter(rhs.get_const_zero_indices());

@@ -2455,7 +2399,6 @@ pub(crate) fn pairwise<F: PrimeField + TensorType + PartialOrd + std::hash::Hash
        BaseOp::Sub => second_zero_indices.clone(),
        _ => return Err(CircuitError::UnsupportedOp),
    };
-    trace!("setting up indices took {:?}", start.elapsed());

    if lhs.len() != rhs.len() {
        return Err(CircuitError::DimMismatch(format!(
@@ -2480,7 +2423,6 @@ pub(crate) fn pairwise<F: PrimeField + TensorType + PartialOrd + std::hash::Hash

    // Now we can assign the dot product
    // time the calc
-    let start = instant::Instant::now();
    let op_result = match op {
        BaseOp::Add => add(&inputs),
        BaseOp::Sub => sub(&inputs),
@@ -2491,20 +2433,13 @@ pub(crate) fn pairwise<F: PrimeField + TensorType + PartialOrd + std::hash::Hash
        error!("{}", e);
        halo2_proofs::plonk::Error::Synthesis
    })?;
-    trace!("pairwise {} calc took {:?}", op.as_str(), start.elapsed());

-    let start = instant::Instant::now();
    let assigned_len = op_result.len() - removal_indices.len();
    let mut output = region.assign_with_omissions(
        &config.custom_gates.output,
        &op_result.into(),
        &removal_indices,
    )?;
-    trace!(
-        "pairwise {} input assign took {:?}",
-        op.as_str(),
-        start.elapsed()
-    );

    // Enable the selectors
    if !region.is_dummy() {
@@ -2671,9 +2606,7 @@ pub fn greater<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
    rhs.expand(&broadcasted_shape)?;

    let diff = pairwise(config, region, &[lhs, rhs], BaseOp::Sub)?;
-
    let sign = sign(config, region, &[diff])?;
-
    equals(config, region, &[sign, create_unit_tensor(1)])
 }

@@ -5286,75 +5219,72 @@ pub(crate) fn decompose<F: PrimeField + TensorType + PartialOrd + std::hash::Has
    base: &usize,
    n: &usize,
 ) -> Result<ValTensor<F>, CircuitError> {
-    let input = values[0].clone();
+    let mut input = values[0].clone();

    let is_assigned = !input.all_prev_assigned();

-    let bases: ValTensor<F> = Tensor::from(
-        (0..*n)
-            .rev()
-            .map(|x| ValType::Constant(integer_rep_to_felt(base.pow(x as u32) as IntegerRep))),
+    if !is_assigned {
+        input = region.assign(&config.custom_gates.inputs[0], &input)?;
+    }
+
+    let mut bases: ValTensor<F> = Tensor::from(
+        // repeat it input.len() times
+        (0..input.len()).flat_map(|_| {
+            (0..*n)
+                .rev()
+                .map(|x| ValType::Constant(integer_rep_to_felt(base.pow(x as u32) as IntegerRep)))
+        }),
    )
    .into();
+    let mut bases_dims = input.dims().to_vec();
+    bases_dims.push(*n);
+    bases.reshape(&bases_dims)?;

-    let cartesian_coord = input
-        .dims()
-        .iter()
-        .map(|x| 0..*x)
-        .multi_cartesian_product()
-        .collect::<Vec<_>>();
+    let mut decomposed_dims = input.dims().to_vec();
+    decomposed_dims.push(*n + 1);

-    let mut output: Tensor<Tensor<ValType<F>>> = Tensor::new(None, input.dims())?;
+    let claimed_output = if region.witness_gen() {
+        input.decompose(*base, *n)?
+    } else {
+        let decomposed_len = decomposed_dims.iter().product();
+        let claimed_output = Tensor::new(
+            Some(&vec![ValType::Value(Value::unknown()); decomposed_len]),
+            &decomposed_dims,
+        )?;

-    let inner_loop_function =
-        |i: usize, region: &mut RegionCtx<F>| -> Result<Tensor<ValType<F>>, CircuitError> {
-            let coord = cartesian_coord[i].clone();
-            let slice = coord.iter().map(|x| *x..*x + 1).collect::<Vec<_>>();
-            let mut sliced_input = input.get_slice(&slice)?;
-            sliced_input.flatten();
+        claimed_output.into()
+    };
+    region.assign(&config.custom_gates.output, &claimed_output)?;
+    region.increment(claimed_output.len());

-            if !is_assigned {
-                sliced_input = region.assign(&config.custom_gates.inputs[0], &sliced_input)?;
-            }
+    let input_slice = input.dims().iter().map(|x| 0..*x).collect::<Vec<_>>();
+    let mut sign_slice = input_slice.clone();
+    sign_slice.push(0..1);
+    let mut rest_slice = input_slice.clone();
+    rest_slice.push(1..n + 1);

-            let mut claimed_output_slice = if region.witness_gen() {
-                sliced_input.decompose(*base, *n)?
-            } else {
-                Tensor::from(vec![ValType::Value(Value::unknown()); *n + 1].into_iter()).into()
-            };
+    let sign = claimed_output.get_slice(&sign_slice)?;
+    let rest = claimed_output.get_slice(&rest_slice)?;

-            claimed_output_slice =
-                region.assign(&config.custom_gates.inputs[1], &claimed_output_slice)?;
-            claimed_output_slice.flatten();
+    let sign = range_check(config, region, &[sign], &(-1, 1))?;
+    let rest = range_check(config, region, &[rest], &(0, (*base - 1) as i128))?;

-            region.increment(claimed_output_slice.len());
+    // equation needs to be constructed as ij,ij->i but for arbitrary n dims we need to construct this dynamically
+    // indices should map in order of the alphabet
+    // start with lhs
+    let lhs = ASCII_ALPHABET.chars().take(rest.dims().len()).join("");
+    let rhs = ASCII_ALPHABET.chars().take(rest.dims().len() - 1).join("");
+    let equation = format!("{},{}->{}", lhs, lhs, rhs);

-            // get the sign bit and make sure it is valid
-            let sign = claimed_output_slice.first()?;
-            let sign = range_check(config, region, &[sign], &(-1, 1))?;
+    // now add the rhs

-            // get the rest of the thing and make sure it is in the correct range
-            let rest = claimed_output_slice.get_slice(&[1..claimed_output_slice.len()])?;
+    let prod_decomp = einsum(config, region, &[rest.clone(), bases], &equation)?;

-            let rest = range_check(config, region, &[rest], &(0, (base - 1) as i128))?;
+    let signed_decomp = pairwise(config, region, &[prod_decomp, sign], BaseOp::Mult)?;

-            let prod_decomp = dot(config, region, &[rest, bases.clone()])?;
+    enforce_equality(config, region, &[input, signed_decomp])?;

-            let signed_decomp = pairwise(config, region, &[prod_decomp, sign], BaseOp::Mult)?;
-
-            enforce_equality(config, region, &[sliced_input, signed_decomp])?;
-
-            Ok(claimed_output_slice.get_inner_tensor()?.clone())
-        };
-
-    region.apply_in_loop(&mut output, inner_loop_function)?;
-
-    let mut combined_output = output.combine()?;
-    let mut output_dims = input.dims().to_vec();
-    output_dims.push(*n + 1);
-    combined_output.reshape(&output_dims)?;
-
-    Ok(combined_output.into())
+    Ok(claimed_output)
 }

 pub(crate) fn sign<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
--- a/src/circuit/ops/region.rs
+++ b/src/circuit/ops/region.rs
@@ -671,22 +671,17 @@ impl<'a, F: PrimeField + TensorType + PartialOrd + std::hash::Hash> RegionCtx<'a
    }

    /// Assign a valtensor to a vartensor with duplication
-    pub fn assign_with_duplication(
+    pub fn assign_with_duplication_unconstrained(
        &mut self,
        var: &VarTensor,
        values: &ValTensor<F>,
-        check_mode: &crate::circuit::CheckMode,
-        single_inner_col: bool,
    ) -> Result<(ValTensor<F>, usize), Error> {
        if let Some(region) = &self.region {
            // duplicates every nth element to adjust for column overflow
-            let (res, len) = var.assign_with_duplication(
+            let (res, len) = var.assign_with_duplication_unconstrained(
                &mut region.borrow_mut(),
-                self.row,
                self.linear_coord,
                values,
-                check_mode,
-                single_inner_col,
                &mut self.assigned_constants,
            )?;
            Ok((res, len))
@@ -695,7 +690,37 @@ impl<'a, F: PrimeField + TensorType + PartialOrd + std::hash::Hash> RegionCtx<'a
                self.row,
                self.linear_coord,
                values,
-                single_inner_col,
+                false,
+                &mut self.assigned_constants,
+            )?;
+            Ok((values.clone(), len))
+        }
+    }
+
+    /// Assign a valtensor to a vartensor with duplication
+    pub fn assign_with_duplication_constrained(
+        &mut self,
+        var: &VarTensor,
+        values: &ValTensor<F>,
+        check_mode: &crate::circuit::CheckMode,
+    ) -> Result<(ValTensor<F>, usize), Error> {
+        if let Some(region) = &self.region {
+            // duplicates every nth element to adjust for column overflow
+            let (res, len) = var.assign_with_duplication_constrained(
+                &mut region.borrow_mut(),
+                self.row,
+                self.linear_coord,
+                values,
+                check_mode,
+                &mut self.assigned_constants,
+            )?;
+            Ok((res, len))
+        } else {
+            let (_, len) = var.dummy_assign_with_duplication(
+                self.row,
+                self.linear_coord,
+                values,
+                true,
                &mut self.assigned_constants,
            )?;
            Ok((values.clone(), len))
--- a/src/eth.rs
+++ b/src/eth.rs
@@ -488,7 +488,8 @@ pub async fn deploy_da_verifier_via_solidity(
        }
    }

-    let contract = match call_to_account {
+    
+    match call_to_account {
        Some(call) => {
            deploy_single_da_contract(
                client,
@@ -514,8 +515,7 @@ pub async fn deploy_da_verifier_via_solidity(
            )
            .await
        }
-    };
-    return contract;
+    }
 }

 async fn deploy_multi_da_contract(
@@ -630,7 +630,7 @@ async fn deploy_single_da_contract(
            // bytes memory _callData,
            PackedSeqToken(call_data.as_ref()),
            // uint256 _decimals,
-            WordToken(B256::from(decimals).into()),
+            WordToken(B256::from(decimals)),
            // uint[] memory _scales,
            DynSeqToken(
                scales
--- a/src/graph/model.rs
+++ b/src/graph/model.rs
@@ -1226,6 +1226,7 @@ impl Model {
                values.iter().map(|v| v.dims()).collect_vec()
            );

+            let start = instant::Instant::now();
            match &node {
                NodeType::Node(n) => {
                    let res = if node.is_constant() && node.num_uses() == 1 {
@@ -1363,6 +1364,7 @@ impl Model {
                    results.insert(*idx, full_results);
                }
            }
+            debug!("------------ layout of {} took {:?}", idx, start.elapsed());
        }

        // we do this so we can support multiple passes of the same model and have deterministic results (Non-assigned inputs etc... etc...)
--- a/src/graph/utilities.rs
+++ b/src/graph/utilities.rs
@@ -142,8 +142,6 @@ use tract_onnx::prelude::SymbolValues;
 pub fn extract_tensor_value(
    input: Arc<tract_onnx::prelude::Tensor>,
 ) -> Result<Tensor<f32>, GraphError> {
-    use maybe_rayon::prelude::{IntoParallelRefIterator, ParallelIterator};
-
    let dt = input.datum_type();
    let dims = input.shape().to_vec();

@@ -156,7 +154,7 @@ pub fn extract_tensor_value(
    match dt {
        DatumType::F16 => {
            let vec = input.as_slice::<tract_onnx::prelude::f16>()?.to_vec();
-            let cast: Vec<f32> = vec.par_iter().map(|x| (*x).into()).collect();
+            let cast: Vec<f32> = vec.iter().map(|x| (*x).into()).collect();
            const_value = Tensor::<f32>::new(Some(&cast), &dims)?;
        }
        DatumType::F32 => {
@@ -165,61 +163,61 @@ pub fn extract_tensor_value(
        }
        DatumType::F64 => {
            let vec = input.as_slice::<f64>()?.to_vec();
-            let cast: Vec<f32> = vec.par_iter().map(|x| *x as f32).collect();
+            let cast: Vec<f32> = vec.iter().map(|x| *x as f32).collect();
            const_value = Tensor::<f32>::new(Some(&cast), &dims)?;
        }
        DatumType::I64 => {
            // Generally a shape or hyperparam
            let vec = input.as_slice::<i64>()?.to_vec();
-            let cast: Vec<f32> = vec.par_iter().map(|x| *x as f32).collect();
+            let cast: Vec<f32> = vec.iter().map(|x| *x as f32).collect();
            const_value = Tensor::<f32>::new(Some(&cast), &dims)?;
        }
        DatumType::I32 => {
            // Generally a shape or hyperparam
            let vec = input.as_slice::<i32>()?.to_vec();
-            let cast: Vec<f32> = vec.par_iter().map(|x| *x as f32).collect();
+            let cast: Vec<f32> = vec.iter().map(|x| *x as f32).collect();
            const_value = Tensor::<f32>::new(Some(&cast), &dims)?;
        }
        DatumType::I16 => {
            // Generally a shape or hyperparam
            let vec = input.as_slice::<i16>()?.to_vec();
-            let cast: Vec<f32> = vec.par_iter().map(|x| *x as f32).collect();
+            let cast: Vec<f32> = vec.iter().map(|x| *x as f32).collect();
            const_value = Tensor::<f32>::new(Some(&cast), &dims)?;
        }
        DatumType::I8 => {
            // Generally a shape or hyperparam
            let vec = input.as_slice::<i8>()?.to_vec();
-            let cast: Vec<f32> = vec.par_iter().map(|x| *x as f32).collect();
+            let cast: Vec<f32> = vec.iter().map(|x| *x as f32).collect();
            const_value = Tensor::<f32>::new(Some(&cast), &dims)?;
        }
        DatumType::U8 => {
            // Generally a shape or hyperparam
            let vec = input.as_slice::<u8>()?.to_vec();
-            let cast: Vec<f32> = vec.par_iter().map(|x| *x as f32).collect();
+            let cast: Vec<f32> = vec.iter().map(|x| *x as f32).collect();
            const_value = Tensor::<f32>::new(Some(&cast), &dims)?;
        }
        DatumType::U16 => {
            // Generally a shape or hyperparam
            let vec = input.as_slice::<u16>()?.to_vec();
-            let cast: Vec<f32> = vec.par_iter().map(|x| *x as f32).collect();
+            let cast: Vec<f32> = vec.iter().map(|x| *x as f32).collect();
            const_value = Tensor::<f32>::new(Some(&cast), &dims)?;
        }
        DatumType::U32 => {
            // Generally a shape or hyperparam
            let vec = input.as_slice::<u32>()?.to_vec();
-            let cast: Vec<f32> = vec.par_iter().map(|x| *x as f32).collect();
+            let cast: Vec<f32> = vec.iter().map(|x| *x as f32).collect();
            const_value = Tensor::<f32>::new(Some(&cast), &dims)?;
        }
        DatumType::U64 => {
            // Generally a shape or hyperparam
            let vec = input.as_slice::<u64>()?.to_vec();
-            let cast: Vec<f32> = vec.par_iter().map(|x| *x as f32).collect();
+            let cast: Vec<f32> = vec.iter().map(|x| *x as f32).collect();
            const_value = Tensor::<f32>::new(Some(&cast), &dims)?;
        }
        DatumType::Bool => {
            // Generally a shape or hyperparam
            let vec = input.as_slice::<bool>()?.to_vec();
-            let cast: Vec<f32> = vec.par_iter().map(|x| *x as usize as f32).collect();
+            let cast: Vec<f32> = vec.iter().map(|x| *x as usize as f32).collect();
            const_value = Tensor::<f32>::new(Some(&cast), &dims)?;
        }
        DatumType::TDim => {
@@ -227,7 +225,7 @@ pub fn extract_tensor_value(
            let vec = input.as_slice::<tract_onnx::prelude::TDim>()?.to_vec();

            let cast: Result<Vec<f32>, GraphError> = vec
-                .par_iter()
+                .iter()
                .map(|x| match x.to_i64() {
                    Ok(v) => Ok(v as f32),
                    Err(_) => match x.to_i64() {
@@ -1136,23 +1134,21 @@ pub fn new_op_from_onnx(
                        a: crate::circuit::utils::F32(exponent),
                    })
                }
-            } else {
-                if let Some(c) = inputs[0].opkind().get_mutable_constant() {
-                    inputs[0].decrement_use();
-                    deleted_indices.push(0);
-                    if c.raw_values.len() > 1 {
-                        unimplemented!("only support scalar base")
-                    }
-
-                    let base = c.raw_values[0];
-
-                    SupportedOp::Nonlinear(LookupOp::Exp {
-                        scale: scale_to_multiplier(input_scales[1]).into(),
-                        base: base.into(),
-                    })
-                } else {
-                    unimplemented!("only support constant base or pow for now")
+            } else if let Some(c) = inputs[0].opkind().get_mutable_constant() {
+                inputs[0].decrement_use();
+                deleted_indices.push(0);
+                if c.raw_values.len() > 1 {
+                    unimplemented!("only support scalar base")
                }
+
+                let base = c.raw_values[0];
+
+                SupportedOp::Nonlinear(LookupOp::Exp {
+                    scale: scale_to_multiplier(input_scales[1]).into(),
+                    base: base.into(),
+                })
+            } else {
+                unimplemented!("only support constant base or pow for now")
            }
        }
        "Div" => {
--- a/src/tensor/mod.rs
+++ b/src/tensor/mod.rs
@@ -638,42 +638,44 @@ impl<T: Clone + TensorType> Tensor<T> {
    where
        T: Send + Sync,
    {
-        if indices.is_empty() {
+        // Fast path: empty indices or full tensor slice
+        if indices.is_empty()
+            || indices.iter().map(|x| x.end - x.start).collect::<Vec<_>>() == self.dims
+        {
            return Ok(self.clone());
        }
+
+        // Validate dimensions
        if self.dims.len() < indices.len() {
            return Err(TensorError::DimError(format!(
                "The dimensionality of the slice {:?} is greater than the tensor's {:?}",
                indices, self.dims
            )));
-        } else if indices.iter().map(|x| x.end - x.start).collect::<Vec<_>>() == self.dims {
-            // else if slice is the same as dims, return self
-            return Ok(self.clone());
        }

-        // if indices weren't specified we fill them in as required
-        let mut full_indices = indices.to_vec();
+        // Pre-allocate the full indices vector with capacity
+        let mut full_indices = Vec::with_capacity(self.dims.len());
+        full_indices.extend_from_slice(indices);

-        for i in 0..(self.dims.len() - indices.len()) {
-            full_indices.push(0..self.dims()[indices.len() + i])
-        }
+        // Fill remaining dimensions
+        full_indices.extend((indices.len()..self.dims.len()).map(|i| 0..self.dims[i]));

-        let cartesian_coord: Vec<Vec<usize>> = full_indices
+        // Pre-calculate total size and allocate result vector
+        let total_size: usize = full_indices
            .iter()
-            .cloned()
-            .multi_cartesian_product()
-            .collect();
-
-        let res: Vec<T> = cartesian_coord
-            .par_iter()
-            .map(|e| {
-                let index = self.get_index(e);
-                self[index].clone()
-            })
-            .collect();
+            .map(|range| range.end - range.start)
+            .product();
+        let mut res = Vec::with_capacity(total_size);

+        // Calculate new dimensions once
        let dims: Vec<usize> = full_indices.iter().map(|e| e.end - e.start).collect();

+        // Use iterator directly without collecting into intermediate Vec
+        for coord in full_indices.iter().cloned().multi_cartesian_product() {
+            let index = self.get_index(&coord);
+            res.push(self[index].clone());
+        }
+
        Tensor::new(Some(&res), &dims)
    }

@@ -831,7 +833,7 @@ impl<T: Clone + TensorType> Tensor<T> {
        num_repeats: usize,
        initial_offset: usize,
    ) -> Result<Tensor<T>, TensorError> {
-        let mut inner: Vec<T> = vec![];
+        let mut inner: Vec<T> = Vec::with_capacity(self.inner.len());
        let mut offset = initial_offset;
        for (i, elem) in self.inner.clone().into_iter().enumerate() {
            if (i + offset + 1) % n == 0 {
@@ -860,20 +862,22 @@ impl<T: Clone + TensorType> Tensor<T> {
        num_repeats: usize,
        initial_offset: usize,
    ) -> Result<Tensor<T>, TensorError> {
-        let mut inner: Vec<T> = vec![];
-        let mut indices_to_remove = std::collections::HashSet::new();
-        for i in 0..self.inner.len() {
-            if (i + initial_offset + 1) % n == 0 {
-                for j in 1..(1 + num_repeats) {
-                    indices_to_remove.insert(i + j);
-                }
-            }
-        }
+        // Pre-calculate capacity to avoid reallocations
+        let estimated_size = self.inner.len() - (self.inner.len() / n) * num_repeats;
+        let mut inner = Vec::with_capacity(estimated_size);

-        let old_inner = self.inner.clone();
-        for (i, elem) in old_inner.into_iter().enumerate() {
-            if !indices_to_remove.contains(&i) {
-                inner.push(elem.clone());
+        // Use iterator directly instead of creating intermediate collections
+        let mut i = 0;
+        while i < self.inner.len() {
+            // Add the current element
+            inner.push(self.inner[i].clone());
+
+            // If this is an nth position (accounting for offset)
+            if (i + initial_offset + 1) % n == 0 {
+                // Skip the next num_repeats elements
+                i += num_repeats + 1;
+            } else {
+                i += 1;
            }
        }

--- a/src/tensor/val.rs
+++ b/src/tensor/val.rs
@@ -1,12 +1,12 @@
 use crate::{circuit::region::ConstantsMap, fieldutils::felt_to_integer_rep};
-use maybe_rayon::slice::Iter;
+use maybe_rayon::slice::{Iter, ParallelSlice};

 use super::{
    ops::{intercalate_values, pad, resize},
    *,
 };
 use halo2_proofs::{arithmetic::Field, circuit::Cell, plonk::Instance};
-use maybe_rayon::iter::{FilterMap, IntoParallelIterator, ParallelIterator};
+use maybe_rayon::iter::{FilterMap, ParallelIterator};

 pub(crate) fn create_constant_tensor<
    F: PrimeField + TensorType + std::marker::Send + std::marker::Sync + PartialOrd,
@@ -455,7 +455,7 @@ impl<F: PrimeField + TensorType + PartialOrd + std::hash::Hash> ValTensor<F> {
        }
    }

-    /// Returns the number of constants in the [ValTensor].
+    /// Returns an iterator over the [ValTensor]'s constants.
    pub fn create_constants_map_iterator(
        &self,
    ) -> FilterMap<Iter<'_, ValType<F>>, fn(&ValType<F>) -> Option<(F, ValType<F>)>> {
@@ -473,20 +473,48 @@ impl<F: PrimeField + TensorType + PartialOrd + std::hash::Hash> ValTensor<F> {
        }
    }

-    /// Returns the number of constants in the [ValTensor].
+    /// Returns a map of the constants in the [ValTensor].
    pub fn create_constants_map(&self) -> ConstantsMap<F> {
-        match self {
-            ValTensor::Value { inner, .. } => inner
-                .par_iter()
-                .filter_map(|x| {
-                    if let ValType::Constant(v) = x {
-                        Some((*v, x.clone()))
-                    } else {
-                        None
-                    }
-                })
-                .collect(),
-            ValTensor::Instance { .. } => ConstantsMap::new(),
+        let threshold = 1_000_000; // Tuned using the benchmarks
+
+        if self.len() < threshold {
+            match self {
+                ValTensor::Value { inner, .. } => inner
+                    .par_iter()
+                    .filter_map(|x| {
+                        if let ValType::Constant(v) = x {
+                            Some((*v, x.clone()))
+                        } else {
+                            None
+                        }
+                    })
+                    .collect(),
+                ValTensor::Instance { .. } => ConstantsMap::new(),
+            }
+        } else {
+            // Use parallel for larger arrays
+            let num_cores = std::thread::available_parallelism()
+                .map(|n| n.get())
+                .unwrap_or(1);
+            let chunk_size = (self.len() / num_cores).max(100_000);
+
+            match self {
+                ValTensor::Value { inner, .. } => inner
+                    .par_chunks(chunk_size)
+                    .flat_map(|chunk| {
+                        chunk
+                            .par_iter() // Make sure we use par_iter() here
+                            .filter_map(|x| {
+                                if let ValType::Constant(v) = x {
+                                    Some((*v, x.clone()))
+                                } else {
+                                    None
+                                }
+                            })
+                    })
+                    .collect(),
+                ValTensor::Instance { .. } => ConstantsMap::new(),
+            }
        }
    }

@@ -878,70 +906,161 @@ impl<F: PrimeField + TensorType + PartialOrd + std::hash::Hash> ValTensor<F> {

    /// remove constant zero values constants
    pub fn remove_const_zero_values(&mut self) {
-        match self {
-            ValTensor::Value { inner: v, dims, .. } => {
-                *v = v
-                    .clone()
-                    .into_par_iter()
-                    .filter_map(|e| {
-                        if let ValType::Constant(r) = e {
-                            if r == F::ZERO {
-                                return None;
+        let size_threshold = 1_000_000; // Tuned using the benchmarks
+
+        if self.len() < size_threshold {
+            match self {
+                ValTensor::Value { inner: v, dims, .. } => {
+                    *v = v
+                        .clone()
+                        .into_iter()
+                        .filter_map(|e| {
+                            if let ValType::Constant(r) = e {
+                                if r == F::ZERO {
+                                    return None;
+                                }
+                            } else if let ValType::AssignedConstant(_, r) = e {
+                                if r == F::ZERO {
+                                    return None;
+                                }
                            }
-                        } else if let ValType::AssignedConstant(_, r) = e {
-                            if r == F::ZERO {
-                                return None;
-                            }
-                        }
-                        Some(e)
-                    })
-                    .collect();
-                *dims = v.dims().to_vec();
+                            Some(e)
+                        })
+                        .collect();
+                    *dims = v.dims().to_vec();
+                }
+                ValTensor::Instance { .. } => {}
+            }
+        } else {
+            // Use parallel for larger arrays
+            let num_cores = std::thread::available_parallelism()
+                .map(|n| n.get())
+                .unwrap_or(1);
+            let chunk_size = (self.len() / num_cores).max(100_000);
+
+            match self {
+                ValTensor::Value { inner: v, dims, .. } => {
+                    *v = v
+                        .par_chunks_mut(chunk_size)
+                        .flat_map(|chunk| {
+                            chunk
+                                .par_iter_mut() // Make sure we use par_iter() here
+                                .filter_map(|e| {
+                                    if let ValType::Constant(r) = e {
+                                        if *r == F::ZERO {
+                                            return None;
+                                        }
+                                    } else if let ValType::AssignedConstant(_, r) = e {
+                                        if *r == F::ZERO {
+                                            return None;
+                                        }
+                                    }
+                                    Some(e.clone())
+                                })
+                        })
+                        .collect();
+                    *dims = v.dims().to_vec();
+                }
+                ValTensor::Instance { .. } => {}
            }
-            ValTensor::Instance { .. } => {}
        }
    }

-    /// gets constants
+    /// filter constant zero values constants
    pub fn get_const_zero_indices(&self) -> Vec<usize> {
-        match self {
-            ValTensor::Value { inner: v, .. } => v
-                .par_iter()
-                .enumerate()
-                .filter_map(|(i, e)| {
-                    if let ValType::Constant(r) = e {
-                        if *r == F::ZERO {
-                            return Some(i);
+        let size_threshold = 1_000_000; // Tuned using the benchmarks
+
+        if self.len() < size_threshold {
+            // Use single-threaded for smaller arrays
+            match &self {
+                ValTensor::Value { inner: v, .. } => v
+                    .iter()
+                    .enumerate()
+                    .filter_map(|(i, e)| {
+                        match e {
+                            // Combine both match arms to reduce branching
+                            ValType::Constant(r) | ValType::AssignedConstant(_, r) => {
+                                (*r == F::ZERO).then_some(i)
+                            }
+                            _ => None,
                        }
-                    } else if let ValType::AssignedConstant(_, r) = e {
-                        if *r == F::ZERO {
-                            return Some(i);
-                        }
-                    }
-                    None
-                })
-                .collect(),
-            ValTensor::Instance { .. } => vec![],
+                    })
+                    .collect(),
+                ValTensor::Instance { .. } => vec![],
+            }
+        } else {
+            // Use parallel for larger arrays
+            let num_cores = std::thread::available_parallelism()
+                .map(|n| n.get())
+                .unwrap_or(1);
+            let chunk_size = (self.len() / num_cores).max(100_000);
+
+            match &self {
+                ValTensor::Value { inner: v, .. } => v
+                    .par_chunks(chunk_size)
+                    .enumerate()
+                    .flat_map(|(chunk_idx, chunk)| {
+                        chunk
+                            .par_iter() // Make sure we use par_iter() here
+                            .enumerate()
+                            .filter_map(move |(i, e)| match e {
+                                ValType::Constant(r) | ValType::AssignedConstant(_, r) => {
+                                    (*r == F::ZERO).then_some(chunk_idx * chunk_size + i)
+                                }
+                                _ => None,
+                            })
+                    })
+                    .collect::<Vec<_>>(),
+                ValTensor::Instance { .. } => vec![],
+            }
        }
    }

-    /// gets constants
+    /// gets constant indices
    pub fn get_const_indices(&self) -> Vec<usize> {
-        match self {
-            ValTensor::Value { inner: v, .. } => v
-                .par_iter()
-                .enumerate()
-                .filter_map(|(i, e)| {
-                    if let ValType::Constant(_) = e {
-                        Some(i)
-                    } else if let ValType::AssignedConstant(_, _) = e {
-                        Some(i)
-                    } else {
-                        None
-                    }
-                })
-                .collect(),
-            ValTensor::Instance { .. } => vec![],
+        let size_threshold = 1_000_000; // Tuned using the benchmarks
+
+        if self.len() < size_threshold {
+            // Use single-threaded for smaller arrays
+            match &self {
+                ValTensor::Value { inner: v, .. } => v
+                    .iter()
+                    .enumerate()
+                    .filter_map(|(i, e)| {
+                        match e {
+                            // Combine both match arms to reduce branching
+                            ValType::Constant(_) | ValType::AssignedConstant(_, _) => Some(i),
+                            _ => None,
+                        }
+                    })
+                    .collect(),
+                ValTensor::Instance { .. } => vec![],
+            }
+        } else {
+            // Use parallel for larger arrays
+            let num_cores = std::thread::available_parallelism()
+                .map(|n| n.get())
+                .unwrap_or(1);
+            let chunk_size = (self.len() / num_cores).max(100_000);
+
+            match &self {
+                ValTensor::Value { inner: v, .. } => v
+                    .par_chunks(chunk_size)
+                    .enumerate()
+                    .flat_map(|(chunk_idx, chunk)| {
+                        chunk
+                            .par_iter() // Make sure we use par_iter() here
+                            .enumerate()
+                            .filter_map(move |(i, e)| match e {
+                                ValType::Constant(_) | ValType::AssignedConstant(_, _) => {
+                                    Some(chunk_idx * chunk_size + i)
+                                }
+                                _ => None,
+                            })
+                    })
+                    .collect::<Vec<_>>(),
+                ValTensor::Instance { .. } => vec![],
+            }
        }
    }

--- a/src/tensor/var.rs
+++ b/src/tensor/var.rs
@@ -494,16 +494,56 @@ impl VarTensor {
        }
    }

+    /// Assigns specific values (`ValTensor`) to the columns of the inner tensor but allows for column wrapping for accumulated operations.
+    pub fn assign_with_duplication_unconstrained<
+        F: PrimeField + TensorType + PartialOrd + std::hash::Hash,
+    >(
+        &self,
+        region: &mut Region<F>,
+        offset: usize,
+        values: &ValTensor<F>,
+        constants: &mut ConstantsMap<F>,
+    ) -> Result<(ValTensor<F>, usize), halo2_proofs::plonk::Error> {
+        match values {
+            ValTensor::Instance { .. } => unimplemented!("duplication is not supported on instance columns. increase K if you require more rows."),
+            ValTensor::Value { inner: v, dims , ..} => {
+
+                let duplication_freq = self.block_size();
+
+                let num_repeats = self.num_inner_cols();
+
+                let duplication_offset = offset;
+
+                // duplicates every nth element to adjust for column overflow
+                let v = v.duplicate_every_n(duplication_freq, num_repeats, duplication_offset).unwrap();
+                let mut res: ValTensor<F> = {
+                    v.enum_map(|coord, k| {
+                    let cell = self.assign_value(region, offset, k.clone(), coord, constants)?;
+                    Ok::<_, halo2_proofs::plonk::Error>(cell)
+
+                })?.into()};
+                let total_used_len = res.len();
+                res.remove_every_n(duplication_freq, num_repeats, duplication_offset).unwrap();
+
+                res.reshape(dims).unwrap();
+                res.set_scale(values.scale());
+
+                Ok((res, total_used_len))
+            }
+        }
+    }
+
    /// Assigns specific values (`ValTensor`) to the columns of the inner tensor but allows for column wrapping for accumulated operations.
    /// Duplication occurs by copying the last cell of the column to the first cell next column and creating a copy constraint between the two.
-    pub fn assign_with_duplication<F: PrimeField + TensorType + PartialOrd + std::hash::Hash>(
+    pub fn assign_with_duplication_constrained<
+        F: PrimeField + TensorType + PartialOrd + std::hash::Hash,
+    >(
        &self,
        region: &mut Region<F>,
        row: usize,
        offset: usize,
        values: &ValTensor<F>,
        check_mode: &CheckMode,
-        single_inner_col: bool,
        constants: &mut ConstantsMap<F>,
    ) -> Result<(ValTensor<F>, usize), halo2_proofs::plonk::Error> {
        let mut prev_cell = None;
@@ -512,34 +552,16 @@ impl VarTensor {
            ValTensor::Instance { .. } => unimplemented!("duplication is not supported on instance columns. increase K if you require more rows."),
            ValTensor::Value { inner: v, dims , ..} => {

-                let duplication_freq = if single_inner_col {
-                    self.col_size()
-                } else {
-                    self.block_size()
-                };
-
-                let num_repeats = if single_inner_col {
-                    1
-                } else {
-                    self.num_inner_cols()
-                };
-
-                let duplication_offset = if single_inner_col {
-                    row
-                } else {
-                    offset
-                };
+                let duplication_freq = self.col_size();
+                let num_repeats = 1;
+                let duplication_offset = row;

                // duplicates every nth element to adjust for column overflow
                let v = v.duplicate_every_n(duplication_freq, num_repeats, duplication_offset).unwrap();
                let mut res: ValTensor<F> = {
                    v.enum_map(|coord, k| {

-                    let step = if !single_inner_col {
-                        1
-                    } else {
-                        self.num_inner_cols()
-                    };
+                    let step = self.num_inner_cols();

                    let (x, y, z) = self.cartesian_coord(offset + coord * step);
                    if matches!(check_mode, CheckMode::SAFE) && coord > 0 && z == 0 && y == 0 {
@@ -549,11 +571,13 @@ impl VarTensor {

                    let cell = self.assign_value(region, offset, k.clone(), coord * step, constants)?;

-                    if single_inner_col {
-                    if z == 0 {
+                    let at_end_of_column = z == duplication_freq - 1;
+                    let at_beginning_of_column = z == 0;
+
+                    if at_end_of_column {
                        // if we are at the end of the column, we need to copy the cell to the next column
                        prev_cell = Some(cell.clone());
-                    } else if coord > 0 && z == 0 && single_inner_col {
+                    } else if coord > 0 && at_beginning_of_column  {
                        if let Some(prev_cell) = prev_cell.as_ref() {
                            let cell = cell.cell().ok_or({
                                error!("Error getting cell: {:?}", (x,y));
@@ -563,10 +587,10 @@ impl VarTensor {
                                halo2_proofs::plonk::Error::Synthesis})?;
                            region.constrain_equal(prev_cell,cell)?;
                        } else {
-                            error!("Error copy-constraining previous value: {:?}", (x,y));
+                            error!("Previous cell was not set");
                            return Err(halo2_proofs::plonk::Error::Synthesis);
                        }
-                    }}
+                    }

                    Ok(cell)

@@ -577,20 +601,6 @@ impl VarTensor {
                res.reshape(dims).unwrap();
                res.set_scale(values.scale());

-                if matches!(check_mode, CheckMode::SAFE) {
-                     // during key generation this will be 0 so we use this as a flag to check
-                     // TODO: this isn't very safe and would be better to get the phase directly
-                    let res_evals = res.int_evals().unwrap();
-                    let is_assigned = res_evals
-                    .iter()
-                    .all(|&x| x == 0);
-                    if !is_assigned {
-                        assert_eq!(
-                           values.int_evals().unwrap(),
-                           res_evals
-                    )};
-                }
-
                Ok((res, total_used_len))
            }
        }
--- a/tests/py_integration_tests.rs
+++ b/tests/py_integration_tests.rs
@@ -68,6 +68,8 @@ mod py_tests {
                    "install",
                    "torch-geometric==2.5.2",
                    "torch==2.2.2",
+                    "datasets==3.2.0",
+                    "torchtext==0.17.2",
                    "torchvision==0.17.2",
                    "pandas==2.2.1",
                    "numpy==1.26.4",
@@ -190,6 +192,16 @@ mod py_tests {
            }
            });

+            #[test]
+            fn neural_bag_of_words_notebook() {
+                crate::py_tests::init_binary();
+                let test_dir: TempDir = TempDir::new("neural_bow").unwrap();
+                let path = test_dir.path().to_str().unwrap();
+                crate::py_tests::mv_test_(path, "neural_bow.ipynb");
+                run_notebook(path, "neural_bow.ipynb");
+                test_dir.close().unwrap();
+            }
+
            #[test]
            fn felt_conversion_test_notebook() {
                crate::py_tests::init_binary();
Author	SHA1	Message	Date
github-actions[bot]	edd478dd8c	ci: update version string in docs	2024-12-31 12:28:24 +00:00
dante	c839a30ae6	fix: clearer duplication functions (#895 )	2024-12-31 07:28:02 -05:00
dante	352812b9ac	refactor!: simplified decompose op (#892 )	2024-12-30 13:44:03 -05:00
dante	d48d0b0b3e	fix: `get_slice` should not use intermediate `Vec` (#894 )	2024-12-27 23:26:22 -05:00
Jseam	8b223354cc	fix: add version string and sed (#893 )	2024-12-27 14:24:28 -05:00
dante	caa6ef8e16	fix: const filtering strat is size dependent (#891 )	2024-12-27 09:43:59 -05:00
Artem	c4354c10a5	fix: ios bindings update action (#886 )	2024-12-16 10:49:13 -05:00
dante	c1ce8c88d0	chore: rm wasm serialization checks (#890 )	2024-12-12 22:20:29 -05:00
dante	876a9584a1	chore: optimize wasm bundle for speed over size (#889 )	2024-12-12 15:35:17 -05:00
dante	7d7f049cc4	chore: neural bag of words example (#888 )	2024-12-12 14:20:21 -05:00