fix(gpu): vector find lut

fix(gpu): vector find lut number
fix(gpu): multiplication fix lut
2026-04-28 03:01:21 -04:00 · 2026-02-25 17:34:22 +01:00 · 2026-02-25 11:39:04 +01:00 · 2026-02-24 22:12:34 +01:00 · 2026-02-24 18:15:55 +01:00 · 2026-02-24 17:23:30 +01:00
151 changed files with 5987 additions and 11266 deletions
--- a/.github/actions/gpu_setup/action.yml
+++ b/.github/actions/gpu_setup/action.yml
@@ -68,12 +68,6 @@ runs:
        echo "${CUDA_KEYRING_SHA} ${CUDA_KEYRING_PACKAGE}" > checksum
        sha256sum -c checksum
        sudo dpkg -i "${CUDA_KEYRING_PACKAGE}"
-
-        # Disable unattended-upgrades to avoid lock issues
-        sudo systemctl disable --now unattended-upgrades
-
-        sudo apt-get clean
-        sudo rm -rf /var/lib/apt/lists/*
        sudo apt update
        sudo apt -y install cuda-toolkit-"${TOOLKIT_VERSION}"

--- a/.github/workflows/benchmark_cpu.yml
+++ b/.github/workflows/benchmark_cpu.yml
@@ -14,7 +14,6 @@ on:
          - signed_integer
          - integer_compression
          - integer_zk
-          - msm_zk
          - shortint
          - shortint_oprf
          - hlapi_unsigned
--- a/.github/workflows/benchmark_documentation.yml
+++ b/.github/workflows/benchmark_documentation.yml
@@ -36,7 +36,7 @@ jobs:
    uses: ./.github/workflows/benchmark_cpu_common.yml
    if: inputs.run-cpu-benchmarks
    with:
-      command: integer,hlapi_erc20
+      command: integer
      op_flavor: fast_default
      bench_type: both
      precisions_set: documentation
@@ -50,40 +50,6 @@ jobs:
      SLAB_URL: ${{ secrets.SLAB_URL }}
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

-  run-benchmarks-cpu-zk-server:
-    name: benchmark_documentation/run-benchmarks-cpu-zk-server
-    uses: ./.github/workflows/benchmark_cpu_common.yml
-    if: inputs.run-cpu-benchmarks
-    with:
-      command: integer_zk
-      op_flavor: default
-      bench_type: both
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-
-  run-benchmarks-cpu-zk-client:
-    name: benchmark_documentation/run-benchmarks-cpu-zk-client
-    uses: ./.github/workflows/benchmark_wasm_client_common.yml
-    if: inputs.run-cpu-benchmarks
-    with:
-      browser: chrome
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-
  run-benchmarks-gpu-integer:
    name: benchmark_documentation/run-benchmarks-gpu-integer
    uses: ./.github/workflows/benchmark_gpu_common.yml
@@ -91,7 +57,7 @@ jobs:
    with:
      profile: multi-h100-sxm5
      hardware_name: n3-H100-SXM5x8
-      command: integer_multi_bit,hlapi_erc20
+      command: integer_multi_bit
      op_flavor: fast_default
      bench_type: both
      precisions_set: documentation
@@ -110,7 +76,7 @@ jobs:
    uses: ./.github/workflows/benchmark_hpu_common.yml
    if: inputs.run-hpu-benchmarks
    with:
-      command: integer,hlapi_erc20
+      command: integer
      op_flavor: default
      bench_type: both
      precisions_set: documentation
@@ -172,7 +138,6 @@ jobs:
      inputs.generate-svgs }}
    needs: [
      run-benchmarks-cpu-integer, run-benchmarks-gpu-integer, run-benchmarks-hpu-integer,
-      run-benchmarks-cpu-zk-server, run-benchmarks-cpu-zk-client,
      run-benchmarks-cpu-core-crypto, run-benchmarks-gpu-core-crypto
    ]
    uses: ./.github/workflows/generate_svgs.yml
--- a/.github/workflows/benchmark_gpu.yml
+++ b/.github/workflows/benchmark_gpu.yml
@@ -31,8 +31,6 @@ on:
          - pbs128
          - ks
          - ks_pbs
-          - tfhe_zk_pok
-          - msm_zk
          - integer_zk
          - integer_aes
          - integer_aes256
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -58,19 +58,171 @@ jobs:
              - tfhe/web_wasm_parallel_tests/**
              - .github/workflows/wasm_client_benchmark.yml

-  run-benchmarks-cpu-zk-client:
-    name: benchmark_documentation/run-benchmarks-cpu-zk-client
-    uses: ./.github/workflows/benchmark_wasm_client_common.yml
-    needs: should-run
+  setup-instance:
+    name: benchmark_wasm_client/setup-instance
    if: github.event_name == 'workflow_dispatch' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.wasm_bench)
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+    needs: should-run
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-small
+
+  wasm-client-benchmarks:
+    name: benchmark_wasm_client/wasm-client-benchmarks
+    needs: setup-instance
+    if: needs.setup-instance.result != 'skipped'
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      max-parallel: 1
+      matrix:
+        browser: [ chrome, firefox ]
+    steps:
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Get benchmark details
+        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        with:
+          toolchain: nightly
+
+      - name: Get Node version
+        run: |
+          echo "NODE_VERSION=$(make node_version)" >> "${GITHUB_ENV}"
+
+      - name: Node cache restoration
+        id: node-cache
+        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        with:
+          path: |
+            ~/.nvm
+            ~/.npm
+          key: node-${{ env.NODE_VERSION }}
+
+      - name: Install Node
+        if: steps.node-cache.outputs.cache-hit != 'true'
+        run: |
+          make install_node
+
+      - name: Node cache save
+        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
+        if: steps.node-cache.outputs.cache-hit != 'true'
+        with:
+          path: |
+            ~/.nvm
+            ~/.npm
+          key: node-${{ env.NODE_VERSION }}
+
+      - name: Install web resources
+        run: |
+          make install_"${BROWSER}"_browser
+          make install_"${BROWSER}"_web_driver
+        env:
+          BROWSER: ${{ matrix.browser }}
+
+      - name: Run benchmarks
+        run: |
+          make bench_web_js_api_parallel_"${BROWSER}"_ci
+        env:
+          BROWSER: ${{ matrix.browser }}
+
+      - name: Run benchmarks (unsafe coop)
+        run: |
+          make bench_web_js_api_unsafe_coop_"${BROWSER}"_ci
+        env:
+          BROWSER: ${{ matrix.browser }}
+
+      - name: Parse results
+        run: |
+          make parse_wasm_benchmarks
+          python3 ./ci/benchmark_parser.py tfhe-benchmark/wasm_pk_gen.csv "${RESULTS_FILENAME}" \
+          --database tfhe_rs \
+          --hardware "m6i.4xlarge" \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
+          --key-gen
+          rm tfhe-benchmark/wasm_pk_gen.csv
+        env:
+          REF_NAME: ${{ github.ref_name }}
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+        with:
+          name: ${{ github.sha }}_wasm_${{ matrix.browser }}
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          repository: zama-ai/slab
+          path: slab
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}
+
+      - name: Slack Notification
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "WASM benchmarks (${{ matrix.browser }}) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: benchmark_wasm_client/teardown-instance
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [ setup-instance, wasm-client-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (wasm-client-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_wasm_client_common.yml
+++ b/.github/workflows/benchmark_wasm_client_common.yml
@@ -1,234 +0,0 @@
-# Run WASM client benchmarks on an instance and return parsed results to Slab CI bot.
-name: benchmark_wasm_client_common
-
-on:
-  workflow_call:
-    inputs:
-      browser:
-        type: string # Use comma separated values to generate an array
-        default: chrome,firefox
-    secrets:
-      REPO_CHECKOUT_TOKEN:
-        required: true
-      SLAB_ACTION_TOKEN:
-        required: true
-      SLAB_BASE_URL:
-        required: true
-      SLAB_URL:
-        required: true
-      JOB_SECRET:
-        required: true
-      SLACK_CHANNEL:
-        required: true
-      BOT_USERNAME:
-        required: true
-      SLACK_WEBHOOK:
-        required: true
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-
-permissions: {}
-
-# zizmor: ignore[concurrency-limits] only Zama organization members and GitHub can trigger this workflow
-
-jobs:
-  prepare-matrix:
-    name: benchmark_wasm_client_common/prepare-matrix
-    runs-on: ubuntu-latest
-    outputs:
-      browser: ${{ steps.set_matrix_arg.outputs.browser }}
-    steps:
-      - name: Parse user inputs
-        shell: python
-        env:
-          INPUTS_BROWSER: ${{ inputs.browser }}
-        run: |
-          import os
-
-          inputs_browser = os.environ["INPUTS_BROWSER"]
-          env_file = os.environ["GITHUB_ENV"]
-
-          split_browser = inputs_browser.replace(" ", "").split(",")
-
-          with open(env_file, "a") as f:
-            f.write(f"""BROWSER=["{'", "'.join(split_browser)}"]\n""")
-
-      - name: Set martix arguments output
-        id: set_matrix_arg
-        run: | # zizmor: ignore[template-injection] this env variable is safe
-          echo "browser=${{ toJSON(env.BROWSER) }}" >> "${GITHUB_OUTPUT}"
-
-  setup-instance:
-    name: benchmark_wasm_client_common/setup-instance
-    needs: prepare-matrix
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: cpu-small
-
-  wasm-client-benchmarks:
-    name: benchmark_wasm_client_common/wasm-client-benchmarks
-    needs: [ prepare-matrix, setup-instance ]
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    strategy:
-      max-parallel: 1
-      matrix:
-        browser: ${{ fromJSON(needs.prepare-matrix.outputs.browser) }}
-    steps:
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
-        with:
-          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-      - name: Get benchmark details
-        run: |
-          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=${COMMIT_DATE}";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
-        env:
-          SHA: ${{ github.sha }}
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
-        with:
-          toolchain: nightly
-
-      - name: Get Node version
-        run: |
-          echo "NODE_VERSION=$(make node_version)" >> "${GITHUB_ENV}"
-
-      - name: Node cache restoration
-        id: node-cache
-        uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
-        with:
-          path: |
-            ~/.nvm
-            ~/.npm
-          key: node-${{ env.NODE_VERSION }}
-
-      - name: Install Node
-        if: steps.node-cache.outputs.cache-hit != 'true'
-        run: |
-          make install_node
-
-      - name: Node cache save
-        uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
-        if: steps.node-cache.outputs.cache-hit != 'true'
-        with:
-          path: |
-            ~/.nvm
-            ~/.npm
-          key: node-${{ env.NODE_VERSION }}
-
-      - name: Install web resources
-        run: |
-          make install_"${BROWSER}"_browser
-          make install_"${BROWSER}"_web_driver
-        env:
-          BROWSER: ${{ matrix.browser }}
-
-      - name: Run benchmarks
-        run: |
-          make bench_web_js_api_parallel_"${BROWSER}"_ci
-        env:
-          BROWSER: ${{ matrix.browser }}
-
-      - name: Run benchmarks (unsafe coop)
-        run: |
-          make bench_web_js_api_unsafe_coop_"${BROWSER}"_ci
-        env:
-          BROWSER: ${{ matrix.browser }}
-
-      - name: Parse results
-        run: |
-          make parse_wasm_benchmarks
-          python3 ./ci/benchmark_parser.py tfhe-benchmark/wasm_pk_gen.csv "${RESULTS_FILENAME}" \
-          --database tfhe_rs \
-          --hardware "m6i.4xlarge" \
-          --project-version "${COMMIT_HASH}" \
-          --branch "${REF_NAME}" \
-          --commit-date "${COMMIT_DATE}" \
-          --bench-date "${BENCH_DATE}" \
-          --key-gen
-          rm tfhe-benchmark/wasm_pk_gen.csv
-        env:
-          REF_NAME: ${{ github.ref_name }}
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
-        with:
-          name: ${{ github.sha }}_wasm_${{ matrix.browser }}
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
-        with:
-          repository: zama-ai/slab
-          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
-          --slab-url "${SLAB_URL}"
-        env:
-          JOB_SECRET: ${{ secrets.JOB_SECRET }}
-          SLAB_URL: ${{ secrets.SLAB_URL }}
-
-      - name: Slack Notification
-        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "WASM benchmarks (${{ matrix.browser }}) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-instance:
-    name: benchmark_wasm_client_common/teardown-instance
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
-    needs: [ setup-instance, wasm-client-benchmarks ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (wasm-client-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/check_commit.yml
+++ b/.github/workflows/check_commit.yml
@@ -18,7 +18,7 @@ jobs:
      - name: Check first line
        uses: gsactions/commit-message-checker@16fa2d5de096ae0d35626443bcd24f1e756cafee
        with:
-          pattern: '^((feat|fix|chore|refactor|style|test|docs|doc|perf)(\([\w\-_]+\))?\!?\:) .+$'
+          pattern: '^((feat|fix|chore|refactor|style|test|docs|doc)(\([\w\-_]+\))?\!?\:) .+$'
          flags: "gs"
          error: 'Your first line has to contain a commit type and scope like "feat(my_feature): msg".'
          excludeDescription: "true" # optional: this excludes the description body of a pull request
--- a/.github/workflows/generate_svg_common.yml
+++ b/.github/workflows/generate_svg_common.yml
@@ -9,9 +9,6 @@ on:
        type: string
      layer:
        type: string
-      bench_subset:
-        type: string
-        default: all
      pbs_kind: # Valid values are 'classical', 'multi_bit' or 'any'
        type: string
      grouping_factor: # Valid values are 2, 3, or 4
@@ -19,9 +16,6 @@ on:
        default: 4
      bench_type: # Valid values are 'latency', 'throughput'
        type: string
-      name_suffix:
-        type: string
-        default: _mean_avx512
      backend_comparison:
        type: boolean
        default: false
@@ -66,8 +60,6 @@ jobs:
          --pbs-kind "${PBS_KIND}" \
          --grouping-factor "${GROUPING_FACTOR}" \
          --bench-type "${BENCH_TYPE}" \
-          --bench-subset "${BENCH_SUBSET}" \
-          --name-suffix "${NAME_SUFFIX}" \
          --time-span-days "${TIME_SPAN}"
        env:
          OUTPUT_FILENAME: ${{ inputs.output_filename }}
@@ -78,8 +70,6 @@ jobs:
          PBS_KIND: ${{ inputs.pbs_kind }}
          GROUPING_FACTOR: ${{ inputs.grouping_factor }}
          BENCH_TYPE: ${{ inputs.bench_type }}
-          BENCH_SUBSET: ${{ inputs.bench_subset }}
-          NAME_SUFFIX: ${{ inputs.name_suffix }}
          TIME_SPAN: ${{ inputs.time_span_days }}
          DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
          DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
@@ -89,7 +79,7 @@ jobs:
        if: inputs.backend_comparison == false
        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
        with:
-          name: ${{ github.sha }}_${{ inputs.backend }}_${{ inputs.layer }}_subset_${{inputs.bench_subset}}_${{ inputs.pbs_kind }}_${{ inputs.bench_type }}_tables
+          name: ${{ github.sha }}_${{ inputs.backend }}_${{ inputs.layer }}_${{ inputs.pbs_kind }}_${{ inputs.bench_type }}_tables
          # This will upload all the file generated
          path: ${{ inputs.output_filename }}*.svg
          retention-days: 60
--- a/.github/workflows/generate_svgs.yml
+++ b/.github/workflows/generate_svgs.yml
@@ -51,7 +51,7 @@ jobs:
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

  cpu-integer-throughput-table:
-    name: generate_documentation_svgs/cpu-integer-throughput-table
+    name: generate_documentation_svgs/cpu-integer-latency-table
    uses: ./.github/workflows/generate_svg_common.yml
    if: inputs.generate-cpu-svgs
    with:
@@ -150,124 +150,6 @@ jobs:
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

-  # -----------------------------------------------------------
-  # ZK benchmarks tables
-  # -----------------------------------------------------------
-
-  cpu-zk-server-latency-table:
-    name: generate_documentation_svgs/cpu-zk-server-latency-table
-    uses: ./.github/workflows/generate_svg_common.yml
-    if: inputs.generate-cpu-svgs
-    with:
-      backend: cpu
-      hardware_name: hpc7a.96xlarge
-      layer: integer
-      bench_subset: zk
-      pbs_kind: classical
-      bench_type: latency
-      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: cpu-zk-benchmark-latency
-    secrets:
-      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
-      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
-      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
-
-  cpu-zk-server-throughput-table:
-    name: generate_documentation_svgs/cpu-zk-server-throughput-table
-    uses: ./.github/workflows/generate_svg_common.yml
-    if: inputs.generate-cpu-svgs
-    with:
-      backend: cpu
-      hardware_name: hpc7a.96xlarge
-      layer: integer
-      bench_subset: zk
-      pbs_kind: classical
-      bench_type: throughput
-      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: cpu-zk-benchmark-throughput
-    secrets:
-      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
-      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
-      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
-
-  cpu-zk-client-latency-table:
-    name: generate_documentation_svgs/cpu-zk-client-latency-table
-    uses: ./.github/workflows/generate_svg_common.yml
-    if: inputs.generate-cpu-svgs
-    with:
-      backend: cpu
-      hardware_name: m6i.4xlarge
-      layer: wasm
-      bench_subset: zk
-      pbs_kind: classical
-      bench_type: latency
-      name_suffix: _chrome_mean
-      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: cpu-zk-wasm-benchmark-latency
-    secrets:
-      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
-      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
-      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
-
-  # -----------------------------------------------------------
-  # ERC20 benchmarks tables
-  # -----------------------------------------------------------
-
-  cpu-erc20-latency-throughput-table:
-    name: generate_documentation_svgs/cpu-erc20-latency-throughput-table
-    uses: ./.github/workflows/generate_svg_common.yml
-    if: inputs.generate-cpu-svgs
-    with:
-      backend: cpu
-      hardware_name: hpc7a.96xlarge
-      layer: hlapi
-      bench_subset: erc20
-      pbs_kind: classical
-      bench_type: both
-      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: cpu-hlapi-erc20-benchmark-latency-throughput
-    secrets:
-      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
-      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
-      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
-
-  gpu-erc20-latency-throughput-table:
-    name: generate_documentation_svgs/gpu-erc20-latency-throughput-table
-    uses: ./.github/workflows/generate_svg_common.yml
-    if: inputs.generate-gpu-svgs
-    with:
-      backend: gpu
-      hardware_name: n3-H100-SXM5x8
-      layer: hlapi
-      bench_subset: erc20
-      pbs_kind: multi_bit
-      grouping_factor: 4
-      bench_type: both
-      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: gpu-hlapi-erc20-benchmark-h100x8-sxm5-latency-throughput
-    secrets:
-      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
-      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
-      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
-
-  hpu-erc20-latency-throughput-table:
-    name: generate_documentation_svgs/hpu-erc20-latency-throughput-table
-    uses: ./.github/workflows/generate_svg_common.yml
-    if: inputs.generate-hpu-svgs
-    with:
-      backend: hpu
-      hardware_name: hpu_x1
-      layer: hlapi
-      bench_subset: erc20
-      pbs_kind: classical
-      bench_type: both
-      time_span_days: ${{ inputs.time_span_days }}
-      output_filename: hpu-hlapi-erc20-benchmark-hpux1-latency-throughput.svg
-    secrets:
-      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
-      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
-      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
-
  # -----------------------------------------------------------
  # PBS benchmarks tables
  # -----------------------------------------------------------
--- a/.github/workflows/gpu_code_validation_tests.yml
+++ b/.github/workflows/gpu_code_validation_tests.yml
@@ -93,11 +93,6 @@ jobs:

      - name: Find tools
        run: |
-          # Disable unattended-upgrades to avoid lock issues
-          sudo systemctl disable --now unattended-upgrades
-
-          sudo apt-get clean
-          sudo rm -rf /var/lib/apt/lists/*
          sudo apt update && sudo apt install -y valgrind 
          find /usr -executable -name "compute-sanitizer"
          which valgrind
@@ -111,10 +106,6 @@ jobs:
        run: |
          make test_high_level_api_gpu_valgrind

-      - name: Run CUDA backend racecheck tests
-        run: |
-          make test_cuda_backend_race_check
-
  slack-notify:
    name: gpu_code_validation_tests/slack-notify
    needs: [ setup-instance, cuda-tests-linux ]
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -90,12 +90,6 @@ jobs:
          echo "${CUDA_KEYRING_SHA} ${CUDA_KEYRING_PACKAGE}" > checksum
          sha256sum -c checksum
          sudo dpkg -i "${CUDA_KEYRING_PACKAGE}"
-
-          # Disable unattended-upgrades to avoid lock issues
-          sudo systemctl disable --now unattended-upgrades
-
-          sudo apt-get clean
-          sudo rm -rf /var/lib/apt/lists/*
          sudo apt update
          sudo apt -y install "cuda-toolkit-${TOOLKIT_VERSION}" cmake-format
        env:
@@ -138,13 +132,7 @@ jobs:

      - name: Run semgrep and lint checks on CUDA code
        run: |
-          # Disable unattended-upgrades to avoid lock issues
-          sudo systemctl disable --now unattended-upgrades
-
-          sudo apt-get clean
-          sudo rm -rf /var/lib/apt/lists/*
-          sudo apt update
-          sudo apt -y install python3-venv
+          sudo apt update && sudo apt -y install python3-venv
          make semgrep_and_lint_gpu_code

      - name: Check build with hpu enabled
--- a/.github/workflows/gpu_zk_tests.yml
+++ b/.github/workflows/gpu_zk_tests.yml
@@ -51,12 +51,7 @@ jobs:
        with:
          files_yaml: |
            gpu:
-              - tfhe/Cargo.toml
-              - tfhe/build.rs
              - backends/zk-cuda-backend/**
-              - tfhe/src/integer/gpu/zk/**
-              - tfhe-zk-pok/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_zk_tests.yml'
              - ci/slab.toml

@@ -131,9 +126,6 @@ jobs:
      - name: Run zk-cuda-backend integration tests
        run: |
          make test_zk_cuda_backend
-          make test_zk_pok_gpu
-          make test_integer_zk_gpu
-          make test_integer_zk_experimental_gpu

  slack-notify:
    name: gpu_zk_tests/slack-notify
--- a/.gitignore
+++ b/.gitignore
@@ -25,7 +25,6 @@ dieharder_run.log

 # Cuda local build
 backends/tfhe-cuda-backend/cuda/cmake-build-debug/
-backends/tfhe-cuda-backend/cuda/build/

 # WASM tests
 tfhe/web_wasm_parallel_tests/server.PID
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -17,7 +17,7 @@ Start by [forking](https://docs.github.com/en/pull-requests/collaborating-with-p
 - **Performance**: For optimal performance, it is highly recommended to run **TFHE-rs** code in release mode with cargo's `--release` flag.
 {% endhint %}

-To get more details about the library, please refer to the [documentation](https://docs.zama.org/tfhe-rs).
+To get more details about the library, please refer to the [documentation](https://docs.zama.ai/tfhe-rs).

 ## 2. Creating a new branch

--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,7 +14,6 @@ members = [
    "utils/tfhe-versionable",
    "utils/tfhe-versionable-derive",
    "utils/tfhe-backward-compat-data",
-    "utils/tfhe-backward-compat-data/crates/add_new_version",
    "utils/param_dedup",
    "tests",
    "mockups/tfhe-hpu-mockup",
@@ -45,7 +44,6 @@ bindgen = "0.71"
 bincode = "=1.3.3"
 cmake = "0.1"
 pkg-config = "0.3"
-clap = { version = "4.5", features = ["derive"] }

 [profile.bench]
 lto = "fat"
--- a/119
+++ b/119
@@ -1,7 +1,4 @@
 SHELL:=$(shell /usr/bin/env which bash)
-# Enable stop on error, no undefined variables
-# the c flag is to run the script inline
-.SHELLFLAGS := -eu -c
 OS:=$(shell uname)
 RS_CHECK_TOOLCHAIN:=$(shell cat nightly-toolchain.txt | tr -d '\n')
 CARGO_RS_CHECK_TOOLCHAIN:=+$(RS_CHECK_TOOLCHAIN)
@@ -270,23 +267,12 @@ install_mlc:
 	cargo install mlc --locked || \
 	( echo "Unable to install mlc, unknown error." && exit 1 )

-fmt: FMT_CHECK =
 .PHONY: fmt # Format rust code
-fmt: fmt_internal
-
-check_fmt: FMT_CHECK = --check
-.PHONY: check_fmt # Check rust code format
-check_fmt: fmt_internal
-
-.PHONY: fmt_internal # internal recipe for fmt
-fmt_internal: install_rs_check_toolchain
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt $(FMT_CHECK)
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt $(FMT_CHECK)
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt $(FMT_CHECK)
-	for crate in `ls -1 $(BACKWARD_COMPAT_DATA_DIR)/crates/ | grep generate_`; do \
-		echo "fmt $$crate"; \
-		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR)/crates/$$crate fmt $(FMT_CHECK); \
-	done
+fmt: install_rs_check_toolchain
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR) fmt
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt

 .PHONY: fmt_js # Format javascript code
 fmt_js: check_nvm_installed
@@ -323,6 +309,13 @@ fmt_c_tests:
 fmt_toml: install_taplo
 	taplo fmt

+.PHONY: check_fmt # Check rust code format
+check_fmt: install_rs_check_toolchain
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR) fmt --check
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt --check
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt --check
+
 .PHONY: check_fmt_c_tests  # Check C tests format
 check_fmt_c_tests:
 	find tfhe/c_api_tests/ -regex '.*\.\(cpp\|hpp\|cu\|c\|h\)' -exec clang-format --dry-run --Werror -style=file {} \;
@@ -353,14 +346,14 @@ check_typos: install_typos_checker
 .PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
 clippy_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats,extended-types,zk-pok \
+		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats,extended-types,zk-pok \
 		--all-targets \
 		-p tfhe -- --no-deps -D warnings

 .PHONY: check_gpu # Run check on tfhe with "gpu" enabled
 check_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
-		--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats \
+		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \
 		--all-targets \
 		-p tfhe

@@ -374,7 +367,7 @@ clippy_hpu: install_rs_check_toolchain
 .PHONY: clippy_gpu_hpu # Run clippy lints on tfhe with "gpu" and "hpu" enabled
 clippy_gpu_hpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,hpu,pbs-stats,extended-types,zk-pok \
+		--features=boolean,shortint,integer,internal-keycache,gpu,hpu,pbs-stats,extended-types,zk-pok \
 		--all-targets \
 		-p tfhe -- --no-deps -D warnings

@@ -467,7 +460,7 @@ clippy_rustdoc_gpu: install_rs_check_toolchain
 	fi && \
 	CARGO_TERM_QUIET=true CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
 		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
-		--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu,gpu-experimental-zk \
+		--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu \
 		-p tfhe -- --nocapture

 .PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
@@ -543,10 +536,11 @@ clippy_param_dedup: install_rs_check_toolchain

 .PHONY: clippy_backward_compat_data # Run clippy lints on tfhe-backward-compat-data
 clippy_backward_compat_data: install_rs_check_toolchain # the toolchain is selected with toolchain.toml
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		-p tfhe-backward-compat-data -- --no-deps -D warnings
 	@# Some old crates are x86 specific, only run in that case
 	@if uname -a | grep -q x86; then \
+		RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options \
+			-C $(BACKWARD_COMPAT_DATA_DIR) clippy --all --all-targets \
+			-- --no-deps -D warnings; \
 		for crate in `ls -1 $(BACKWARD_COMPAT_DATA_DIR)/crates/ | grep generate_`; do \
 			echo "checking $$crate"; \
 			RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options \
@@ -670,7 +664,7 @@ build_c_api: install_rs_check_toolchain
 .PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
 build_c_api_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,extended-types,gpu,gpu-experimental-zk \
+		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,extended-types,gpu \
 		-p tfhe

 .PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
@@ -748,15 +742,6 @@ test_cuda_backend:
 		"$(MAKE)" -j "$(CPU_COUNT)" && \
 		"$(MAKE)" test

-.PHONY: test_cuda_backend_race_check # Build and run selected CUDA backend tests with Compute Sanitizer racecheck
-test_cuda_backend_race_check:
-	mkdir -p "$(TFHECUDA_BUILD)" && \
-		cd "$(TFHECUDA_BUILD)" && \
-		cmake .. -DCMAKE_BUILD_TYPE=Release -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON && \
-		"$(MAKE)" -j "$(CPU_COUNT)" test_tfhe_cuda_backend && \
-		compute-sanitizer --tool racecheck --target-processes all ./tests_and_benchmarks/tests/test_tfhe_cuda_backend \
-			--gtest_filter="*ClassicalProgrammableBootstrap*:*MultiBitProgrammableBootstrap*"
-
 .PHONY: test_zk_cuda_backend # Run the internal tests of the CUDA ZK backend
 test_zk_cuda_backend:
 	mkdir -p "$(ZKCUDA_BUILD)" && \
@@ -769,7 +754,7 @@ test_zk_cuda_backend:


 .PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
-test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend test_zk_cuda_backend
+test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend

 .PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
 test_core_crypto_gpu:
@@ -809,7 +794,7 @@ test_integer_hl_test_gpu_check_warnings:
 		--features=integer,internal-keycache,gpu-debug,zk-pok -vv -p tfhe &> /tmp/gpu_compile_output
 	WARNINGS=$$(cat /tmp/gpu_compile_output | grep ": warning #" | grep "\[tfhe-cuda-backend" | grep -v "inline qualifier" || true) && \
 	if [[ "$${WARNINGS}" != "" ]]; then \
-		echo "FAILING BECAUSE CUDA COMPILATION WARNINGS WERE DETECTED: " && \
+	    echo "FAILING BECAUSE CUDA COMPILATION WARNINGS WERE DETECTED: " && \
 		echo "$${WARNINGS}" && exit 1; \
 	fi

@@ -1205,31 +1190,12 @@ test_tfhe_csprng_big_endian: install_cargo_cross
 	RUSTFLAGS="" cross test --profile $(CARGO_PROFILE) \
 		-p tfhe-csprng --target=powerpc64-unknown-linux-gnu

+
 .PHONY: test_zk_pok # Run tfhe-zk-pok tests
 test_zk_pok:
 	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
 		-p tfhe-zk-pok --features experimental

-.PHONY: test_zk_pok_gpu # Run tfhe-zk-pok GPU-accelerated tests
-test_zk_pok_gpu:
-	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
-		-p tfhe-zk-pok --features experimental,gpu-experimental -- gpu
-
-.PHONY: test_integer_zk_gpu # Run tfhe-zk-pok tests
-test_integer_zk_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile release \
-		--features=integer,zk-pok,gpu -p tfhe -- \
-		integer::gpu::zk::
-
-.PHONY: test_integer_zk_experimental_gpu # Run tfhe-zk-pok tests
-test_integer_zk_experimental_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile release \
-		--features=integer,zk-pok,gpu,gpu-experimental-zk -p tfhe -- \
-		integer::gpu::zk::
-
-.PHONY: test_zk_cuda # Run all GPU MSM integration tests (CPU vs GPU comparison + integration test)
-test_zk_cuda: install_rs_check_toolchain test_zk_cuda_backend test_zk_pok_gpu test_integer_zk_gpu test_integer_zk_experimental_gpu
-
 .PHONY: test_zk_wasm_x86_compat_ci
 test_zk_wasm_x86_compat_ci: check_nvm_installed
 	source ~/.nvm/nvm.sh && \
@@ -1522,47 +1488,27 @@ bench_integer_compression_128b_gpu: install_rs_check_toolchain
 	--bench	glwe_packing_compression_128b-integer-bench \
 	--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --

-.PHONY: bench_msm_zk
-bench_msm_zk: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench zk-msm \
-	--features=zk-pok -p tfhe-benchmark --profile release --
-
-.PHONY: bench_msm_zk_gpu
-bench_msm_zk_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench zk-msm \
-	--features=gpu,gpu-experimental-zk,zk-pok -p tfhe-benchmark --profile release --
-
 .PHONY: bench_integer_zk_gpu
 bench_integer_zk_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-zk-pke \
-	--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release --
-
-.PHONY: bench_integer_zk_experimental_gpu
-bench_integer_zk_experimental_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench integer-zk-pke \
-	--features=integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats,zk-pok -p tfhe-benchmark --profile release --
+	--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_integer_aes_gpu # Run benchmarks for AES on GPU backend
 bench_integer_aes_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-aes \
-	--features=integer,internal-keycache,gpu -p tfhe-benchmark --profile release_lto_off --
+	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_integer_aes256_gpu # Run benchmarks for AES256 on GPU backend
 bench_integer_aes256_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-aes256 \
-	--features=integer,internal-keycache,gpu -p tfhe-benchmark --profile release_lto_off --
+	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_integer_trivium_gpu # Run benchmarks for trivium on GPU backend
 bench_integer_trivium_gpu: install_rs_check_toolchain
@@ -1799,14 +1745,14 @@ bench_hlapi_erc20: install_rs_check_toolchain
 .PHONY: bench_hlapi_erc20_gpu # Run benchmarks for ERC20 operations on GPU
 bench_hlapi_erc20_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+    cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi-erc20 \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_hlapi_erc20_gpu_classical # Run benchmarks for ERC20 operations on GPU with classical parameters
 bench_hlapi_erc20_gpu_classical: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=classical \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+    cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi-erc20 \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

@@ -1845,13 +1791,6 @@ bench_tfhe_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-zk-pok --

-.PHONY: bench_tfhe_zk_pok_gpu # Run benchmarks for the tfhe_zk_pok crate using GPU acceleration
-bench_tfhe_zk_pok_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--package tfhe-zk-pok \
-	--features=gpu-experimental --profile release
-
 .PHONY: bench_hlapi_noise_squash # Run benchmarks for noise squash operation
 bench_hlapi_noise_squash: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@
 <hr/>

 <p align="center">
-  <a href="https://github.com/zama-ai/tfhe-rs-handbook/blob/main/tfhe-rs-handbook.pdf"> 📃 Read Handbook</a> |<a href="https://docs.zama.org/tfhe-rs"> 📒 Documentation</a> | <a href="https://www.zama.org/community-channels"> 💛 Community support</a> | <a href="https://github.com/zama-ai/awesome-zama"> 📚 FHE resources by Zama</a>
+  <a href="https://github.com/zama-ai/tfhe-rs-handbook/blob/main/tfhe-rs-handbook.pdf"> 📃 Read Handbook</a> |<a href="https://docs.zama.ai/tfhe-rs"> 📒 Documentation</a> | <a href="https://zama.ai/community"> 💛 Community support</a> | <a href="https://github.com/zama-ai/awesome-zama"> 📚 FHE resources by Zama</a>
 </p>


@@ -47,7 +47,7 @@ production-ready library for all the advanced features of TFHE.
 - **Ciphertext and server key compression** for efficient data transfer
 - **Full Rust API, C bindings to the Rust High-Level API, and client-side JavaScript API using WASM**.

-*Learn more about TFHE-rs features in the [documentation](https://docs.zama.org/tfhe-rs).*
+*Learn more about TFHE-rs features in the [documentation](https://docs.zama.ai/tfhe-rs/readme).*
 <br></br>

 ## Table of Contents
@@ -149,7 +149,7 @@ To run this code, use the following command:
 > Note that when running code that uses `TFHE-rs`, it is highly recommended
 to run in release mode with cargo's `--release` flag to have the best performance possible.

-*Find an example with more explanations in [this part of the documentation](https://docs.zama.org/tfhe-rs/get-started/quick-start)*
+*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick-start)*

 <p align="right">
  <a href="#about" > ↑ Back to top </a>
@@ -163,25 +163,25 @@ to run in release mode with cargo's `--release` flag to have the best performanc
 A document containing scientific and technical details about algorithms implemented into the library is available here: [TFHE-rs: A (Practical) Handbook](https://github.com/zama-ai/tfhe-rs-handbook/blob/main/tfhe-rs-handbook.pdf).

 ### TFHE deep dive
- [TFHE Deep Dive - Part I - Ciphertext types](https://www.zama.org/post/tfhe-deep-dive-part-1)
- [TFHE Deep Dive - Part II - Encodings and linear leveled operations](https://www.zama.org/post/tfhe-deep-dive-part-2)
- [TFHE Deep Dive - Part III - Key switching and leveled multiplications](https://www.zama.org/post/tfhe-deep-dive-part-3)
- [TFHE Deep Dive - Part IV - Programmable Bootstrapping](https://www.zama.org/post/tfhe-deep-dive-part-4)
+- [TFHE Deep Dive - Part I - Ciphertext types](https://www.zama.ai/post/tfhe-deep-dive-part-1)
+- [TFHE Deep Dive - Part II - Encodings and linear leveled operations](https://www.zama.ai/post/tfhe-deep-dive-part-2)
+- [TFHE Deep Dive - Part III - Key switching and leveled multiplications](https://www.zama.ai/post/tfhe-deep-dive-part-3)
+- [TFHE Deep Dive - Part IV - Programmable Bootstrapping](https://www.zama.ai/post/tfhe-deep-dive-part-4)
 <br></br>

 ### Tutorials
- [Video tutorial: Implement signed integers using TFHE-rs](https://www.zama.org/post/video-tutorial-implement-signed-integers-sing-tfhe-rs)
- [Homomorphic parity bit](https://docs.zama.org/tfhe-rs/tutorials/parity-bit)
- [Homomorphic case changing on Ascii string](https://docs.zama.org/tfhe-rs/tutorials/ascii-fhe-string)
- [Boolean SHA256 with TFHE-rs](https://www.zama.org/post/boolean-sha256-tfhe-rs)
- [Dark market with TFHE-rs](https://www.zama.org/post/dark-market-tfhe-rs)
- [Regular expression engine with TFHE-rs](https://www.zama.org/post/regex-engine-tfhe-rs)
+- [[Video tutorial] Implement signed integers using TFHE-rs ](https://www.zama.ai/post/video-tutorial-implement-signed-integers-sing-tfhe-rs)
+- [Homomorphic parity bit](https://docs.zama.ai/tfhe-rs/tutorials/parity-bit)
+- [Homomorphic case changing on Ascii string](https://docs.zama.ai/tfhe-rs/tutorials/ascii-fhe-string)
+- [Boolean SHA256 with TFHE-rs](https://www.zama.ai/post/boolean-sha256-tfhe-rs)
+- [Dark market with TFHE-rs](https://www.zama.ai/post/dark-market-tfhe-rs)
+- [Regular expression engine with TFHE-rs](https://www.zama.ai/post/regex-engine-tfhe-rs)

-*Explore more useful resources in [TFHE-rs tutorials](https://docs.zama.org/tfhe-rs/tutorials) and [Awesome Zama repo](https://github.com/zama-ai/awesome-zama)*
+*Explore more useful resources in [TFHE-rs tutorials](https://docs.zama.ai/tfhe-rs/tutorials) and [Awesome Zama repo](https://github.com/zama-ai/awesome-zama)*
 <br></br>
 ### Documentation

-Full, comprehensive documentation is available here: [https://docs.zama.org/tfhe-rs](https://docs.zama.org/tfhe-rs).
+Full, comprehensive documentation is available here: [https://docs.zama.ai/tfhe-rs](https://docs.zama.ai/tfhe-rs).
 <p align="right">
  <a href="#about" > ↑ Back to top </a>
 </p>
@@ -202,7 +202,7 @@ When a new update is published in the Lattice Estimator, we update parameters ac
 ### Security model

 By default, the parameter sets used in the High-Level API have a failure probability $\le 2^{-128}$ to securely work in the IND-CPA^D model using the algorithmic techniques provided in our code base [1].
-If you want to work within the IND-CPA security model, which is less strict than the IND-CPA-D model, the parameter sets can easily be changed and would have slightly better performance. More details can be found in the [TFHE-rs documentation](https://docs.zama.org/tfhe-rs).
+If you want to work within the IND-CPA security model, which is less strict than the IND-CPA-D model, the parameter sets can easily be changed and would have slightly better performance. More details can be found in the [TFHE-rs documentation](https://docs.zama.ai/tfhe-rs).

 [1] Bernard, Olivier, et al. "Drifting Towards Better Error Probabilities in Fully Homomorphic Encryption Schemes". https://eprint.iacr.org/2024/1718.pdf

@@ -231,7 +231,7 @@ To cite TFHE-rs in academic papers, please use the following entry:
 There are two ways to contribute to TFHE-rs:

 - [Open issues](https://github.com/zama-ai/tfhe-rs/issues/new/choose) to report bugs and typos, or to suggest new ideas
- Request to become an official contributor by emailing [hello@zama.org](mailto:hello@zama.org).
+- Request to become an official contributor by emailing [hello@zama.ai](mailto:hello@zama.ai).

 Becoming an approved contributor involves signing our Contributor License Agreement (CLA). Only approved contributors can send pull requests, so please make sure to get in touch before you do!
 <br></br>
@@ -243,16 +243,16 @@ This software is distributed under the **BSD-3-Clause-Clear** license. Read [thi
 **Is Zama’s technology free to use?**
 >Zama’s libraries are free to use under the BSD 3-Clause Clear license only for development, research, prototyping, and experimentation purposes. However, for any commercial use of Zama's open source code, companies must purchase Zama’s commercial patent license.
 >
->Everything we do is open source and we are very transparent on what it means for our users, you can read more about how we monetize our open source products at Zama in [this blogpost](https://www.zama.org/post/open-source).
+>Everything we do is open source and we are very transparent on what it means for our users, you can read more about how we monetize our open source products at Zama in [this blogpost](https://www.zama.ai/post/open-source).

 **What do I need to do if I want to use Zama’s technology for commercial purposes?**
->To commercially use Zama’s technology you need to be granted Zama’s patent license. Please contact us hello@zama.org for more information.
+>To commercially use Zama’s technology you need to be granted Zama’s patent license. Please contact us hello@zama.ai for more information.

 **Do you file IP on your technology?**
 >Yes, all Zama’s technologies are patented.

 **Can you customize a solution for my specific use case?**
->We are open to collaborating and advancing the FHE space with our partners. If you have specific needs, please email us at hello@zama.org.
+>We are open to collaborating and advancing the FHE space with our partners. If you have specific needs, please email us at hello@zama.ai.

 <p align="right">
  <a href="#about" > ↑ Back to top </a>
@@ -261,7 +261,7 @@ This software is distributed under the **BSD-3-Clause-Clear** license. Read [thi

 ## Support

-<a target="_blank" href="https://community.zama.org">
+<a target="_blank" href="https://community.zama.ai">
 <picture>
  <source media="(prefers-color-scheme: dark)" srcset="https://github.com/zama-ai/tfhe-rs/assets/157474013/08656d0a-3f44-4126-b8b6-8c601dff5380">
  <source media="(prefers-color-scheme: light)" srcset="https://github.com/zama-ai/tfhe-rs/assets/157474013/1c9c9308-50ac-4aab-a4b9-469bb8c536a4">
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -5,8 +5,8 @@ edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
 description = "Cuda implementation of TFHE-rs primitives."
-homepage = "https://www.zama.org/"
-documentation = "https://docs.zama.org/tfhe-rs"
+homepage = "https://www.zama.ai/"
+documentation = "https://docs.zama.ai/tfhe-rs"
 repository = "https://github.com/zama-ai/tfhe-rs"
 readme = "README.md"
 keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
--- a/backends/tfhe-cuda-backend/README.md
+++ b/backends/tfhe-cuda-backend/README.md
@@ -51,4 +51,4 @@ If your machine does not have an available Nvidia GPU, the compilation will work
 ## License

 This software is distributed under the BSD-3-Clause-Clear license. If you have any questions,
-please contact us at `hello@zama.org`.
+please contact us at `hello@zama.ai`.
--- a/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
@@ -14,7 +14,6 @@ template <typename Torus> struct int_are_all_block_true_buffer {
  // of interest in are_all_block_true(), as with max_value (the maximum message
  // value).
  int_radix_lut<Torus> *is_max_value;
-  Torus *preallocated_h_lut;
  bool gpu_memory_allocated;

  int_are_all_block_true_buffer(CudaStreams streams, COMPARISON_TYPE op,
@@ -40,10 +39,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
        max_chunks, params.big_lwe_dimension, size_tracker,
        allocate_gpu_memory);

-    preallocated_h_lut = (Torus *)malloc(safe_mul_sizeof<Torus>(
-        params.glwe_dimension + 1, params.polynomial_size));
-
-    is_max_value = new int_radix_lut<Torus>(streams, params, 2, max_chunks,
+    is_max_value = new int_radix_lut<Torus>(streams, params, 1, max_chunks,
                                            allocate_gpu_memory, size_tracker);

    auto active_streams =
@@ -67,7 +63,6 @@ template <typename Torus> struct int_are_all_block_true_buffer {
    delete tmp_out;
    delete tmp_block_accumulated;
    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
-    free(preallocated_h_lut);
  }
 };

--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -823,6 +823,10 @@ private:
    generate_lut_indexes<InputTorus>(streams, generator, get_lut_indexes(0, 0),
                                     num_indexes, num_luts, index_buffer,
                                     gpu_memory_allocated);
+
+    if (h_buffer != nullptr) {
+      memcpy(h_lut_indexes, h_buffer, num_indexes * sizeof(h_lut_indexes[0]));
+    }
  }

  /// Sets all LUT indexes to a constant value on both CPU and GPU.
@@ -881,6 +885,39 @@ public:
    broadcast_lut(streams, false);
  }

+  // TODO: add comment
+  template <typename IndexGenerator>
+  void prepare_to_apply_to_block_subset(uint32_t num_radix_blocks_subset,
+                                        IndexGenerator generator) {
+    // TODO: add comment
+    GPU_ASSERT(num_radix_blocks_subset <= num_blocks,
+               "num_radix_blocks_subset (%u) must not exceed num_blocks (%u)",
+               num_radix_blocks_subset, num_blocks);
+
+    if constexpr (!std::is_same_v<IndexGenerator, std::nullptr_t>) {
+      // TODO: add comment
+      std::vector<InputTorus> expected(num_blocks);
+      generator(expected.data(), num_blocks);
+      for (uint32_t i = 0; i < num_blocks; i++) {
+        // TODO: add comment
+        GPU_ASSERT(expected[i] == h_lut_indexes[i],
+                   "LUT index mismatch at block %u: expected %llu, stored %llu",
+                   i, (unsigned long long)expected[i],
+                   (unsigned long long)h_lut_indexes[i]);
+      }
+    } else {
+      // TODO: add comment
+      for (uint32_t i = 0; i < num_blocks; i++) {
+        GPU_ASSERT(h_lut_indexes[i] == 0,
+                   "LUT index mismatch at block %u: expected 0, stored %llu", i,
+                   (unsigned long long)h_lut_indexes[i]);
+      }
+    }
+
+    // TODO: add comment
+    last_broadcast_num_radix_blocks = num_radix_blocks_subset;
+  }
+
  // Broadcast luts from device gpu_indexes[0] to all active gpus
  void broadcast_lut(CudaStreams new_active_streams,
                     bool broadcast_lut_values = true) {
@@ -1268,6 +1305,8 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
  int_radix_params params;
  int_radix_lut<Torus> *lut;
  bool gpu_memory_allocated;
+  uint32_t bits_per_block = 0;
+  uint32_t num_radix_blocks = 0;

  // With offset
  int_bit_extract_luts_buffer(CudaStreams streams, int_radix_params params,
@@ -1277,6 +1316,8 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
                              uint64_t &size_tracker) {
    this->params = params;
    gpu_memory_allocated = allocate_gpu_memory;
+    this->bits_per_block = bits_per_block;
+    this->num_radix_blocks = num_radix_blocks;

    lut = new int_radix_lut<Torus>(streams, params, bits_per_block,
                                   bits_per_block * num_radix_blocks,
@@ -1303,10 +1344,8 @@ template <typename Torus> struct int_bit_extract_luts_buffer {

    auto lut_index_generator =
        [num_radix_blocks, bits_per_block](Torus *h_lut_indexes, uint32_t) {
-          for (int j = 0; j < num_radix_blocks; j++) {
-            for (int i = 0; i < bits_per_block; i++)
-              h_lut_indexes[i + j * bits_per_block] = i;
-          }
+          compute_bit_extract_lut_indexes(h_lut_indexes, num_radix_blocks,
+                                          bits_per_block);
        };

    lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funs,
@@ -1354,11 +1393,28 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
                                    num_radix_blocks, allocate_gpu_memory,
                                    size_tracker) {}

+  // TODO: add comment
+  void prepare_lut_for_blocks(uint32_t effective_num_blocks) {
+    auto generator = [this](Torus *h, uint32_t) {
+      compute_bit_extract_lut_indexes(h, num_radix_blocks, bits_per_block);
+    };
+    lut->prepare_to_apply_to_block_subset(effective_num_blocks, generator);
+  }
+
  void release(CudaStreams streams) {
    lut->release(streams);
    delete (lut);
    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
  }
+
+private:
+  static void compute_bit_extract_lut_indexes(Torus *h,
+                                              uint32_t num_radix_blocks,
+                                              uint32_t bits_per_block) {
+    for (int j = 0; j < num_radix_blocks; j++)
+      for (int i = 0; i < bits_per_block; i++)
+        h[i + j * bits_per_block] = i;
+  }
 };

 template <typename Torus> struct int_fullprop_buffer {
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
@@ -25,24 +25,39 @@ void cuda_convert_lwe_programmable_bootstrap_key_128_async(
    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
    uint32_t polynomial_size);

+uint64_t scratch_cuda_programmable_bootstrap_amortized_64_async(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+
+void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32_async(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples);
+
+void cuda_programmable_bootstrap_amortized_64_async(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples);
+
+void cleanup_cuda_programmable_bootstrap_amortized_64(void *stream,
+                                                      uint32_t gpu_index,
+                                                      int8_t **pbs_buffer);
+
 uint64_t scratch_cuda_programmable_bootstrap_64_async(
    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-uint64_t scratch_cuda_programmable_bootstrap_tbc_generic_64_async(
-    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
-
-uint64_t scratch_cuda_programmable_bootstrap_tbc_2_2_64_async(
-    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
-
 uint64_t scratch_cuda_programmable_bootstrap_128_async(
    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -67,24 +82,6 @@ void cuda_programmable_bootstrap_64_async(
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);

-void cuda_programmable_bootstrap_tbc_64_generic_async(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
-
-void cuda_programmable_bootstrap_tbc_64_2_2_async(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
-
 void cuda_programmable_bootstrap_128_async(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lut_vector, void const *lwe_array_in,
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -25,16 +25,6 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64_async(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

-uint64_t scratch_cuda_multi_bit_programmable_bootstrap_tbc_generic_64_async(
-    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
-
-uint64_t scratch_cuda_multi_bit_programmable_bootstrap_tbc_2_2_64_async(
-    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
-
 void cuda_multi_bit_programmable_bootstrap_64_async(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_output_indexes, void const *lut_vector,
@@ -45,26 +35,6 @@ void cuda_multi_bit_programmable_bootstrap_64_async(
    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
    uint32_t lut_stride);

-void cuda_multi_bit_programmable_bootstrap_tbc_64_generic_async(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
-    uint32_t lut_stride);
-
-void cuda_multi_bit_programmable_bootstrap_tbc_64_2_2_async(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
-    uint32_t lut_stride);
-
 void cleanup_cuda_multi_bit_programmable_bootstrap_64(void *stream,
                                                      uint32_t gpu_index,
                                                      int8_t **pbs_buffer);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -128,32 +128,22 @@ __host__ void are_all_comparisons_block_true(
      // is_non_zero_lut_buffer LUT
      lut = mem_ptr->eq_buffer->is_non_zero_lut;
    } else {
-      if (chunk_lengths[num_chunks - 1] != max_value) {
-        // LUT needs to be computed
-        uint32_t chunk_length = chunk_lengths[num_chunks - 1];
-        auto is_equal_to_num_blocks_lut_f = [chunk_length](Torus x) -> Torus {
-          return x == chunk_length;
-        };
-
-        auto num_blocks = is_max_value_lut->num_blocks;
-        auto active_streams =
-            streams.active_gpu_subset(num_chunks, params.pbs_type);
-
-        // Index generator: last chunk uses LUT 1, others use LUT 0
-        auto index_gen = [num_chunks, num_blocks](Torus *h_lut_indexes,
-                                                  uint32_t) {
-          for (uint32_t index = 0; index < num_blocks; index++) {
-            if (index == num_chunks - 1) {
-              h_lut_indexes[index] = 1;
-            } else if (index < num_chunks - 1 || index >= num_chunks) {
-              h_lut_indexes[index] = 0;
-            }
-          }
-        };
-
-        is_max_value_lut->generate_and_broadcast_lut(
-            active_streams, {1}, {is_equal_to_num_blocks_lut_f}, index_gen,
-            true, {are_all_block_true_buffer->preallocated_h_lut});
+      // Pad the last chunk's accumulator block up to max_value by adding
+      // trivial plaintext 1s, so that all chunks can use the same
+      // is_max_value LUT uniformly.
+      uint32_t last_chunk_length = chunk_lengths[num_chunks - 1];
+      if (last_chunk_length != max_value) {
+        uint32_t pad = max_value - last_chunk_length;
+        GPU_ASSERT(pad < max_value,
+                   "pad (%u) must be strictly less than max_value (%u)", pad,
+                   max_value);
+        Torus delta = (Torus(1) << (sizeof(Torus) * 8 - 1)) /
+                      (message_modulus * carry_modulus);
+        Torus *last_block_ptr = (Torus *)accumulator->ptr +
+                                (num_chunks - 1) * (big_lwe_dimension + 1);
+        device_add_scalar_one_inplace<<<1, 1, 0, streams.stream(0)>>>(
+            last_block_ptr, 1, big_lwe_dimension, pad * delta);
+        check_cuda_error(cudaGetLastError());
      }
      lut = is_max_value_lut;
    }
@@ -163,13 +153,6 @@ __host__ void are_all_comparisons_block_true(
      // In the last iteration we copy the output to the final address
      integer_radix_apply_univariate_lookup_table<Torus>(
          streams, lwe_array_out, accumulator, bsks, ksks, lut, 1);
-      // Reset max_value_lut_indexes before returning, otherwise if the lut is
-      // reused the lut indexes will be wrong
-      auto active_gpu_count_is_max = streams.active_gpu_subset(
-          is_max_value_lut->num_blocks, params.pbs_type);
-      is_max_value_lut->set_lut_indexes_and_broadcast_constant(
-          active_gpu_count_is_max, 0);
-
      reset_radix_ciphertext_blocks(lwe_array_out, 1);
      return;
    } else {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -546,7 +546,9 @@ __host__ void integer_radix_apply_univariate_lookup_table(

  // Verify consistency between set_lut_indexes and apply_lookup_table
  GPU_ASSERT(
-      num_radix_blocks <= lut->last_broadcast_num_radix_blocks,
+      lut->num_luts == 1
+          ? num_radix_blocks <= lut->last_broadcast_num_radix_blocks
+          : num_radix_blocks == lut->last_broadcast_num_radix_blocks,
      "num_radix_blocks (%u) must match last_broadcast_num_radix_blocks (%u)",
      num_radix_blocks, lut->last_broadcast_num_radix_blocks);
  GPU_ASSERT(active_streams.count() <= lut->last_broadcast_streams.count(),
@@ -655,6 +657,13 @@ __host__ void integer_radix_apply_many_univariate_lookup_table(
  if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
    PANIC("Cuda error: input and output radix ciphertexts should have the same "
          "lwe dimension")
+  GPU_ASSERT(
+      lut->num_luts == 1 ? lwe_array_in->num_radix_blocks <=
+                               lut->last_broadcast_num_radix_blocks
+                         : lwe_array_in->num_radix_blocks ==
+                               lut->last_broadcast_num_radix_blocks,
+      "num_radix_blocks (%u) must match last_broadcast_num_radix_blocks (%u)",
+      lwe_array_in->num_radix_blocks, lut->last_broadcast_num_radix_blocks);

  auto num_radix_blocks = lwe_array_in->num_radix_blocks;
  /// For multi GPU execution we create vectors of pointers for inputs and
@@ -747,6 +756,12 @@ __host__ void integer_radix_apply_bivariate_lookup_table(
  if (num_radix_blocks > lut->num_blocks)
    PANIC("Cuda error: num radix blocks on which lut is applied should be "
          "smaller or equal to the number of lut radix blocks")
+  GPU_ASSERT(
+      lut->num_luts == 1
+          ? num_radix_blocks <= lut->last_broadcast_num_radix_blocks
+          : num_radix_blocks == lut->last_broadcast_num_radix_blocks,
+      "num_radix_blocks (%u) must match last_broadcast_num_radix_blocks (%u)",
+      num_radix_blocks, lut->last_broadcast_num_radix_blocks);
  if (num_radix_blocks > lwe_array_out->num_radix_blocks ||
      num_radix_blocks > lwe_array_1->num_radix_blocks ||
      num_radix_blocks > lwe_array_2->num_radix_blocks)
@@ -1376,10 +1391,18 @@ void host_resolve_group_carries_sequentially(

      // Apply the lut
      auto luts_sequential = mem->lut_sequential_algorithm;
+      auto lut_index_generator = [](Torus *h_lut_indexes,
+                                    uint32_t num_indexes) {
+        for (uint32_t i = 0; i < num_indexes; i++)
+          h_lut_indexes[i] = i;
+      };
+      luts_sequential->prepare_to_apply_to_block_subset(blocks_to_solve,
+                                                        lut_index_generator);
      CudaRadixCiphertextFFI shifted_group_resolved_carries;
      as_radix_ciphertext_slice<Torus>(&shifted_group_resolved_carries,
                                       group_resolved_carries, 1,
                                       blocks_to_solve + 1);
+
      integer_radix_apply_univariate_lookup_table<Torus>(
          streams, &shifted_group_resolved_carries,
          &shifted_group_resolved_carries, bsks, ksks, luts_sequential,
@@ -1716,6 +1739,7 @@ extract_n_bits(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
          num_radix_blocks);
    }
  }
+  bit_extract->prepare_lut_for_blocks(effective_num_radix_blocks);
  integer_radix_apply_univariate_lookup_table<Torus>(
      streams, lwe_array_out, lwe_array_out, bsks, ksks, bit_extract->lut,
      effective_num_radix_blocks);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -414,6 +414,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec(

      // we just need to broadcast the indexes
      luts_message_carry->broadcast_lut(active_streams, false);
+      luts_message_carry->prepare_to_apply_to_block_subset(
+          total_ciphertexts, LUT_0_FOR_ALL_BLOCKS);
      luts_message_carry->using_trivial_lwe_indexes = false;

      integer_radix_apply_univariate_lookup_table<Torus>(
@@ -467,6 +469,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec(
      uint32_t num_blocks_in_apply_lut = 2 * num_radix_blocks;
      // we just need to broadcast the indexes
      luts_message_carry->broadcast_lut(active_streams, false);
+      luts_message_carry->prepare_to_apply_to_block_subset(
+          num_blocks_in_apply_lut, LUT_0_FOR_ALL_BLOCKS);
      luts_message_carry->using_trivial_lwe_indexes = false;

      integer_radix_apply_univariate_lookup_table<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
@@ -0,0 +1,317 @@
+#include "programmable_bootstrap_amortized.cuh"
+
+/*
+ * This scratch function allocates the necessary amount of data on the GPU for
+ * the amortized PBS on 64 bits inputs, into `buffer`. It also
+ * configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
+ * be used.
+ */
+uint64_t scratch_cuda_programmable_bootstrap_amortized_64_async(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+
+  uint64_t size_tracker = 0;
+  switch (polynomial_size) {
+  case 256:
+    size_tracker =
+        scratch_programmable_bootstrap_amortized<uint64_t,
+                                                 AmortizedDegree<256>>(
+            static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
+            glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
+            allocate_gpu_memory);
+    return size_tracker;
+  case 512:
+    size_tracker =
+        scratch_programmable_bootstrap_amortized<uint64_t,
+                                                 AmortizedDegree<512>>(
+            static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
+            glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
+            allocate_gpu_memory);
+    return size_tracker;
+  case 1024:
+    size_tracker =
+        scratch_programmable_bootstrap_amortized<uint64_t,
+                                                 AmortizedDegree<1024>>(
+            static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
+            glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
+            allocate_gpu_memory);
+    return size_tracker;
+  case 2048:
+    size_tracker =
+        scratch_programmable_bootstrap_amortized<uint64_t,
+                                                 AmortizedDegree<2048>>(
+            static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
+            glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
+            allocate_gpu_memory);
+    return size_tracker;
+  case 4096:
+    size_tracker =
+        scratch_programmable_bootstrap_amortized<uint64_t,
+                                                 AmortizedDegree<4096>>(
+            static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
+            glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
+            allocate_gpu_memory);
+    return size_tracker;
+  case 8192:
+    size_tracker =
+        scratch_programmable_bootstrap_amortized<uint64_t,
+                                                 AmortizedDegree<8192>>(
+            static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
+            glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
+            allocate_gpu_memory);
+    return size_tracker;
+  case 16384:
+    size_tracker =
+        scratch_programmable_bootstrap_amortized<uint64_t,
+                                                 AmortizedDegree<16384>>(
+            static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
+            glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
+            allocate_gpu_memory);
+    return size_tracker;
+  default:
+    PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..16384].")
+  }
+}
+
+/* Perform the programmable bootstrapping on a batch of input u32 LWE
+ * ciphertexts. See the corresponding operation on 64 bits for more details.
+ */
+void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32_async(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples) {
+
+  if (base_log > 32)
+    PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
+          "the ciphertext representation (32)");
+
+  switch (polynomial_size) {
+  case 256:
+    host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
+        (uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
+        (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
+        num_samples);
+    break;
+  case 512:
+    host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
+        (uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
+        (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
+        num_samples);
+    break;
+  case 1024:
+    host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
+        (uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
+        (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
+        num_samples);
+    break;
+  case 2048:
+    host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
+        (uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
+        (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
+        num_samples);
+    break;
+  case 4096:
+    host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
+        (uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
+        (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
+        num_samples);
+    break;
+  case 8192:
+    host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<8192>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
+        (uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
+        (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
+        num_samples);
+    break;
+  case 16384:
+    host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<16384>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint32_t *)lwe_array_out,
+        (uint32_t *)lwe_output_indexes, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
+        (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
+        num_samples);
+    break;
+  default:
+    PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..16384].")
+  }
+}
+
+/* Perform the programmable bootstrapping on a batch of input u64 LWE
+ * ciphertexts. This functions performs best for large numbers of inputs (> 10).
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ *  - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
+ * (a0,..an-1,b) where n is the LWE dimension
+ *  - lut_vector: should hold as many luts of size polynomial_size
+ * as there are input ciphertexts, but actually holds
+ * num_luts vectors to reduce memory usage
+ *  - lut_vector_indexes: stores the index corresponding to
+ * which lut of lut_vector to use for each LWE input in
+ * lwe_array_in
+ *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
+ * mask values + 1 body value
+ *  - bootstrapping_key: GGSW encryption of the LWE secret key sk1
+ * under secret key sk2
+ * bsk = Z + sk1 H
+ * where H is the gadget matrix and Z is a matrix (k+1).l
+ * containing GLWE encryptions of 0 under sk2.
+ * bsk is thus a tensor of size (k+1)^2.l.N.n
+ * where l is the number of decomposition levels and
+ * k is the GLWE dimension, N is the polynomial size for
+ * GLWE. The polynomial size for GLWE and the lut
+ * are the same because they have to be in the same ring
+ * to be multiplied.
+ * - input_lwe_dimension: size of the Torus vector used to encrypt the input
+ * LWE ciphertexts - referred to as n above (~ 600)
+ * - polynomial_size: size of the test polynomial (lut) and size of the
+ * GLWE polynomials (~1024) (where `size` refers to the polynomial degree + 1).
+ * - base_log: log of the base used for the gadget matrix - B = 2^base_log (~8)
+ * - level_count: number of decomposition levels in the gadget matrix (~4)
+ * - num_samples: number of encrypted input messages
+ * used
+ *
+ * This function calls a wrapper to a device kernel that performs the
+ * bootstrapping:
+ * 	- the kernel is templatized based on integer discretization and
+ * polynomial degree
+ * 	- num_samples blocks of threads are launched, where each thread is going
+ * to handle one or more polynomial coefficients at each stage:
+ * 		- perform the blind rotation
+ * 		- round the result
+ * 		- decompose into level_count levels, then for each level:
+ * 		  - switch to the FFT domain
+ * 		  - multiply with the bootstrapping key
+ * 		  - come back to the coefficients representation
+ * 	- between each stage a synchronization of the threads is necessary
+ * 	- in case the device has enough shared memory, temporary arrays used for
+ * the different stages (accumulators) are stored into the shared memory
+ * 	- the accumulators serve to combine the results for all decomposition
+ * levels
+ * 	- the constant memory (64K) is used for storing the roots of identity
+ * values for the FFT
+ */
+void cuda_programmable_bootstrap_amortized_64_async(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples) {
+
+  if (base_log > 64)
+    PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
+          "the ciphertext representation (64)");
+
+  switch (polynomial_size) {
+  case 256:
+    host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
+        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
+        (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
+        num_samples);
+    break;
+  case 512:
+    host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
+        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
+        (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
+        num_samples);
+    break;
+  case 1024:
+    host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
+        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
+        (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
+        num_samples);
+    break;
+  case 2048:
+    host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
+        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
+        (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
+        num_samples);
+    break;
+  case 4096:
+    host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
+        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
+        (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
+        num_samples);
+    break;
+  case 8192:
+    host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<8192>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
+        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
+        (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
+        num_samples);
+    break;
+  case 16384:
+    host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<16384>>(
+        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)lwe_output_indexes, (uint64_t *)lut_vector,
+        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
+        (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
+        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
+        num_samples);
+    break;
+  default:
+    PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..16384].")
+  }
+}
+
+/*
+ * This cleanup function frees the data for the amortized PBS on GPU in
+ * buffer for 32 or 64 bits inputs.
+ */
+void cleanup_cuda_programmable_bootstrap_amortized_64(void *stream,
+                                                      uint32_t gpu_index,
+                                                      int8_t **pbs_buffer) {
+
+  // Free memory
+  cuda_drop_async(*pbs_buffer, static_cast<cudaStream_t>(stream), gpu_index);
+  *pbs_buffer = nullptr;
+  cuda_synchronize_stream(static_cast<cudaStream_t>(stream), gpu_index);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -0,0 +1,371 @@
+#ifndef CUDA_AMORTIZED_PBS_CUH
+#define CUDA_AMORTIZED_PBS_CUH
+
+#ifdef __CDT_PARSER__
+#undef __CUDA_RUNTIME_H__
+#include <cuda_runtime.h>
+#endif
+
+#include "bootstrapping_key.cuh"
+#include "crypto/gadget.cuh"
+#include "crypto/torus.cuh"
+#include "device.h"
+#include "fft/bnsmfft.cuh"
+#include "fft/twiddles.cuh"
+#include "pbs/programmable_bootstrap.h"
+#include "polynomial/functions.cuh"
+#include "polynomial/parameters.cuh"
+#include "polynomial/polynomial_math.cuh"
+#include "types/complex/operations.cuh"
+
+template <typename Torus, class params, sharedMemDegree SMD>
+/*
+ * Kernel launched by host_programmable_bootstrap_amortized
+ *
+ * Uses shared memory to increase performance
+ *  - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
+ * (a0,..an-1,b) where n is the LWE dimension
+ *  - lut_vector: should hold as many luts of size polynomial_size
+ * as there are input ciphertexts, but actually holds
+ * num_luts vectors to reduce memory usage
+ *  - lut_vector_indexes: stores the index corresponding to which lut
+ * to use for each sample in lut_vector
+ *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
+ * mask values + 1 body value
+ *  - bootstrapping_key: RGSW encryption of the LWE secret key sk1 under secret
+ * key sk2
+ *  - device_mem: pointer to the device's global memory in case we use it (SMD
+ * == NOSM or PARTIALSM)
+ *  - lwe_dimension: size of the Torus vector used to encrypt the input
+ * LWE ciphertexts - referred to as n above (~ 600)
+ *  - polynomial_size: size of the test polynomial (lut) and size of the
+ * GLWE polynomial (~1024)
+ *  - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
+ *  - level_count: number of decomposition levels in the gadget matrix (~4)
+ *  - gpu_num: index of the current GPU (useful for multi-GPU computations)
+ *  - device_memory_size_per_sample: amount of global memory to allocate if SMD
+ * is not FULLSM
+ */
+__global__ void device_programmable_bootstrap_amortized(
+    Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
+    const Torus *__restrict__ lut_vector,
+    const Torus *__restrict__ lut_vector_indexes,
+    const Torus *__restrict__ lwe_array_in,
+    const Torus *__restrict__ lwe_input_indexes,
+    const double2 *__restrict__ bootstrapping_key, int8_t *device_mem,
+    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count,
+    size_t device_memory_size_per_sample) {
+  // We use shared memory for the polynomials that are used often during the
+  // bootstrap, since shared memory is kept in L1 cache and accessing it is
+  // much faster than global memory
+  extern __shared__ int8_t sharedmem[];
+  int8_t *selected_memory;
+
+  if constexpr (SMD == FULLSM)
+    selected_memory = sharedmem;
+  else
+    selected_memory = &device_mem[blockIdx.x * device_memory_size_per_sample];
+
+  // For GPU bootstrapping the GLWE dimension is hard-set to 1: there is only
+  // one mask polynomial and 1 body to handle.
+  Torus *accumulator = (Torus *)selected_memory;
+  Torus *accumulator_rotated =
+      (Torus *)accumulator +
+      (ptrdiff_t)((glwe_dimension + 1) * polynomial_size);
+  double2 *res_fft =
+      (double2 *)accumulator_rotated + (glwe_dimension + 1) * polynomial_size /
+                                           (sizeof(double2) / sizeof(Torus));
+  double2 *accumulator_fft = (double2 *)sharedmem;
+  if constexpr (SMD != PARTIALSM)
+    accumulator_fft = (double2 *)res_fft +
+                      (ptrdiff_t)((glwe_dimension + 1) * polynomial_size / 2);
+
+  auto block_lwe_array_in =
+      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
+  const Torus *block_lut_vector =
+      &lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
+                  (glwe_dimension + 1)];
+
+  // Put "b", the body, in [0, 2N[
+  constexpr auto log_modulus = params::log2_degree + 1;
+  Torus b_hat = 0;
+  auto correction = centered_binary_modulus_switch_body_correction_to_add(
+      block_lwe_array_in, lwe_dimension, log_modulus);
+  modulus_switch(block_lwe_array_in[lwe_dimension] + correction, b_hat,
+                 log_modulus);
+
+  divide_by_monomial_negacyclic_inplace<Torus, params::opt,
+                                        params::degree / params::opt>(
+      accumulator, block_lut_vector, b_hat, false, glwe_dimension + 1);
+
+  // Loop over all the mask elements of the sample to accumulate
+  // (X^a_i-1) multiplication, decomposition of the resulting polynomial
+  // into level_count polynomials, and performing polynomial multiplication
+  // via an FFT with the RGSW encrypted secret key
+  for (int iteration = 0; iteration < lwe_dimension; iteration++) {
+    __syncthreads();
+
+    // Put "a" in [0, 2N[ instead of Zq
+    Torus a_hat = 0;
+    modulus_switch(block_lwe_array_in[iteration], a_hat, log_modulus);
+
+    // Perform ACC * (X^ä - 1)
+    multiply_by_monomial_negacyclic_and_sub_polynomial<
+        Torus, params::opt, params::degree / params::opt>(
+        accumulator, accumulator_rotated, a_hat, glwe_dimension + 1);
+
+    __syncthreads();
+
+    // Perform a rounding to increase the accuracy of the
+    // bootstrapped ciphertext
+    init_decomposer_state_inplace<Torus, params::opt,
+                                  params::degree / params::opt>(
+        accumulator_rotated, base_log, level_count, glwe_dimension + 1);
+
+    // Initialize the polynomial multiplication via FFT arrays
+    // The polynomial multiplications happens at the block level
+    // and each thread handles two or more coefficients
+    int pos = threadIdx.x;
+    for (int i = 0; i < (glwe_dimension + 1); i++)
+      for (int j = 0; j < params::opt / 2; j++) {
+        res_fft[pos].x = 0;
+        res_fft[pos].y = 0;
+        pos += params::degree / params::opt;
+      }
+
+    GadgetMatrix<Torus, params> gadget(base_log, level_count,
+                                       accumulator_rotated, glwe_dimension + 1);
+    // Now that the rotation is done, decompose the resulting polynomial
+    // coefficients so as to multiply each decomposed level with the
+    // corresponding part of the bootstrapping key
+    for (int level = level_count - 1; level >= 0; level--) {
+      for (int i = 0; i < (glwe_dimension + 1); i++) {
+        gadget.decompose_and_compress_next_polynomial(accumulator_fft, i);
+
+        // Switch to the FFT space
+        NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
+
+        // Get the bootstrapping key piece necessary for the multiplication
+        // It is already in the Fourier domain
+        auto bsk_slice = get_ith_mask_kth_block(bootstrapping_key, iteration, i,
+                                                level, polynomial_size,
+                                                glwe_dimension, level_count);
+
+        // Perform the coefficient-wise product with the two pieces of
+        // bootstrapping key
+        for (int j = 0; j < (glwe_dimension + 1); j++) {
+          auto bsk_poly = bsk_slice + j * params::degree / 2;
+          auto res_fft_poly = res_fft + j * params::degree / 2;
+          polynomial_product_accumulate_in_fourier_domain<params, double2>(
+              res_fft_poly, accumulator_fft, bsk_poly);
+        }
+      }
+      __syncthreads();
+    }
+
+    // Come back to the coefficient representation
+    if constexpr (SMD == FULLSM || SMD == NOSM) {
+      __syncthreads();
+
+      for (int i = 0; i < (glwe_dimension + 1); i++) {
+        auto res_fft_slice = res_fft + i * params::degree / 2;
+        NSMFFT_inverse<HalfDegree<params>>(res_fft_slice);
+      }
+      __syncthreads();
+
+      for (int i = 0; i < (glwe_dimension + 1); i++) {
+        auto accumulator_slice = accumulator + i * params::degree;
+        auto res_fft_slice = res_fft + i * params::degree / 2;
+        add_to_torus<Torus, params>(res_fft_slice, accumulator_slice);
+      }
+      __syncthreads();
+    } else {
+#pragma unroll
+      for (int i = 0; i < (glwe_dimension + 1); i++) {
+        auto accumulator_slice = accumulator + i * params::degree;
+        auto res_fft_slice = res_fft + i * params::degree / 2;
+        int tid = threadIdx.x;
+        for (int j = 0; j < params::opt / 2; j++) {
+          accumulator_fft[tid] = res_fft_slice[tid];
+          tid = tid + params::degree / params::opt;
+        }
+        __syncthreads();
+
+        NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
+        __syncthreads();
+
+        add_to_torus<Torus, params>(accumulator_fft, accumulator_slice);
+      }
+      __syncthreads();
+    }
+  }
+
+  auto block_lwe_array_out =
+      &lwe_array_out[lwe_output_indexes[blockIdx.x] *
+                     (glwe_dimension * polynomial_size + 1)];
+
+  // The blind rotation for this block is over
+  // Now we can perform the sample extraction: for the body it's just
+  // the resulting constant coefficient of the accumulator
+  // For the mask it's more complicated
+  sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator,
+                                     glwe_dimension);
+
+  // No need to sync here, it is already synchronized after add_to_torus
+  sample_extract_body<Torus, params>(block_lwe_array_out, accumulator,
+                                     glwe_dimension);
+}
+
+template <typename Torus>
+uint64_t get_buffer_size_full_sm_programmable_bootstrap_amortized(
+    uint32_t polynomial_size, uint32_t glwe_dimension) {
+  return safe_mul_sizeof<Torus>((size_t)polynomial_size,
+                                (size_t)(glwe_dimension + 1)) + // accumulator
+         safe_mul_sizeof<Torus>(
+             (size_t)polynomial_size,
+             (size_t)(glwe_dimension + 1)) +             // accumulator rotated
+         safe_mul_sizeof<double2>(polynomial_size / 2) + // accumulator fft
+         safe_mul_sizeof<double2>((size_t)(polynomial_size / 2),
+                                  (size_t)(glwe_dimension + 1)); // res fft
+}
+
+template <typename Torus>
+uint64_t get_buffer_size_partial_sm_programmable_bootstrap_amortized(
+    uint32_t polynomial_size) {
+  return safe_mul_sizeof<double2>(polynomial_size / 2); // accumulator fft
+}
+
+template <typename Torus>
+uint64_t get_buffer_size_programmable_bootstrap_amortized(
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
+
+  uint64_t full_sm =
+      get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
+          polynomial_size, glwe_dimension);
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_programmable_bootstrap_amortized<Torus>(
+          polynomial_size);
+  uint64_t partial_dm = full_sm - partial_sm;
+  uint64_t full_dm = full_sm;
+  uint64_t device_mem = 0;
+  if (max_shared_memory < partial_sm) {
+    device_mem = full_dm * input_lwe_ciphertext_count;
+  } else if (max_shared_memory < full_sm) {
+    device_mem = partial_dm * input_lwe_ciphertext_count;
+  }
+  return device_mem + device_mem % sizeof(double2);
+}
+
+template <typename Torus, typename params>
+__host__ uint64_t scratch_programmable_bootstrap_amortized(
+    cudaStream_t stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+
+  cuda_set_device(gpu_index);
+
+  uint64_t full_sm =
+      get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
+          polynomial_size, glwe_dimension);
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_programmable_bootstrap_amortized<Torus>(
+          polynomial_size);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
+        cudaFuncCachePreferShared));
+  } else if (max_shared_memory >= partial_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_amortized<Torus, params, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_amortized<Torus, params, FULLSM>,
+        cudaFuncCachePreferShared));
+  }
+  uint64_t size_tracker = 0;
+  uint64_t buffer_size =
+      get_buffer_size_programmable_bootstrap_amortized<Torus>(
+          glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
+          max_shared_memory);
+  *pbs_buffer = (int8_t *)cuda_malloc_with_size_tracking_async(
+      buffer_size, stream, gpu_index, size_tracker, allocate_gpu_memory);
+  check_cuda_error(cudaGetLastError());
+  return size_tracker;
+}
+
+template <typename Torus, class params>
+__host__ void host_programmable_bootstrap_amortized(
+    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
+    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
+    int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count) {
+  PANIC_IF_FALSE(sizeof(Torus) == 8,
+                 "Error: Programmable bootstrap amortized only supports 64-bit "
+                 "Torus type.");
+  uint64_t SM_FULL =
+      get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
+          polynomial_size, glwe_dimension);
+
+  uint64_t SM_PART =
+      get_buffer_size_partial_sm_programmable_bootstrap_amortized<Torus>(
+          polynomial_size);
+
+  uint64_t DM_PART = SM_FULL - SM_PART;
+
+  uint64_t DM_FULL = SM_FULL;
+
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  cuda_set_device(gpu_index);
+
+  // Create a 1-dimensional grid of threads
+  // where each block handles 1 sample and each thread
+  // handles opt polynomial coefficients
+  // (actually opt/2 coefficients since we compress the real polynomial into a
+  // complex)
+  dim3 grid(input_lwe_ciphertext_count, 1, 1);
+  dim3 thds(polynomial_size / params::opt, 1, 1);
+
+  // Launch the kernel using polynomial_size/opt threads
+  // where each thread computes opt polynomial coefficients
+  // Depending on the required amount of shared memory, choose
+  // from one of three templates (no use, partial use or full use
+  // of shared memory)
+  if (max_shared_memory < SM_PART) {
+    device_programmable_bootstrap_amortized<Torus, params, NOSM>
+        <<<grid, thds, 0, stream>>>(
+            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
+            lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
+            glwe_dimension, lwe_dimension, polynomial_size, base_log,
+            level_count, DM_FULL);
+  } else if (max_shared_memory < SM_FULL) {
+    device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>
+        <<<grid, thds, SM_PART, stream>>>(
+            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
+            lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
+            glwe_dimension, lwe_dimension, polynomial_size, base_log,
+            level_count, DM_PART);
+  } else {
+    // For devices with compute capability 7.x a single thread block can
+    // address the full capacity of shared memory. Shared memory on the
+    // device then has to be allocated dynamically.
+    // For lower compute capabilities, this call
+    // just does nothing and the amount of shared memory used is 48 KB
+    device_programmable_bootstrap_amortized<Torus, params, FULLSM>
+        <<<grid, thds, SM_FULL, stream>>>(
+            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
+            lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
+            glwe_dimension, lwe_dimension, polynomial_size, base_log,
+            level_count, 0);
+  }
+  check_cuda_error(cudaGetLastError());
+}
+
+#endif // CNCRT_PBS_H
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -404,11 +404,10 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
       lwe_offset += lwe_chunk_size) {

    // Compute a keybundle
-    execute_compute_keybundle_with_mode<Torus, params>(
+    execute_compute_keybundle<Torus, params>(
        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, level_count, lwe_offset,
-        MultiBitKeybundleLaunchMode::GENERIC);
+        grouping_factor, level_count, lwe_offset);

    // Accumulate
    execute_cg_external_product_loop<Torus, params>(
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -195,81 +195,6 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
          " in the interval [256..16384].")
  }
 }
-
-template <typename Torus>
-void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector_generic(
-    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
-    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
-    uint32_t lut_stride) {
-
-  switch (polynomial_size) {
-  case 256:
-    host_programmable_bootstrap_tbc_generic<Torus, AmortizedDegree<256>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_many_lut, lut_stride);
-    break;
-  case 512:
-    host_programmable_bootstrap_tbc_generic<Torus, AmortizedDegree<512>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_many_lut, lut_stride);
-    break;
-  case 1024:
-    host_programmable_bootstrap_tbc_generic<Torus, AmortizedDegree<1024>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_many_lut, lut_stride);
-    break;
-  case 2048:
-    host_programmable_bootstrap_tbc_generic<Torus, Degree<2048>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_many_lut, lut_stride);
-    break;
-  case 4096:
-    host_programmable_bootstrap_tbc_generic<Torus, AmortizedDegree<4096>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_many_lut, lut_stride);
-    break;
-  case 8192:
-    host_programmable_bootstrap_tbc_generic<Torus, AmortizedDegree<8192>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_many_lut, lut_stride);
-    break;
-  case 16384:
-    host_programmable_bootstrap_tbc_generic<Torus, AmortizedDegree<16384>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_many_lut, lut_stride);
-    break;
-  default:
-    PANIC("Cuda error (classical PBS): unsupported polynomial size. "
-          "Supported N's are powers of two"
-          " in the interval [256..16384].")
-  }
-}
 #endif

 template <typename Torus>
@@ -408,46 +333,6 @@ uint64_t scratch_cuda_programmable_bootstrap_64_async(
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
 }

-uint64_t scratch_cuda_programmable_bootstrap_tbc_generic_64_async(
-    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
-#if (CUDA_ARCH >= 900)
-  return scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
-      stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
-      lwe_dimension, glwe_dimension, polynomial_size, level_count,
-      input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
-#else
-  (void)stream;
-  (void)gpu_index;
-  (void)buffer;
-  (void)lwe_dimension;
-  (void)glwe_dimension;
-  (void)polynomial_size;
-  (void)level_count;
-  (void)input_lwe_ciphertext_count;
-  (void)allocate_gpu_memory;
-  (void)noise_reduction_type;
-  PANIC("Cuda error (classical PBS): TBC pbs is not supported.")
-#endif
-}
-
-uint64_t scratch_cuda_programmable_bootstrap_tbc_2_2_64_async(
-    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
-  PANIC_IF_FALSE(
-      polynomial_size == 2048 && level_count == 1 && glwe_dimension == 1,
-      "Cuda error (classical PBS): specialized TBC 2_2 scratch requires "
-      "(N=2048, level_count=1, glwe_dimension=1).");
-  return scratch_cuda_programmable_bootstrap_tbc_generic_64_async(
-      stream, gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size,
-      level_count, input_lwe_ciphertext_count, allocate_gpu_memory,
-      noise_reduction_type);
-}
-
 template <typename Torus>
 void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
@@ -784,102 +669,6 @@ void cuda_programmable_bootstrap_64_async(
  }
 }

-void cuda_programmable_bootstrap_tbc_64_generic_async(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) {
-  if (base_log > 64)
-    PANIC("Cuda error (classical PBS): base log should be <= 64")
-
-  pbs_buffer<uint64_t, CLASSICAL> *buffer =
-      (pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
-  PANIC_IF_FALSE(buffer->pbs_variant == PBS_VARIANT::TBC,
-                 "Cuda error (classical PBS): expected a TBC buffer.");
-
-#if (CUDA_ARCH >= 900)
-  cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector_generic<uint64_t>(
-      stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
-      static_cast<const uint64_t *>(lwe_output_indexes),
-      static_cast<const uint64_t *>(lut_vector),
-      static_cast<const uint64_t *>(lut_vector_indexes),
-      static_cast<const uint64_t *>(lwe_array_in),
-      static_cast<const uint64_t *>(lwe_input_indexes),
-      static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
-      glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-      num_many_lut, lut_stride);
-#else
-  (void)stream;
-  (void)gpu_index;
-  (void)lwe_array_out;
-  (void)lwe_output_indexes;
-  (void)lut_vector;
-  (void)lut_vector_indexes;
-  (void)lwe_array_in;
-  (void)lwe_input_indexes;
-  (void)bootstrapping_key;
-  (void)lwe_dimension;
-  (void)glwe_dimension;
-  (void)polynomial_size;
-  (void)level_count;
-  (void)num_samples;
-  (void)num_many_lut;
-  (void)lut_stride;
-  PANIC("Cuda error (classical PBS): TBC pbs is not supported.")
-#endif
-}
-
-void cuda_programmable_bootstrap_tbc_64_2_2_async(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) {
-  PANIC_IF_FALSE(polynomial_size == 2048 && level_count == 1 &&
-                     glwe_dimension == 1 && base_log == 23,
-                 "Cuda error (classical PBS): specialized TBC 2_2 requires "
-                 "(N=2048, level_count=1, glwe_dimension=1, base_log=23).");
-
-  pbs_buffer<uint64_t, CLASSICAL> *buffer =
-      (pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
-  PANIC_IF_FALSE(buffer->pbs_variant == PBS_VARIANT::TBC,
-                 "Cuda error (classical PBS): expected a TBC buffer.");
-
-#if (CUDA_ARCH >= 900)
-  host_programmable_bootstrap_tbc_2_2_specialized<uint64_t, Degree<2048>>(
-      static_cast<cudaStream_t>(stream), gpu_index,
-      static_cast<uint64_t *>(lwe_array_out),
-      static_cast<const uint64_t *>(lwe_output_indexes),
-      static_cast<const uint64_t *>(lut_vector),
-      static_cast<const uint64_t *>(lut_vector_indexes),
-      static_cast<const uint64_t *>(lwe_array_in),
-      static_cast<const uint64_t *>(lwe_input_indexes),
-      static_cast<const double2 *>(bootstrapping_key), buffer, glwe_dimension,
-      lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-      num_many_lut, lut_stride);
-#else
-  (void)stream;
-  (void)gpu_index;
-  (void)lwe_array_out;
-  (void)lwe_output_indexes;
-  (void)lut_vector;
-  (void)lut_vector_indexes;
-  (void)lwe_array_in;
-  (void)lwe_input_indexes;
-  (void)bootstrapping_key;
-  (void)lwe_dimension;
-  (void)num_samples;
-  (void)num_many_lut;
-  (void)lut_stride;
-  PANIC("Cuda error (classical PBS): TBC pbs is not supported.")
-#endif
-}
-
 /*
 * This cleanup function frees the data on GPU for the PBS buffer for 32 or 64
 * bits inputs.
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -3,7 +3,6 @@
 #include "pbs/programmable_bootstrap_multibit.h"
 #include "programmable_bootstrap_cg_multibit.cuh"
 #include "programmable_bootstrap_multibit.cuh"
-#include <type_traits>

 #if (CUDA_ARCH >= 900)
 #include "programmable_bootstrap_tbc_multibit.cuh"
@@ -221,17 +220,6 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
  }
 }

-template <typename Torus>
-void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_generic(
-    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
-    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_many_lut, uint32_t lut_stride);
-
 void cuda_multi_bit_programmable_bootstrap_64_async(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_output_indexes, void const *lut_vector,
@@ -295,159 +283,6 @@ void cuda_multi_bit_programmable_bootstrap_64_async(
  }
 }

-void cuda_multi_bit_programmable_bootstrap_64_generic_async(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
-    uint32_t lut_stride) {
-  PANIC_IF_FALSE(base_log <= 64,
-                 "Cuda error (multi-bit PBS): base log (%d) should be <= 64",
-                 base_log);
-
-  pbs_buffer<uint64_t, MULTI_BIT> *buffer =
-      (pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
-  PANIC_IF_FALSE(buffer->pbs_variant == PBS_VARIANT::TBC,
-                 "Cuda error (multi-bit PBS): expected a TBC buffer.");
-
-#if CUDA_ARCH >= 900
-  cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_generic<
-      uint64_t>(stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
-                static_cast<const uint64_t *>(lwe_output_indexes),
-                static_cast<const uint64_t *>(lut_vector),
-                static_cast<const uint64_t *>(lut_vector_indexes),
-                static_cast<const uint64_t *>(lwe_array_in),
-                static_cast<const uint64_t *>(lwe_input_indexes),
-                static_cast<const uint64_t *>(bootstrapping_key), buffer,
-                lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
-                base_log, level_count, num_samples, num_many_lut, lut_stride);
-#else
-  (void)stream;
-  (void)gpu_index;
-  (void)lwe_array_out;
-  (void)lwe_output_indexes;
-  (void)lut_vector;
-  (void)lut_vector_indexes;
-  (void)lwe_array_in;
-  (void)lwe_input_indexes;
-  (void)bootstrapping_key;
-  (void)lwe_dimension;
-  (void)glwe_dimension;
-  (void)polynomial_size;
-  (void)grouping_factor;
-  (void)level_count;
-  (void)num_samples;
-  (void)num_many_lut;
-  (void)lut_stride;
-  PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
-#endif
-}
-
-void cuda_multi_bit_programmable_bootstrap_tbc_64_2_2_async(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
-    uint32_t lut_stride) {
-  PANIC_IF_FALSE(polynomial_size == 2048 && grouping_factor == 4 &&
-                     level_count == 1 && glwe_dimension == 1 && base_log == 22,
-                 "Cuda error (multi-bit PBS): specialized TBC 2_2 requires "
-                 "(N=2048, grouping_factor=4, level_count=1, glwe_dimension=1, "
-                 "base_log=22).");
-
-  pbs_buffer<uint64_t, MULTI_BIT> *buffer =
-      (pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
-  PANIC_IF_FALSE(buffer->pbs_variant == PBS_VARIANT::TBC,
-                 "Cuda error (multi-bit PBS): expected a TBC buffer.");
-
-#if CUDA_ARCH >= 900
-  host_tbc_multi_bit_programmable_bootstrap_2_2_specialized<uint64_t,
-                                                            Degree<2048>>(
-      static_cast<cudaStream_t>(stream), gpu_index,
-      static_cast<uint64_t *>(lwe_array_out),
-      static_cast<const uint64_t *>(lwe_output_indexes),
-      static_cast<const uint64_t *>(lut_vector),
-      static_cast<const uint64_t *>(lut_vector_indexes),
-      static_cast<const uint64_t *>(lwe_array_in),
-      static_cast<const uint64_t *>(lwe_input_indexes),
-      static_cast<const uint64_t *>(bootstrapping_key), buffer, glwe_dimension,
-      lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-      num_samples, num_many_lut, lut_stride);
-#else
-  (void)stream;
-  (void)gpu_index;
-  (void)lwe_array_out;
-  (void)lwe_output_indexes;
-  (void)lut_vector;
-  (void)lut_vector_indexes;
-  (void)lwe_array_in;
-  (void)lwe_input_indexes;
-  (void)bootstrapping_key;
-  (void)lwe_dimension;
-  (void)num_samples;
-  (void)num_many_lut;
-  (void)lut_stride;
-  PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
-#endif
-}
-
-void cuda_multi_bit_programmable_bootstrap_tbc_64_generic_async(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
-    uint32_t lut_stride) {
-  PANIC_IF_FALSE(base_log <= 64,
-                 "Cuda error (multi-bit PBS): base log (%d) should be <= 64",
-                 base_log);
-
-  pbs_buffer<uint64_t, MULTI_BIT> *buffer =
-      (pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
-  PANIC_IF_FALSE(buffer->pbs_variant == PBS_VARIANT::TBC,
-                 "Cuda error (multi-bit PBS): expected a TBC buffer.");
-
-#if CUDA_ARCH >= 900
-  cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_generic<
-      uint64_t>(stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
-                static_cast<const uint64_t *>(lwe_output_indexes),
-                static_cast<const uint64_t *>(lut_vector),
-                static_cast<const uint64_t *>(lut_vector_indexes),
-                static_cast<const uint64_t *>(lwe_array_in),
-                static_cast<const uint64_t *>(lwe_input_indexes),
-                static_cast<const uint64_t *>(bootstrapping_key), buffer,
-                lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
-                base_log, level_count, num_samples, num_many_lut, lut_stride);
-#else
-  (void)stream;
-  (void)gpu_index;
-  (void)lwe_array_out;
-  (void)lwe_output_indexes;
-  (void)lut_vector;
-  (void)lut_vector_indexes;
-  (void)lwe_array_in;
-  (void)lwe_input_indexes;
-  (void)bootstrapping_key;
-  (void)lwe_dimension;
-  (void)glwe_dimension;
-  (void)polynomial_size;
-  (void)grouping_factor;
-  (void)level_count;
-  (void)num_samples;
-  (void)num_many_lut;
-  (void)lut_stride;
-  PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
-#endif
-}
-
 template <typename Torus>
 uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
@@ -601,41 +436,6 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64_async(
        input_lwe_ciphertext_count, allocate_gpu_memory);
 }

-uint64_t scratch_cuda_multi_bit_programmable_bootstrap_tbc_generic_64_async(
-    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
-#if CUDA_ARCH >= 900
-  return scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
-      stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
-      glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count,
-      allocate_gpu_memory);
-#else
-  (void)stream;
-  (void)gpu_index;
-  (void)buffer;
-  (void)glwe_dimension;
-  (void)polynomial_size;
-  (void)level_count;
-  (void)input_lwe_ciphertext_count;
-  (void)allocate_gpu_memory;
-  PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
-#endif
-}
-
-uint64_t scratch_cuda_multi_bit_programmable_bootstrap_tbc_2_2_64_async(
-    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
-  PANIC_IF_FALSE(
-      polynomial_size == 2048 && level_count == 1 && glwe_dimension == 1,
-      "Cuda error (multi-bit PBS): specialized TBC 2_2 scratch requires "
-      "(N=2048, level_count=1, glwe_dimension=1).");
-  return scratch_cuda_multi_bit_programmable_bootstrap_tbc_generic_64_async(
-      stream, gpu_index, buffer, glwe_dimension, polynomial_size, level_count,
-      input_lwe_ciphertext_count, allocate_gpu_memory);
-}
-
 void cleanup_cuda_multi_bit_programmable_bootstrap_64(void *stream,
                                                      uint32_t gpu_index,
                                                      int8_t **buffer) {
@@ -843,9 +643,6 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
    uint32_t num_many_lut, uint32_t lut_stride) {
-  static_assert(std::is_same_v<Torus, uint64_t>,
-                "Cuda error (multi-bit PBS): TBC path currently supports only "
-                "uint64_t torus.");

  if (base_log > 32)
    PANIC("Cuda error (multi-bit PBS): base log should be <= 32")
@@ -928,108 +725,6 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
  }
 }

-template <typename Torus>
-void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_generic(
-    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
-    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_many_lut, uint32_t lut_stride) {
-  static_assert(std::is_same_v<Torus, uint64_t>,
-                "Cuda error (multi-bit PBS): TBC path currently supports only "
-                "uint64_t torus.");
-
-  if (base_log > 32)
-    PANIC("Cuda error (multi-bit PBS): base log should be <= 32")
-
-  switch (polynomial_size) {
-  case 256:
-    host_tbc_multi_bit_programmable_bootstrap_generic<uint64_t,
-                                                      AmortizedDegree<256>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
-    break;
-  case 512:
-    host_tbc_multi_bit_programmable_bootstrap_generic<Torus,
-                                                      AmortizedDegree<512>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
-    break;
-  case 1024:
-    host_tbc_multi_bit_programmable_bootstrap_generic<Torus,
-                                                      AmortizedDegree<1024>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
-    break;
-  case 2048: {
-    int num_sms = 0;
-    check_cuda_error(cudaDeviceGetAttribute(
-        &num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
-
-    if (4 * num_sms < num_samples * level_count * (glwe_dimension + 1))
-      host_tbc_multi_bit_programmable_bootstrap_generic<Torus,
-                                                        AmortizedDegree<2048>>(
-          static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-          lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-          lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-          lwe_dimension, polynomial_size, grouping_factor, base_log,
-          level_count, num_samples, num_many_lut, lut_stride);
-    else
-      host_tbc_multi_bit_programmable_bootstrap_generic<Torus, Degree<2048>>(
-          static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-          lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-          lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-          lwe_dimension, polynomial_size, grouping_factor, base_log,
-          level_count, num_samples, num_many_lut, lut_stride);
-
-    break;
-  }
-  case 4096:
-    host_tbc_multi_bit_programmable_bootstrap_generic<Torus,
-                                                      AmortizedDegree<4096>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
-    break;
-  case 8192:
-    host_tbc_multi_bit_programmable_bootstrap_generic<Torus,
-                                                      AmortizedDegree<8192>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
-    break;
-  case 16384:
-    host_tbc_multi_bit_programmable_bootstrap_generic<Torus,
-                                                      AmortizedDegree<16384>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
-    break;
-  default:
-    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
-          "N's are powers of two"
-          " in the interval [256..16384].")
-  }
-}
-
 template uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
    void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -658,20 +658,13 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
  return size_tracker;
 }

-enum class MultiBitKeybundleLaunchMode {
-  AUTO,
-  GENERIC,
-  SPECIALIZED_2_2,
-};
-
 template <typename Torus, class params>
-__host__ void execute_compute_keybundle_with_mode(
+__host__ void execute_compute_keybundle(
    cudaStream_t stream, uint32_t gpu_index, Torus const *lwe_array_in,
    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset,
-    MultiBitKeybundleLaunchMode launch_mode) {
+    uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
  cuda_set_device(gpu_index);
  PANIC_IF_FALSE(sizeof(Torus) == 8,
                 "Error: PBS keybundle only supports 64-bit "
@@ -698,9 +691,6 @@ __host__ void execute_compute_keybundle_with_mode(
  dim3 thds(polynomial_size / params::opt, 1, 1);

  if (max_shared_memory < full_sm_keybundle) {
-    PANIC_IF_FALSE(launch_mode != MultiBitKeybundleLaunchMode::SPECIALIZED_2_2,
-                   "Cuda error (multi-bit PBS): specialized keybundle 2_2 "
-                   "requires FULLSM.");
    device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>
        <<<grid_keybundle, thds, 0, stream>>>(
            lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
@@ -713,21 +703,8 @@ __host__ void execute_compute_keybundle_with_mode(
            num_samples, glwe_dimension, polynomial_size, level_count,
            cuda_get_max_shared_memory(gpu_index));

-    bool can_use_specialized = supports_tbc && polynomial_size == 2048 &&
-                               grouping_factor == 4 && level_count == 1 &&
-                               glwe_dimension == 1;
-    if (launch_mode == MultiBitKeybundleLaunchMode::SPECIALIZED_2_2) {
-      PANIC_IF_FALSE(
-          can_use_specialized,
-          "Cuda error (multi-bit PBS): specialized keybundle 2_2 requires "
-          "(N=2048, grouping_factor=4, level_count=1, glwe_dimension=1).");
-    }
-
-    bool use_specialized =
-        launch_mode == MultiBitKeybundleLaunchMode::SPECIALIZED_2_2 ||
-        (launch_mode == MultiBitKeybundleLaunchMode::AUTO &&
-         can_use_specialized);
-    if (use_specialized) {
+    if (supports_tbc && polynomial_size == 2048 && grouping_factor == 4 &&
+        level_count == 1 && glwe_dimension == 1) {
      dim3 thds_new_keybundle(512, 1, 1);
      check_cuda_error(cudaFuncSetAttribute(
          device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
@@ -755,48 +732,6 @@ __host__ void execute_compute_keybundle_with_mode(
  check_cuda_error(cudaGetLastError());
 }

-template <typename Torus, class params>
-__host__ void execute_compute_keybundle(
-    cudaStream_t stream, uint32_t gpu_index, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
-    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
-  execute_compute_keybundle_with_mode<Torus, params>(
-      stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-      buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-      grouping_factor, level_count, lwe_offset,
-      MultiBitKeybundleLaunchMode::AUTO);
-}
-
-template <typename Torus, class params>
-__host__ void execute_compute_keybundle_generic(
-    cudaStream_t stream, uint32_t gpu_index, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
-    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
-  execute_compute_keybundle_with_mode<Torus, params>(
-      stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-      buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-      grouping_factor, level_count, lwe_offset,
-      MultiBitKeybundleLaunchMode::GENERIC);
-}
-
-template <typename Torus, class params>
-__host__ void execute_compute_keybundle_2_2_specialized(
-    cudaStream_t stream, uint32_t gpu_index, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
-    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
-  execute_compute_keybundle_with_mode<Torus, params>(
-      stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-      buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-      grouping_factor, level_count, lwe_offset,
-      MultiBitKeybundleLaunchMode::SPECIALIZED_2_2);
-}
-
 template <typename Torus, class params, bool is_first_iter>
 __host__ void execute_step_one(
    cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
@@ -916,11 +851,10 @@ __host__ void host_multi_bit_programmable_bootstrap(
       lwe_offset += lwe_chunk_size) {

    // Compute a keybundle
-    execute_compute_keybundle_with_mode<Torus, params>(
+    execute_compute_keybundle<Torus, params>(
        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, level_count, lwe_offset,
-        MultiBitKeybundleLaunchMode::GENERIC);
+        grouping_factor, level_count, lwe_offset);
    // Accumulate
    uint32_t chunk_size =
        std::min((uint32_t)lwe_chunk_size,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -22,12 +22,6 @@
 using namespace cooperative_groups;
 namespace cg = cooperative_groups;

-enum class ClassicalTbcLaunchMode {
-  AUTO,            // Heuristic-based selection based on parameters
-  GENERIC,         // Force-fallback to the generic implementation
-  SPECIALIZED_2_2, // Force-select the 2.2 specialized variant
-};
-
 /*
 * Kernel that computes the classical PBS using cooperative groups
 *
@@ -461,7 +455,7 @@ __host__ uint64_t scratch_programmable_bootstrap_tbc(
 * Host wrapper
 */
 template <typename Torus, class params>
-__host__ void host_programmable_bootstrap_tbc_with_mode(
+__host__ void host_programmable_bootstrap_tbc(
    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus const *lwe_output_indexes, Torus const *lut_vector,
    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
@@ -469,8 +463,7 @@ __host__ void host_programmable_bootstrap_tbc_with_mode(
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t num_many_lut, uint32_t lut_stride,
-    ClassicalTbcLaunchMode launch_mode) {
+    uint32_t num_many_lut, uint32_t lut_stride) {
  cuda_set_device(gpu_index);

  PANIC_IF_FALSE(sizeof(Torus) == 8,
@@ -521,9 +514,6 @@ __host__ void host_programmable_bootstrap_tbc_with_mode(
  config.stream = stream;

  if (max_shared_memory < partial_sm + minimum_sm_tbc) {
-    PANIC_IF_FALSE(
-        launch_mode != ClassicalTbcLaunchMode::SPECIALIZED_2_2,
-        "Cuda error (classical PBS): specialized TBC 2_2 requires FULLSM.");
    config.dynamicSmemBytes = minimum_sm_tbc;

    check_cuda_error(cudaLaunchKernelEx(
@@ -533,9 +523,6 @@ __host__ void host_programmable_bootstrap_tbc_with_mode(
        lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm,
        supports_dsm, num_many_lut, lut_stride, noise_reduction_type));
  } else if (max_shared_memory < full_sm + minimum_sm_tbc) {
-    PANIC_IF_FALSE(
-        launch_mode != ClassicalTbcLaunchMode::SPECIALIZED_2_2,
-        "Cuda error (classical PBS): specialized TBC 2_2 requires FULLSM.");
    config.dynamicSmemBytes = partial_sm + minimum_sm_tbc;

    check_cuda_error(cudaLaunchKernelEx(
@@ -546,19 +533,8 @@ __host__ void host_programmable_bootstrap_tbc_with_mode(
        partial_dm, supports_dsm, num_many_lut, lut_stride,
        noise_reduction_type));
  } else {
-    bool can_use_specialized = polynomial_size == 2048 && level_count == 1 &&
-                               glwe_dimension == 1 && base_log == 23;
-    if (launch_mode == ClassicalTbcLaunchMode::SPECIALIZED_2_2) {
-      PANIC_IF_FALSE(can_use_specialized,
-                     "Cuda error (classical PBS): specialized TBC 2_2 requires "
-                     "(N=2048, level_count=1, glwe_dimension=1, base_log=23).");
-    }
-
-    bool use_specialized =
-        launch_mode == ClassicalTbcLaunchMode::SPECIALIZED_2_2 ||
-        (launch_mode == ClassicalTbcLaunchMode::AUTO && can_use_specialized);
-
-    if (use_specialized) {
+    if (polynomial_size == 2048 && level_count == 1 && glwe_dimension == 1 &&
+        base_log == 23) {
      uint64_t full_sm_2_2 =
          get_buffer_size_full_sm_programmable_bootstrap_tbc_2_2_params<Torus>(
              polynomial_size);
@@ -594,60 +570,6 @@ __host__ void host_programmable_bootstrap_tbc_with_mode(
  }
 }

-template <typename Torus, class params>
-__host__ void host_programmable_bootstrap_tbc(
-    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
-    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t num_many_lut, uint32_t lut_stride) {
-  host_programmable_bootstrap_tbc_with_mode<Torus, params>(
-      stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
-      lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-      buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
-      level_count, input_lwe_ciphertext_count, num_many_lut, lut_stride,
-      ClassicalTbcLaunchMode::AUTO);
-}
-
-template <typename Torus, class params>
-__host__ void host_programmable_bootstrap_tbc_generic(
-    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
-    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t num_many_lut, uint32_t lut_stride) {
-  host_programmable_bootstrap_tbc_with_mode<Torus, params>(
-      stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
-      lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-      buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
-      level_count, input_lwe_ciphertext_count, num_many_lut, lut_stride,
-      ClassicalTbcLaunchMode::GENERIC);
-}
-
-template <typename Torus, class params>
-__host__ void host_programmable_bootstrap_tbc_2_2_specialized(
-    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
-    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t num_many_lut, uint32_t lut_stride) {
-  host_programmable_bootstrap_tbc_with_mode<Torus, params>(
-      stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
-      lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-      buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
-      level_count, input_lwe_ciphertext_count, num_many_lut, lut_stride,
-      ClassicalTbcLaunchMode::SPECIALIZED_2_2);
-}
-
 // Verify if the grid size satisfies the cooperative group constraints
 template <typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -18,12 +18,6 @@
 #include "types/complex/operations.cuh"
 #include <vector>

-enum class MultiBitTbcLaunchMode {
-  AUTO,            // Heuristic-based selection based on parameters
-  GENERIC,         // Force-fallback to the generic implementation
-  SPECIALIZED_2_2, // Force-select the 2.2 specialized variant
-};
-
 template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void __launch_bounds__(params::degree / params::opt)
    device_multi_bit_programmable_bootstrap_tbc_accumulate(
@@ -536,7 +530,7 @@ __host__ void execute_tbc_external_product_loop(
    uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
    uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut,
-    uint32_t lut_stride, MultiBitTbcLaunchMode launch_mode) {
+    uint32_t lut_stride) {

  PANIC_IF_FALSE(
      sizeof(Torus) == 8,
@@ -596,9 +590,6 @@ __host__ void execute_tbc_external_product_loop(
  config.stream = stream;

  if (max_shared_memory < partial_dm + minimum_dm) {
-    PANIC_IF_FALSE(
-        launch_mode != MultiBitTbcLaunchMode::SPECIALIZED_2_2,
-        "Cuda error (multi-bit PBS): specialized TBC 2_2 requires FULLSM.");
    config.dynamicSmemBytes = minimum_dm;
    check_cuda_error(cudaLaunchKernelEx(
        &config,
@@ -611,9 +602,6 @@ __host__ void execute_tbc_external_product_loop(
        keybundle_size_per_input, d_mem, full_dm, supports_dsm, num_many_lut,
        lut_stride));
  } else if (max_shared_memory < full_dm + minimum_dm) {
-    PANIC_IF_FALSE(
-        launch_mode != MultiBitTbcLaunchMode::SPECIALIZED_2_2,
-        "Cuda error (multi-bit PBS): specialized TBC 2_2 requires FULLSM.");
    config.dynamicSmemBytes = partial_dm + minimum_dm;
    check_cuda_error(cudaLaunchKernelEx(
        &config,
@@ -627,21 +615,8 @@ __host__ void execute_tbc_external_product_loop(
        lut_stride));
  } else {
    config.dynamicSmemBytes = full_dm + minimum_dm;
-    bool can_use_specialized = polynomial_size == 2048 &&
-                               grouping_factor == 4 && level_count == 1 &&
-                               glwe_dimension == 1 && base_log == 22;
-    if (launch_mode == MultiBitTbcLaunchMode::SPECIALIZED_2_2) {
-      PANIC_IF_FALSE(
-          can_use_specialized,
-          "Cuda error (multi-bit PBS): specialized TBC 2_2 requires "
-          "(N=2048, grouping_factor=4, level_count=1, glwe_dimension=1, "
-          "base_log=22).");
-    }
-
-    bool use_specialized =
-        launch_mode == MultiBitTbcLaunchMode::SPECIALIZED_2_2 ||
-        (launch_mode == MultiBitTbcLaunchMode::AUTO && can_use_specialized);
-    if (use_specialized) {
+    if (polynomial_size == 2048 && grouping_factor == 4 && level_count == 1 &&
+        glwe_dimension == 1 && base_log == 22) {

      config.dynamicSmemBytes = full_dm + 2 * minimum_dm;
      check_cuda_error(cudaFuncSetAttribute(
@@ -690,8 +665,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_many_lut, uint32_t lut_stride,
-    MultiBitTbcLaunchMode launch_mode) {
+    uint32_t num_many_lut, uint32_t lut_stride) {
  cuda_set_device(gpu_index);

  auto lwe_chunk_size = buffer->lwe_chunk_size;
@@ -699,27 +673,10 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
       lwe_offset += lwe_chunk_size) {

    // Compute a keybundle
-    switch (launch_mode) {
-    case MultiBitTbcLaunchMode::GENERIC:
-      execute_compute_keybundle_generic<Torus, params>(
-          stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-          buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-          grouping_factor, level_count, lwe_offset);
-      break;
-    case MultiBitTbcLaunchMode::SPECIALIZED_2_2:
-      execute_compute_keybundle_2_2_specialized<Torus, params>(
-          stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-          buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-          grouping_factor, level_count, lwe_offset);
-      break;
-    case MultiBitTbcLaunchMode::AUTO:
-    default:
-      execute_compute_keybundle<Torus, params>(
-          stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-          buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-          grouping_factor, level_count, lwe_offset);
-      break;
-    }
+    execute_compute_keybundle<Torus, params>(
+        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+        grouping_factor, level_count, lwe_offset);

    // Accumulate
    execute_tbc_external_product_loop<Torus, params>(
@@ -727,64 +684,10 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
        lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
        grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
-        lut_stride, launch_mode);
+        lut_stride);
  }
 }

-template <typename Torus, class params>
-__host__ void host_tbc_multi_bit_programmable_bootstrap(
-    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
-    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_many_lut, uint32_t lut_stride) {
-  host_tbc_multi_bit_programmable_bootstrap<Torus, params>(
-      stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
-      lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-      buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-      base_log, level_count, num_samples, num_many_lut, lut_stride,
-      MultiBitTbcLaunchMode::AUTO);
-}
-
-template <typename Torus, class params>
-__host__ void host_tbc_multi_bit_programmable_bootstrap_generic(
-    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
-    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_many_lut, uint32_t lut_stride) {
-  host_tbc_multi_bit_programmable_bootstrap<Torus, params>(
-      stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
-      lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-      buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-      base_log, level_count, num_samples, num_many_lut, lut_stride,
-      MultiBitTbcLaunchMode::GENERIC);
-}
-
-template <typename Torus, class params>
-__host__ void host_tbc_multi_bit_programmable_bootstrap_2_2_specialized(
-    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
-    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
-    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_many_lut, uint32_t lut_stride) {
-  host_tbc_multi_bit_programmable_bootstrap<Torus, params>(
-      stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
-      lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-      buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-      base_log, level_count, num_samples, num_many_lut, lut_stride,
-      MultiBitTbcLaunchMode::SPECIALIZED_2_2);
-}
-
 template <typename Torus>
 bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
    uint32_t polynomial_size, uint32_t max_shared_memory) {
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
@@ -340,6 +340,28 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, DefaultPBS)
  cleanup_cuda_programmable_bootstrap_64(stream, gpu_index, &buffer);
 }

+BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, AmortizedPBS)
+(benchmark::State &st) {
+
+  scratch_cuda_programmable_bootstrap_amortized_64_async(
+      stream, gpu_index, &buffer, glwe_dimension, polynomial_size,
+      input_lwe_ciphertext_count, true);
+
+  for (auto _ : st) {
+    // Execute PBS
+    cuda_programmable_bootstrap_amortized_64_async(
+        stream, gpu_index, (void *)d_lwe_ct_out_array,
+        (void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
+        (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in_array,
+        (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, buffer,
+        lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+        input_lwe_ciphertext_count);
+    cuda_synchronize_stream(stream, gpu_index);
+  }
+
+  cleanup_cuda_programmable_bootstrap_amortized_64(stream, gpu_index, &buffer);
+}
+
 static void
 MultiBitPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
  // Define the parameters to benchmark
@@ -424,3 +446,8 @@ BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, CgPBS)
    ->Apply(BootstrapBenchmarkGenerateParams)
    ->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
                "pbs_base_log", "pbs_level", "input_lwe_ciphertext_count"});
+
+BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, AmortizedPBS)
+    ->Apply(BootstrapBenchmarkGenerateParams)
+    ->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
+                "pbs_base_log", "pbs_level", "input_lwe_ciphertext_count"});
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
@@ -4,7 +4,6 @@
 #include <cstdlib>
 #include <functional>
 #include <gtest/gtest.h>
-#include <pbs/pbs_utilities.h>
 #include <setup_and_teardown.h>
 #include <utils.h>

@@ -57,63 +56,6 @@ protected:
  uint64_t *d_lwe_output_indexes;
  uint64_t *lwe_ct_out_array;

-  void run_and_check_pbs(
-      const std::function<void(uint64_t *d_lwe_ct_in, double *d_fourier_bsk,
-                               int8_t *pbs_buffer)> &run_pbs,
-      int8_t *pbs_buffer) {
-    int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
-                   polynomial_size * (lwe_dimension + 1);
-
-    for (int r = 0; r < repetitions; r++) {
-      double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
-      uint64_t *lwe_sk_out =
-          lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
-      for (int s = 0; s < samples; s++) {
-        uint64_t *d_lwe_ct_in =
-            d_lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
-                                             s * number_of_inputs) *
-                                            (lwe_dimension + 1));
-
-        run_pbs(d_lwe_ct_in, d_fourier_bsk, pbs_buffer);
-
-        cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
-                                 (glwe_dimension * polynomial_size + 1) *
-                                     number_of_inputs * sizeof(uint64_t),
-                                 stream, gpu_index);
-        cuda_synchronize_stream(stream, gpu_index);
-
-        for (int j = 0; j < number_of_inputs; j++) {
-          uint64_t *result =
-              lwe_ct_out_array +
-              (ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
-          uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
-                                          s * number_of_inputs + j];
-          uint64_t decrypted = 0;
-          core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out,
-                                  glwe_dimension * polynomial_size);
-          ASSERT_NE(decrypted, plaintext);
-
-          uint64_t rounding_bit = delta >> 1;
-          uint64_t rounding = (decrypted & rounding_bit) << 1;
-          uint64_t decoded = (decrypted + rounding) / delta;
-          ASSERT_EQ(decoded, plaintext / delta);
-        }
-      }
-    }
-  }
-
-  bool supports_classical_cg() const {
-    return has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
-        glwe_dimension, polynomial_size, pbs_level, number_of_inputs,
-        cuda_get_max_shared_memory(gpu_index));
-  }
-
-  bool supports_classical_tbc() const {
-    return has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
-        number_of_inputs, glwe_dimension, polynomial_size, pbs_level,
-        cuda_get_max_shared_memory(gpu_index));
-  }
-
 public:
  // Test arithmetic functions
  void SetUp() {
@@ -163,121 +105,132 @@ public:
  }
 };

-TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64,
-       classical_auto_dispatch) {
-  pbs_buffer<uint64_t, CLASSICAL> *typed_buffer = nullptr;
-  scratch_cuda_programmable_bootstrap<uint64_t>(
-      stream, gpu_index, &typed_buffer, lwe_dimension, glwe_dimension,
-      polynomial_size, pbs_level, number_of_inputs, true,
-      PBS_MS_REDUCTION_T::NO_REDUCTION);
-  int8_t *pbs_buffer = reinterpret_cast<int8_t *>(typed_buffer);
+TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, amortized_bootstrap) {
+  int8_t *pbs_buffer;
+  scratch_cuda_programmable_bootstrap_amortized_64_async(
+      stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
+      number_of_inputs, true);

-  uint32_t num_many_lut = 1;
-  uint32_t lut_stride = 0;
-  run_and_check_pbs(
-      [&](uint64_t *d_lwe_ct_in, double *d_fourier_bsk, int8_t *buffer) {
-        cuda_programmable_bootstrap_64_async(
-            stream, gpu_index, (void *)d_lwe_ct_out_array,
-            (void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
-            (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
-            (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, buffer,
-            lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
-            pbs_level, number_of_inputs, num_many_lut, lut_stride);
-      },
-      pbs_buffer);
+  int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
+                 polynomial_size * (lwe_dimension + 1);
+  // Here execute the PBS
+  for (int r = 0; r < repetitions; r++) {
+    double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
+    uint64_t *lwe_sk_out =
+        lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
+    for (int s = 0; s < samples; s++) {
+      uint64_t *d_lwe_ct_in =
+          d_lwe_ct_in_array +
+          (ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
+                      (lwe_dimension + 1));
+      // Execute PBS
+      cuda_programmable_bootstrap_amortized_64_async(
+          stream, gpu_index, (void *)d_lwe_ct_out_array,
+          (void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
+          (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
+          (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, pbs_buffer,
+          lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
+          pbs_level, number_of_inputs);
+      // Copy result back
+      cuda_memcpy_async_to_cpu(
+          lwe_ct_out_array, d_lwe_ct_out_array,
+          safe_mul_sizeof<uint64_t>(
+              safe_mul((size_t)glwe_dimension, (size_t)polynomial_size) + 1,
+              (size_t)number_of_inputs),
+          stream, gpu_index);

-  cleanup_cuda_programmable_bootstrap_64(stream, gpu_index, &pbs_buffer);
+      for (int j = 0; j < number_of_inputs; j++) {
+        uint64_t *result =
+            lwe_ct_out_array +
+            (ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
+        uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
+                                        s * number_of_inputs + j];
+        uint64_t decrypted = 0;
+        core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out,
+                                glwe_dimension * polynomial_size);
+        EXPECT_NE(decrypted, plaintext);
+        // let err = (decrypted >= plaintext) ? decrypted - plaintext :
+        // plaintext
+        // - decrypted;
+        // error_sample_vec.push(err);
+
+        // The bit before the message
+        uint64_t rounding_bit = delta >> 1;
+        // Compute the rounding bit
+        uint64_t rounding = (decrypted & rounding_bit) << 1;
+        uint64_t decoded = (decrypted + rounding) / delta;
+        EXPECT_EQ(decoded, plaintext / delta)
+            << "Repetition: " << r << ", sample: " << s;
+      }
+    }
+  }
+  cleanup_cuda_programmable_bootstrap_amortized_64(stream, gpu_index,
+                                                   &pbs_buffer);
 }

-TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, classical_cg) {
-  if (!supports_classical_cg()) {
-    GTEST_SKIP() << "CG classical PBS is not supported on this architecture.";
-  }
-
-  pbs_buffer<uint64_t, CLASSICAL> *typed_buffer = nullptr;
-  scratch_cuda_programmable_bootstrap_cg<uint64_t>(
-      stream, gpu_index, &typed_buffer, lwe_dimension, glwe_dimension,
-      polynomial_size, pbs_level, number_of_inputs, true,
-      PBS_MS_REDUCTION_T::NO_REDUCTION);
-  int8_t *pbs_buffer = reinterpret_cast<int8_t *>(typed_buffer);
-
-  uint32_t num_many_lut = 1;
-  uint32_t lut_stride = 0;
-  run_and_check_pbs(
-      [&](uint64_t *d_lwe_ct_in, double *d_fourier_bsk, int8_t *buffer) {
-        auto *typed =
-            reinterpret_cast<::pbs_buffer<uint64_t, CLASSICAL> *>(buffer);
-        cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
-            stream, gpu_index, d_lwe_ct_out_array, d_lwe_output_indexes,
-            d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in,
-            d_lwe_input_indexes,
-            reinterpret_cast<const double2 *>(d_fourier_bsk), typed,
-            lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
-            pbs_level, number_of_inputs, num_many_lut, lut_stride);
-      },
-      pbs_buffer);
-
-  cleanup_cuda_programmable_bootstrap_64(stream, gpu_index, &pbs_buffer);
-}
-
-TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, classical_tbc) {
-  if (!supports_classical_tbc()) {
-    GTEST_SKIP() << "TBC classical PBS is not supported on this architecture.";
-  }
-
-  int8_t *pbs_buffer = nullptr;
-  scratch_cuda_programmable_bootstrap_tbc_generic_64_async(
+TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
+  int8_t *pbs_buffer;
+  scratch_cuda_programmable_bootstrap_64_async(
      stream, gpu_index, &pbs_buffer, lwe_dimension, glwe_dimension,
      polynomial_size, pbs_level, number_of_inputs, true,
      PBS_MS_REDUCTION_T::NO_REDUCTION);

+  int number_of_sm = 0;
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
+                 polynomial_size * (lwe_dimension + 1);
  uint32_t num_many_lut = 1;
  uint32_t lut_stride = 0;
-  run_and_check_pbs(
-      [&](uint64_t *d_lwe_ct_in, double *d_fourier_bsk, int8_t *buffer) {
-        cuda_programmable_bootstrap_tbc_64_generic_async(
-            stream, gpu_index, (void *)d_lwe_ct_out_array,
-            (void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
-            (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
-            (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, buffer,
-            lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
-            pbs_level, number_of_inputs, num_many_lut, lut_stride);
-      },
-      pbs_buffer);
+  // Here execute the PBS
+  for (int r = 0; r < repetitions; r++) {
+    double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
+    uint64_t *lwe_sk_out =
+        lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
+    for (int s = 0; s < samples; s++) {
+      uint64_t *d_lwe_ct_in =
+          d_lwe_ct_in_array +
+          (ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
+                      (lwe_dimension + 1));
+      // Execute PBS
+      cuda_programmable_bootstrap_64_async(
+          stream, gpu_index, (void *)d_lwe_ct_out_array,
+          (void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
+          (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
+          (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, pbs_buffer,
+          lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
+          pbs_level, number_of_inputs, num_many_lut, lut_stride);
+      // Copy result back
+      cuda_memcpy_async_to_cpu(
+          lwe_ct_out_array, d_lwe_ct_out_array,
+          safe_mul_sizeof<uint64_t>(
+              safe_mul((size_t)glwe_dimension, (size_t)polynomial_size) + 1,
+              (size_t)number_of_inputs),
+          stream, gpu_index);

-  cleanup_cuda_programmable_bootstrap_64(stream, gpu_index, &pbs_buffer);
-}
+      for (int j = 0; j < number_of_inputs; j++) {
+        uint64_t *result =
+            lwe_ct_out_array +
+            (ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
+        uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
+                                        s * number_of_inputs + j];
+        uint64_t decrypted = 0;
+        core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out,
+                                glwe_dimension * polynomial_size);
+        ASSERT_NE(decrypted, plaintext);
+        // let err = (decrypted >= plaintext) ? decrypted - plaintext :
+        // plaintext
+        // - decrypted;
+        // error_sample_vec.push(err);

-TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, classical_tbc_2_2) {
-  if (!supports_classical_tbc()) {
-    GTEST_SKIP() << "TBC classical PBS is not supported on this architecture.";
+        // The bit before the message
+        uint64_t rounding_bit = delta >> 1;
+        // Compute the rounding bit
+        uint64_t rounding = (decrypted & rounding_bit) << 1;
+        uint64_t decoded = (decrypted + rounding) / delta;
+        ASSERT_EQ(decoded, plaintext / delta);
+      }
+    }
  }
-  if (!(polynomial_size == 2048 && pbs_level == 1 && glwe_dimension == 1 &&
-        pbs_base_log == 23)) {
-    GTEST_SKIP()
-        << "TBC specialized 2_2 requires N=2048, glwe=1, level=1, base_log=23.";
-  }
-
-  int8_t *pbs_buffer = nullptr;
-  scratch_cuda_programmable_bootstrap_tbc_2_2_64_async(
-      stream, gpu_index, &pbs_buffer, lwe_dimension, glwe_dimension,
-      polynomial_size, pbs_level, number_of_inputs, true,
-      PBS_MS_REDUCTION_T::NO_REDUCTION);
-
-  uint32_t num_many_lut = 1;
-  uint32_t lut_stride = 0;
-  run_and_check_pbs(
-      [&](uint64_t *d_lwe_ct_in, double *d_fourier_bsk, int8_t *buffer) {
-        cuda_programmable_bootstrap_tbc_64_2_2_async(
-            stream, gpu_index, (void *)d_lwe_ct_out_array,
-            (void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
-            (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
-            (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, buffer,
-            lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
-            pbs_level, number_of_inputs, num_many_lut, lut_stride);
-      },
-      pbs_buffer);
-
  cleanup_cuda_programmable_bootstrap_64(stream, gpu_index, &pbs_buffer);
 }

@@ -288,19 +241,16 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, classical_tbc_2_2) {
        // n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
        // message_modulus, carry_modulus, number_of_inputs, repetitions,
        // samples
-
-        // V1_4_PARAM_MESSAGE_1_CARRY_1_KS_PBS_TUNIFORM_2M128
+        // V1_1_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
        (ClassicalProgrammableBootstrapTestParams){
-            879, 4, 512, new_t_uniform(46), new_t_uniform(17), 23, 1, 2, 2, 10,
-            1, 1},
-        // V1_6_PARAM_GPU_MESSAGE_1_CARRY_1_KS_PBS_TUNIFORM_2M128
+            918, 1, 2048, new_t_uniform(45), new_t_uniform(17), 23, 1, 4, 4,
+            100, 1, 1},
+        // V1_1_PARAM_MESSAGE_3_CARRY_3_KS_PBS_TUNIFORM_2M128
+        // This test is here only to be sure we don't break support to
+        // 8192-degree polynomials
        (ClassicalProgrammableBootstrapTestParams){
-            759, 1, 2048, new_t_uniform(50), new_t_uniform(17), 23, 1, 2, 2, 10,
-            1, 1},
-        // V1_4_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
-        (ClassicalProgrammableBootstrapTestParams){
-            918, 1, 2048, new_t_uniform(45), new_t_uniform(17), 23, 1, 4, 4, 10,
-            1, 1});
+            1077, 1, 8192, new_t_uniform(41), new_t_uniform(3), 15, 2, 4, 4,
+            100, 1, 1});
 std::string printParamName(
    ::testing::TestParamInfo<ClassicalProgrammableBootstrapTestParams> p) {
  ClassicalProgrammableBootstrapTestParams params = p.param;
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp
@@ -6,8 +6,6 @@
 #include <cstdlib>
 #include <functional>
 #include <gtest/gtest.h>
-#include <pbs/pbs_multibit_utilities.h>
-#include <pbs/programmable_bootstrap_multibit.h>
 #include <setup_and_teardown.h>
 #include <utils.h>

@@ -56,71 +54,11 @@ protected:
  uint64_t *lwe_ct_out_array;
  uint64_t *d_lwe_input_indexes;
  uint64_t *d_lwe_output_indexes;
+  int8_t *pbs_buffer;

  int repetitions;
  int samples;

-  void run_and_check_pbs(
-      const std::function<void(uint64_t *d_lwe_ct_in, uint64_t *d_bsk,
-                               int8_t *pbs_buffer)> &run_pbs,
-      int8_t *pbs_buffer) {
-    int bsk_size = (lwe_dimension / grouping_factor) * pbs_level *
-                   (glwe_dimension + 1) * (glwe_dimension + 1) *
-                   polynomial_size * (1 << grouping_factor);
-
-    for (int r = 0; r < repetitions; r++) {
-      uint64_t *d_bsk = d_bsk_array + (ptrdiff_t)(bsk_size * r);
-      uint64_t *lwe_sk_out =
-          lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
-      for (int s = 0; s < samples; s++) {
-        uint64_t *d_lwe_ct_in =
-            d_lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
-                                             s * number_of_inputs) *
-                                            (lwe_dimension + 1));
-
-        run_pbs(d_lwe_ct_in, d_bsk, pbs_buffer);
-
-        cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
-                                 (glwe_dimension * polynomial_size + 1) *
-                                     number_of_inputs * sizeof(uint64_t),
-                                 stream, gpu_index);
-        cuda_synchronize_stream(stream, gpu_index);
-
-        for (int j = 0; j < number_of_inputs; j++) {
-          uint64_t *result =
-              lwe_ct_out_array +
-              (ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
-          uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
-                                          s * number_of_inputs + j];
-          uint64_t decrypted = 0;
-          core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out,
-                                  glwe_dimension * polynomial_size);
-
-          EXPECT_NE(decrypted, plaintext)
-              << "Repetition: " << r << ", sample: " << s << ", input: " << j;
-
-          uint64_t rounding_bit = delta >> 1;
-          uint64_t rounding = (decrypted & rounding_bit) << 1;
-          uint64_t decoded = (decrypted + rounding) / delta;
-          EXPECT_EQ(decoded, plaintext / delta)
-              << "Repetition: " << r << ", sample: " << s << ", input: " << j;
-        }
-      }
-    }
-  }
-
-  bool supports_multibit_cg() const {
-    return has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
-        glwe_dimension, polynomial_size, pbs_level, number_of_inputs,
-        cuda_get_max_shared_memory(gpu_index));
-  }
-
-  bool supports_multibit_tbc() const {
-    return has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
-        number_of_inputs, glwe_dimension, polynomial_size, pbs_level,
-        cuda_get_max_shared_memory(gpu_index));
-  }
-
 public:
  void SetUp() {
    stream = cuda_create_stream(gpu_index);
@@ -155,14 +93,20 @@ public:
        pbs_base_log, pbs_level, message_modulus, carry_modulus,
        &payload_modulus, &delta, number_of_inputs, repetitions, samples);

-    lwe_ct_out_array =
-        (uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
-                           number_of_inputs * sizeof(uint64_t));
+    scratch_cuda_multi_bit_programmable_bootstrap_64_async(
+        stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
+        pbs_level, number_of_inputs, true);
+
+    lwe_ct_out_array = (uint64_t *)malloc(safe_mul_sizeof<uint64_t>(
+        safe_mul((size_t)glwe_dimension, (size_t)polynomial_size) + 1,
+        (size_t)number_of_inputs));
  }

  void TearDown() {
    free(lwe_ct_out_array);

+    cleanup_cuda_multi_bit_programmable_bootstrap_64(stream, gpu_index,
+                                                     &pbs_buffer);
    programmable_bootstrap_multibit_teardown(
        stream, gpu_index, lwe_sk_in_array, lwe_sk_out_array, d_bsk_array,
        plaintexts, d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
@@ -170,135 +114,104 @@ public:
  }
 };

-TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64, multi_bit_default) {
-  int8_t *pbs_buffer = nullptr;
-  scratch_cuda_multi_bit_programmable_bootstrap_64_async(
-      stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
-      pbs_level, number_of_inputs, true);
+TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64,
+       multi_bit_programmable_bootstrap) {
+
+  int bsk_size = (lwe_dimension / grouping_factor) * pbs_level *
+                 (glwe_dimension + 1) * (glwe_dimension + 1) * polynomial_size *
+                 (1 << grouping_factor);

  uint32_t num_many_lut = 1;
  uint32_t lut_stride = 0;
-  run_and_check_pbs(
-      [&](uint64_t *d_lwe_ct_in, uint64_t *d_bsk, int8_t *buffer) {
-        auto *typed =
-            reinterpret_cast<::pbs_buffer<uint64_t, MULTI_BIT> *>(buffer);
-        cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
-            stream, gpu_index, d_lwe_ct_out_array, d_lwe_output_indexes,
-            d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in,
-            d_lwe_input_indexes, d_bsk, typed, lwe_dimension, glwe_dimension,
-            polynomial_size, grouping_factor, pbs_base_log, pbs_level,
-            number_of_inputs, num_many_lut, lut_stride);
-      },
-      pbs_buffer);
+  for (int r = 0; r < repetitions; r++) {
+    uint64_t *d_bsk = d_bsk_array + (ptrdiff_t)(bsk_size * r);
+    uint64_t *lwe_sk_out =
+        lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
+    for (int s = 0; s < samples; s++) {
+      uint64_t *d_lwe_ct_in =
+          d_lwe_ct_in_array +
+          (ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
+                      (lwe_dimension + 1));
+      // Execute PBS
+      cuda_multi_bit_programmable_bootstrap_64_async(
+          stream, gpu_index, (void *)d_lwe_ct_out_array,
+          (void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
+          (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
+          (void *)d_lwe_input_indexes, (void *)d_bsk, pbs_buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
+          pbs_level, number_of_inputs, num_many_lut, lut_stride);

-  cleanup_cuda_multi_bit_programmable_bootstrap_64(stream, gpu_index,
-                                                   &pbs_buffer);
-}
-
-TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64, multi_bit_cg) {
-  if (!supports_multibit_cg()) {
-    GTEST_SKIP() << "CG multibit PBS is not supported on this architecture.";
-  }
-
-  pbs_buffer<uint64_t, MULTI_BIT> *typed_buffer = nullptr;
-  scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
-      stream, gpu_index, &typed_buffer, glwe_dimension, polynomial_size,
-      pbs_level, number_of_inputs, true);
-  int8_t *pbs_buffer = reinterpret_cast<int8_t *>(typed_buffer);
-
-  uint32_t num_many_lut = 1;
-  uint32_t lut_stride = 0;
-  run_and_check_pbs(
-      [&](uint64_t *d_lwe_ct_in, uint64_t *d_bsk, int8_t *buffer) {
-        auto *typed =
-            reinterpret_cast<::pbs_buffer<uint64_t, MULTI_BIT> *>(buffer);
-        cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<
-            uint64_t>(stream, gpu_index, d_lwe_ct_out_array,
-                      d_lwe_output_indexes, d_lut_pbs_identity,
-                      d_lut_pbs_indexes, d_lwe_ct_in, d_lwe_input_indexes,
-                      d_bsk, typed, lwe_dimension, glwe_dimension,
-                      polynomial_size, grouping_factor, pbs_base_log, pbs_level,
-                      number_of_inputs, num_many_lut, lut_stride);
-      },
-      pbs_buffer);
-
-  cleanup_cuda_multi_bit_programmable_bootstrap_64(stream, gpu_index,
-                                                   &pbs_buffer);
-}
-
-TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64, multi_bit_tbc) {
-  if (!supports_multibit_tbc()) {
-    GTEST_SKIP() << "TBC multibit PBS is not supported on this architecture.";
-  }
-
-  int8_t *pbs_buffer = nullptr;
-  scratch_cuda_multi_bit_programmable_bootstrap_tbc_generic_64_async(
-      stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
-      pbs_level, number_of_inputs, true);
-
-  uint32_t num_many_lut = 1;
-  uint32_t lut_stride = 0;
-  run_and_check_pbs(
-      [&](uint64_t *d_lwe_ct_in, uint64_t *d_bsk, int8_t *buffer) {
-        cuda_multi_bit_programmable_bootstrap_tbc_64_generic_async(
-            stream, gpu_index, (void *)d_lwe_ct_out_array,
-            (void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
-            (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
-            (void *)d_lwe_input_indexes, (void *)d_bsk, buffer, lwe_dimension,
-            glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
-            pbs_level, number_of_inputs, num_many_lut, lut_stride);
-      },
-      pbs_buffer);
-
-  cleanup_cuda_multi_bit_programmable_bootstrap_64(stream, gpu_index,
-                                                   &pbs_buffer);
-}
-
-TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64, multi_bit_tbc_2_2) {
-  if (!supports_multibit_tbc()) {
-    GTEST_SKIP() << "TBC multibit PBS is not supported on this architecture.";
-  }
-  if (!(polynomial_size == 2048 && grouping_factor == 4 && pbs_level == 1 &&
-        glwe_dimension == 1 && pbs_base_log == 22)) {
-    GTEST_SKIP() << "TBC specialized 2_2 requires N=2048, grouping_factor=4, "
-                    "glwe=1, level=1, base_log=22.";
-  }
-
-  int8_t *pbs_buffer = nullptr;
-  scratch_cuda_multi_bit_programmable_bootstrap_tbc_2_2_64_async(
-      stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
-      pbs_level, number_of_inputs, true);
-
-  uint32_t num_many_lut = 1;
-  uint32_t lut_stride = 0;
-  run_and_check_pbs(
-      [&](uint64_t *d_lwe_ct_in, uint64_t *d_bsk, int8_t *buffer) {
-        cuda_multi_bit_programmable_bootstrap_tbc_64_2_2_async(
-            stream, gpu_index, (void *)d_lwe_ct_out_array,
-            (void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
-            (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
-            (void *)d_lwe_input_indexes, (void *)d_bsk, buffer, lwe_dimension,
-            glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
-            pbs_level, number_of_inputs, num_many_lut, lut_stride);
-      },
-      pbs_buffer);
-
-  cleanup_cuda_multi_bit_programmable_bootstrap_64(stream, gpu_index,
-                                                   &pbs_buffer);
+      // Copy result to the host memory
+      cuda_memcpy_async_to_cpu(
+          lwe_ct_out_array, d_lwe_ct_out_array,
+          safe_mul_sizeof<uint64_t>(
+              safe_mul((size_t)glwe_dimension, (size_t)polynomial_size) + 1,
+              (size_t)number_of_inputs),
+          stream, gpu_index);
+
+      for (int j = 0; j < number_of_inputs; j++) {
+        uint64_t *result =
+            lwe_ct_out_array +
+            (ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
+        uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
+                                        s * number_of_inputs + j];
+        uint64_t decrypted = 0;
+        core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out,
+                                glwe_dimension * polynomial_size);
+
+        EXPECT_NE(decrypted, plaintext)
+            << "Repetition: " << r << ", sample: " << s << ", input: " << j;
+
+        // The bit before the message
+        uint64_t rounding_bit = delta >> 1;
+
+        // Compute the rounding bit
+        uint64_t rounding = (decrypted & rounding_bit) << 1;
+        uint64_t decoded = (decrypted + rounding) / delta;
+        EXPECT_EQ(decoded, plaintext / delta)
+            << "Repetition: " << r << ", sample: " << s << ", input: " << j;
+      }
+    }
+  }
 }

+/**
+  int lwe_dimension;
+  int glwe_dimension;
+  int polynomial_size;
+  DynamicDistribution lwe_noise_distribution;
+  DynamicDistribution glwe_noise_distribution;
+  int pbs_base_log;
+  int pbs_level;
+  int message_modulus;
+  int carry_modulus;
+  int number_of_inputs;
+  int grouping_factor;
+  int repetitions;
+  int samples;
+ */
 // Defines for which parameters set the PBS will be tested.
 // It executes each src for all pairs on phis X qs (Cartesian product)
 ::testing::internal::ParamGenerator<MultiBitProgrammableBootstrapTestParams>
    multipbs_params_u64 = ::testing::Values(
-        // V1_4_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_1_CARRY_1_KS_PBS_TUNIFORM_2M128
+        // V1_1_PARAM_GPU_MULTI_BIT_GROUP_2_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
        (MultiBitProgrammableBootstrapTestParams){
-            760, 1, 2048, new_t_uniform(49), new_t_uniform(17), 22, 1, 2, 2, 10,
-            4, 1, 1},
+            918, 1, 4096, new_t_uniform(45), new_t_uniform(3), 21, 1, 4, 4, 100,
+            2, 1, 1},
+        // V1_1_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
+        (MultiBitProgrammableBootstrapTestParams){
+            879, 1, 2048, new_t_uniform(46), new_t_uniform(17), 14, 2, 4, 4,
+            100, 3, 1, 1},
        // V1_1_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
        (MultiBitProgrammableBootstrapTestParams){
-            920, 1, 2048, new_t_uniform(45), new_t_uniform(17), 22, 1, 4, 4, 10,
-            4, 1, 1});
+            920, 1, 2048, new_t_uniform(45), new_t_uniform(17), 22, 1, 4, 4,
+            100, 4, 1, 1},
+        // V1_1_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_3_CARRY_3_KS_PBS_TUNIFORM_2M128
+        // This test is here only to be sure we don't break support to
+        // 8192-degree polynomials
+        (MultiBitProgrammableBootstrapTestParams){
+            1040, 1, 8192, new_t_uniform(42), new_t_uniform(3), 14, 2, 4, 4,
+            100, 4, 1, 1});

 std::string printParamName(
    ::testing::TestParamInfo<MultiBitProgrammableBootstrapTestParams> p) {
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -3162,6 +3162,64 @@ unsafe extern "C" {
        polynomial_size: u32,
    );
 }
+unsafe extern "C" {
+    pub fn scratch_cuda_programmable_bootstrap_amortized_64_async(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        pbs_buffer: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        input_lwe_ciphertext_count: u32,
+        allocate_gpu_memory: bool,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32_async(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut ffi::c_void,
+        lwe_output_indexes: *const ffi::c_void,
+        lut_vector: *const ffi::c_void,
+        lut_vector_indexes: *const ffi::c_void,
+        lwe_array_in: *const ffi::c_void,
+        lwe_input_indexes: *const ffi::c_void,
+        bootstrapping_key: *const ffi::c_void,
+        pbs_buffer: *mut i8,
+        lwe_dimension: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        base_log: u32,
+        level_count: u32,
+        num_samples: u32,
+    );
+}
+unsafe extern "C" {
+    pub fn cuda_programmable_bootstrap_amortized_64_async(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut ffi::c_void,
+        lwe_output_indexes: *const ffi::c_void,
+        lut_vector: *const ffi::c_void,
+        lut_vector_indexes: *const ffi::c_void,
+        lwe_array_in: *const ffi::c_void,
+        lwe_input_indexes: *const ffi::c_void,
+        bootstrapping_key: *const ffi::c_void,
+        pbs_buffer: *mut i8,
+        lwe_dimension: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        base_log: u32,
+        level_count: u32,
+        num_samples: u32,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_programmable_bootstrap_amortized_64(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        pbs_buffer: *mut *mut i8,
+    );
+}
 unsafe extern "C" {
    pub fn scratch_cuda_programmable_bootstrap_64_async(
        stream: *mut ffi::c_void,
--- a/backends/tfhe-hpu-backend/Cargo.toml
+++ b/backends/tfhe-hpu-backend/Cargo.toml
@@ -4,8 +4,8 @@ version = "0.4.0"
 edition = "2021"
 license = "BSD-3-Clause-Clear"
 description = "HPU implementation on FPGA of TFHE-rs primitives."
-homepage = "https://www.zama.org/"
-documentation = "https://docs.zama.org/tfhe-rs"
+homepage = "https://www.zama.ai/"
+documentation = "https://docs.zama.ai/tfhe-rs"
 repository = "https://github.com/zama-ai/tfhe-rs"
 readme = "README.md"
 keywords = ["encryption", "fhe", "cryptography", "hardware", "fpga"]
--- a/backends/zk-cuda-backend/Cargo.toml
+++ b/backends/zk-cuda-backend/Cargo.toml
@@ -6,8 +6,8 @@ rust-version.workspace = true
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
 description = "Cuda implementation of TFHE-rs' ZK primitives."
-homepage = "https://www.zama.org/"
-documentation = "https://docs.zama.org/tfhe-rs"
+homepage = "https://www.zama.ai/"
+documentation = "https://docs.zama.ai/tfhe-rs"
 repository = "https://github.com/zama-ai/tfhe-rs"
 readme = "README.md"
 keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
--- a/backends/zk-cuda-backend/NAMING_CONVENTIONS.md
+++ b/backends/zk-cuda-backend/NAMING_CONVENTIONS.md
@@ -1,175 +0,0 @@
-# Naming Conventions
-
-This document defines the naming conventions used throughout the zk-cuda-backend codebase.
-
-## Types and Structs
-
-**Rule: `PascalCase`, no underscores.**
-
-| Category | Pattern | Examples |
-|----------|---------|----------|
-| Field elements | Short math names | `Fp`, `Fp2` (future: `Fp6`, `Fp12`) |
-| Big integers | `BigInt<N>` template | `BigInt<7>`, `BigInt<5>` |
-| Scalars | Alias of BigInt | `Scalar` (= `BigInt<ZP_LIMBS>`) |
-| Curve points (affine) | `G{1,2}Affine` | `G1Affine`, `G2Affine` |
-| Curve points (projective) | `G{1,2}Projective` | `G1Projective`, `G2Projective` |
-| Enums | `PascalCase` | `ComparisonType` |
-
-**FFI boundary types (api.h):** Use `G1Point`/`G2Point` (affine) and `G1ProjectivePoint`/`G2ProjectivePoint` for C compatibility. Type aliases bridge to internal names.
-
-**Rust types** mirror internal CUDA names: `G1Affine`, `G1Projective`, `G2Affine`, `G2Projective`, `Scalar`.
-
-## Template Parameters and Trait Structs
-
-**Template parameters: `PascalCase` with descriptive suffix.**
-
-| Parameter | Used in |
-|-----------|---------|
-| `AffineType` | MSM kernels, launch params |
-| `ProjectiveType` | MSM kernels, launch params |
-| `PointType` | Generic point operations |
-| `FieldType` | Trait associated type |
-| `N` | `BigInt<N>` |
-
-**Trait structs:**
-
-| Struct | Purpose | Location |
-|--------|---------|----------|
-| `Affine<T>` | Affine point ops dispatch | `point_traits.h` |
-| `Projective<T>` | Projective point ops dispatch | `point_traits.h` |
-| `SelectorChooser<T>` | Maps point type -> trait struct | `point_traits.h` |
-| `MSMTraits<T>` | Maps projective -> affine type | `curve.h` |
-| `MSMWindowSize<T>` | Window size constant per type | `msm.h` |
-| `Phase1KernelLaunchParams<T>` | Kernel config for accumulation | `msm_pippenger.cu` |
-| `Phase2KernelLaunchParams<T>` | Kernel config for reduction | `msm_pippenger.cu` |
-
-## Functions
-
-### Field Arithmetic
-
-**Rule: `<field>_<operation>` — lowercase snake_case, field prefix.**
-
-| Pattern | Meaning | Examples |
-|---------|---------|---------|
-| `fp_<op>` | Basic operation | `fp_add`, `fp_sub`, `fp_neg`, `fp_copy`, `fp_cmp` |
-| `fp_is_<pred>` | Predicate | `fp_is_zero`, `fp_is_one`, `fp_is_quadratic_residue` |
-| `fp_mont_<op>` | Montgomery-domain operation | `fp_mont_mul`, `fp_mont_inv`, `fp_mont_reduce` |
-| `fp_<op>_raw` | No modular reduction | `fp_add_raw`, `fp_sub_raw`, `fp_mul_schoolbook_raw` |
-| `fp_to_montgomery` / `fp_from_montgomery` | Form conversion | |
-| `fp_<constant>` | Return constant (normal form) | `fp_zero`, `fp_one`, `fp_modulus` |
-| `fp_<constant>_montgomery` | Return constant (Montgomery) | `fp_one_montgomery`, `fp_two_montgomery` |
-
-Fp2 follows identical patterns with `fp2_` prefix.
-
-### Point Operations (Generic Template)
-
-**Rule: `point_<operation>` for G1/G2-generic operations.**
-
-```
-point_add, point_double, point_neg, point_scalar_mul
-point_at_infinity, point_to_montgomery, point_from_montgomery
-point_to_montgomery_batch
-```
-
-### Point Operations (Group-Specific)
-
-**Rule:** Group **leads** when it "owns" the concept; group **trails** when the operation is primary.
-
-| Group leads | Group trails |
-|-------------|-------------|
-| `g1_point_at_infinity` | `projective_to_affine_g1` |
-| `g1_is_infinity` | `normalize_projective_g1` |
-| `g1_generator` | `is_on_curve_g1` |
-| `g1_projective_point_at_infinity` | `curve_b_g1` |
-
-Overloaded functions omit the group entirely.
-
-### Projective Point Operations
-
-**Rule: `projective_<operation>` prefix.**
-
-`projective_point_add`, `projective_point_double`, `projective_mixed_add`, `projective_scalar_mul` — all overloaded for G1/G2.
-
-### In-Place Host Operations
-
-**Rule: `_inplace` suffix for host-only in-place modifications.**
-
-`point_to_montgomery_inplace`, `point_from_montgomery_inplace`
-
-The CUDA template batch functions (`point_to_montgomery_batch`) are also in-place but omit `_inplace` — this is intentional. The `_inplace` suffix distinguishes the host-only path from the CUDA template path.
-
-### MSM Functions
-
-Internal: `point_msm_g1_async`, `point_msm_g1`, `pippenger_scratch_size_g1` (group suffix).
-
-### CUDA Kernels
-
-**Rule: `kernel_<descriptive_name>` prefix.**
-
-`kernel_accumulate_all_windows`, `kernel_reduce_all_windows`, `kernel_compute_window_sums`, `kernel_clear_buckets`, `kernel_point_add`, `kernel_point_to_montgomery_batch`, etc.
-
-### FFI Wrappers
-
-**Rule: `*_wrapper` suffix.** Group position follows the underlying function's convention:
-
- Group prefix: `g1_msm_managed_wrapper`, `g1_msm_unmanaged_wrapper_async`, `g1_from_montgomery_wrapper`
- Group suffix: `affine_to_projective_g1_wrapper`, `is_on_curve_g1_wrapper`, `pippenger_scratch_size_g1_wrapper`
- No group: `fp_to_montgomery_wrapper`, `scalar_modulus_limbs_wrapper`
-
-### Rust API
-
-Standard Rust `snake_case`: `to_projective()`, `from_montgomery_normalized()`, `is_infinity()`, `msm()`.
-
-Module-level conversions: `g1_affine_from_montgomery()`, `g1_affine_from_arkworks()`.
-
-## Variables
-
-**Rule: `snake_case` everywhere.**
-
-| Convention | Examples |
-|------------|---------|
-| Device pointers: `d_` prefix | `d_result`, `d_points`, `d_scratch` |
-| Host pointers: no prefix | `result`, `points` |
-| Counts: `num_*` | `num_points`, `num_blocks`, `num_windows` |
-| Indices: `*_idx` | `window_idx`, `bucket_idx`, `point_idx` |
-| Memory sizes: `*_bytes` | `points_bytes`, `scratch_bytes` |
-| Booleans: descriptive | `valid`, `overflow`, `points_in_montgomery` |
-| Shared memory: `shared_*` | `shared_mem`, `shared_points`, `shared_sums` |
-| CUDA params | `stream`, `gpu_index`, `size_tracker` |
-
-## Constants and Macros
-
-**Rule: `UPPER_SNAKE_CASE`.**
-
-| Prefix | Category | Examples |
-|--------|----------|---------|
-| `FP_` | Field parameters | `FP_LIMBS`, `FP_BITS` |
-| `ZP_` | Scalar field | `ZP_LIMBS` |
-| `LIMB_` | Limb config | `LIMB_BITS`, `LIMB_MAX` |
-| `MSM_G1_` / `MSM_G2_` | MSM per-group | `MSM_G1_WINDOW_SIZE`, `MSM_G2_BUCKET_COUNT` |
-| `MSM_` | MSM shared | `MSM_WINDOW_SIZE`, `MSM_SIGNED_BUCKET_COUNT` |
-| `KERNEL_` | Kernel config | `KERNEL_THREADS_MAX` |
-| `CUDA_` | CUDA arch | `CUDA_WARP_SIZE` |
-| `BLS12_446_` | Curve constants | `BLS12_446_MODULUS_LIMBS` |
-| `DEVICE_` | `__constant__` memory | `DEVICE_MODULUS`, `DEVICE_R2`, `DEVICE_G1_GENERATOR` |
-
-## Files
-
-| Category | Location | Naming |
-|----------|----------|--------|
-| CUDA public headers | `cuda/include/*.h` | `fp.h`, `curve.h`, `msm.h`, `point_traits.h` |
-| CUDA internal headers | `cuda/src/**/*.cuh` | `common.cuh` |
-| CUDA source | `cuda/src/**/*.cu` | `fp.cu`, `curve.cu`, `msm_pippenger.cu` |
-| Rust modules | `src/` | `snake_case`: `types`, `conversions`, `bindings`, `g1`, `g2`, `scalar` |
-
-## Async/Sync Pair Convention
-
-```
-<operation>_async   — launch kernel(s), return immediately
-<operation>         — call _async, then synchronize
-```
-
-**`_async` suffix** for non-blocking; **no suffix** for synchronizing.
-
-**Rule: `_async` is always the last component of the name**, even on `_wrapper` functions.
-For example: `point_msm_g1_async` (not `point_msm_async_g1`), `g1_msm_unmanaged_wrapper_async` (not `g1_msm_unmanaged_async_wrapper`).
--- a/backends/zk-cuda-backend/README.md
+++ b/backends/zk-cuda-backend/README.md
@@ -1,167 +1,491 @@
 # ZK CUDA Backend

-A CUDA implementation of BLS12-446 elliptic curve operations for zero-knowledge proof systems.
-It provides GPU-accelerated finite field arithmetic, elliptic curve point operations, and
-multi-scalar multiplication (MSM) targeting NVIDIA GPUs.
+A high-performance CUDA implementation of BLS12-446 elliptic curve operations for zero-knowledge proof systems. This library provides GPU-accelerated finite field arithmetic, elliptic curve point operations, and multi-scalar multiplication (MSM) optimized for NVIDIA GPUs.

-The cryptographic operations it provides are:
+## Overview
+
+This project implements a CUDA backend for BLS12-446 elliptic curve operations, which are fundamental to zero-knowledge proof systems. The implementation focuses on performance and correctness, providing both host and device-side APIs for maximum flexibility.
+
+**Key Features:**
 - Multi-precision finite field arithmetic (Fp) with Montgomery reduction
 - Quadratic extension field (Fp2) operations
- Elliptic curve point operations for G1 (over Fp) and G2 (over Fp2) groups
+- Elliptic curve operations for G1 and G2 groups
 - High-performance Multi-Scalar Multiplication (MSM) using Pippenger's algorithm
+- Comprehensive test suite with 100+ tests
+- Performance benchmarks
 - Rust API bindings

+## Project Structure
+
+```
+zk-cuda-backend/
+├── include/              # Header files
+│   ├── fp.h              # Fp (finite field) declarations
+│   ├── fp2.h             # Fp2 (quadratic extension) declarations
+│   ├── curve.h           # Elliptic curve point operations
+│   └── msm.h             # Multi-scalar multiplication API
+│   # Note: device.h comes from tfhe-cuda-backend
+├── src/                  # CUDA source files
+│   ├── primitives/
+│   │   ├── fp.cu         # Fp implementation
+│   │   └── fp2.cu        # Fp2 implementation
+│   ├── curve.cu          # Curve operations
+│   └── msm/              # MSM implementation
+│       └── pippenger/    # Pippenger's algorithm
+├── tests/                # Test suite
+│   ├── primitives/       # Fp and Fp2 tests
+│   ├── test_msm.cu       # MSM tests
+│   ├── test_point_ops.cu # Point operation tests
+│   └── test_scalar_mul.cu # Scalar multiplication tests
+├── benchmarks/          # Performance benchmarks
+│   ├── benchmark_fp.cu   # Fp benchmarks
+│   ├── benchmark_fp2.cu  # Fp2 benchmarks
+│   └── benchmark_msm.cu  # MSM benchmarks
+├── src/                  # Rust bindings
+│   ├── src/             # Rust source code
+│   └── include/         # C wrapper headers
+└── utils/               # Utility scripts
+```
+
 ## BLS12-446 Curve

-This implementation targets the **BLS12-446** curve:
- **446-bit prime field** (Fp): 7 limbs of 64 bits (448 bits total, 2 bits headroom)
+This implementation targets the **BLS12-446** curve, which uses:
+- **446-bit prime field** (Fp): Requires 7 limbs of 64 bits each
 - **Two groups**: G1 (over Fp) and G2 (over Fp2)
 - **Modulus**: Hardcoded from tfhe-rs reference implementation

-## API
+The modulus and all curve constants are initialized at compile time and available as device constants for optimal performance.
+
+## Components

 ### Finite Field Arithmetic (Fp and Fp2)

-**Fp** — multi-precision arithmetic for the 446-bit prime field:
- Operations: `fp_add()`, `fp_sub()`, `fp_mul()`, `fp_neg()`, `fp_inv()`, `fp_div()`, `fp_pow()`, `fp_sqrt()`, Montgomery conversions
- Operator overloads: `+`, `-`, `*`, `/`, unary `-`, `+=`, `-=`, `*=`, `/=`, `==`, `!=`
- Montgomery form: `fp_to_montgomery()` / `fp_from_montgomery()` for conversion; `fp_one_montgomery()` etc. for constants
+**Fp** - Multi-precision arithmetic for the 446-bit prime field:
+- **Structure**: 7 limbs of 64 bits each (448 bits total, 2 bits headroom)
+- **Montgomery Reduction**: R = 2^448, matching tfhe-rs implementation
+- **Format Tracking**: `mont` field tracks whether values are in Montgomery form
+- **Operations**: `fp_add()`, `fp_sub()`, `fp_mul()`, `fp_neg()`, `fp_inv()`, `fp_div()`, `fp_pow()`, `fp_sqrt()`, Montgomery conversions, etc.

-**Fp2** — quadratic extension field (Fp2 = Fp[i], i² = −1):
- Operations: `fp2_add()`, `fp2_sub()`, `fp2_mul()`, `fp2_neg()`, `fp2_inv()`, `fp2_div()`, `fp2_square()`, `fp2_conjugate()`, `fp2_frobenius()`
+**Fp2** - Quadratic extension field (Fp2 = Fp[i] where i² = -1):
+- **Structure**: Two Fp elements (c0, c1) representing a + b*i
+- **Operations**: `fp2_add()`, `fp2_sub()`, `fp2_mul()`, `fp2_neg()`, `fp2_inv()`, `fp2_div()`, `fp2_square()`
+- **Special**: `fp2_conjugate()`, `fp2_frobenius()`, `fp2_mul_by_i()`
+
+**Operator Overloads** (both Fp and Fp2):
+- Arithmetic: `+`, `-`, `*`, `/`, unary `-`
+- Compound assignment: `+=`, `-=`, `*=`, `/=`
+- Comparison: `==`, `!=`
+- Assignment: `=` (replaces `fp_copy()` / `fp2_copy()`)
+
+**CUDA Kernels**: Batch operations for GPU execution

 ### Elliptic Curve Operations

-Point representations:
- **Affine**: `G1Affine`, `G2Affine` — (x, y) with infinity flag
- **Projective**: `G1Projective`, `G2Projective` — (X, Y, Z) homogeneous coordinates
+Complete implementation for both G1 and G2 groups:

-Operations (template functions work for both G1 and G2):
- `point_add()`, `point_double()`, `point_neg()`, `point_scalar_mul()`
- `affine_to_projective()`, `projective_to_affine_g1()`, `projective_to_affine_g2()`
- `point_to_montgomery_inplace()`, `normalize_from_montgomery_g1()` / `normalize_from_montgomery_g2()`
- Operator overloads on projective points: `+`, unary `-`, `*` (scalar), `+=`, `==`, `!=`
- Generator access: `g1_generator()`, `g2_generator()`
+- **Point Representations**:
+  - **Affine**: (x, y) coordinates with infinity flag (`G1Affine`, `G2Affine`)
+  - **Projective**: (X, Y, Z) homogeneous coordinates (`G1Projective`, `G2Projective`)
+
+- **Operations**:
+  - Point addition: `point_add()`
+  - Point doubling: `point_double()`
+  - Point negation: `point_neg()`
+  - Scalar multiplication: `point_scalar_mul()`, `projective_scalar_mul()`
+  - Coordinate conversion: `affine_to_projective()`, `projective_to_affine()`
+
+- **Operator Overloads** (Projective points):
+  - Addition: `+` (point addition)
+  - Negation: unary `-` (point negation)
+  - Scalar multiplication: `*` (with `Scalar` type)
+  - Compound assignment: `+=`
+  - Comparison: `==`, `!=`
+  - Assignment: `=` (replaces `point_copy()`)
+
+- **Template API**: Generic functions that work for both G1 and G2 points
+- **Generator Points**: Hardcoded G1 and G2 generators for BLS12-446

 ### Multi-Scalar Multiplication (MSM)

-Implements Pippenger's bucket method. Window sizes are selected dynamically:
- **G1**: 4-bit windows for n ≤ 256, 5-bit for n ≤ 4096, larger for bigger inputs
- **G2**: fixed 5-bit windows (Fp2 operations are 2× more expensive)
+High-performance MSM implementation:

-**Unmanaged API** — caller manages all device memory:
-```c
-// Query required scratch space, then run MSM.
-size_t scratch_bytes = pippenger_scratch_size_g1(n, gpu_index);
-G1Projective *d_scratch = (G1Projective *)cuda_malloc(scratch_bytes, gpu_index);
-point_msm_g1(stream, gpu_index, d_result, d_points, d_scalars, n,
-             d_scratch, size_tracker, /*gpu_memory_allocated=*/true);
-```
+- **Algorithm**: Pippenger's bucket method with configurable window sizes
+- **Window Sizes**:
+  - **G1**: 4-bit windows (16 buckets: 0-15)
+  - **G2**: 5-bit windows (32 buckets: 0-31) - larger windows reduce Horner doublings for more expensive Fp2 operations
+- **Features**:
+  - Supports both G1 and G2 groups
+  - Uses projective coordinates internally (no inversions)
+  - Optimized for large batch sizes
+  - Register-based bucket accumulation for optimal performance

-**Managed API** — Rust bindings handle memory allocation and transfers internally:
-```rust
-let (result, size_tracker) = G1Projective::msm(&points, &scalars, stream, gpu_index, false)?;
-```
+- **API**:
+  - BigInt scalars (320-bit, 5 limbs): `point_msm_g1()`, `point_msm_g2()`
+  - Async/Sync variants: `point_msm_async_*()` and `point_msm_*()`
+  - **Managed API**: Handles memory allocation and transfers internally (convenient for Rust bindings)
+  - **Unmanaged API**: Assumes data already on device, caller manages memory (better performance for pure-GPU workflows)

-See the [basic examples](cuda/tests_and_benchmarks/tests/basic/) for complete working programs.
-## Dependencies
+- **Memory**: Device pointer-based API (caller manages memory allocation for unmanaged API)

-**Disclaimer**: Compilation on Windows/Mac is not supported. Only Nvidia GPUs are supported.
+## Building

- nvidia driver — GPU with Compute Capability ≥ 3.0 (e.g. Ubuntu 20.04: [installation guide](https://linuxconfig.org/how-to-install-the-nvidia-drivers-on-ubuntu-20-04-focal-fossa-linux))
- [nvcc](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) ≥ 10.0
- [gcc](https://gcc.gnu.org/) ≥ 8.0 — see [nvcc/gcc compatibility](https://gist.github.com/ax3l/9489132)
- [cmake](https://cmake.org/) ≥ 3.24
- libclang ≥ 9.0 — for Rust [bindgen requirements](https://rust-lang.github.io/rust-bindgen/requirements.html)
+### Dependencies

-Dependencies fetched automatically by CMake: Google Test, Google Benchmark.
+**Disclaimer**: Compilation on Windows/Mac is not supported yet. Only Nvidia GPUs are supported. 

-## Build
+- nvidia driver - for example, if you're running Ubuntu 20.04 check this [page](https://linuxconfig.org/how-to-install-the-nvidia-drivers-on-ubuntu-20-04-focal-fossa-linux) for installation. You need an Nvidia GPU with Compute Capability >= 3.0
+- [nvcc](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) >= 10.0
+- [gcc](https://gcc.gnu.org/) >= 8.0 - check this [page](https://gist.github.com/ax3l/9489132) for more details about nvcc/gcc compatible versions
+- [cmake](https://cmake.org/) >= 3.24
+- libclang, to match Rust bingen [requirements](https://rust-lang.github.io/rust-bindgen/requirements.html) >= 9.0
+
+Dependencies (automatically fetched by CMake):
+- Google Test (for testing)
+- Google Benchmark (for benchmarks)
+
+### Build Instructions

 ```bash
-cd cuda
-cmake -B build
-cmake --build build
+# Create build directory
+mkdir -p build
+cd build
+
+# Configure
+cmake .. 
+
+# Build
+cmake --build .
+
+# Or use make
+make
 ```

-The compute capability is detected automatically from the first available GPU.
-If no GPU is present, the build targets sm_70 (Volta).
+### Building Rust API
+
+The Rust API build automatically compiles the CUDA library via `build.rs`. Simply run:
+
+```bash
+# From the zk-cuda-backend directory (backends/zk-cuda-backend/)
+cargo build --release
+```
+
+This will:
+1. Automatically configure and build the CUDA library in `cuda/build/` if needed
+2. Compile the Rust bindings
+3. Link everything together
+
+**Manual CUDA build** (if you need to build the CUDA library separately):
+
+```bash
+# Build the C++/CUDA library manually
+cd cuda
+mkdir -p build
+cd build
+cmake ..
+make
+```
+
+## Usage
+
+### C++/CUDA API
+
+#### Basic Fp Operations
+
+```cpp
+#include "fp.h"
+
+// Initialize values
+Fp a, b, c;
+fp_one(a);  // a = 1
+fp_one(b);  // b = 1
+
+// Using operator syntax (preferred)
+c = a + b;  // c = 2
+c = a - b;  // c = 0
+c = a * b;  // c = 1
+c = -a;     // c = -1 (mod p)
+
+// Compound assignment
+c += a;     // c = c + a
+c *= b;     // c = c * b
+
+// Assignment (copies value)
+Fp d = a;   // d is a copy of a
+
+// Named functions still available
+fp_add(c, a, b);  // c = a + b = 2
+
+// Convert to Montgomery form
+fp_to_montgomery(a, a);
+
+// Montgomery multiplication
+fp_mont_mul(c, a, b);  // c = a * b (all in Montgomery form)
+```
+
+#### Elliptic Curve Operations
+
+```cpp
+#include "curve.h"
+
+// Create points
+G1Projective p1, p2, result;
+// ... initialize point coordinates ...
+
+// Using operator syntax (projective points)
+result = p1 + p2;      // Point addition
+result = -p1;          // Point negation
+result += p2;          // Compound addition
+
+// Scalar multiplication with Scalar type
+Scalar s;
+// ... initialize scalar ...
+result = p1 * s;       // result = scalar * point
+result = s * p1;       // Same as above
+
+// Assignment (copies point)
+G1Projective copy = p1;
+
+// Named functions still available for affine points
+G1Affine affine_point, affine_result;
+uint64_t scalar[5] = {0x1234, 0, 0, 0, 0};
+point_scalar_mul(affine_result, affine_point, scalar, 5);
+```
+
+#### Multi-Scalar Multiplication
+
+```cpp
+#include "msm.h"
+#include "device.h"  // From tfhe-cuda-backend
+
+// Allocate device memory
+G1Affine* d_points;
+Scalar* d_scalars;  // BigInt (320-bit scalars, 5 limbs)
+G1Projective* d_result;
+G1Projective* d_scratch;
+
+// Calculate scratch space size
+uint32_t n = 1000;  // number of points
+uint32_t num_blocks = (n + 255) / 256;
+size_t scratch_size = (num_blocks + 1) * MSM_G1_BUCKET_COUNT * sizeof(G1Projective);
+
+// Allocate memory using device wrappers
+uint32_t gpu_index = 0;
+d_points = (G1Affine*)cuda_malloc(n * sizeof(G1Affine), gpu_index);
+d_scalars = (Scalar*)cuda_malloc(n * sizeof(Scalar), gpu_index);
+d_result = (G1Projective*)cuda_malloc(sizeof(G1Projective), gpu_index);
+d_scratch = (G1Projective*)cuda_malloc(scratch_size, gpu_index);
+
+// Create stream and copy data to device
+cudaStream_t stream = cuda_create_stream(gpu_index);
+cuda_memcpy_async_to_gpu(d_points, h_points, n * sizeof(G1Affine), stream, gpu_index);
+cuda_memcpy_async_to_gpu(d_scalars, h_scalars, n * sizeof(Scalar), stream, gpu_index);
+
+// Perform MSM
+point_msm_g1(stream, gpu_index, d_result, d_points, d_scalars, d_scratch, n);
+
+// Copy result back and synchronize
+G1Projective result;
+cuda_memcpy_async_to_cpu(&result, d_result, sizeof(G1Projective), stream, gpu_index);
+cuda_synchronize_stream(stream, gpu_index);
+
+// Cleanup
+cuda_drop(d_points, gpu_index);
+cuda_drop(d_scalars, gpu_index);
+cuda_drop(d_result, gpu_index);
+cuda_drop(d_scratch, gpu_index);
+cuda_destroy_stream(stream, gpu_index);
+```

 ### Rust API

-The Rust build compiles the CUDA library automatically via `build.rs`:
+See the [Rust API README](src/README.md) for detailed usage examples.

-```bash
-# From backends/zk-cuda-backend/
-cargo build --release
+```rust
+use zk_cuda_backend::{G1Affine, G1Projective, Scalar};
+use tfhe_cuda_backend::cuda_create_stream;
+
+// Create points and scalars
+let points: Vec<G1Affine> = vec![...];
+let scalars: Vec<Scalar> = vec![...];
+
+// Create a CUDA stream (required for MSM)
+let gpu_index = 0;
+let stream = cuda_create_stream(gpu_index);
+
+// Perform MSM using managed API
+// The managed API handles memory allocation and transfers internally
+let (result, size_tracker) = G1Projective::msm(
+    &points,
+    &scalars,
+    stream,
+    gpu_index,
+    false, // points_in_montgomery: false means points will be converted
+)?;
+
+// For G2 points:
+use zk_cuda_backend::{G2Affine, G2Projective};
+let (g2_result, _) = G2Projective::msm(
+    &g2_points,
+    &scalars,
+    stream,
+    gpu_index,
+    true, // points_in_montgomery: true for better performance if already converted
+)?;
 ```

 ## Testing

+The project includes a comprehensive test suite using Google Test.
+
+### Running Tests
+
 ```bash
-cd cuda/build
-ctest --output-on-failure       # run all tests
-./test_fp                        # individual test executables
+# Run all tests
+cd build
+ctest --output-on-failure
+
+# Run with verbose output
+ctest --verbose
+
+# Run specific test executables
+./test_fp
 ./test_fp2
 ./test_msm
 ./test_point_ops
-./test_fp --gtest_filter="*Montgomery*"  # filter by name
+
+# Run specific test cases
+./test_fp --gtest_filter="*Montgomery*"
+./test_msm --gtest_filter="*G1*"
 ```

-Test coverage: Fp operations (22+ tests), Fp2 operations, G1/G2 point operations,
-projective arithmetic, MSM correctness for various batch sizes.
+### Test Coverage
+
+- **Fp Tests** (`test_fp`): 22+ tests covering:
+  - Basic operations (addition, subtraction, multiplication)
+  - Montgomery form conversions
+  - Edge cases (zero, one, large values)
+  - Property-based tests (commutativity, associativity)
+
+- **Fp2 Tests** (`test_fp2`): Complete coverage of:
+  - All Fp2 operations
+  - Montgomery form operations
+  - Special functions (Frobenius, conjugation)
+
+- **Point Operation Tests** (`test_point_ops`): Verification of:
+  - Point addition and doubling
+  - Scalar multiplication
+  - Coordinate conversions
+  - Infinity point handling
+
+- **MSM Tests** (`test_msm`): End-to-end verification:
+  - G1 and G2 MSM correctness
+  - Various batch sizes
+  - Comparison with reference implementations

 ## Benchmarks

+Performance benchmarks are available using Google Benchmark:
+
 ```bash
-cd cuda/build
+cd build
 ./benchmark_fp
 ./benchmark_fp2
 ./benchmark_msm
 ```

-## Technical Notes
+Benchmarks measure:
+- Fp arithmetic operation throughput
+- Fp2 operation performance
+- MSM performance for various batch sizes
+- GPU utilization and memory bandwidth
+
+## Technical Details

 ### Montgomery Reduction

-All internal multiplications use Montgomery form (R = 2^448, matching tfhe-rs).
-Precomputed constants: R² mod p, R_INV mod p, p' = −p⁻¹ mod 2⁶⁴.
-The `mont` convention: functions documented "MONTGOMERY" expect inputs already in
-Montgomery form; "NORMAL" functions handle conversion internally.
+- **R value**: 2^448 (matching tfhe-rs)
+- **Precomputed constants**: R² mod p, R_INV mod p, p' = -p⁻¹ mod 2⁶⁴
+- **Format tracking**: Fp struct includes `mont` field to track representation
+- **Efficiency**: All multiplications use Montgomery form internally
+
+### MSM Algorithm
+
+- **Pippenger's algorithm**: Bucket method with configurable window sizes
+  - **G1**: 4-bit windows (16 buckets)
+  - **G2**: 5-bit windows (32 buckets) - larger windows reduce expensive Fp2 field operations
+- **Projective coordinates**: Avoids expensive field inversions
+- **Memory layout**: Optimized for coalesced memory access
+- **Thread configuration**: 128 threads/block for both G1 and G2 (optimized for H100 SM occupancy)
+- **Register-based accumulation**: Uses register-based bucket accumulation instead of shared memory for better performance

 ### Memory Management

- **Unmanaged API** (`point_msm_g1`, `point_msm_g2`): all data must be on device;
-  caller manages allocation and transfers. Use `pippenger_scratch_size_g1/g2()` to
-  query the required scratch buffer size.
- **Managed API** (Rust `G1Projective::msm()`, `G2Projective::msm()`): handles
-  allocation, host-to-device copies, and scratch space automatically.
+The library provides two MSM API variants:
+
+- **Unmanaged API** (`point_msm_*_unmanaged_wrapper`):
+  - Assumes all data (points, scalars, scratch space) is already on device
+  - Caller manages all memory allocation and transfers
+  - Best for performance-critical applications where data is already on GPU
+  - Supports `points_in_montgomery` flag to avoid redundant conversions
+
+- **Managed API** (`point_msm_*_managed_wrapper`):
+  - Handles memory allocation and transfers internally
+  - Copies data from host to device, runs MSM, copies result back
+  - Convenient for Rust bindings and host-side code
+  - Automatically manages scratch space allocation
+
+- **Scratch space**: Required size is `(num_blocks + 1) * BUCKET_COUNT * sizeof(ProjectivePoint)`
+  - G1: `(num_blocks + 1) * 16 * sizeof(G1Projective)`
+  - G2: `(num_blocks + 1) * 32 * sizeof(G2Projective)`
+- **Stream support**: Async operations with CUDA streams (all operations are async internally)
+
+### CUDA Optimizations
+
+- **Constant memory**: Modulus and curve constants in `__constant__` memory
+- **Shared memory**: Used for bucket accumulations in MSM
+- **Coalesced access**: Memory access patterns optimized for GPU
+- **Separable compilation**: Enabled for better optimization
+
+## Template Functions
+
+Many functions are templated to work with both G1 and G2 points:
+```cpp
+template<typename PointType>
+void point_add(PointType& result, const PointType& p1, const PointType& p2);
+```

 ## Security

 ### Side-Channel Resistance

-This implementation assumes **scalars are public** and is **not** constant-time.
-Do not use it for operations where scalars must remain secret.
-For ZK proof generation this is acceptable when scalars are derived from public
-parameters or are witness values revealed in the proof.
+This implementation assumes **scalars are public** and is NOT constant-time. 
+The MSM and scalar multiplication operations have timing variations that depend 
+on scalar values (bit length, Hamming weight, specific bit patterns).
+
+For ZK proof generation, this is acceptable if:
+- Scalars are derived from public parameters
+- Or are witness values that are revealed in the proof anyway
+
+**Do not use this implementation for operations where scalars must remain secret.**

 ### Input Validation

- **Point validation**: off by default; enable with the `validate_points` feature:
+- **Point validation**: Point on-curve validation is optional and controlled by the 
+  `validate_points` feature flag. When disabled (default), malformed points may cause 
+  undefined behavior in curve operations. Enable this feature for untrusted inputs:
  ```toml
  zk-cuda-backend = { version = "...", features = ["validate_points"] }
  ```
- **Scalar validation**: `Scalar::is_valid()` and `Scalar::reduce_once()` available in the Rust API.
+- **Scalar validation**: `Scalar::is_valid()` and `Scalar::reduce_once()` methods available
+- **Input size limits**: MSM operations are limited to 100,000 points maximum
+- **Division by zero**: Caller must ensure division by zero does not occur (checks must be done at host side)

-## Naming Conventions
-
-See [NAMING_CONVENTIONS.md](NAMING_CONVENTIONS.md) for the full reference.
+For detailed security information, see [SECURITY.md](SECURITY.md).

 ## References

- [Pairing-Friendly Curves (BLS12)](https://eprint.iacr.org/2006/372.pdf)
- [Montgomery Reduction — Handbook of Applied Cryptography](https://cacr.uwaterloo.ca/hac/)
- [Pippenger's Algorithm](https://eprint.iacr.org/2012/549.pdf)
- [NVIDIA CUDA Best Practices Guide](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/)
- [tfhe-rs BLS12-446 reference](https://github.com/zama-ai/tfhe-rs/blob/main/tfhe-zk-pok/src/curve_446/mod.rs)
+- **BLS12 Curves**: [Pairing-Friendly Curves](https://eprint.iacr.org/2006/372.pdf)
+- **Montgomery Reduction**: [Handbook of Applied Cryptography](https://cacr.uwaterloo.ca/hac/)
+- **Pippenger's Algorithm**: [On the Evaluation of Powers and Monomials](https://eprint.iacr.org/2012/549.pdf)
+- **CUDA Best Practices**: [NVIDIA CUDA Best Practices Guide](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/)
+- **TFHE-rs Reference**: [tfhe-rs/tfhe-zk-pok/src/curve_446/mod.rs](https://github.com/zama-ai/tfhe-rs/blob/main/tfhe-zk-pok/src/curve_446/mod.rs)
--- a/backends/zk-cuda-backend/build.rs
+++ b/backends/zk-cuda-backend/build.rs
@@ -46,10 +46,7 @@ fn main() {
        }

        // Build CUDA library using cmake crate
-        let limb_bits = std::env::var("ZK_CUDA_LIMB_BITS").unwrap_or_else(|_| "64".to_string());
-        println!("cargo::rerun-if-env-changed=ZK_CUDA_LIMB_BITS");
        let mut cmake_config = cmake::Config::new("cuda");
-        cmake_config.define("ZK_CUDA_LIMB_BITS", &limb_bits);
        let dest = cmake_config.build();

        // cmake crate installs to dest/lib subdirectory
@@ -109,7 +106,7 @@ fn main() {
            let bindings = bindgen::Builder::default()
                .header(header_path.to_str().unwrap())
                // Allow only the wrapper functions (C FFI interface)
-                .allowlist_function(".*_wrapper(_async)?")
+                .allowlist_function(".*_wrapper")
                // Allow the core types needed for FFI
                .allowlist_type("G1Point")
                .allowlist_type("G2Point")
--- a/backends/zk-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/zk-cuda-backend/cuda/CMakeLists.txt
@@ -51,16 +51,6 @@ else()
  set(CMAKE_CUDA_ARCHITECTURES 70)
 endif()

-# Limb size configuration: 32 or 64 (default: 64)
-# 32-bit limbs enable PTX carry-chain optimizations on GPU
-set(ZK_CUDA_LIMB_BITS "64" CACHE STRING "Limb size in bits for Fp arithmetic (32 or 64)")
-set_property(CACHE ZK_CUDA_LIMB_BITS PROPERTY STRINGS "32" "64")
-if(NOT ZK_CUDA_LIMB_BITS STREQUAL "32" AND NOT ZK_CUDA_LIMB_BITS STREQUAL "64")
-  message(FATAL_ERROR "ZK_CUDA_LIMB_BITS must be 32 or 64, got: ${ZK_CUDA_LIMB_BITS}")
-endif()
-add_compile_definitions(LIMB_BITS_CONFIG=${ZK_CUDA_LIMB_BITS})
-message(STATUS "Limb size: ${ZK_CUDA_LIMB_BITS}-bit")
-
 # Enable CUDA separable compilation for better optimization
 set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)

--- a/backends/zk-cuda-backend/cuda/include/curve.h
+++ b/backends/zk-cuda-backend/cuda/include/curve.h
@@ -17,15 +17,7 @@ __host__ __device__ void fp2_zero(Fp2 &a);

 // G1 point: (x, y) coordinates in Fp
 // Curve equation: y^2 = x^3 + b (short Weierstrass form with a = 0)
-// alignas(8) ensures identical struct layout (size 120) in both 32-bit and
-// 64-bit limb modes, matching the Rust FFI bindings generated from 64-bit.
-// Without this, 32-bit mode produces 116-byte structs (4-byte alignment from
-// uint32_t limbs) vs 120 bytes in Rust FFI, causing array stride mismatches
-// that corrupt point data for n>1.
-// The 4-byte padding overhead is negligible: MSM is compute-bound (Montgomery
-// multiplications dominate), and point access patterns in Pippenger-style MSM
-// are non-coalescing regardless of struct size.
-struct alignas(8) G1Affine {
+struct G1Affine {
  Fp x;
  Fp y;
  bool infinity; // true if point at infinity (identity element)
@@ -44,9 +36,7 @@ struct alignas(8) G1Affine {

 // G2 point: (x, y) coordinates in Fp2
 // Curve equation: y^2 = x^3 + b' (twisted curve over Fp2)
-// alignas(8): same rationale as G1Affine above — ensures FFI layout
-// compatibility (size 232) between 32-bit and 64-bit limb modes.
-struct alignas(8) G2Affine {
+struct G2Affine {
  Fp2 x;
  Fp2 y;
  bool infinity; // true if point at infinity (identity element)
@@ -241,6 +231,13 @@ __host__ __device__ const G2Affine &g2_generator();
 // points, significantly reducing the number of point operations compared to
 // naive methods

+// Pippenger algorithm constants
+#define MSM_WINDOW_SIZE 4 // 4-bit windows
+#define MSM_G1_BUCKET_COUNT                                                    \
+  16 // 2^MSM_WINDOW_SIZE buckets (0-15) - legacy, kept for compatibility
+#define MSM_SIGNED_BUCKET_COUNT                                                \
+  8 // With signed recoding: buckets 1-8 (half the buckets)
+
 // ============================================================================
 // Template Async/Sync API for curve operations
 // ============================================================================
@@ -335,9 +332,22 @@ void point_to_montgomery_batch(cudaStream_t stream, uint32_t gpu_index,
                               PointType *d_points, uint32_t n);

 // ============================================================================
-// MSM Traits (maps projective to affine point types, used by msm.h)
+// Refactored MSM API (device pointers only, no allocations/copies/frees)
 // ============================================================================
+// All pointers are device pointers (already allocated by caller)
+// Temporary buffer must be provided by caller:
+//   - d_scratch: buffer of size (num_blocks + 1) * MSM_G1_BUCKET_COUNT *
+//   sizeof(G1Point/G2Affine)
+//     where num_blocks = CEIL_DIV(n, threadsPerBlock) (typically
+//     256 threads per block) This provides space for:
+//       * num_blocks * MSM_G1_BUCKET_COUNT points for per-block bucket
+//       accumulations
+//       * MSM_G1_BUCKET_COUNT points for final buckets
+//     MSM_G1_BUCKET_COUNT is typically 16 (for 4-bit windows)
+// Uses Pippenger algorithm (bucket method) with sppark-style single-pass
+// accumulation

+// Simple traits for MSM template (maps projective to affine point types)
 template <typename ProjectivePointType> struct MSMTraits;

 template <> struct MSMTraits<G1Projective> {
@@ -348,4 +358,40 @@ template <> struct MSMTraits<G2Projective> {
  using AffinePointType = G2Affine;
 };

-// MSM function declarations are in msm.h
+// ============================================================================
+// MSM with BigInt5 scalars (default MSM implementation)
+// ============================================================================
+// These functions accept BigInt5* scalars (320-bit scalars, 5 limbs)
+// BigInt5 represents a scalar as 5 limbs of 64 bits (320 bits total)
+// Uses projective coordinates internally (no inversions!)
+
+// MSM with BigInt scalars for G1 (projective result)
+void point_msm_async_g1(cudaStream_t stream, uint32_t gpu_index,
+                        G1Projective *d_result, const G1Affine *d_points,
+                        const Scalar *d_scalars, G1Projective *d_scratch,
+                        uint32_t n);
+void point_msm_g1(cudaStream_t stream, uint32_t gpu_index,
+                  G1Projective *d_result, const G1Affine *d_points,
+                  const Scalar *d_scalars, G1Projective *d_scratch, uint32_t n);
+
+// MSM with BigInt scalars for G2 (projective result)
+void point_msm_async_g2(cudaStream_t stream, uint32_t gpu_index,
+                        G2Projective *d_result, const G2Affine *d_points,
+                        const Scalar *d_scalars, G2Projective *d_scratch,
+                        uint32_t n);
+void point_msm_g2(cudaStream_t stream, uint32_t gpu_index,
+                  G2Projective *d_result, const G2Affine *d_points,
+                  const Scalar *d_scalars, G2Projective *d_scratch, uint32_t n);
+
+// Template MSM with BigInt scalars (works for both G1 and G2)
+template <typename ProjectivePointType>
+void point_msm_async(
+    cudaStream_t stream, uint32_t gpu_index, ProjectivePointType *d_result,
+    const typename MSMTraits<ProjectivePointType>::AffinePointType *d_points,
+    const Scalar *d_scalars, ProjectivePointType *d_scratch, uint32_t n);
+
+template <typename ProjectivePointType>
+void point_msm(
+    cudaStream_t stream, uint32_t gpu_index, ProjectivePointType *d_result,
+    const typename MSMTraits<ProjectivePointType>::AffinePointType *d_points,
+    const Scalar *d_scalars, ProjectivePointType *d_scratch, uint32_t n);
--- a/backends/zk-cuda-backend/cuda/include/fp.h
+++ b/backends/zk-cuda-backend/cuda/include/fp.h
@@ -124,11 +124,8 @@ static_assert(sizeof(Fp) == FP_LIMBS * sizeof(UNSIGNED_LIMB),
 // Binary arithmetic operators
 __host__ __device__ Fp operator+(const Fp &a, const Fp &b);
 __host__ __device__ Fp operator-(const Fp &a, const Fp &b);
-// MONTGOMERY: Both inputs must be in Montgomery form, result is in Montgomery
-// form.
+// Binary multiplication: returns result in Montgomery form
 __host__ __device__ Fp operator*(const Fp &a, const Fp &b);
-// MONTGOMERY: Both inputs must be in Montgomery form, result is in Montgomery
-// form. Computes a * b^{-1} in Montgomery representation.
 __host__ __device__ Fp operator/(const Fp &a, const Fp &b);

 // Unary negation operator
@@ -142,7 +139,6 @@ __host__ __device__ bool operator!=(const Fp &a, const Fp &b);
 __host__ __device__ Fp &operator+=(Fp &a, const Fp &b);
 __host__ __device__ Fp &operator-=(Fp &a, const Fp &b);
 __host__ __device__ Fp &operator*=(Fp &a, const Fp &b);
-// MONTGOMERY: Both inputs must be in Montgomery form.
 __host__ __device__ Fp &operator/=(Fp &a, const Fp &b);

 // Prime modulus p for BLS12-446
@@ -269,21 +265,12 @@ __host__ __device__ bool fp_sqrt(Fp &c, const Fp &a);
 // Uses Euler's criterion: a is a quadratic residue if a^((p-1)/2) = 1 mod p
 __host__ __device__ bool fp_is_quadratic_residue(const Fp &a);

-// Small-constant multiplication via addition chains (much cheaper than
-// fp_mont_mul). MONTGOMERY: input and output must be in Montgomery form.
-__host__ __device__ void fp_double(Fp &c, const Fp &a);
-__host__ __device__ void fp_mul3(Fp &c, const Fp &a);
-__host__ __device__ void fp_mul4(Fp &c, const Fp &a);
-__host__ __device__ void fp_mul8(Fp &c, const Fp &a);
-
 // Conditional assignment: if condition, dst = src, else dst unchanged
 __host__ __device__ void fp_cmov(Fp &dst, const Fp &src, uint64_t condition);

 // Helper functions to access constants
 // Get modulus reference (device: from constant memory, host: static copy)
 __host__ __device__ const Fp &fp_modulus();
-// Get Montgomery reduction constant p' = -p^(-1) mod 2^LIMB_BITS
-__host__ __device__ UNSIGNED_LIMB fp_p_prime();

 // ============================================================================
 // Async/Sync API for device memory operations
--- a/backends/zk-cuda-backend/cuda/include/fp2.h
+++ b/backends/zk-cuda-backend/cuda/include/fp2.h
@@ -81,12 +81,6 @@ __host__ __device__ void fp2_mul(Fp2 &c, const Fp2 &a, const Fp2 &b);
 // NOTE: All inputs and outputs are in Montgomery form (no conversions)
 __host__ __device__ void fp2_mont_mul(Fp2 &c, const Fp2 &a, const Fp2 &b);

-// Montgomery squaring: c = a^2 (all in Montgomery form)
-// Uses the complex-squaring identity: c0 = (a0+a1)(a0-a1), c1 = 2*a0*a1
-// Only 2 Fp multiplications vs 3 for fp2_mont_mul(c, a, a).
-// NOTE: All inputs and outputs are in Montgomery form (no conversions)
-__host__ __device__ void fp2_mont_square(Fp2 &c, const Fp2 &a);
-
 // Squaring: c = a^2
 // (a0 + a1*i)^2 = (a0^2 - a1^2) + 2*a0*a1*i
 // Optimized version that uses fewer multiplications
@@ -109,13 +103,6 @@ __host__ __device__ void fp2_mont_inv(Fp2 &c, const Fp2 &a);
 // Division: c = a / b = a * b^(-1)
 __host__ __device__ void fp2_div(Fp2 &c, const Fp2 &a, const Fp2 &b);

-// Small-constant multiplication via addition chains (much cheaper than
-// fp2_mont_mul). MONTGOMERY: input and output must be in Montgomery form.
-__host__ __device__ void fp2_double(Fp2 &c, const Fp2 &a);
-__host__ __device__ void fp2_mul3(Fp2 &c, const Fp2 &a);
-__host__ __device__ void fp2_mul4(Fp2 &c, const Fp2 &a);
-__host__ __device__ void fp2_mul8(Fp2 &c, const Fp2 &a);
-
 __host__ __device__ void fp2_cmov(Fp2 &dst, const Fp2 &src, uint64_t condition);

 // Frobenius map: c = a^p
--- a/backends/zk-cuda-backend/cuda/include/msm.h
+++ b/backends/zk-cuda-backend/cuda/include/msm.h
@@ -14,33 +14,33 @@
 // ============================================================================

 // Kernel thread configuration
-constexpr uint32_t KERNEL_THREADS_MAX = 256;
+#define KERNEL_THREADS_MAX 256 // Maximum threads per block for general kernels

 // G1 dynamic window selection thresholds
-constexpr uint32_t MSM_G1_SMALL_THRESHOLD = 256; // n <= 256: use 4-bit windows
-constexpr uint32_t MSM_G1_MEDIUM_THRESHOLD =
-    4096; // n <= 4096: use 5-bit windows
+#define MSM_G1_SMALL_THRESHOLD 256   // n <= 256: use 4-bit windows
+#define MSM_G1_MEDIUM_THRESHOLD 4096 // n <= 4096: use 5-bit windows

 // Pippenger algorithm parameters
-constexpr uint32_t MSM_G1_WINDOW_SIZE = 4;   // 4-bit windows for G1
-constexpr uint32_t MSM_G1_BUCKET_COUNT = 16; // 2^MSM_G1_WINDOW_SIZE buckets
+#define MSM_G1_WINDOW_SIZE 4   // 4-bit windows for G1
+#define MSM_G1_BUCKET_COUNT 16 // 2^MSM_G1_WINDOW_SIZE buckets (0-15)

 // G2-specific parameters: larger window = fewer Horner doublings
 // G2 benefits from larger windows because its field ops are 2x more expensive
-constexpr uint32_t MSM_G2_WINDOW_SIZE = 5;   // 5-bit windows for G2
-constexpr uint32_t MSM_G2_BUCKET_COUNT = 32; // 2^MSM_G2_WINDOW_SIZE buckets
+#define MSM_G2_WINDOW_SIZE 5   // 5-bit windows for G2
+#define MSM_G2_BUCKET_COUNT 32 // 2^MSM_G2_WINDOW_SIZE buckets (0-31)

 // Threads per block for MSM kernels (must match implementation)
 // These values are used for scratch space calculation in wrappers
-constexpr uint32_t MSM_G1_THREADS_PER_BLOCK = 128;
-constexpr uint32_t MSM_G2_THREADS_PER_BLOCK = 128;
+#define MSM_G1_THREADS_PER_BLOCK 128 // G1 uses 128 threads per block
+#define MSM_G2_THREADS_PER_BLOCK                                               \
+  128 // G2 uses 128 threads per block (register-based bucket accumulation)

 // Helper function to get optimal threads per block for MSM based on point type.
 // Uses 128 threads for both G1 and G2 for optimal SM occupancy on H100:
 // - G1 with 128 threads: 15.6KB shared mem, allows 3 blocks per SM
 // - G2 with 128 threads: 29.8KB shared mem, allows 1 block per SM
 // Testing showed 64 threads is worse (25% slower for G2/4096).
-template <typename PointType> uint32_t msm_threads_per_block(uint32_t n) {
+template <typename PointType> int get_msm_threads_per_block(uint32_t n) {
  (void)n;
  return 128;
 }
@@ -65,60 +65,39 @@ template <> struct MSMWindowSize<G2ProjectivePoint> {
  static constexpr uint32_t value = MSM_G2_WINDOW_SIZE;
 };

-// ============================================================================
-// Scratch Size Helpers
-// ============================================================================
-// Compute the exact scratch buffer size (in bytes) needed by the Pippenger MSM
-// implementation for a given input count. These match the internal scratch
-// partitioning exactly: all_block_buckets + all_final_buckets + window_sums.
-// The gpu_index is needed to query device shared memory limits, which affect
-// the per-window block count.
-
-size_t pippenger_scratch_size_g1(uint32_t n, uint32_t gpu_index);
-size_t pippenger_scratch_size_g2(uint32_t n, uint32_t gpu_index);
-
 // ============================================================================
 // MSM with BigInt Scalars (320-bit scalars, default implementation)
 // ============================================================================

 // MSM for G1 points with BigInt scalars (projective result)
 // Computes: result = sum(scalars[i] * points[i])
-// Result is written directly to a host pointer (no device allocation needed for
-// the result). Scratch space must be pre-allocated by the caller and passed via
-// d_scratch as a typed projective pointer (G1Projective* for G1,
-// G2ProjectivePoint* for G2). Use the scratch size helpers to query the
-// required allocation size in bytes, then cast the allocation to the
-// appropriate projective type.
 // Arguments:
 //   stream: CUDA stream for async execution
 //   gpu_index: GPU device index
-//   h_result: Host pointer to output (projective G1 point)
+//   d_result: Device pointer to output (projective G1 point)
 //   d_points: Device pointer to input affine G1 points (array of n points)
 //   d_scalars: Device pointer to input BigInt scalars (array of n scalars)
+//   d_scratch: Device pointer to scratch buffer for intermediate results
+//              Required size: (num_blocks + 1) * MSM_G1_BUCKET_COUNT *
+//              sizeof(G1Projective)
 //   n: Number of points/scalars
-//   d_scratch: Caller-provided device scratch buffer for intermediate results
-//   size_tracker: Reference for tracking GPU memory allocation sizes
-void point_msm_g1_async(cudaStream_t stream, uint32_t gpu_index,
-                        G1Projective *h_result, const G1Affine *d_points,
-                        const Scalar *d_scalars, uint32_t n,
-                        G1Projective *d_scratch, uint64_t &size_tracker,
-                        bool gpu_memory_allocated);
+void point_msm_async_g1(cudaStream_t stream, uint32_t gpu_index,
+                        G1Projective *d_result, const G1Affine *d_points,
+                        const Scalar *d_scalars, G1Projective *d_scratch,
+                        uint32_t n, uint64_t &size_tracker);

 void point_msm_g1(cudaStream_t stream, uint32_t gpu_index,
-                  G1Projective *h_result, const G1Affine *d_points,
-                  const Scalar *d_scalars, uint32_t n, G1Projective *d_scratch,
-                  uint64_t &size_tracker, bool gpu_memory_allocated);
+                  G1Projective *d_result, const G1Affine *d_points,
+                  const Scalar *d_scalars, G1Projective *d_scratch, uint32_t n,
+                  uint64_t &size_tracker);

 // MSM for G2 points with BigInt scalars (projective result)
-// Result is written directly to a host pointer.
-void point_msm_g2_async(cudaStream_t stream, uint32_t gpu_index,
-                        G2ProjectivePoint *h_result, const G2Point *d_points,
-                        const Scalar *d_scalars, uint32_t n,
-                        G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
-                        bool gpu_memory_allocated);
+void point_msm_async_g2(cudaStream_t stream, uint32_t gpu_index,
+                        G2ProjectivePoint *d_result, const G2Point *d_points,
+                        const Scalar *d_scalars, G2ProjectivePoint *d_scratch,
+                        uint32_t n, uint64_t &size_tracker);

 void point_msm_g2(cudaStream_t stream, uint32_t gpu_index,
-                  G2ProjectivePoint *h_result, const G2Point *d_points,
-                  const Scalar *d_scalars, uint32_t n,
-                  G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
-                  bool gpu_memory_allocated);
+                  G2ProjectivePoint *d_result, const G2Point *d_points,
+                  const Scalar *d_scalars, G2ProjectivePoint *d_scratch,
+                  uint32_t n, uint64_t &size_tracker);
--- a/backends/zk-cuda-backend/cuda/include/point_traits.h
+++ b/backends/zk-cuda-backend/cuda/include/point_traits.h
@@ -1,278 +0,0 @@
-#pragma once
-
-#include "curve.h"
-#include "fp.h"
-#include "fp2.h"
-
-// ============================================================================
-// Unified Trait System for Elliptic Curve Points
-// ============================================================================
-// Provides compile-time dispatch for field and point operations across G1/G2.
-// Both affine (curve.cu) and MSM (msm/) code use these traits instead of
-// maintaining separate copies.
-
-// Forward declarations for projective point operations (implemented in
-// curve.cu)
-__host__ __device__ void projective_point_add(G1Projective &result,
-                                              const G1Projective &p1,
-                                              const G1Projective &p2);
-__host__ __device__ void projective_point_add(G2Projective &result,
-                                              const G2Projective &p1,
-                                              const G2Projective &p2);
-__host__ __device__ void projective_point_double(G1Projective &result,
-                                                 const G1Projective &p);
-__host__ __device__ void projective_point_double(G2Projective &result,
-                                                 const G2Projective &p);
-__host__ __device__ void projective_mixed_add(G1Projective &result,
-                                              const G1Projective &p1,
-                                              const G1Affine &p2);
-__host__ __device__ void projective_mixed_add(G2Projective &result,
-                                              const G2Projective &p1,
-                                              const G2Affine &p2);
-
-// ============================================================================
-// Affine<T>: trait for affine point operations
-// ============================================================================
-
-template <typename PointType> struct Affine;
-
-template <> struct Affine<G1Affine> {
-  using FieldType = Fp;
-
-  __host__ __device__ static void field_zero(FieldType &a) { fp_zero(a); }
-  __host__ __device__ static void field_copy(FieldType &dst,
-                                             const FieldType &src) {
-    dst = src;
-  }
-  __host__ __device__ static void field_neg(FieldType &c, const FieldType &a) {
-    c = -a;
-  }
-  __host__ __device__ static void field_add(FieldType &c, const FieldType &a,
-                                            const FieldType &b) {
-    c = a + b;
-  }
-  __host__ __device__ static void field_sub(FieldType &c, const FieldType &a,
-                                            const FieldType &b) {
-    c = a - b;
-  }
-  __host__ __device__ static void field_mul(FieldType &c, const FieldType &a,
-                                            const FieldType &b) {
-    fp_mont_mul(c, a, b);
-  }
-  __host__ __device__ static void field_inv(FieldType &c, const FieldType &a) {
-    fp_mont_inv(c, a);
-  }
-  __host__ __device__ static ComparisonType field_cmp(const FieldType &a,
-                                                      const FieldType &b) {
-    return fp_cmp(a, b);
-  }
-  __host__ __device__ static bool field_is_zero(const FieldType &a) {
-    return fp_is_zero(a);
-  }
-  __host__ __device__ static void field_to_montgomery(FieldType &c,
-                                                      const FieldType &a) {
-    fp_to_montgomery(c, a);
-  }
-  __host__ __device__ static void field_from_montgomery(FieldType &c,
-                                                        const FieldType &a) {
-    fp_from_montgomery(c, a);
-  }
-
-  __host__ __device__ static void point_at_infinity(G1Affine &point) {
-    g1_point_at_infinity(point);
-  }
-  __host__ __device__ static bool is_infinity(const G1Affine &point) {
-    return g1_is_infinity(point);
-  }
-  __host__ __device__ static const FieldType &curve_b() { return curve_b_g1(); }
-  __host__ __device__ static void point_copy(G1Affine &dst,
-                                             const G1Affine &src) {
-    dst = src;
-  }
-};
-
-template <> struct Affine<G2Affine> {
-  using FieldType = Fp2;
-
-  __host__ __device__ static void field_zero(FieldType &a) { fp2_zero(a); }
-  __host__ __device__ static void field_copy(FieldType &dst,
-                                             const FieldType &src) {
-    dst = src;
-  }
-  __host__ __device__ static void field_neg(FieldType &c, const FieldType &a) {
-    c = -a;
-  }
-  __host__ __device__ static void field_add(FieldType &c, const FieldType &a,
-                                            const FieldType &b) {
-    c = a + b;
-  }
-  __host__ __device__ static void field_sub(FieldType &c, const FieldType &a,
-                                            const FieldType &b) {
-    c = a - b;
-  }
-  __host__ __device__ static void field_mul(FieldType &c, const FieldType &a,
-                                            const FieldType &b) {
-    fp2_mont_mul(c, a, b);
-  }
-  __host__ __device__ static void field_inv(FieldType &c, const FieldType &a) {
-    fp2_mont_inv(c, a);
-  }
-  __host__ __device__ static ComparisonType field_cmp(const FieldType &a,
-                                                      const FieldType &b) {
-    return fp2_cmp(a, b);
-  }
-  __host__ __device__ static bool field_is_zero(const FieldType &a) {
-    return fp2_is_zero(a);
-  }
-  __host__ __device__ static void field_to_montgomery(FieldType &c,
-                                                      const FieldType &a) {
-    fp_to_montgomery(c.c0, a.c0);
-    fp_to_montgomery(c.c1, a.c1);
-  }
-  __host__ __device__ static void field_from_montgomery(FieldType &c,
-                                                        const FieldType &a) {
-    fp_from_montgomery(c.c0, a.c0);
-    fp_from_montgomery(c.c1, a.c1);
-  }
-
-  __host__ __device__ static void point_at_infinity(G2Affine &point) {
-    g2_point_at_infinity(point);
-  }
-  __host__ __device__ static bool is_infinity(const G2Affine &point) {
-    return g2_is_infinity(point);
-  }
-  __host__ __device__ static const FieldType &curve_b() { return curve_b_g2(); }
-  __host__ __device__ static void point_copy(G2Affine &dst,
-                                             const G2Affine &src) {
-    dst = src;
-  }
-};
-
-// ============================================================================
-// Projective<T>: trait for projective point operations
-// ============================================================================
-// Includes mixed_add() for efficient projective + affine addition used by MSM.
-
-template <typename PointType> struct Projective;
-
-template <> struct Projective<G1Projective> {
-  using FieldType = Fp;
-  using AffineType = G1Affine;
-
-  __host__ __device__ static void field_zero(FieldType &a) { fp_zero(a); }
-  __host__ __device__ static void field_copy(FieldType &dst,
-                                             const FieldType &src) {
-    dst = src;
-  }
-  __host__ __device__ static bool field_is_zero(const FieldType &a) {
-    return fp_is_zero(a);
-  }
-  __host__ __device__ static void field_mul(FieldType &c, const FieldType &a,
-                                            const FieldType &b) {
-    fp_mont_mul(c, a, b);
-  }
-  __host__ __device__ static void field_sub(FieldType &c, const FieldType &a,
-                                            const FieldType &b) {
-    c = a - b;
-  }
-
-  __host__ __device__ static void point_at_infinity(G1Projective &point) {
-    g1_projective_point_at_infinity(point);
-  }
-  __host__ __device__ static bool is_infinity(const G1Projective &point) {
-    return fp_is_zero(point.Z);
-  }
-  __host__ __device__ static void affine_to_projective(G1Projective &proj,
-                                                       const G1Affine &affine) {
-    ::affine_to_projective(proj, affine);
-  }
-  __host__ __device__ static void projective_add(G1Projective &result,
-                                                 const G1Projective &p1,
-                                                 const G1Projective &p2) {
-    projective_point_add(result, p1, p2);
-  }
-  __host__ __device__ static void projective_double(G1Projective &result,
-                                                    const G1Projective &p) {
-    projective_point_double(result, p);
-  }
-  __host__ __device__ static void
-  mixed_add(G1Projective &result, const G1Projective &p1, const G1Affine &p2) {
-    projective_mixed_add(result, p1, p2);
-  }
-  __host__ __device__ static void point_copy(G1Projective &dst,
-                                             const G1Projective &src) {
-    dst = src;
-  }
-};
-
-template <> struct Projective<G2Projective> {
-  using FieldType = Fp2;
-  using AffineType = G2Affine;
-
-  __host__ __device__ static void field_zero(FieldType &a) { fp2_zero(a); }
-  __host__ __device__ static void field_copy(FieldType &dst,
-                                             const FieldType &src) {
-    dst = src;
-  }
-  __host__ __device__ static bool field_is_zero(const FieldType &a) {
-    return fp2_is_zero(a);
-  }
-  __host__ __device__ static void field_mul(FieldType &c, const FieldType &a,
-                                            const FieldType &b) {
-    fp2_mont_mul(c, a, b);
-  }
-  __host__ __device__ static void field_sub(FieldType &c, const FieldType &a,
-                                            const FieldType &b) {
-    c = a - b;
-  }
-
-  __host__ __device__ static void point_at_infinity(G2Projective &point) {
-    g2_projective_point_at_infinity(point);
-  }
-  __host__ __device__ static bool is_infinity(const G2Projective &point) {
-    return fp2_is_zero(point.Z);
-  }
-  __host__ __device__ static void affine_to_projective(G2Projective &proj,
-                                                       const G2Affine &affine) {
-    ::affine_to_projective(proj, affine);
-  }
-  __host__ __device__ static void projective_add(G2Projective &result,
-                                                 const G2Projective &p1,
-                                                 const G2Projective &p2) {
-    projective_point_add(result, p1, p2);
-  }
-  __host__ __device__ static void projective_double(G2Projective &result,
-                                                    const G2Projective &p) {
-    projective_point_double(result, p);
-  }
-  __host__ __device__ static void
-  mixed_add(G2Projective &result, const G2Projective &p1, const G2Affine &p2) {
-    projective_mixed_add(result, p1, p2);
-  }
-  __host__ __device__ static void point_copy(G2Projective &dst,
-                                             const G2Projective &src) {
-    dst = src;
-  }
-};
-
-// ============================================================================
-// SelectorChooser<T>: maps any point type to its trait struct
-// ============================================================================
-
-template <typename PointType> struct SelectorChooser;
-
-template <> struct SelectorChooser<G1Affine> {
-  using Selection = Affine<G1Affine>;
-};
-
-template <> struct SelectorChooser<G2Affine> {
-  using Selection = Affine<G2Affine>;
-};
-
-template <> struct SelectorChooser<G1Projective> {
-  using Selection = Projective<G1Projective>;
-};
-
-template <> struct SelectorChooser<G2Projective> {
-  using Selection = Projective<G2Projective>;
-};
--- a/backends/zk-cuda-backend/cuda/src/curve.cu
+++ b/backends/zk-cuda-backend/cuda/src/curve.cu
@@ -3,10 +3,239 @@
 #include "fp.h"
 #include "fp2.h"
 #include "msm.h"
-#include "point_traits.h"
 #include <cstdio>
 #include <cstring>

+// ============================================================================
+// Template Traits System for Affine Operations
+// ============================================================================
+// This traits system allows us to write generic point operations that work
+// for both G1 (Fp) and G2 (Fp2) points using the same algorithm.
+
+template <typename PointType> struct Affine;
+
+// Specialization for G1Point (uses Fp)
+template <> struct Affine<G1Affine> {
+  using Field = Fp;
+
+  __host__ __device__ static void field_zero(Field &a) { fp_zero(a); }
+  __host__ __device__ static void field_copy(Field &dst, const Field &src) {
+    dst = src;
+  }
+  __host__ __device__ static void field_neg(Field &c, const Field &a) {
+    c = -a;
+  }
+  __host__ __device__ static void field_add(Field &c, const Field &a,
+                                            const Field &b) {
+    c = a + b;
+  }
+  __host__ __device__ static void field_sub(Field &c, const Field &a,
+                                            const Field &b) {
+    c = a - b;
+  }
+  __host__ __device__ static void field_mul(Field &c, const Field &a,
+                                            const Field &b) {
+    fp_mont_mul(c, a, b);
+  }
+  __host__ __device__ static void field_inv(Field &c, const Field &a) {
+    fp_mont_inv(c, a);
+  }
+  __host__ __device__ static ComparisonType field_cmp(const Field &a,
+                                                      const Field &b) {
+    return fp_cmp(a, b);
+  }
+  __host__ __device__ static bool field_is_zero(const Field &a) {
+    return fp_is_zero(a);
+  }
+  __host__ __device__ static void field_to_montgomery(Field &c,
+                                                      const Field &a) {
+    fp_to_montgomery(c, a);
+  }
+  __host__ __device__ static void field_from_montgomery(Field &c,
+                                                        const Field &a) {
+    fp_from_montgomery(c, a);
+  }
+
+  __host__ __device__ static void point_at_infinity(G1Affine &point) {
+    g1_point_at_infinity(point);
+  }
+  __host__ __device__ static bool is_infinity(const G1Affine &point) {
+    return g1_is_infinity(point);
+  }
+  __host__ __device__ static const Field &curve_b() { return curve_b_g1(); }
+  __host__ __device__ static void point_copy(G1Affine &dst,
+                                             const G1Affine &src) {
+    dst = src;
+  }
+};
+
+// Specialization for G2Affine (uses Fp2)
+template <> struct Affine<G2Affine> {
+  using Field = Fp2;
+
+  __host__ __device__ static void field_zero(Field &a) { fp2_zero(a); }
+  __host__ __device__ static void field_copy(Field &dst, const Field &src) {
+    dst = src;
+  }
+  __host__ __device__ static void field_neg(Field &c, const Field &a) {
+    c = -a;
+  }
+  __host__ __device__ static void field_add(Field &c, const Field &a,
+                                            const Field &b) {
+    c = a + b;
+  }
+  __host__ __device__ static void field_sub(Field &c, const Field &a,
+                                            const Field &b) {
+    c = a - b;
+  }
+  __host__ __device__ static void field_mul(Field &c, const Field &a,
+                                            const Field &b) {
+    fp2_mont_mul(c, a, b);
+  }
+  __host__ __device__ static void field_inv(Field &c, const Field &a) {
+    fp2_mont_inv(c, a);
+  }
+  __host__ __device__ static ComparisonType field_cmp(const Field &a,
+                                                      const Field &b) {
+    return fp2_cmp(a, b);
+  }
+  __host__ __device__ static bool field_is_zero(const Field &a) {
+    return fp2_is_zero(a);
+  }
+  __host__ __device__ static void field_to_montgomery(Field &c,
+                                                      const Field &a) {
+    fp_to_montgomery(c.c0, a.c0);
+    fp_to_montgomery(c.c1, a.c1);
+  }
+  __host__ __device__ static void field_from_montgomery(Field &c,
+                                                        const Field &a) {
+    fp_from_montgomery(c.c0, a.c0);
+    fp_from_montgomery(c.c1, a.c1);
+  }
+
+  __host__ __device__ static void point_at_infinity(G2Affine &point) {
+    g2_point_at_infinity(point);
+  }
+  __host__ __device__ static bool is_infinity(const G2Affine &point) {
+    return g2_is_infinity(point);
+  }
+  __host__ __device__ static const Field &curve_b() { return curve_b_g2(); }
+  __host__ __device__ static void point_copy(G2Affine &dst,
+                                             const G2Affine &src) {
+    dst = src;
+  }
+};
+
+// Forward declarations for projective point operations (needed by Projective)
+__host__ __device__ void projective_point_add(G1Projective &result,
+                                              const G1Projective &p1,
+                                              const G1Projective &p2);
+__host__ __device__ void projective_point_add(G2Projective &result,
+                                              const G2Projective &p1,
+                                              const G2Projective &p2);
+__host__ __device__ void projective_point_double(G1Projective &result,
+                                                 const G1Projective &p);
+__host__ __device__ void projective_point_double(G2Projective &result,
+                                                 const G2Projective &p);
+
+// ============================================================================
+// Template Traits System for Projective Points
+// ============================================================================
+
+template <typename PointType> struct Projective;
+
+// Specialization for G1Projective (uses Fp)
+template <> struct Projective<G1Projective> {
+  using Field = Fp;
+  using Affine = G1Affine;
+
+  __host__ __device__ static void field_zero(Field &a) { fp_zero(a); }
+  __host__ __device__ static void field_copy(Field &dst, const Field &src) {
+    dst = src;
+  }
+  __host__ __device__ static bool field_is_zero(const Field &a) {
+    return fp_is_zero(a);
+  }
+  __host__ __device__ static void field_mul(Field &c, const Field &a,
+                                            const Field &b) {
+    fp_mont_mul(c, a, b);
+  }
+  __host__ __device__ static void field_sub(Field &c, const Field &a,
+                                            const Field &b) {
+    c = a - b;
+  }
+
+  __host__ __device__ static void point_at_infinity(G1Projective &point) {
+    g1_projective_point_at_infinity(point);
+  }
+  __host__ __device__ static bool is_infinity(const G1Projective &point) {
+    return fp_is_zero(point.Z);
+  }
+  __host__ __device__ static void affine_to_projective(G1Projective &proj,
+                                                       const G1Affine &affine) {
+    affine_to_projective(proj, affine);
+  }
+  __host__ __device__ static void projective_add(G1Projective &result,
+                                                 const G1Projective &p1,
+                                                 const G1Projective &p2) {
+    projective_point_add(result, p1, p2);
+  }
+  __host__ __device__ static void projective_double(G1Projective &result,
+                                                    const G1Projective &p) {
+    projective_point_double(result, p);
+  }
+  __host__ __device__ static void point_copy(G1Projective &dst,
+                                             const G1Projective &src) {
+    dst = src;
+  }
+};
+
+// Specialization for G2Projective (uses Fp2)
+template <> struct Projective<G2Projective> {
+  using Field = Fp2;
+  using Affine = G2Affine;
+
+  __host__ __device__ static void field_zero(Field &a) { fp2_zero(a); }
+  __host__ __device__ static void field_copy(Field &dst, const Field &src) {
+    dst = src;
+  }
+  __host__ __device__ static bool field_is_zero(const Field &a) {
+    return fp2_is_zero(a);
+  }
+  __host__ __device__ static void field_mul(Field &c, const Field &a,
+                                            const Field &b) {
+    fp2_mont_mul(c, a, b);
+  }
+  __host__ __device__ static void field_sub(Field &c, const Field &a,
+                                            const Field &b) {
+    c = a - b;
+  }
+
+  __host__ __device__ static void point_at_infinity(G2Projective &point) {
+    g2_projective_point_at_infinity(point);
+  }
+  __host__ __device__ static bool is_infinity(const G2Projective &point) {
+    return fp2_is_zero(point.Z);
+  }
+  __host__ __device__ static void affine_to_projective(G2Projective &proj,
+                                                       const G2Affine &affine) {
+    affine_to_projective(proj, affine);
+  }
+  __host__ __device__ static void projective_add(G2Projective &result,
+                                                 const G2Projective &p1,
+                                                 const G2Projective &p2) {
+    projective_point_add(result, p1, p2);
+  }
+  __host__ __device__ static void projective_double(G2Projective &result,
+                                                    const G2Projective &p) {
+    projective_point_double(result, p);
+  }
+  __host__ __device__ static void point_copy(G2Projective &dst,
+                                             const G2Projective &src) {
+    dst = src;
+  }
+};
+
 // ============================================================================
 // Template Scalar Multiplication for Projective Points
 // ============================================================================
@@ -294,7 +523,7 @@ __host__ __device__ void point_neg(PointType &result, const PointType &p) {
 template <typename PointType>
 __host__ __device__ void point_double(PointType &result, const PointType &p) {
  using AffinePoint = Affine<PointType>;
-  using FieldType = typename AffinePoint::FieldType;
+  using FieldType = typename AffinePoint::Field;

  if (AffinePoint::is_infinity(p) || AffinePoint::field_is_zero(p.y)) {
    AffinePoint::point_at_infinity(result);
@@ -333,7 +562,7 @@ template <typename PointType>
 __host__ __device__ void point_add(PointType &result, const PointType &p1,
                                   const PointType &p2) {
  using AffinePoint = Affine<PointType>;
-  using FieldType = typename AffinePoint::FieldType;
+  using Field = typename AffinePoint::Field;

  // Handle infinity cases
  if (AffinePoint::is_infinity(p1)) {
@@ -346,7 +575,7 @@ __host__ __device__ void point_add(PointType &result, const PointType &p1,
  }

  // Check if p1 == -p2 (same x, opposite y)
-  FieldType neg_y2;
+  Field neg_y2;
  AffinePoint::field_neg(neg_y2, p2.y);
  if (AffinePoint::field_cmp(p1.x, p2.x) == ComparisonType::Equal &&
      AffinePoint::field_cmp(p1.y, neg_y2) == ComparisonType::Equal) {
@@ -362,7 +591,7 @@ __host__ __device__ void point_add(PointType &result, const PointType &p1,
  }

  // Standard addition: lambda = (y2 - y1) / (x2 - x1)
-  FieldType dx, dy, lambda, lambda_squared, x_result;
+  Field dx, dy, lambda, lambda_squared, x_result;
  AffinePoint::field_sub(dx, p2.x, p1.x);
  AffinePoint::field_sub(dy, p2.y, p1.y);
  AffinePoint::field_inv(lambda, dx);         // 1 / (x2 - x1)
@@ -374,7 +603,7 @@ __host__ __device__ void point_add(PointType &result, const PointType &p1,
  AffinePoint::field_sub(x_result, x_result, p2.x);

  // y_result = lambda * (x1 - x_result) - y1
-  FieldType x1_minus_xr, y_result;
+  Field x1_minus_xr, y_result;
  AffinePoint::field_sub(x1_minus_xr, p1.x, x_result);
  AffinePoint::field_mul(y_result, lambda, x1_minus_xr);
  AffinePoint::field_sub(y_result, y_result, p1.y);
@@ -494,6 +723,76 @@ __host__ __device__ const Fp2 &curve_b_g2() {
 #endif
 }

+// ============================================================================
+// Cached Montgomery Form Constants for Curve Operations
+// ============================================================================
+// These functions return references to cached Montgomery form constants
+// to avoid recomputing them on every projective point operation call.
+// For host code: uses static locals (thread-safe in C++11)
+// For device code: computes once per call (cached via output parameter)
+
+// Helper struct to hold cached Fp Montgomery constants
+struct FpMontConstants {
+  Fp two;
+  Fp three;
+  Fp four;
+  Fp eight;
+};
+
+// Helper struct to hold cached Fp2 Montgomery constants
+struct Fp2MontConstants {
+  Fp2 two;
+  Fp2 three;
+  Fp2 four;
+  Fp2 eight;
+};
+
+// Get cached Fp Montgomery constants (for host code)
+__host__ const FpMontConstants &get_fp_mont_constants_host() {
+  static FpMontConstants constants = []() {
+    FpMontConstants c;
+    fp_two_montgomery(c.two);
+    fp_three_montgomery(c.three);
+    fp_four_montgomery(c.four);
+    fp_eight_montgomery(c.eight);
+    return c;
+  }();
+  return constants;
+}
+
+// Get cached Fp2 Montgomery constants (for host code)
+__host__ const Fp2MontConstants &get_fp2_mont_constants_host() {
+  static Fp2MontConstants constants = []() {
+    Fp2MontConstants c;
+    fp2_two_montgomery(c.two);
+    fp2_three_montgomery(c.three);
+    fp2_four_montgomery(c.four);
+    fp2_eight_montgomery(c.eight);
+    return c;
+  }();
+  return constants;
+}
+
+// Initialize Fp Montgomery constants (for device code, called once per
+// function)
+__device__ void init_fp_mont_constants(Fp &two, Fp &three, Fp &four,
+                                       Fp &eight) {
+  fp_two_montgomery(two);
+  fp_three_montgomery(three);
+  fp_four_montgomery(four);
+  fp_eight_montgomery(eight);
+}
+
+// Initialize Fp2 Montgomery constants (for device code, called once per
+// function)
+__device__ void init_fp2_mont_constants(Fp2 &two, Fp2 &three, Fp2 &four,
+                                        Fp2 &eight) {
+  fp2_two_montgomery(two);
+  fp2_three_montgomery(three);
+  fp2_four_montgomery(four);
+  fp2_eight_montgomery(eight);
+}
+
 // Check if a G1 point is on the curve: y^2 = x^3 + b
 // Uses Montgomery form internally for efficiency
 __host__ __device__ bool is_on_curve_g1(const G1Affine &point) {
@@ -544,11 +843,11 @@ __host__ __device__ bool is_on_curve_g2(const G2Affine &point) {

  // Compute y^2 in Montgomery form
  Fp2 y_squared_mont;
-  fp2_mont_square(y_squared_mont, y_mont);
+  fp2_mont_mul(y_squared_mont, y_mont, y_mont);

  // Compute x^3 in Montgomery form
  Fp2 x_squared_mont, x_cubed_mont;
-  fp2_mont_square(x_squared_mont, x_mont);
+  fp2_mont_mul(x_squared_mont, x_mont, x_mont);
  fp2_mont_mul(x_cubed_mont, x_squared_mont, x_mont);

  // Compute x^3 + b' in Montgomery form
@@ -1439,7 +1738,14 @@ __host__ __device__ void projective_point_add(G1Projective &result,
  Fp temp1, two_R;
  fp_mont_mul(temp1, uu, Z1Z2);
  Fp temp2 = temp1 - vvv;
-  fp_double(two_R, R);
+  // Compute 2*R using cached Montgomery constant
+  Fp two_mont;
+#ifdef __CUDA_ARCH__
+  fp_two_montgomery(two_mont);
+#else
+  two_mont = get_fp_mont_constants_host().two;
+#endif
+  fp_mont_mul(two_R, two_mont, R);
  A = temp2 - two_R;

  // X3 = v * A
@@ -1483,7 +1789,7 @@ __host__ __device__ void projective_point_add(G2Projective &result,
  fp2_mont_mul(Y2Z1, p2.Y, p1.Z);
  u = Y2Z1 - Y1Z2;

-  fp2_mont_square(uu, u);
+  fp2_mont_mul(uu, u, u);

  Fp2 X2Z1;
  fp2_mont_mul(X2Z1, p2.X, p1.Z);
@@ -1495,7 +1801,7 @@ __host__ __device__ void projective_point_add(G2Projective &result,
    return;
  }

-  fp2_mont_square(vv, v);
+  fp2_mont_mul(vv, v, v);
  fp2_mont_mul(vvv, v, vv);

  fp2_mont_mul(R, vv, X1Z2);
@@ -1504,7 +1810,14 @@ __host__ __device__ void projective_point_add(G2Projective &result,
  Fp2 temp1, two_R;
  fp2_mont_mul(temp1, uu, Z1Z2);
  Fp2 temp2 = temp1 - vvv;
-  fp2_double(two_R, R);
+  // Compute 2*R using cached Montgomery constant
+  Fp2 two_mont;
+#ifdef __CUDA_ARCH__
+  fp2_two_montgomery(two_mont);
+#else
+  two_mont = get_fp2_mont_constants_host().two;
+#endif
+  fp2_mont_mul(two_R, two_mont, R);
  A = temp2 - two_R;

  fp2_mont_mul(result.X, v, A);
@@ -1581,7 +1894,14 @@ __host__ __device__ void projective_mixed_add(G1Projective &result,
  Fp temp1, two_R;
  fp_mont_mul(temp1, uu, p1.Z);
  Fp temp2 = temp1 - vvv;
-  fp_double(two_R, R);
+  // Compute 2*R
+  Fp two_mont;
+#ifdef __CUDA_ARCH__
+  fp_two_montgomery(two_mont);
+#else
+  two_mont = get_fp_mont_constants_host().two;
+#endif
+  fp_mont_mul(two_R, two_mont, R);
  A = temp2 - two_R;

  // X3 = v * A
@@ -1647,8 +1967,8 @@ __host__ __device__ void projective_mixed_add(G2Projective &result,
  }

  // uu = u^2, vv = v^2, vvv = v * vv
-  fp2_mont_square(uu, u);
-  fp2_mont_square(vv, v);
+  fp2_mont_mul(uu, u, u);
+  fp2_mont_mul(vv, v, v);
  fp2_mont_mul(vvv, v, vv);

  // R = vv * X1
@@ -1658,7 +1978,13 @@ __host__ __device__ void projective_mixed_add(G2Projective &result,
  Fp2 temp1, two_R;
  fp2_mont_mul(temp1, uu, p1.Z);
  Fp2 temp2 = temp1 - vvv;
-  fp2_double(two_R, R);
+  Fp2 two_mont;
+#ifdef __CUDA_ARCH__
+  fp2_two_montgomery(two_mont);
+#else
+  two_mont = get_fp2_mont_constants_host().two;
+#endif
+  fp2_mont_mul(two_R, two_mont, R);
  A = temp2 - two_R;

  // X3 = v * A
@@ -1690,10 +2016,22 @@ __host__ __device__ void projective_point_double(G1Projective &result,
  // G1 projective doubling using hyperelliptic.org formula
  // For curves y^2 = x^3 + a_4*x + b with a_4 = 0

+  // Get Montgomery constants (cached for host, computed once for device)
+  Fp two_mont, three_mont, four_mont, eight_mont;
+#ifdef __CUDA_ARCH__
+  init_fp_mont_constants(two_mont, three_mont, four_mont, eight_mont);
+#else
+  const FpMontConstants &c = get_fp_mont_constants_host();
+  two_mont = c.two;
+  three_mont = c.three;
+  four_mont = c.four;
+  eight_mont = c.eight;
+#endif
+
  // A = 3 * X^2
  Fp X_sq, A;
  fp_mont_mul(X_sq, p.X, p.X);
-  fp_mul3(A, X_sq);
+  fp_mont_mul(A, three_mont, X_sq);

  // B = Y * Z
  Fp B;
@@ -1707,17 +2045,17 @@ __host__ __device__ void projective_point_double(G1Projective &result,
  // D = A^2 - 8*C
  Fp A_sq, eight_C;
  fp_mont_mul(A_sq, A, A);
-  fp_mul8(eight_C, C);
+  fp_mont_mul(eight_C, eight_mont, C);
  Fp D = A_sq - eight_C;

-  // X3 = 2 * B * D
+  // X₃ = 2 * B * D
  Fp BD;
  fp_mont_mul(BD, B, D);
-  fp_double(result.X, BD);
+  fp_mont_mul(result.X, two_mont, BD);

-  // Y3 = A * (4*C - D) - 8 * Y^2 * B^2
+  // Y₃ = A * (4*C - D) - 8 * Y^2 * B^2
  Fp four_C, A_times_diff;
-  fp_mul4(four_C, C);
+  fp_mont_mul(four_C, four_mont, C);
  Fp four_C_minus_D = four_C - D;
  fp_mont_mul(A_times_diff, A, four_C_minus_D);

@@ -1725,13 +2063,13 @@ __host__ __device__ void projective_point_double(G1Projective &result,
  fp_mont_mul(Y_sq, p.Y, p.Y);
  fp_mont_mul(B_sq, B, B);
  fp_mont_mul(Y_sq_B_sq, Y_sq, B_sq);
-  fp_mul8(eight_Y_sq_B_sq, Y_sq_B_sq);
+  fp_mont_mul(eight_Y_sq_B_sq, eight_mont, Y_sq_B_sq);
  result.Y = A_times_diff - eight_Y_sq_B_sq;

-  // Z3 = 8 * B^3
+  // Z₃ = 8 * B^3
  Fp B_cu;
  fp_mont_mul(B_cu, B_sq, B);
-  fp_mul8(result.Z, B_cu);
+  fp_mont_mul(result.Z, eight_mont, B_cu);
 }

 // Projective point doubling: result = 2 * p (no inversions!) - G2
@@ -1747,10 +2085,22 @@ __host__ __device__ void projective_point_double(G2Projective &result,

  // G2 projective doubling (same as G1 but with Fp2)

+  // Get Montgomery constants (cached for host, computed once for device)
+  Fp2 two_mont, three_mont, four_mont, eight_mont;
+#ifdef __CUDA_ARCH__
+  init_fp2_mont_constants(two_mont, three_mont, four_mont, eight_mont);
+#else
+  const Fp2MontConstants &c = get_fp2_mont_constants_host();
+  two_mont = c.two;
+  three_mont = c.three;
+  four_mont = c.four;
+  eight_mont = c.eight;
+#endif
+
  // A = 3 * X^2
  Fp2 X_sq, A;
-  fp2_mont_square(X_sq, p.X);
-  fp2_mul3(A, X_sq);
+  fp2_mont_mul(X_sq, p.X, p.X);
+  fp2_mont_mul(A, three_mont, X_sq);

  // B = Y * Z
  Fp2 B;
@@ -1763,32 +2113,32 @@ __host__ __device__ void projective_point_double(G2Projective &result,

  // D = A^2 - 8*C
  Fp2 A_sq, eight_C;
-  fp2_mont_square(A_sq, A);
-  fp2_mul8(eight_C, C);
+  fp2_mont_mul(A_sq, A, A);
+  fp2_mont_mul(eight_C, eight_mont, C);
  Fp2 D = A_sq - eight_C;

-  // X3 = 2 * B * D
+  // X₃ = 2 * B * D
  Fp2 BD;
  fp2_mont_mul(BD, B, D);
-  fp2_double(result.X, BD);
+  fp2_mont_mul(result.X, two_mont, BD);

-  // Y3 = A * (4*C - D) - 8 * Y^2 * B^2
+  // Y₃ = A * (4*C - D) - 8 * Y^2 * B^2
  Fp2 four_C, A_times_diff;
-  fp2_mul4(four_C, C);
+  fp2_mont_mul(four_C, four_mont, C);
  Fp2 four_C_minus_D = four_C - D;
  fp2_mont_mul(A_times_diff, A, four_C_minus_D);

  Fp2 Y_sq, B_sq, Y_sq_B_sq, eight_Y_sq_B_sq;
-  fp2_mont_square(Y_sq, p.Y);
-  fp2_mont_square(B_sq, B);
+  fp2_mont_mul(Y_sq, p.Y, p.Y);
+  fp2_mont_mul(B_sq, B, B);
  fp2_mont_mul(Y_sq_B_sq, Y_sq, B_sq);
-  fp2_mul8(eight_Y_sq_B_sq, Y_sq_B_sq);
+  fp2_mont_mul(eight_Y_sq_B_sq, eight_mont, Y_sq_B_sq);
  result.Y = A_times_diff - eight_Y_sq_B_sq;

-  // Z3 = 8 * B^3
+  // Z₃ = 8 * B^3
  Fp2 B_cu;
  fp2_mont_mul(B_cu, B_sq, B);
-  fp2_mul8(result.Z, B_cu);
+  fp2_mont_mul(result.Z, eight_mont, B_cu);
 }

 // Explicit template instantiations for projective_scalar_mul (needed by MSM)
--- a/backends/zk-cuda-backend/cuda/src/msm/common.cuh
+++ b/backends/zk-cuda-backend/cuda/src/msm/common.cuh
@@ -1,15 +1,286 @@
-#pragma once
+#include "curve.h"
+#include "fp.h"
+#include "fp2.h"

-#include "point_traits.h"
+// Forward declarations for projective point operations (implemented in
+// curve.cu)
+__host__ __device__ void projective_point_add(G1Projective &result,
+                                              const G1Projective &p1,
+                                              const G1Projective &p2);
+__host__ __device__ void projective_point_add(G2Projective &result,
+                                              const G2Projective &p1,
+                                              const G2Projective &p2);
+__host__ __device__ void projective_point_double(G1Projective &result,
+                                                 const G1Projective &p);
+__host__ __device__ void projective_point_double(G2Projective &result,
+                                                 const G2Projective &p);
+// Mixed addition: projective + affine (saves 3 field muls vs
+// projective+projective)
+__host__ __device__ void projective_mixed_add(G1Projective &result,
+                                              const G1Projective &p1,
+                                              const G1Affine &p2);
+__host__ __device__ void projective_mixed_add(G2Projective &result,
+                                              const G2Projective &p1,
+                                              const G2Affine &p2);
+
+// Multi-Scalar Multiplication (MSM) common code
+// Template traits used by MSM algorithms
+// Note: projective_point_add and projective_point_double are declared in
+// curve.h
+
+// ============================================================================
+// Template Traits (needed by MSM kernels)
+// ============================================================================
+
+template <typename PointType> struct PointSelector;
+
+// Specialization for G1Point (uses Fp)
+template <> struct PointSelector<G1Affine> {
+  using FieldType = Fp;
+
+  __host__ __device__ static void field_zero(FieldType &a) { fp_zero(a); }
+  __host__ __device__ static void field_copy(FieldType &dst,
+                                             const FieldType &src) {
+    dst = src;
+  }
+  __host__ __device__ static void field_neg(FieldType &c, const FieldType &a) {
+    c = -a;
+  }
+  __host__ __device__ static void field_add(FieldType &c, const FieldType &a,
+                                            const FieldType &b) {
+    c = a + b;
+  }
+  __host__ __device__ static void field_sub(FieldType &c, const FieldType &a,
+                                            const FieldType &b) {
+    c = a - b;
+  }
+  __host__ __device__ static void field_mul(FieldType &c, const FieldType &a,
+                                            const FieldType &b) {
+    fp_mont_mul(c, a, b);
+  }
+  __host__ __device__ static void field_inv(FieldType &c, const FieldType &a) {
+    fp_mont_inv(c, a);
+  }
+  __host__ __device__ static ComparisonType field_cmp(const FieldType &a,
+                                                      const FieldType &b) {
+    return fp_cmp(a, b);
+  }
+  __host__ __device__ static bool field_is_zero(const FieldType &a) {
+    return fp_is_zero(a);
+  }
+  __host__ __device__ static void field_to_montgomery(FieldType &c,
+                                                      const FieldType &a) {
+    fp_to_montgomery(c, a);
+  }
+  __host__ __device__ static void field_from_montgomery(FieldType &c,
+                                                        const FieldType &a) {
+    fp_from_montgomery(c, a);
+  }
+
+  __host__ __device__ static void point_at_infinity(G1Affine &point) {
+    g1_point_at_infinity(point);
+  }
+  __host__ __device__ static bool is_infinity(const G1Affine &point) {
+    return g1_is_infinity(point);
+  }
+  __host__ __device__ static const FieldType &curve_b() { return curve_b_g1(); }
+  __host__ __device__ static void point_copy(G1Affine &dst,
+                                             const G1Affine &src) {
+    dst = src;
+  }
+};
+
+// Specialization for G2Point (uses Fp2)
+template <> struct PointSelector<G2Point> {
+  using FieldType = Fp2;
+
+  __host__ __device__ static void field_zero(FieldType &a) { fp2_zero(a); }
+  __host__ __device__ static void field_copy(FieldType &dst,
+                                             const FieldType &src) {
+    dst = src;
+  }
+  __host__ __device__ static void field_neg(FieldType &c, const FieldType &a) {
+    c = -a;
+  }
+  __host__ __device__ static void field_add(FieldType &c, const FieldType &a,
+                                            const FieldType &b) {
+    c = a + b;
+  }
+  __host__ __device__ static void field_sub(FieldType &c, const FieldType &a,
+                                            const FieldType &b) {
+    c = a - b;
+  }
+  __host__ __device__ static void field_mul(FieldType &c, const FieldType &a,
+                                            const FieldType &b) {
+    fp2_mont_mul(c, a, b);
+  }
+  __host__ __device__ static void field_inv(FieldType &c, const FieldType &a) {
+    fp2_mont_inv(c, a);
+  }
+  __host__ __device__ static ComparisonType field_cmp(const FieldType &a,
+                                                      const FieldType &b) {
+    return fp2_cmp(a, b);
+  }
+  __host__ __device__ static bool field_is_zero(const FieldType &a) {
+    return fp2_is_zero(a);
+  }
+  __host__ __device__ static void field_to_montgomery(FieldType &c,
+                                                      const FieldType &a) {
+    fp_to_montgomery(c.c0, a.c0);
+    fp_to_montgomery(c.c1, a.c1);
+  }
+  __host__ __device__ static void field_from_montgomery(FieldType &c,
+                                                        const FieldType &a) {
+    fp_from_montgomery(c.c0, a.c0);
+    fp_from_montgomery(c.c1, a.c1);
+  }
+
+  __host__ __device__ static void point_at_infinity(G2Point &point) {
+    g2_point_at_infinity(point);
+  }
+  __host__ __device__ static bool is_infinity(const G2Point &point) {
+    return g2_is_infinity(point);
+  }
+  __host__ __device__ static const FieldType &curve_b() { return curve_b_g2(); }
+  __host__ __device__ static void point_copy(G2Point &dst, const G2Point &src) {
+    dst = src;
+  }
+};
+
+template <typename ProjectiveType> struct ProjectiveSelector;
+
+// Specialization for G1Projective (uses Fp)
+template <> struct ProjectiveSelector<G1Projective> {
+  using FieldType = Fp;
+  using AffineType = G1Affine;
+
+  __host__ __device__ static void field_zero(FieldType &a) { fp_zero(a); }
+  __host__ __device__ static void field_copy(FieldType &dst,
+                                             const FieldType &src) {
+    dst = src;
+  }
+  __host__ __device__ static bool field_is_zero(const FieldType &a) {
+    return fp_is_zero(a);
+  }
+  __host__ __device__ static void field_mul(FieldType &c, const FieldType &a,
+                                            const FieldType &b) {
+    fp_mont_mul(c, a, b);
+  }
+  __host__ __device__ static void field_sub(FieldType &c, const FieldType &a,
+                                            const FieldType &b) {
+    c = a - b;
+  }
+
+  __host__ __device__ static void point_at_infinity(G1Projective &point) {
+    g1_projective_point_at_infinity(point);
+  }
+  __host__ __device__ static bool is_infinity(const G1Projective &point) {
+    return fp_is_zero(point.Z);
+  }
+  __host__ __device__ static void affine_to_projective(G1Projective &proj,
+                                                       const G1Affine &affine) {
+    ::affine_to_projective(proj, affine);
+  }
+  __host__ __device__ static void projective_add(G1Projective &result,
+                                                 const G1Projective &p1,
+                                                 const G1Projective &p2) {
+    projective_point_add(result, p1, p2);
+  }
+  __host__ __device__ static void projective_double(G1Projective &result,
+                                                    const G1Projective &p) {
+    projective_point_double(result, p);
+  }
+  // Mixed addition: adds affine point to projective (saves 3 field muls)
+  __host__ __device__ static void
+  mixed_add(G1Projective &result, const G1Projective &p1, const G1Affine &p2) {
+    projective_mixed_add(result, p1, p2);
+  }
+  __host__ __device__ static void point_copy(G1Projective &dst,
+                                             const G1Projective &src) {
+    dst = src;
+  }
+};
+
+// Specialization for G2ProjectivePoint (uses Fp2)
+// Note: G2ProjectivePoint is a type alias for G2Projective
+template <> struct ProjectiveSelector<G2ProjectivePoint> {
+  using FieldType = Fp2;
+  using AffineType = G2Point;
+
+  __host__ __device__ static void field_zero(FieldType &a) { fp2_zero(a); }
+  __host__ __device__ static void field_copy(FieldType &dst,
+                                             const FieldType &src) {
+    dst = src;
+  }
+  __host__ __device__ static bool field_is_zero(const FieldType &a) {
+    return fp2_is_zero(a);
+  }
+  __host__ __device__ static void field_mul(FieldType &c, const FieldType &a,
+                                            const FieldType &b) {
+    fp2_mont_mul(c, a, b);
+  }
+  __host__ __device__ static void field_sub(FieldType &c, const FieldType &a,
+                                            const FieldType &b) {
+    c = a - b;
+  }
+
+  __host__ __device__ static void point_at_infinity(G2ProjectivePoint &point) {
+    g2_projective_point_at_infinity(point);
+  }
+  __host__ __device__ static bool is_infinity(const G2ProjectivePoint &point) {
+    return fp2_is_zero(point.Z);
+  }
+  __host__ __device__ static void affine_to_projective(G2ProjectivePoint &proj,
+                                                       const G2Point &affine) {
+    ::affine_to_projective(proj, affine);
+  }
+  __host__ __device__ static void projective_add(G2ProjectivePoint &result,
+                                                 const G2ProjectivePoint &p1,
+                                                 const G2ProjectivePoint &p2) {
+    projective_point_add(result, p1, p2);
+  }
+  __host__ __device__ static void
+  projective_double(G2ProjectivePoint &result, const G2ProjectivePoint &p) {
+    projective_point_double(result, p);
+  }
+  // Mixed addition: adds affine point to projective (saves 3 field muls)
+  __host__ __device__ static void mixed_add(G2ProjectivePoint &result,
+                                            const G2ProjectivePoint &p1,
+                                            const G2Point &p2) {
+    projective_mixed_add(result, p1, p2);
+  }
+  __host__ __device__ static void point_copy(G2ProjectivePoint &dst,
+                                             const G2ProjectivePoint &src) {
+    dst = src;
+  }
+};

 // ============================================================================
 // MSM Kernel Templates (defined here so they're visible when instantiated)
 // ============================================================================

+// Helper to select appropriate selector for a point type (affine or projective)
+template <typename PointType> struct SelectorChooser;
+
+template <> struct SelectorChooser<G1Affine> {
+  using Selection = PointSelector<G1Affine>;
+};
+
+template <> struct SelectorChooser<G2Point> {
+  using Selection = PointSelector<G2Point>;
+};
+
+template <> struct SelectorChooser<G1Projective> {
+  using Selection = ProjectiveSelector<G1Projective>;
+};
+
+template <> struct SelectorChooser<G2ProjectivePoint> {
+  using Selection = ProjectiveSelector<G2ProjectivePoint>;
+};
+
 // Pippenger kernel: Clear buckets (works for both affine and projective points)
 template <typename PointType>
-__global__ void kernel_clear_buckets(PointType *__restrict__ buckets,
-                                     uint32_t num_buckets) {
+__global__ void kernel_clear_buckets(PointType *buckets, uint32_t num_buckets) {
  using AffinePoint = typename SelectorChooser<PointType>::Selection;

  uint32_t idx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -22,11 +293,11 @@ __global__ void kernel_clear_buckets(PointType *__restrict__ buckets,
 // blocks OPTIMIZED: Uses parallel tree reduction instead of sequential loop
 // Launch config: <<<num_buckets, min(num_blocks, 256), shared_mem>>>
 template <typename ProjectiveType>
-__global__ void
-kernel_reduce_buckets(ProjectiveType *__restrict__ final_buckets,
-                      const ProjectiveType *__restrict__ block_buckets,
-                      uint32_t num_blocks, uint32_t num_buckets) {
-  using ProjectivePoint = Projective<ProjectiveType>;
+__global__ void kernel_reduce_buckets(ProjectiveType *final_buckets,
+                                      const ProjectiveType *block_buckets,
+                                      uint32_t num_blocks,
+                                      uint32_t num_buckets) {
+  using ProjectivePoint = ProjectiveSelector<ProjectiveType>;

  // Each block handles one bucket, threads cooperate to reduce all block
  // contributions
--- a/backends/zk-cuda-backend/cuda/src/msm/msm.cu
+++ b/backends/zk-cuda-backend/cuda/src/msm/msm.cu
@@ -8,64 +8,55 @@
 // Multi-Scalar Multiplication (MSM) using Pippenger algorithm for BLS12-446

 // Forward declarations for Pippenger implementations
-void point_msm_g1_pippenger_async(
-    cudaStream_t stream, uint32_t gpu_index, G1Projective *h_result,
-    const G1Affine *d_points, const Scalar *d_scalars, uint32_t n,
-    G1Projective *d_scratch, uint64_t &size_tracker, bool gpu_memory_allocated);
-void point_msm_g2_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
-                                  G2ProjectivePoint *h_result,
+void point_msm_async_g1_pippenger(cudaStream_t stream, uint32_t gpu_index,
+                                  G1Projective *d_result,
+                                  const G1Affine *d_points,
+                                  const Scalar *d_scalars,
+                                  G1Projective *d_scratch, uint32_t n,
+                                  uint64_t &size_tracker);
+void point_msm_async_g2_pippenger(cudaStream_t stream, uint32_t gpu_index,
+                                  G2ProjectivePoint *d_result,
                                  const G2Point *d_points,
-                                  const Scalar *d_scalars, uint32_t n,
-                                  G2ProjectivePoint *d_scratch,
-                                  uint64_t &size_tracker,
-                                  bool gpu_memory_allocated);
+                                  const Scalar *d_scalars,
+                                  G2ProjectivePoint *d_scratch, uint32_t n,
+                                  uint64_t &size_tracker);

 // ============================================================================
 // Public MSM API for BigInt scalars
 // ============================================================================

 // MSM with BigInt scalars for G1 (projective coordinates internally)
-// Result is written directly to the host pointer h_result.
-void point_msm_g1_async(cudaStream_t stream, uint32_t gpu_index,
-                        G1Projective *h_result, const G1Affine *d_points,
-                        const Scalar *d_scalars, uint32_t n,
-                        G1Projective *d_scratch, uint64_t &size_tracker,
-                        bool gpu_memory_allocated) {
-  point_msm_g1_pippenger_async(stream, gpu_index, h_result, d_points, d_scalars,
-                               n, d_scratch, size_tracker,
-                               gpu_memory_allocated);
+void point_msm_async_g1(cudaStream_t stream, uint32_t gpu_index,
+                        G1Projective *d_result, const G1Affine *d_points,
+                        const Scalar *d_scalars, G1Projective *d_scratch,
+                        uint32_t n, uint64_t &size_tracker) {
+  point_msm_async_g1_pippenger(stream, gpu_index, d_result, d_points, d_scalars,
+                               d_scratch, n, size_tracker);
 }

 // MSM with BigInt scalars for G2 (projective coordinates internally)
-// Result is written directly to the host pointer h_result.
-void point_msm_g2_async(cudaStream_t stream, uint32_t gpu_index,
-                        G2ProjectivePoint *h_result, const G2Point *d_points,
-                        const Scalar *d_scalars, uint32_t n,
-                        G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
-                        bool gpu_memory_allocated) {
-  point_msm_g2_pippenger_async(stream, gpu_index, h_result, d_points, d_scalars,
-                               n, d_scratch, size_tracker,
-                               gpu_memory_allocated);
+void point_msm_async_g2(cudaStream_t stream, uint32_t gpu_index,
+                        G2ProjectivePoint *d_result, const G2Point *d_points,
+                        const Scalar *d_scalars, G2ProjectivePoint *d_scratch,
+                        uint32_t n, uint64_t &size_tracker) {
+  point_msm_async_g2_pippenger(stream, gpu_index, d_result, d_points, d_scalars,
+                               d_scratch, n, size_tracker);
 }

 void point_msm_g1(cudaStream_t stream, uint32_t gpu_index,
-                  G1Projective *h_result, const G1Affine *d_points,
-                  const Scalar *d_scalars, uint32_t n, G1Projective *d_scratch,
-                  uint64_t &size_tracker, bool gpu_memory_allocated) {
-  point_msm_g1_async(stream, gpu_index, h_result, d_points, d_scalars, n,
-                     d_scratch, size_tracker, gpu_memory_allocated);
-  // The async impl already syncs internally before the CPU-side Horner phase,
-  // so the stream is idle here. This sync is kept for defensive correctness.
+                  G1Projective *d_result, const G1Affine *d_points,
+                  const Scalar *d_scalars, G1Projective *d_scratch, uint32_t n,
+                  uint64_t &size_tracker) {
+  point_msm_async_g1(stream, gpu_index, d_result, d_points, d_scalars,
+                     d_scratch, n, size_tracker);
  cuda_synchronize_stream(stream, gpu_index);
 }

 void point_msm_g2(cudaStream_t stream, uint32_t gpu_index,
-                  G2ProjectivePoint *h_result, const G2Point *d_points,
-                  const Scalar *d_scalars, uint32_t n,
-                  G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
-                  bool gpu_memory_allocated) {
-  point_msm_g2_async(stream, gpu_index, h_result, d_points, d_scalars, n,
-                     d_scratch, size_tracker, gpu_memory_allocated);
-  // See comment in point_msm_g1 above.
+                  G2ProjectivePoint *d_result, const G2Point *d_points,
+                  const Scalar *d_scalars, G2ProjectivePoint *d_scratch,
+                  uint32_t n, uint64_t &size_tracker) {
+  point_msm_async_g2(stream, gpu_index, d_result, d_points, d_scalars,
+                     d_scratch, n, size_tracker);
  cuda_synchronize_stream(stream, gpu_index);
 }
--- a/backends/zk-cuda-backend/cuda/src/msm/pippenger/msm_pippenger.cu
+++ b/backends/zk-cuda-backend/cuda/src/msm/pippenger/msm_pippenger.cu
@@ -1,12 +1,11 @@
 #include "../common.cuh"
-#include "checked_arithmetic.h"
 #include "curve.h"
 #include "device.h"
 #include "fp.h"
 #include "fp2.h"
 #include "msm.h"
 #include <algorithm>
-#include <type_traits>
+#include <cstring>
 #include <vector>

 // ============================================================================
@@ -47,11 +46,6 @@ template <typename AffineType> struct Phase1KernelLaunchParams {
    adjusted_threads_per_block =
        std::min(requested_threads_per_block, max_threads_for_shared_mem);

-    PANIC_IF_FALSE(adjusted_threads_per_block > 0,
-                   "Phase1KernelLaunchParams: insufficient shared memory for "
-                   "kernel launch (max_shared=%u, fixed=%zu)",
-                   max_shared_mem_per_block, fixed_shared_mem);
-
    // Calculate number of blocks per window
    num_blocks_per_window = CEIL_DIV(n, adjusted_threads_per_block);

@@ -77,24 +71,16 @@ template <typename ProjectiveType> struct Phase2KernelLaunchParams {

    // Cap threads to respect shared memory limit
    uint32_t threads = std::min(requested_threads, max_threads_for_shared);
-    threads = std::min(threads, static_cast<uint32_t>(KERNEL_THREADS_MAX));
+    threads = std::min(threads, (uint32_t)KERNEL_THREADS_MAX);

    // Round up to nearest power of 2 (required for tree reduction)
    uint32_t pow2_threads = 1;
    while (pow2_threads < threads)
      pow2_threads *= 2;
-
-    // After rounding to power of 2, verify shared memory doesn't exceed device
-    // limit
-    if (safe_mul_sizeof<ProjectiveType>(static_cast<size_t>(pow2_threads)) >
-        max_shared_mem_per_block) {
-      pow2_threads /= 2;
-    }
    adjusted_threads = pow2_threads;

    // Calculate actual shared memory requirement
-    shared_mem =
-        safe_mul_sizeof<ProjectiveType>(static_cast<size_t>(adjusted_threads));
+    shared_mem = adjusted_threads * sizeof(ProjectiveType);
  }
 };

@@ -153,6 +139,18 @@ __device__ __forceinline__ uint32_t extract_window_bigint(
                                       window_size);
 }

+// Forward declarations for projective point operations (needed by kernels)
+__host__ __device__ void projective_point_add(G1Projective &result,
+                                              const G1Projective &p1,
+                                              const G1Projective &p2);
+__host__ __device__ void projective_point_add(G2ProjectivePoint &result,
+                                              const G2ProjectivePoint &p1,
+                                              const G2ProjectivePoint &p2);
+__host__ __device__ void projective_point_double(G1Projective &result,
+                                                 const G1Projective &p);
+__host__ __device__ void projective_point_double(G2ProjectivePoint &result,
+                                                 const G2ProjectivePoint &p);
+
 // Kernel: Accumulate ALL windows in parallel using SORT-THEN-REDUCE
 // Grid: (num_windows * num_blocks_per_window) blocks
 // Each block processes points for ONE window
@@ -160,12 +158,12 @@ __device__ __forceinline__ uint32_t extract_window_bigint(
 // Uses mixed addition (affine + projective) to save 3 field muls per add
 template <typename AffineType, typename ProjectiveType>
 __global__ void kernel_accumulate_all_windows(
-    ProjectiveType *__restrict__ all_block_buckets, // [num_windows * num_blocks
-                                                    // * bucket_count]
-    const AffineType *__restrict__ points, const Scalar *__restrict__ scalars,
-    uint32_t num_points, uint32_t num_windows, uint32_t num_blocks_per_window,
-    uint32_t window_size, uint32_t bucket_count) {
-  using ProjectivePoint = Projective<ProjectiveType>;
+    ProjectiveType
+        *all_block_buckets, // [num_windows * num_blocks * bucket_count]
+    const AffineType *points, const Scalar *scalars, uint32_t num_points,
+    uint32_t num_windows, uint32_t num_blocks_per_window, uint32_t window_size,
+    uint32_t bucket_count) {
+  using ProjectivePoint = ProjectiveSelector<ProjectiveType>;

  const uint32_t window_idx = blockIdx.x / num_blocks_per_window;
  const uint32_t block_within_window = blockIdx.x % num_blocks_per_window;
@@ -290,14 +288,12 @@ __global__ void kernel_accumulate_all_windows(
 // Each block reduces one (window, bucket) pair across all block contributions
 template <typename ProjectiveType>
 __global__ void kernel_reduce_all_windows(
-    ProjectiveType
-        *__restrict__ all_final_buckets, // [num_windows * NUM_BUCKETS]
+    ProjectiveType *all_final_buckets, // [num_windows * NUM_BUCKETS]
    const ProjectiveType
-        *__restrict__ all_block_buckets, // [num_windows * num_blocks *
-                                         // NUM_BUCKETS]
+        *all_block_buckets, // [num_windows * num_blocks * NUM_BUCKETS]
    uint32_t num_windows, uint32_t num_blocks_per_window,
    uint32_t num_buckets) {
-  using ProjectivePoint = Projective<ProjectiveType>;
+  using ProjectivePoint = ProjectiveSelector<ProjectiveType>;

  const uint32_t flat_idx = blockIdx.x;
  const uint32_t window_idx = flat_idx / num_buckets;
@@ -361,11 +357,10 @@ __global__ void kernel_reduce_all_windows(
 // Each block computes the window sum: sum(i * bucket[i]) for i=1..15
 template <typename ProjectiveType>
 __global__ void kernel_compute_window_sums(
-    ProjectiveType *__restrict__ window_sums, // [num_windows]
-    const ProjectiveType
-        *__restrict__ all_final_buckets, // [num_windows * NUM_BUCKETS]
+    ProjectiveType *window_sums,             // [num_windows]
+    const ProjectiveType *all_final_buckets, // [num_windows * NUM_BUCKETS]
    uint32_t num_windows, uint32_t num_buckets) {
-  using ProjectivePoint = Projective<ProjectiveType>;
+  using ProjectivePoint = ProjectiveSelector<ProjectiveType>;

  const uint32_t window_idx = blockIdx.x;
  if (window_idx >= num_windows)
@@ -434,22 +429,18 @@ __global__ void kernel_compute_window_sums(
 }

 // ============================================================================
-// CPU Horner Combination
+// CPU-side Horner Combination (faster than single-thread GPU)
 // ============================================================================

-// Combines window sums using Horner's method on the CPU. A single CPU core
-// native 64-bit multiply is much faster than a single GPU thread for this
-// workload. The CPU path takes ~0.1 ms; a <<<1,1>>> GPU kernel takes ~10-12 ms.
-//
-// Horner evaluation (MSB-first):
-//   acc = window_sums[0]
-//   for w = 1 .. num_windows-1:
-//     acc = acc * 2^window_size + window_sums[w]
+// CPU Horner: combine window sums using Horner's method on host
+// Single-threaded CPU execution is faster than single-threaded GPU for this
+// sequential operation. The memcpy overhead is smaller than the GPU's memory
+// latency penalty for sequential access patterns.
 template <typename ProjectiveType>
 void horner_combine_cpu(ProjectiveType &result,
                        const ProjectiveType *window_sums, uint32_t num_windows,
                        uint32_t window_size) {
-  using ProjectivePoint = Projective<ProjectiveType>;
+  using ProjectivePoint = ProjectiveSelector<ProjectiveType>;

  ProjectiveType acc;
  ProjectivePoint::point_at_infinity(acc);
@@ -472,7 +463,7 @@ void horner_combine_cpu(ProjectiveType &result,
        ProjectivePoint::point_copy(acc, temp);
      }
    } else if (!ProjectivePoint::is_infinity(acc)) {
-      // Window sum is infinity but accumulator is not -- still shift left
+      // Window sum is zero, but still need to shift
      for (uint32_t i = 0; i < window_size; i++) {
        ProjectivePoint::projective_double(temp, acc);
        ProjectivePoint::point_copy(acc, temp);
@@ -488,27 +479,25 @@ void horner_combine_cpu(ProjectiveType &result,
 // ============================================================================

 // Template MSM with BigInt scalars - ALL WINDOWS PARALLEL
-// Result is written directly to a host pointer -- no device round-trip needed.
-// d_scratch: caller-provided device buffer for intermediate bucket arrays and
-// window sums. The caller is responsible for allocating and freeing this
-// buffer.
 template <typename AffineType, typename ProjectiveType>
-void point_msm_pippenger_impl_async(
-    cudaStream_t stream, uint32_t gpu_index, ProjectiveType *h_result,
-    const AffineType *d_points, const Scalar *d_scalars, uint32_t n,
-    uint32_t threads_per_block, uint32_t window_size, uint32_t bucket_count,
-    ProjectiveType *d_scratch, uint64_t &size_tracker,
-    bool gpu_memory_allocated) {
-  using ProjectivePoint = Projective<ProjectiveType>;
+void point_msm_async_pippenger_impl(
+    cudaStream_t stream, uint32_t gpu_index, ProjectiveType *d_result,
+    const AffineType *d_points, const Scalar *d_scalars,
+    ProjectiveType *d_scratch, uint32_t n, uint32_t threads_per_block,
+    uint32_t window_size, uint32_t bucket_count, uint64_t &size_tracker) {
+  using ProjectivePoint = ProjectiveSelector<ProjectiveType>;

  if (n == 0) {
-    ProjectivePoint::point_at_infinity(*h_result);
+    cuda_set_device(gpu_index);
+    kernel_clear_buckets<ProjectiveType><<<1, 1, 0, stream>>>(d_result, 1);
+    check_cuda_error(cudaGetLastError());
    return;
  }

-  PANIC_IF_FALSE(h_result != nullptr && d_points != nullptr &&
+  PANIC_IF_FALSE(n > 0, "point_msm_async_pippenger_impl: invalid size n=%u", n);
+  PANIC_IF_FALSE(d_result != nullptr && d_points != nullptr &&
                     d_scalars != nullptr && d_scratch != nullptr,
-                 "point_msm_pippenger_impl_async: null pointer argument");
+                 "point_msm_async_pippenger_impl: null pointer argument");

  cuda_set_device(gpu_index);

@@ -535,20 +524,34 @@ void point_msm_pippenger_impl_async(
  const size_t total_scratch =
      all_block_buckets_size + all_final_buckets_size + num_windows;

-  // Partition the caller-provided scratch buffer into sub-regions
-  ProjectiveType *d_all_block_buckets = d_scratch;
-  ProjectiveType *d_all_final_buckets = d_scratch + all_block_buckets_size;
+  // Check for overflow before allocating scratch space
+  size_t scratch_bytes = 0;
+  bool scratch_overflow = __builtin_mul_overflow(
+      total_scratch, sizeof(ProjectiveType), &scratch_bytes);
+  PANIC_IF_FALSE(!scratch_overflow,
+                 "point_msm_async_pippenger_impl: scratch allocation overflow "
+                 "(total_scratch=%zu, element_size=%zu)",
+                 total_scratch, sizeof(ProjectiveType));
+
+  // Allocate internal scratch space (user-provided scratch is too small for
+  // all-windows-parallel)
+  ProjectiveType *d_internal_scratch =
+      (ProjectiveType *)cuda_malloc_with_size_tracking_async(
+          scratch_bytes, stream, gpu_index, size_tracker, true);
+
+  ProjectiveType *d_all_block_buckets = d_internal_scratch;
+  ProjectiveType *d_all_final_buckets =
+      d_internal_scratch + all_block_buckets_size;
  ProjectiveType *d_window_sums = d_all_final_buckets + all_final_buckets_size;

  // Clear all scratch space
  const uint32_t clear_blocks = CEIL_DIV(total_scratch, KERNEL_THREADS_MAX);
  PANIC_IF_FALSE(clear_blocks * KERNEL_THREADS_MAX >= total_scratch,
-                 "kernel_clear_buckets: insufficient threads (%zu) to clear "
-                 "buffer (%zu elements)",
-                 static_cast<size_t>(clear_blocks) * KERNEL_THREADS_MAX,
-                 total_scratch);
+                 "kernel_clear_buckets: insufficient threads (%u) to clear "
+                 "buffer (%u elements)",
+                 clear_blocks * KERNEL_THREADS_MAX, total_scratch);
  kernel_clear_buckets<ProjectiveType>
-      <<<clear_blocks, KERNEL_THREADS_MAX, 0, stream>>>(d_scratch,
+      <<<clear_blocks, KERNEL_THREADS_MAX, 0, stream>>>(d_internal_scratch,
                                                        total_scratch);
  check_cuda_error(cudaGetLastError());

@@ -557,10 +560,8 @@ void point_msm_pippenger_impl_async(
      num_windows * launch_params.num_blocks_per_window;
  PANIC_IF_FALSE(
      total_accum_blocks * bucket_count <= all_block_buckets_size,
-      "kernel_accumulate_all_windows: max write index (%zu) exceeds buffer "
-      "(%zu)",
-      static_cast<size_t>(total_accum_blocks) * bucket_count,
-      all_block_buckets_size);
+      "kernel_accumulate_all_windows: max write index (%u) exceeds buffer (%u)",
+      total_accum_blocks * bucket_count, all_block_buckets_size);
  kernel_accumulate_all_windows<AffineType, ProjectiveType>
      <<<total_accum_blocks, launch_params.adjusted_threads_per_block,
         launch_params.accum_shared_mem, stream>>>(
@@ -574,7 +575,7 @@ void point_msm_pippenger_impl_async(
      launch_params.num_blocks_per_window, gpu_index);
  PANIC_IF_FALSE(
      total_reduce_blocks <= all_final_buckets_size,
-      "kernel_reduce_all_windows: blocks (%u) exceeds output buffer (%zu)",
+      "kernel_reduce_all_windows: blocks (%u) exceeds output buffer (%u)",
      total_reduce_blocks, all_final_buckets_size);
  kernel_reduce_all_windows<ProjectiveType>
      <<<total_reduce_blocks, reduce_params.adjusted_threads,
@@ -587,31 +588,36 @@ void point_msm_pippenger_impl_async(
  // Round up to next multiple of 32 (warp size) for efficient scheduling.
  // The kernel already has `if (tid < n)` bounds checks for the excess threads.
  const uint32_t combine_threads = ((bucket_count - 1) + 31) & ~31u;
-  const size_t combine_shared_mem =
-      safe_mul_sizeof<ProjectiveType>(static_cast<size_t>(combine_threads));
+  const size_t combine_shared_mem = combine_threads * sizeof(ProjectiveType);
  PANIC_IF_FALSE(num_windows * bucket_count <= all_final_buckets_size,
-                 "kernel_compute_window_sums: max read index (%zu) exceeds "
-                 "input buffer (%zu)",
-                 static_cast<size_t>(num_windows) * bucket_count,
-                 all_final_buckets_size);
+                 "kernel_compute_window_sums: max read index (%u) exceeds "
+                 "input buffer (%u)",
+                 num_windows * bucket_count, all_final_buckets_size);
  kernel_compute_window_sums<ProjectiveType>
      <<<num_windows, combine_threads, combine_shared_mem, stream>>>(
          d_window_sums, d_all_final_buckets, num_windows, bucket_count);
  check_cuda_error(cudaGetLastError());

-  // Phase 4: CPU Horner combine, result written directly to host pointer
-  //
-  // The Horner loop is inherently sequential. A single CPU core is much faster
-  // than a single GPU thread for this workload, so we run Horner on the CPU
-  // and write the result directly to the caller's host pointer.
+  // Phase 4: CPU-side Horner combine (faster than single GPU thread!)
+  // Download window sums to host
  std::vector<ProjectiveType> h_window_sums(num_windows);
-  cuda_memcpy_async_to_cpu(
-      h_window_sums.data(), d_window_sums,
-      safe_mul_sizeof<ProjectiveType>(static_cast<size_t>(num_windows)), stream,
-      gpu_index);
+  cuda_memcpy_async_to_cpu(h_window_sums.data(), d_window_sums,
+                           num_windows * sizeof(ProjectiveType), stream,
+                           gpu_index);
  cuda_synchronize_stream(stream, gpu_index);

-  horner_combine_cpu(*h_result, h_window_sums.data(), num_windows, window_size);
+  // Perform Horner combination on CPU
+  ProjectiveType h_result;
+  horner_combine_cpu(h_result, h_window_sums.data(), num_windows, window_size);
+
+  // Upload result back to device
+  cuda_memcpy_async_to_gpu(d_result, &h_result, sizeof(ProjectiveType), stream,
+                           gpu_index);
+
+  // Cleanup - must sync before returning since h_result is a local variable
+  cuda_synchronize_stream(stream, gpu_index);
+  cuda_drop_with_size_tracking_async(d_internal_scratch, stream, gpu_index,
+                                     true);
 }

 // ============================================================================
@@ -646,92 +652,36 @@ inline void get_g2_window_params(uint32_t n, uint32_t &window_size,
  bucket_count = MSM_G2_BUCKET_COUNT; // 32 buckets
 }

-// ============================================================================
-// Scratch Size Computation
-// ============================================================================
-// Computes the exact scratch buffer size (in bytes) needed by
-// point_msm_pippenger_impl_async for a given input count n. The formula must
-// stay in sync with the scratch partitioning inside that function:
-//   all_block_buckets: num_windows * num_blocks_per_window * bucket_count
-//   all_final_buckets: num_windows * bucket_count
-//   window_sums:       num_windows
-// Factoring this into a helper avoids duplicating the formula in every caller
-// and prevents the buffer-underallocation bug that occurs when callers use
-// ad-hoc estimates.
-template <typename AffineType, typename ProjectiveType>
-size_t pippenger_scratch_size(uint32_t n, uint32_t gpu_index) {
-  if (n == 0)
-    return 0;
-
-  uint32_t window_size, bucket_count;
-  // Use the same window parameter selection as the MSM entry points
-  if constexpr (std::is_same_v<AffineType, G1Affine>) {
-    get_g1_window_params(n, window_size, bucket_count);
-  } else {
-    get_g2_window_params(n, window_size, bucket_count);
-  }
-
-  const uint32_t threads_per_block = msm_threads_per_block<AffineType>(n);
-  const uint32_t num_windows = CEIL_DIV(Scalar::NUM_BITS, window_size);
-
-  // Phase1KernelLaunchParams computes the adjusted threads per block
-  // respecting shared memory limits, which determines num_blocks_per_window
-  Phase1KernelLaunchParams<AffineType> launch_params(n, threads_per_block,
-                                                     bucket_count, gpu_index);
-
-  const size_t all_block_buckets_elems = static_cast<size_t>(num_windows) *
-                                         launch_params.num_blocks_per_window *
-                                         bucket_count;
-  const size_t all_final_buckets_elems =
-      static_cast<size_t>(num_windows) * bucket_count;
-  const size_t total_elems =
-      all_block_buckets_elems + all_final_buckets_elems + num_windows;
-
-  return safe_mul_sizeof<ProjectiveType>(total_elems);
-}
-
-// Non-template wrappers so callers outside this TU (c_wrapper.cu, tests, etc.)
-// can compute the correct scratch size without access to template internals.
-size_t pippenger_scratch_size_g1(uint32_t n, uint32_t gpu_index) {
-  return pippenger_scratch_size<G1Affine, G1Projective>(n, gpu_index);
-}
-
-size_t pippenger_scratch_size_g2(uint32_t n, uint32_t gpu_index) {
-  return pippenger_scratch_size<G2Point, G2ProjectivePoint>(n, gpu_index);
-}
-
 // MSM with BigInt scalars for G1 (projective coordinates internally)
-void point_msm_g1_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
-                                  G1Projective *h_result,
+void point_msm_async_g1_pippenger(cudaStream_t stream, uint32_t gpu_index,
+                                  G1Projective *d_result,
                                  const G1Affine *d_points,
-                                  const Scalar *d_scalars, uint32_t n,
-                                  G1Projective *d_scratch,
-                                  uint64_t &size_tracker,
-                                  bool gpu_memory_allocated) {
+                                  const Scalar *d_scalars,
+                                  G1Projective *d_scratch, uint32_t n,
+                                  uint64_t &size_tracker) {
  uint32_t window_size, bucket_count;
  get_g1_window_params(n, window_size, bucket_count);

-  point_msm_pippenger_impl_async<G1Affine, G1Projective>(
-      stream, gpu_index, h_result, d_points, d_scalars, n,
-      msm_threads_per_block<G1Affine>(n), window_size, bucket_count, d_scratch,
-      size_tracker, gpu_memory_allocated);
+  point_msm_async_pippenger_impl<G1Affine, G1Projective>(
+      stream, gpu_index, d_result, d_points, d_scalars, d_scratch, n,
+      get_msm_threads_per_block<G1Affine>(n), window_size, bucket_count,
+      size_tracker);
 }

 // MSM with BigInt scalars for G2 (projective coordinates internally)
 // Uses larger window size to reduce Horner doublings (G2 ops are 2x more
 // expensive)
-void point_msm_g2_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
-                                  G2ProjectivePoint *h_result,
+void point_msm_async_g2_pippenger(cudaStream_t stream, uint32_t gpu_index,
+                                  G2ProjectivePoint *d_result,
                                  const G2Point *d_points,
-                                  const Scalar *d_scalars, uint32_t n,
-                                  G2ProjectivePoint *d_scratch,
-                                  uint64_t &size_tracker,
-                                  bool gpu_memory_allocated) {
+                                  const Scalar *d_scalars,
+                                  G2ProjectivePoint *d_scratch, uint32_t n,
+                                  uint64_t &size_tracker) {
  uint32_t window_size, bucket_count;
  get_g2_window_params(n, window_size, bucket_count);

-  point_msm_pippenger_impl_async<G2Point, G2ProjectivePoint>(
-      stream, gpu_index, h_result, d_points, d_scalars, n,
-      msm_threads_per_block<G2Point>(n), window_size, bucket_count, d_scratch,
-      size_tracker, gpu_memory_allocated);
+  point_msm_async_pippenger_impl<G2Point, G2ProjectivePoint>(
+      stream, gpu_index, d_result, d_points, d_scalars, d_scratch, n,
+      get_msm_threads_per_block<G2Point>(n), window_size, bucket_count,
+      size_tracker);
 }
--- a/backends/zk-cuda-backend/cuda/src/primitives/fp.cu
+++ b/backends/zk-cuda-backend/cuda/src/primitives/fp.cu
@@ -1,7 +1,6 @@
 #include "bls12_446_params.h"
 #include "device.h"
 #include "fp.h"
-#include "fp_ptx32.cuh"
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
@@ -188,9 +187,6 @@ __host__ __device__ void fp_copy(Fp &dst, const Fp &src) {
 // "Raw" means without modular reduction - performs a + b and returns carry.
 // This is an internal helper used by fp_add() which handles reduction.
 __host__ __device__ UNSIGNED_LIMB fp_add_raw(Fp &c, const Fp &a, const Fp &b) {
-#if LIMB_BITS_CONFIG == 32 && defined(__CUDA_ARCH__)
-  return fp_add_raw_ptx32(c, a, b);
-#else
  UNSIGNED_LIMB carry = 0;

  for (int i = 0; i < FP_LIMBS; i++) {
@@ -203,16 +199,12 @@ __host__ __device__ UNSIGNED_LIMB fp_add_raw(Fp &c, const Fp &a, const Fp &b) {
  }

  return carry;
-#endif
 }

 // Subtraction with borrow propagation
 // "Raw" means without modular reduction - performs a - b and returns borrow.
 // This is an internal helper used by fp_sub() which handles reduction.
 __host__ __device__ UNSIGNED_LIMB fp_sub_raw(Fp &c, const Fp &a, const Fp &b) {
-#if LIMB_BITS_CONFIG == 32 && defined(__CUDA_ARCH__)
-  return fp_sub_raw_ptx32(c, a, b);
-#else
  UNSIGNED_LIMB borrow = 0;

  for (int i = 0; i < FP_LIMBS; i++) {
@@ -226,15 +218,11 @@ __host__ __device__ UNSIGNED_LIMB fp_sub_raw(Fp &c, const Fp &a, const Fp &b) {
  }

  return borrow;
-#endif
 }

 // Addition with modular reduction: c = (a + b) mod p
 // MONTGOMERY: Both inputs and output must be in Montgomery form
 __host__ __device__ void fp_add(Fp &c, const Fp &a, const Fp &b) {
-#if LIMB_BITS_CONFIG == 32 && defined(__CUDA_ARCH__)
-  fp_add_ptx32(c, a, b);
-#else
  Fp sum;
  UNSIGNED_LIMB carry = fp_add_raw(sum, a, b);

@@ -247,15 +235,11 @@ __host__ __device__ void fp_add(Fp &c, const Fp &a, const Fp &b) {
  } else {
    fp_copy(c, sum);
  }
-#endif
 }

 // Subtraction with modular reduction: c = (a - b) mod p
 // MONTGOMERY: Both inputs and output must be in Montgomery form
 __host__ __device__ void fp_sub(Fp &c, const Fp &a, const Fp &b) {
-#if LIMB_BITS_CONFIG == 32 && defined(__CUDA_ARCH__)
-  fp_sub_ptx32(c, a, b);
-#else
  Fp diff;
  UNSIGNED_LIMB borrow = fp_sub_raw(diff, a, b);

@@ -266,31 +250,6 @@ __host__ __device__ void fp_sub(Fp &c, const Fp &a, const Fp &b) {
  } else {
    fp_copy(c, diff);
  }
-#endif
-}
-
-// Small-constant multiplication via addition chains.
-// These replace full Montgomery multiplications by 2, 3, 4, 8 with a few
-// modular additions, each ~25 instructions vs ~200+ for CIOS Montgomery mul.
-
-__host__ __device__ void fp_double(Fp &c, const Fp &a) { fp_add(c, a, a); }
-
-__host__ __device__ void fp_mul3(Fp &c, const Fp &a) {
-  Fp t;
-  fp_add(t, a, a);
-  fp_add(c, t, a);
-}
-
-__host__ __device__ void fp_mul4(Fp &c, const Fp &a) {
-  Fp t;
-  fp_add(t, a, a);
-  fp_add(c, t, t);
-}
-
-__host__ __device__ void fp_mul8(Fp &c, const Fp &a) {
-  Fp t;
-  fp_mul4(t, a);
-  fp_add(c, t, t);
 }

 // Helper function for limb multiplication: LIMB_BITS x LIMB_BITS -> 2*LIMB_BITS
@@ -475,9 +434,6 @@ __host__ __device__ void fp_mont_reduce(Fp &c, const UNSIGNED_LIMB *a) {
 // Uses only FP_LIMBS+1 limbs of working space instead of 2*FP_LIMBS.
 // Both a and b are in Montgomery form, result is in Montgomery form.
 __host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
-#if LIMB_BITS_CONFIG == 32 && defined(__CUDA_ARCH__)
-  fp_mont_mul_cios_ptx32(c, a, b);
-#else
  const Fp &p = fp_modulus();
  UNSIGNED_LIMB p_prime = fp_p_prime();

@@ -565,7 +521,6 @@ __host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
    fp_copy(c, reduced);
  }
  // Result is in Montgomery form
-#endif
 }

 // Montgomery multiplication: c = (a * b * R_INV) mod p
@@ -885,24 +840,32 @@ __host__ __device__ Fp operator-(const Fp &a, const Fp &b) {
  return c;
 }

+// TODO: This operator returns Montgomery form while operator+ and operator-
+// preserve the input form. This inconsistency means expressions like
+// `a + (b * c)` produce incorrect results. Verify all call sites and decide
+// whether to convert the result back to normal form or remove this operator.
+//
 // Binary multiplication: a * b
-// MONTGOMERY: Both inputs must be in Montgomery form, result is in Montgomery
-// form. This is consistent with operator+ and operator- which also require
-// Montgomery-form inputs.
+// EXTERNAL API: Accepts normal form inputs, converts to Montgomery, and returns
+// Montgomery form result. For internal operations where inputs are already in
+// Montgomery form, use fp_mont_mul() directly.
 __host__ __device__ Fp operator*(const Fp &a, const Fp &b) {
-  Fp result;
-  fp_mont_mul(result, a, b);
+  Fp a_mont, b_mont, result;
+
+  // Convert from normal form to Montgomery form for computation
+  fp_to_montgomery(a_mont, a);
+  fp_to_montgomery(b_mont, b);
+
+  // Multiply in Montgomery form - result stays in Montgomery form
+  fp_mont_mul(result, a_mont, b_mont);
+
  return result;
 }

 // Binary division: a / b
-// MONTGOMERY: Both inputs must be in Montgomery form, result is in Montgomery
-// form. Computes a * b^{-1} entirely in Montgomery representation.
 __host__ __device__ Fp operator/(const Fp &a, const Fp &b) {
-  Fp b_inv;
-  fp_mont_inv(b_inv, b);
  Fp c;
-  fp_mont_mul(c, a, b_inv);
+  fp_div(c, a, b);
  return c;
 }

@@ -936,23 +899,16 @@ __host__ __device__ Fp &operator-=(Fp &a, const Fp &b) {
 }

 // Compound multiplication: a *= b
-// MONTGOMERY: Both inputs must be in Montgomery form, result is in Montgomery
-// form.
 __host__ __device__ Fp &operator*=(Fp &a, const Fp &b) {
-  Fp temp;
-  fp_mont_mul(temp, a, b);
+  Fp temp = a * b;
  fp_copy(a, temp);
  return a;
 }

 // Compound division: a /= b
-// MONTGOMERY: Both inputs must be in Montgomery form, result is in Montgomery
-// form.
 __host__ __device__ Fp &operator/=(Fp &a, const Fp &b) {
-  Fp b_inv;
-  fp_mont_inv(b_inv, b);
  Fp temp;
-  fp_mont_mul(temp, a, b_inv);
+  fp_div(temp, a, b);
  fp_copy(a, temp);
  return a;
 }
--- a/backends/zk-cuda-backend/cuda/src/primitives/fp2.cu
+++ b/backends/zk-cuda-backend/cuda/src/primitives/fp2.cu
@@ -74,30 +74,6 @@ __host__ __device__ void fp2_sub(Fp2 &c, const Fp2 &a, const Fp2 &b) {
  fp_sub(c.c1, a.c1, b.c1);
 }

-// Small-constant multiplication via addition chains.
-// These replace full Fp2 Montgomery multiplications by 2, 3, 4, 8 with
-// modular additions on each component.
-
-__host__ __device__ void fp2_double(Fp2 &c, const Fp2 &a) {
-  fp_double(c.c0, a.c0);
-  fp_double(c.c1, a.c1);
-}
-
-__host__ __device__ void fp2_mul3(Fp2 &c, const Fp2 &a) {
-  fp_mul3(c.c0, a.c0);
-  fp_mul3(c.c1, a.c1);
-}
-
-__host__ __device__ void fp2_mul4(Fp2 &c, const Fp2 &a) {
-  fp_mul4(c.c0, a.c0);
-  fp_mul4(c.c1, a.c1);
-}
-
-__host__ __device__ void fp2_mul8(Fp2 &c, const Fp2 &a) {
-  fp_mul8(c.c0, a.c0);
-  fp_mul8(c.c1, a.c1);
-}
-
 // Multiplication: c = a * b
 // (a0 + a1*i) * (b0 + b1*i) = (a0*b0 - a1*b1) + (a0*b1 + a1*b0)*i
 // Optimized: converts to Montgomery once at start, operates, converts back at
@@ -166,40 +142,29 @@ __host__ __device__ void fp2_mont_mul(Fp2 &c, const Fp2 &a, const Fp2 &b) {
  fp_sub(c.c1, c.c1, t1);
 }

-// Montgomery squaring: c = a^2 (all in Montgomery form)
-// Uses the complex-squaring identity for Fp2 = Fp[i]/(i^2+1):
-//   c0 = (a0 + a1)(a0 - a1)   [since a0^2 - a1^2 = (a0+a1)(a0-a1)]
-//   c1 = 2 * a0 * a1
-// This requires only 2 Fp multiplications vs 3 for general fp2_mont_mul.
-// NOTE: All inputs and outputs are in Montgomery form
-// Safe when c aliases a: all reads of a complete before any write to c.
-__host__ __device__ void fp2_mont_square(Fp2 &c, const Fp2 &a) {
-  Fp sum, diff, c0_tmp, prod;
-
-  fp_add(sum, a.c0, a.c1);
-  fp_sub(diff, a.c0, a.c1);
-  fp_mont_mul(c0_tmp, sum, diff);
-
-  fp_mont_mul(prod, a.c0, a.c1);
-  fp_double(c.c1, prod);
-  fp_copy(c.c0, c0_tmp);
-}
-
-// Squaring with Montgomery conversion: c = a^2
-// Converts to Montgomery form, uses the 2-mul complex-squaring formula,
-// and converts back.
+// Optimized: converts to Montgomery once at start, operates, converts back at
+// end (4 conversions instead of 9)
 __host__ __device__ void fp2_square(Fp2 &c, const Fp2 &a) {
+  // Convert inputs to Montgomery form once
  Fp a0_m, a1_m;
  fp_to_montgomery(a0_m, a.c0);
  fp_to_montgomery(a1_m, a.c1);

-  // Use the 2-mul complex-squaring identity in Montgomery form
-  Fp2 a_m = {a0_m, a1_m};
-  Fp2 c_m;
-  fp2_mont_square(c_m, a_m);
+  // Operate in Montgomery form
+  Fp t0, t1, t2;
+  fp_mont_mul(t0, a0_m, a0_m); // t0 = a0^2
+  fp_mont_mul(t1, a1_m, a1_m); // t1 = a1^2
+  fp_add(t2, a0_m, a1_m);      // t2 = a0 + a1
+  fp_mont_mul(t2, t2, t2);     // t2 = (a0 + a1)^2

-  fp_from_montgomery(c.c0, c_m.c0);
-  fp_from_montgomery(c.c1, c_m.c1);
+  Fp c0_m, c1_m;
+  fp_sub(c0_m, t0, t1);   // c0 = a0^2 - a1^2
+  fp_sub(c1_m, t2, t0);   // c1 = (a0+a1)^2 - a0^2
+  fp_sub(c1_m, c1_m, t1); // c1 = (a0+a1)^2 - a0^2 - a1^2 = 2*a0*a1
+
+  // Convert outputs back from Montgomery form
+  fp_from_montgomery(c.c0, c0_m);
+  fp_from_montgomery(c.c1, c1_m);
 }

 __host__ __device__ void fp2_neg(Fp2 &c, const Fp2 &a) {
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_msm.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_msm.cu
@@ -8,8 +8,6 @@
 #include <cuda_runtime.h>
 #include <random>

-#include "checked_arithmetic.h"
-
 // Helper to get modulus (use fp_modulus() from the library)
 static Fp get_modulus() { return fp_modulus(); }

@@ -94,13 +92,27 @@ static void BM_G1_MSM(benchmark::State &state) {
  const auto n = static_cast<int>(state.range(0));
  std::mt19937_64 rng(42);

+  // Calculate required scratch space
+  const int threadsPerBlock =
+      get_msm_threads_per_block<G1Affine>(n); // Must match MSM implementation
+  const auto num_blocks = CEIL_DIV(n, threadsPerBlock);
+  const auto scratch_size =
+      (num_blocks + 1) * MSM_G1_BUCKET_COUNT * sizeof(G1Projective);
+
  // Allocate device memory
  auto *d_points = static_cast<G1Affine *>(cuda_malloc_with_size_tracking_async(
-      safe_mul_sizeof<G1Affine>(static_cast<size_t>(n)), g_benchmark_stream,
-      g_gpu_index, size_tracker, true));
+      n * sizeof(G1Affine), g_benchmark_stream, g_gpu_index, size_tracker,
+      true));
  auto *d_scalars = static_cast<Scalar *>(cuda_malloc_with_size_tracking_async(
-      safe_mul_sizeof<Scalar>(static_cast<size_t>(n)), g_benchmark_stream,
-      g_gpu_index, size_tracker, true));
+      n * sizeof(Scalar), g_benchmark_stream, g_gpu_index, size_tracker, true));
+  auto *d_result =
+      static_cast<G1Projective *>(cuda_malloc_with_size_tracking_async(
+          sizeof(G1Projective), g_benchmark_stream, g_gpu_index, size_tracker,
+          true));
+  auto *d_scratch =
+      static_cast<G1Projective *>(cuda_malloc_with_size_tracking_async(
+          scratch_size, g_benchmark_stream, g_gpu_index, size_tracker, true));
+
  // Prepare host data
  auto *h_points = new G1Affine[n];
  auto *h_scalars = new Scalar[n];
@@ -113,11 +125,11 @@ static void BM_G1_MSM(benchmark::State &state) {

  // Copy to device (once, before benchmark loop)
  cuda_memcpy_with_size_tracking_async_to_gpu(
-      d_points, h_points, safe_mul_sizeof<G1Affine>(static_cast<size_t>(n)),
-      g_benchmark_stream, g_gpu_index, true);
+      d_points, h_points, n * sizeof(G1Affine), g_benchmark_stream, g_gpu_index,
+      true);
  cuda_memcpy_with_size_tracking_async_to_gpu(
-      d_scalars, h_scalars, safe_mul_sizeof<Scalar>(static_cast<size_t>(n)),
-      g_benchmark_stream, g_gpu_index, true);
+      d_scalars, h_scalars, n * sizeof(Scalar), g_benchmark_stream, g_gpu_index,
+      true);

  // Convert points to Montgomery form (required for performance - all
  // operations use Montgomery)
@@ -125,29 +137,26 @@ static void BM_G1_MSM(benchmark::State &state) {
                                      n);
  check_cuda_error(cudaGetLastError());

-  // Allocate scratch buffer sized to match the pippenger internal partitioning
-  size_t g1_scratch_bytes = pippenger_scratch_size_g1(n, g_gpu_index);
-  auto *d_scratch = static_cast<G1Projective *>(
-      cuda_malloc_with_size_tracking_async(g1_scratch_bytes, g_benchmark_stream,
-                                           g_gpu_index, size_tracker, true));
+  // Initialize result and scratch memory to zero (once, before benchmark loop)
+  cuda_memset_with_size_tracking_async(d_result, 0, sizeof(G1Projective),
+                                       g_benchmark_stream, g_gpu_index, true);
+  cuda_memset_with_size_tracking_async(d_scratch, 0, scratch_size,
+                                       g_benchmark_stream, g_gpu_index, true);

  // Synchronize once before benchmark loop to ensure all setup is complete
  cuda_synchronize_stream(g_benchmark_stream, g_gpu_index);

-  // Result written directly to host -- no device allocation needed
-  G1Projective h_result;
-
  // Warm-up iterations
  for (int i = 0; i < WARMUP_ITERATIONS; i++) {
-    point_msm_g1_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
-                       d_scalars, n, d_scratch, size_tracker, true);
+    point_msm_async_g1(g_benchmark_stream, g_gpu_index, d_result, d_points,
+                       d_scalars, d_scratch, n, size_tracker);
  }
  cuda_synchronize_stream(g_benchmark_stream, g_gpu_index);

  // Benchmark loop: only measure the MSM computation, no memory operations
  for (auto _ : state) {
-    point_msm_g1_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
-                       d_scalars, n, d_scratch, size_tracker, true);
+    point_msm_async_g1(g_benchmark_stream, g_gpu_index, d_result, d_points,
+                       d_scalars, d_scratch, n, size_tracker);
    benchmark::ClobberMemory();
  }

@@ -159,12 +168,14 @@ static void BM_G1_MSM(benchmark::State &state) {

  delete[] h_points;
  delete[] h_scalars;
-  cuda_drop_with_size_tracking_async(d_scratch, g_benchmark_stream, g_gpu_index,
-                                     true);
  cuda_drop_with_size_tracking_async(d_points, g_benchmark_stream, g_gpu_index,
                                     true);
  cuda_drop_with_size_tracking_async(d_scalars, g_benchmark_stream, g_gpu_index,
                                     true);
+  cuda_drop_with_size_tracking_async(d_result, g_benchmark_stream, g_gpu_index,
+                                     true);
+  cuda_drop_with_size_tracking_async(d_scratch, g_benchmark_stream, g_gpu_index,
+                                     true);
 }

 // Benchmark G2 MSM with random points and 320-bit scalars
@@ -175,13 +186,27 @@ static void BM_G2_MSM(benchmark::State &state) {
  const auto n = static_cast<int>(state.range(0));
  std::mt19937_64 rng(42);

+  // Calculate required scratch space
+  const int threadsPerBlock =
+      get_msm_threads_per_block<G2Affine>(n); // Must match MSM implementation
+  const auto num_blocks = CEIL_DIV(n, threadsPerBlock);
+  const auto scratch_size =
+      (num_blocks + 1) * MSM_G2_BUCKET_COUNT * sizeof(G2Projective);
+
  // Allocate device memory
  auto *d_points = static_cast<G2Affine *>(cuda_malloc_with_size_tracking_async(
-      safe_mul_sizeof<G2Affine>(static_cast<size_t>(n)), g_benchmark_stream,
-      g_gpu_index, size_tracker, true));
+      n * sizeof(G2Affine), g_benchmark_stream, g_gpu_index, size_tracker,
+      true));
  auto *d_scalars = static_cast<Scalar *>(cuda_malloc_with_size_tracking_async(
-      safe_mul_sizeof<Scalar>(static_cast<size_t>(n)), g_benchmark_stream,
-      g_gpu_index, size_tracker, true));
+      n * sizeof(Scalar), g_benchmark_stream, g_gpu_index, size_tracker, true));
+  auto *d_result =
+      static_cast<G2Projective *>(cuda_malloc_with_size_tracking_async(
+          sizeof(G2Projective), g_benchmark_stream, g_gpu_index, size_tracker,
+          true));
+  auto *d_scratch =
+      static_cast<G2Projective *>(cuda_malloc_with_size_tracking_async(
+          scratch_size, g_benchmark_stream, g_gpu_index, size_tracker, true));
+
  // Prepare host data
  auto *h_points = new G2Affine[n];
  auto *h_scalars = new Scalar[n];
@@ -194,11 +219,11 @@ static void BM_G2_MSM(benchmark::State &state) {

  // Copy to device (once, before benchmark loop)
  cuda_memcpy_with_size_tracking_async_to_gpu(
-      d_points, h_points, safe_mul_sizeof<G2Affine>(static_cast<size_t>(n)),
-      g_benchmark_stream, g_gpu_index, true);
+      d_points, h_points, n * sizeof(G2Affine), g_benchmark_stream, g_gpu_index,
+      true);
  cuda_memcpy_with_size_tracking_async_to_gpu(
-      d_scalars, h_scalars, safe_mul_sizeof<Scalar>(static_cast<size_t>(n)),
-      g_benchmark_stream, g_gpu_index, true);
+      d_scalars, h_scalars, n * sizeof(Scalar), g_benchmark_stream, g_gpu_index,
+      true);

  // Convert points to Montgomery form (required for performance - all
  // operations use Montgomery)
@@ -206,29 +231,26 @@ static void BM_G2_MSM(benchmark::State &state) {
                                      n);
  check_cuda_error(cudaGetLastError());

-  // Allocate scratch buffer sized to match the pippenger internal partitioning
-  size_t g2_scratch_bytes = pippenger_scratch_size_g2(n, g_gpu_index);
-  auto *d_scratch = static_cast<G2Projective *>(
-      cuda_malloc_with_size_tracking_async(g2_scratch_bytes, g_benchmark_stream,
-                                           g_gpu_index, size_tracker, true));
+  // Initialize result and scratch memory to zero (once, before benchmark loop)
+  cuda_memset_with_size_tracking_async(d_result, 0, sizeof(G2Projective),
+                                       g_benchmark_stream, g_gpu_index, true);
+  cuda_memset_with_size_tracking_async(d_scratch, 0, scratch_size,
+                                       g_benchmark_stream, g_gpu_index, true);

  // Synchronize once before benchmark loop to ensure all setup is complete
  cuda_synchronize_stream(g_benchmark_stream, g_gpu_index);

-  // Result written directly to host -- no device allocation needed
-  G2Projective h_result;
-
  // Warm-up iterations
  for (int i = 0; i < WARMUP_ITERATIONS; i++) {
-    point_msm_g2_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
-                       d_scalars, n, d_scratch, size_tracker, true);
+    point_msm_async_g2(g_benchmark_stream, g_gpu_index, d_result, d_points,
+                       d_scalars, d_scratch, n, size_tracker);
  }
  cuda_synchronize_stream(g_benchmark_stream, g_gpu_index);

  // Benchmark loop: only measure the MSM computation, no memory operations
  for (auto _ : state) {
-    point_msm_g2_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
-                       d_scalars, n, d_scratch, size_tracker, true);
+    point_msm_async_g2(g_benchmark_stream, g_gpu_index, d_result, d_points,
+                       d_scalars, d_scratch, n, size_tracker);
    benchmark::ClobberMemory();
  }

@@ -240,12 +262,14 @@ static void BM_G2_MSM(benchmark::State &state) {

  delete[] h_points;
  delete[] h_scalars;
-  cuda_drop_with_size_tracking_async(d_scratch, g_benchmark_stream, g_gpu_index,
-                                     true);
  cuda_drop_with_size_tracking_async(d_points, g_benchmark_stream, g_gpu_index,
                                     true);
  cuda_drop_with_size_tracking_async(d_scalars, g_benchmark_stream, g_gpu_index,
                                     true);
+  cuda_drop_with_size_tracking_async(d_result, g_benchmark_stream, g_gpu_index,
+                                     true);
+  cuda_drop_with_size_tracking_async(d_scratch, g_benchmark_stream, g_gpu_index,
+                                     true);
 }

 // Register MSM benchmarks with sizes matching the Rust Criterion benchmarks
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/CMakeLists.txt
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/CMakeLists.txt
@@ -77,6 +77,3 @@ gtest_discover_tests(test_fp)
 gtest_discover_tests(test_fp2)
 gtest_discover_tests(test_msm)
 gtest_discover_tests(test_point_ops)
-
-# Basic usage examples (standalone programs, not registered with CTest)
-add_subdirectory(basic)
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/basic/CMakeLists.txt
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/basic/CMakeLists.txt
@@ -1,34 +0,0 @@
-# Basic usage examples for zk-cuda-backend. These are standalone programs for learning purposes, not part of the CTest
-# suite. Run them directly after building to verify the API works end-to-end.
-
-set(ZK_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../include)
-
-# basic_fp_ops: host-side Fp field arithmetic
-add_executable(basic_fp_ops basic_fp_ops.cu)
-target_link_libraries(basic_fp_ops zk_cuda_backend tfhe_device)
-target_include_directories(basic_fp_ops PRIVATE ${ZK_INCLUDE_DIR})
-set_target_properties(
-  basic_fp_ops
-  PROPERTIES CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}
-             CUDA_SEPARABLE_COMPILATION ON
-             CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-
-# basic_curve_ops: host-side G1 elliptic curve operations
-add_executable(basic_curve_ops basic_curve_ops.cu)
-target_link_libraries(basic_curve_ops zk_cuda_backend tfhe_device)
-target_include_directories(basic_curve_ops PRIVATE ${ZK_INCLUDE_DIR})
-set_target_properties(
-  basic_curve_ops
-  PROPERTIES CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}
-             CUDA_SEPARABLE_COMPILATION ON
-             CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-
-# basic_msm: GPU-accelerated multi-scalar multiplication
-add_executable(basic_msm basic_msm.cu)
-target_link_libraries(basic_msm zk_cuda_backend tfhe_device)
-target_include_directories(basic_msm PRIVATE ${ZK_INCLUDE_DIR})
-set_target_properties(
-  basic_msm
-  PROPERTIES CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}
-             CUDA_SEPARABLE_COMPILATION ON
-             CUDA_RESOLVE_DEVICE_SYMBOLS ON)
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/basic/basic_curve_ops.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/basic/basic_curve_ops.cu
@@ -1,90 +0,0 @@
-// Basic elliptic curve operations on BLS12-446 G1.
-//
-// Demonstrates G1 projective point arithmetic on the host.  Projective points
-// support operator overloads (+, -, *, ==) that cover the common use cases.
-// Affine points are used for input/output; coordinates are in Montgomery form
-// during arithmetic and converted back by normalize_from_montgomery_g1().
-//
-// See README.md and include/curve.h for the full API reference.
-//
-// Build (from cuda/):
-//   cmake -B build -DZK_CUDA_BACKEND_BUILD_TESTS=ON
-//   cmake --build build --target basic_curve_ops
-//   ./build/tests_and_benchmarks/tests/basic/basic_curve_ops
-
-#include "curve.h"
-#include "fp.h"
-#include <cassert>
-#include <cstdio>
-#include <cstring>
-
-int main() {
-  // ---- Generator point ----
-  // g1_generator() returns the hardcoded BLS12-446 G1 generator in normal
-  // (non-Montgomery) form. Convert to Montgomery, then lift to projective for
-  // host-side arithmetic.
-  const G1Affine &gen_normal = g1_generator();
-  assert(!g1_is_infinity(gen_normal));
-
-  G1Affine gen_affine = gen_normal;
-  point_to_montgomery_inplace(gen_affine);
-
-  G1Projective G;
-  affine_to_projective(G, gen_affine);
-
-  // ---- Negation: -G ----
-  G1Projective neg_G = -G;
-
-  // G + (-G) = identity (Z = 0 in the projective convention)
-  G1Projective identity = G + neg_G;
-  assert(fp_is_zero(identity.Z));
-  printf("Negation (-G) and G + (-G) = identity: OK\n");
-
-  // ---- Addition: 2*G = G + G, 3*G = 2*G + G ----
-  G1Projective two_G = G + G;
-  assert(!(two_G == G1Projective())); // not the identity
-
-  G1Projective three_G = two_G + G;
-  assert(!(three_G == G1Projective()));
-  printf("Addition (2*G, 3*G): OK\n");
-
-  // ---- Compound assignment: G += G ----
-  G1Projective acc = G;
-  acc += G; // acc = 2*G
-  assert(acc == two_G);
-  printf("Compound assignment (+=): OK\n");
-
-  // ---- Scalar multiplication: 3*G using Scalar type ----
-  // The * operator calls projective_scalar_mul internally.
-  Scalar scalar_3;
-  memset(&scalar_3, 0, sizeof(scalar_3));
-  scalar_3.limb[0] = 3;
-
-  G1Projective three_G_via_scalar = G * scalar_3;
-  assert(!(three_G_via_scalar == G1Projective()));
-
-  // Normalise both to Z = 1 (Montgomery) before comparing coordinates.
-  normalize_projective_g1(three_G);
-  normalize_projective_g1(three_G_via_scalar);
-  assert(three_G == three_G_via_scalar);
-  printf("Scalar multiplication (3*G == G + G + G): OK\n");
-
-  // ---- Projective -> affine conversion ----
-  // projective_to_affine_g1 keeps coordinates in Montgomery form.
-  G1Affine three_G_affine;
-  projective_to_affine_g1(three_G_affine, three_G);
-  assert(!g1_is_infinity(three_G_affine));
-  printf("Projective -> affine conversion: OK\n");
-
-  // ---- Convert to normal-form coordinates ----
-  // normalize_from_montgomery_g1 strips Montgomery form and sets Z = 1 in one
-  // pass.
-  G1Projective result = three_G_via_scalar;
-  normalize_from_montgomery_g1(
-      result); // coordinates now in normal (non-Montgomery) form
-  assert(!fp_is_zero(result.Z)); // Z = 1 (non-zero)
-  printf("Conversion to normal-form projective: OK\n");
-
-  printf("All G1 curve operations passed.\n");
-  return 0;
-}
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/basic/basic_fp_ops.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/basic/basic_fp_ops.cu
@@ -1,107 +0,0 @@
-// Basic finite field (Fp) arithmetic over BLS12-446.
-//
-// Demonstrates host-side Fp operations intended as a learning reference.
-// All arithmetic in the field is modular with respect to the BLS12-446 prime.
-//
-// Internal representation uses Montgomery form for multiplications.
-// See README.md and include/fp.h for the full API reference.
-//
-// Build (from cuda/):
-//   cmake -B build -DZK_CUDA_BACKEND_BUILD_TESTS=ON
-//   cmake --build build --target basic_fp_ops
-//   ./build/tests_and_benchmarks/tests/basic/basic_fp_ops
-
-#include "fp.h"
-#include <cassert>
-#include <cstdio>
-
-int main() {
-  // ---- Addition and subtraction ----
-  // fp_one() and fp_zero() produce values in normal (non-Montgomery) form.
-  // fp_add / fp_sub perform modular addition/subtraction and are form-agnostic
-  // (addition is linear, so the result stays in the same form).
-  Fp a, b, c;
-  fp_one(a); // a = 1
-  fp_one(b); // b = 1
-
-  c = a + b; // c = 2
-  assert(c.limb[0] == 2);
-
-  c = c - a; // c = 1
-  assert(fp_is_one(c));
-
-  // Compound assignment
-  c += a; // c = 2
-  assert(c.limb[0] == 2);
-  c -= b; // c = 1
-  assert(fp_is_one(c));
-
-  printf("Addition/subtraction: OK\n");
-
-  // ---- Negation ----
-  // fp_neg computes p - a (mod p). For consistency use values in Montgomery
-  // form, but for add/sub/neg small normal-form values also work correctly.
-  Fp neg_a = -a; // neg_a = -1 mod p
-  Fp sum = a + neg_a;
-  assert(fp_is_zero(sum)); // 1 + (-1) = 0
-  printf("Negation: OK\n");
-
-  // ---- Multiplication (Montgomery form required) ----
-  // The * operator calls fp_mont_mul, which requires both operands to be in
-  // Montgomery form.  Use fp_to_montgomery() to convert, or the helper
-  // fp_one_montgomery() / fp_two_montgomery() for small constants.
-  Fp one_m, two_m, result_m, result;
-  fp_one_montgomery(one_m); // one_m  = 1 in Montgomery form
-  fp_two_montgomery(two_m); // two_m  = 2 in Montgomery form
-
-  result_m = one_m * two_m; // result_m = 2 in Montgomery form
-  fp_from_montgomery(result, result_m);
-  assert(result.limb[0] == 2);
-
-  result_m = two_m * two_m; // result_m = 4 in Montgomery form
-  fp_from_montgomery(result, result_m);
-  assert(result.limb[0] == 4);
-
-  // Compound multiplication
-  result_m = two_m;
-  result_m *= two_m; // result_m = 4
-  fp_from_montgomery(result, result_m);
-  assert(result.limb[0] == 4);
-
-  // Convert an arbitrary normal-form value to Montgomery before multiplying
-  Fp five_normal, five_m, twenty_five_m, twenty_five;
-  fp_zero(five_normal);
-  five_normal.limb[0] = 5;
-  fp_to_montgomery(five_m, five_normal);
-
-  fp_mont_mul(twenty_five_m, five_m, five_m); // 5 * 5 = 25
-  fp_from_montgomery(twenty_five, twenty_five_m);
-  assert(twenty_five.limb[0] == 25);
-
-  printf("Multiplication: OK\n");
-
-  // ---- Inversion and division (normal-form convenience API) ----
-  // fp_inv and fp_div accept and return values in normal form (they handle
-  // the Montgomery conversion internally).
-  Fp five_inv;
-  fp_inv(five_inv, five_normal); // five_inv = 5^{-1} mod p
-
-  Fp one_check;
-  fp_div(one_check, five_normal, five_normal); // 5 / 5 = 1
-  assert(fp_is_one(one_check));
-
-  // Verify: 5 * 5^{-1} == 1  (using fp_div as a cross-check)
-  Fp product;
-  fp_zero(product);
-  product.limb[0] = 1; // product = 1
-  Fp two_normal;
-  fp_zero(two_normal);
-  two_normal.limb[0] = 2;
-  fp_div(product, two_normal, two_normal); // 2 / 2 = 1
-  assert(fp_is_one(product));
-
-  printf("Inversion/division: OK\n");
-
-  printf("All Fp operations passed.\n");
-  return 0;
-}
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/basic/basic_msm.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/basic/basic_msm.cu
@@ -1,109 +0,0 @@
-// Basic Multi-Scalar Multiplication (MSM) on BLS12-446 G1.
-//
-// Demonstrates the unmanaged GPU MSM API:
-//   - Allocating device memory for points, scalars, result, and scratch space
-//   - Copying data to the GPU and running point_msm_g1()
-//   - Reading the result back and verifying against a naive scalar-mul sum
-//
-// The unmanaged API requires the caller to manage all allocations.  For a
-// higher-level interface that handles memory internally, see the Rust bindings
-// (G1Projective::msm in the Rust API).
-//
-// See README.md and include/msm.h for the full API reference.
-//
-// Build (from cuda/):
-//   cmake -B build -DZK_CUDA_BACKEND_BUILD_TESTS=ON
-//   cmake --build build --target basic_msm
-//   ./build/tests_and_benchmarks/tests/basic/basic_msm
-
-#include "curve.h"
-#include "device.h"
-#include "fp.h"
-#include "msm.h"
-#include <cassert>
-#include <cstdio>
-#include <cstring>
-#include <vector>
-
-int main() {
-  if (!cuda_is_available()) {
-    printf("CUDA not available, skipping.\n");
-    return 0;
-  }
-
-  const uint32_t gpu_index = 0;
-  const uint32_t n = 4; // number of points / scalars
-  uint64_t size_tracker = 0;
-
-  // ---- Prepare host-side points in Montgomery form ----
-  // Use n doublings of the G1 generator: G, 2*G, 4*G, 8*G.
-  const G1Affine &gen_normal = g1_generator();
-  G1Affine gen = gen_normal;
-  point_to_montgomery_inplace(gen);
-
-  std::vector<G1Affine> h_points(n);
-  h_points[0] = gen;
-  for (uint32_t i = 1; i < n; i++) {
-    point_double(h_points[i], h_points[i - 1]);
-  }
-
-  // ---- Prepare host-side scalars ----
-  // Each scalar is a 320-bit little-endian integer (ZP_LIMBS × LIMB_BITS).
-  // Use scalar[i] = i + 1, so MSM = 1*G + 2*(2G) + 3*(4G) + 4*(8G).
-  std::vector<Scalar> h_scalars(n);
-  for (uint32_t i = 0; i < n; i++) {
-    memset(&h_scalars[i], 0, sizeof(Scalar));
-    h_scalars[i].limb[0] = i + 1;
-  }
-
-  // ---- Allocate device memory ----
-  cudaStream_t stream = cuda_create_stream(gpu_index);
-
-  auto *d_points =
-      static_cast<G1Affine *>(cuda_malloc(n * sizeof(G1Affine), gpu_index));
-  auto *d_scalars =
-      static_cast<Scalar *>(cuda_malloc(n * sizeof(Scalar), gpu_index));
-  // Use pippenger_scratch_size_g1() to compute the required scratch allocation.
-  size_t scratch_bytes = pippenger_scratch_size_g1(n, gpu_index);
-  auto *d_scratch =
-      static_cast<G1Projective *>(cuda_malloc(scratch_bytes, gpu_index));
-
-  // ---- Copy inputs to the GPU ----
-  cuda_memcpy_async_to_gpu(d_points, h_points.data(), n * sizeof(G1Affine),
-                           stream, gpu_index);
-  cuda_memcpy_async_to_gpu(d_scalars, h_scalars.data(), n * sizeof(Scalar),
-                           stream, gpu_index);
-
-  // ---- Run MSM (synchronous wrapper; result written directly to host) ----
-  G1Projective h_result;
-  point_msm_g1(stream, gpu_index, &h_result, d_points, d_scalars, n, d_scratch,
-               size_tracker, true);
-
-  // ---- Verify against naive sequential computation on the host ----
-  // Expected = sum over i of (scalar[i] * point[i]).
-  // Use projective * Scalar operator; host-side affine scalar_mul is internal
-  // only.
-  G1Projective expected;
-  g1_projective_point_at_infinity(expected);
-
-  for (uint32_t i = 0; i < n; i++) {
-    G1Projective term_proj;
-    affine_to_projective(term_proj, h_points[i]);
-    expected = expected + term_proj * h_scalars[i];
-  }
-
-  // Normalise to Z = 1 (Montgomery) before comparing projective coordinates.
-  normalize_projective_g1(h_result);
-  normalize_projective_g1(expected);
-  assert(h_result == expected);
-  printf("MSM result matches naive sequential computation.\n");
-
-  // ---- Cleanup ----
-  cuda_drop(d_points, gpu_index);
-  cuda_drop(d_scalars, gpu_index);
-  cuda_drop(d_scratch, gpu_index);
-  cuda_destroy_stream(stream, gpu_index);
-
-  printf("All MSM basic operations passed.\n");
-  return 0;
-}
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/primitives/fp2_helpers.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/primitives/fp2_helpers.cu
@@ -6,8 +6,6 @@
 #include "fp2.h"
 #include <cuda_runtime.h>

-#include "checked_arithmetic.h"
-
 // ============================================================================
 // CUDA Kernels for parallel Fp2 operations (test-only)
 // ============================================================================
@@ -113,21 +111,16 @@ void fp2_add_batch_on_host(cudaStream_t stream, uint32_t gpu_index, Fp2 *c,
  const uint32_t blocksPerGrid = CEIL_DIV(n, threadsPerBlock);

  auto *d_c = static_cast<Fp2 *>(cuda_malloc_with_size_tracking_async(
-      safe_mul_sizeof<Fp2>(static_cast<size_t>(n)), stream, gpu_index,
-      size_tracker, true));
+      n * sizeof(Fp2), stream, gpu_index, size_tracker, true));
  auto *d_a = static_cast<Fp2 *>(cuda_malloc_with_size_tracking_async(
-      safe_mul_sizeof<Fp2>(static_cast<size_t>(n)), stream, gpu_index,
-      size_tracker, true));
+      n * sizeof(Fp2), stream, gpu_index, size_tracker, true));
  auto *d_b = static_cast<Fp2 *>(cuda_malloc_with_size_tracking_async(
-      safe_mul_sizeof<Fp2>(static_cast<size_t>(n)), stream, gpu_index,
-      size_tracker, true));
+      n * sizeof(Fp2), stream, gpu_index, size_tracker, true));

-  cuda_memcpy_with_size_tracking_async_to_gpu(
-      d_a, a, safe_mul_sizeof<Fp2>(static_cast<size_t>(n)), stream, gpu_index,
-      true);
-  cuda_memcpy_with_size_tracking_async_to_gpu(
-      d_b, b, safe_mul_sizeof<Fp2>(static_cast<size_t>(n)), stream, gpu_index,
-      true);
+  cuda_memcpy_with_size_tracking_async_to_gpu(d_a, a, n * sizeof(Fp2), stream,
+                                              gpu_index, true);
+  cuda_memcpy_with_size_tracking_async_to_gpu(d_b, b, n * sizeof(Fp2), stream,
+                                              gpu_index, true);

  kernel_fp2_add_array<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_c, d_a,
                                                                      d_b, n);
@@ -136,8 +129,7 @@ void fp2_add_batch_on_host(cudaStream_t stream, uint32_t gpu_index, Fp2 *c,

  cuda_synchronize_stream(stream, gpu_index);

-  cuda_memcpy_async_to_cpu(c, d_c, safe_mul_sizeof<Fp2>(static_cast<size_t>(n)),
-                           stream, gpu_index);
+  cuda_memcpy_async_to_cpu(c, d_c, n * sizeof(Fp2), stream, gpu_index);
  cuda_synchronize_stream(stream, gpu_index);

  if (d_c != nullptr) {
@@ -167,21 +159,16 @@ void fp2_mul_batch_on_host(cudaStream_t stream, uint32_t gpu_index, Fp2 *c,
  const uint32_t blocksPerGrid = CEIL_DIV(n, threadsPerBlock);

  auto *d_c = static_cast<Fp2 *>(cuda_malloc_with_size_tracking_async(
-      safe_mul_sizeof<Fp2>(static_cast<size_t>(n)), stream, gpu_index,
-      size_tracker, true));
+      n * sizeof(Fp2), stream, gpu_index, size_tracker, true));
  auto *d_a = static_cast<Fp2 *>(cuda_malloc_with_size_tracking_async(
-      safe_mul_sizeof<Fp2>(static_cast<size_t>(n)), stream, gpu_index,
-      size_tracker, true));
+      n * sizeof(Fp2), stream, gpu_index, size_tracker, true));
  auto *d_b = static_cast<Fp2 *>(cuda_malloc_with_size_tracking_async(
-      safe_mul_sizeof<Fp2>(static_cast<size_t>(n)), stream, gpu_index,
-      size_tracker, true));
+      n * sizeof(Fp2), stream, gpu_index, size_tracker, true));

-  cuda_memcpy_with_size_tracking_async_to_gpu(
-      d_a, a, safe_mul_sizeof<Fp2>(static_cast<size_t>(n)), stream, gpu_index,
-      true);
-  cuda_memcpy_with_size_tracking_async_to_gpu(
-      d_b, b, safe_mul_sizeof<Fp2>(static_cast<size_t>(n)), stream, gpu_index,
-      true);
+  cuda_memcpy_with_size_tracking_async_to_gpu(d_a, a, n * sizeof(Fp2), stream,
+                                              gpu_index, true);
+  cuda_memcpy_with_size_tracking_async_to_gpu(d_b, b, n * sizeof(Fp2), stream,
+                                              gpu_index, true);

  kernel_fp2_mul_array<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_c, d_a,
                                                                      d_b, n);
@@ -190,8 +177,7 @@ void fp2_mul_batch_on_host(cudaStream_t stream, uint32_t gpu_index, Fp2 *c,

  cuda_synchronize_stream(stream, gpu_index);

-  cuda_memcpy_async_to_cpu(c, d_c, safe_mul_sizeof<Fp2>(static_cast<size_t>(n)),
-                           stream, gpu_index);
+  cuda_memcpy_async_to_cpu(c, d_c, n * sizeof(Fp2), stream, gpu_index);
  cuda_synchronize_stream(stream, gpu_index);

  if (d_c != nullptr) {
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/primitives/fp_helpers.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/primitives/fp_helpers.cu
@@ -6,8 +6,6 @@
 #include "fp.h"
 #include <cuda_runtime.h>

-#include "checked_arithmetic.h"
-
 // ============================================================================
 // CUDA Kernels for parallel Fp operations (test-only)
 // ============================================================================
@@ -175,22 +173,17 @@ void fp_add_batch_on_host(cudaStream_t stream, uint32_t gpu_index, Fp *c,

  // Allocate device memory (asynchronous with stream)
  auto *d_c = static_cast<Fp *>(cuda_malloc_with_size_tracking_async(
-      safe_mul_sizeof<Fp>(static_cast<size_t>(n)), stream, gpu_index,
-      size_tracker, true));
+      n * sizeof(Fp), stream, gpu_index, size_tracker, true));
  auto *d_a = static_cast<Fp *>(cuda_malloc_with_size_tracking_async(
-      safe_mul_sizeof<Fp>(static_cast<size_t>(n)), stream, gpu_index,
-      size_tracker, true));
+      n * sizeof(Fp), stream, gpu_index, size_tracker, true));
  auto *d_b = static_cast<Fp *>(cuda_malloc_with_size_tracking_async(
-      safe_mul_sizeof<Fp>(static_cast<size_t>(n)), stream, gpu_index,
-      size_tracker, true));
+      n * sizeof(Fp), stream, gpu_index, size_tracker, true));

  // Copy to device (asynchronous with stream)
-  cuda_memcpy_with_size_tracking_async_to_gpu(
-      d_a, a, safe_mul_sizeof<Fp>(static_cast<size_t>(n)), stream, gpu_index,
-      true);
-  cuda_memcpy_with_size_tracking_async_to_gpu(
-      d_b, b, safe_mul_sizeof<Fp>(static_cast<size_t>(n)), stream, gpu_index,
-      true);
+  cuda_memcpy_with_size_tracking_async_to_gpu(d_a, a, n * sizeof(Fp), stream,
+                                              gpu_index, true);
+  cuda_memcpy_with_size_tracking_async_to_gpu(d_b, b, n * sizeof(Fp), stream,
+                                              gpu_index, true);

  // Launch kernel (with stream)
  kernel_fp_add_array<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_c, d_a,
@@ -203,8 +196,7 @@ void fp_add_batch_on_host(cudaStream_t stream, uint32_t gpu_index, Fp *c,
  cuda_synchronize_stream(stream, gpu_index);

  // Copy back (synchronous after stream sync)
-  cuda_memcpy_async_to_cpu(c, d_c, safe_mul_sizeof<Fp>(static_cast<size_t>(n)),
-                           stream, gpu_index);
+  cuda_memcpy_async_to_cpu(c, d_c, n * sizeof(Fp), stream, gpu_index);
  cuda_synchronize_stream(stream, gpu_index);

  // Free device memory (asynchronous with stream)
@@ -240,22 +232,17 @@ void fp_mul_batch_on_host(cudaStream_t stream, uint32_t gpu_index, Fp *c,

  // Allocate device memory (asynchronous with stream)
  auto *d_c = static_cast<Fp *>(cuda_malloc_with_size_tracking_async(
-      safe_mul_sizeof<Fp>(static_cast<size_t>(n)), stream, gpu_index,
-      size_tracker, true));
+      n * sizeof(Fp), stream, gpu_index, size_tracker, true));
  auto *d_a = static_cast<Fp *>(cuda_malloc_with_size_tracking_async(
-      safe_mul_sizeof<Fp>(static_cast<size_t>(n)), stream, gpu_index,
-      size_tracker, true));
+      n * sizeof(Fp), stream, gpu_index, size_tracker, true));
  auto *d_b = static_cast<Fp *>(cuda_malloc_with_size_tracking_async(
-      safe_mul_sizeof<Fp>(static_cast<size_t>(n)), stream, gpu_index,
-      size_tracker, true));
+      n * sizeof(Fp), stream, gpu_index, size_tracker, true));

  // Copy to device (asynchronous with stream)
-  cuda_memcpy_with_size_tracking_async_to_gpu(
-      d_a, a, safe_mul_sizeof<Fp>(static_cast<size_t>(n)), stream, gpu_index,
-      true);
-  cuda_memcpy_with_size_tracking_async_to_gpu(
-      d_b, b, safe_mul_sizeof<Fp>(static_cast<size_t>(n)), stream, gpu_index,
-      true);
+  cuda_memcpy_with_size_tracking_async_to_gpu(d_a, a, n * sizeof(Fp), stream,
+                                              gpu_index, true);
+  cuda_memcpy_with_size_tracking_async_to_gpu(d_b, b, n * sizeof(Fp), stream,
+                                              gpu_index, true);

  // Launch kernel (with stream)
  kernel_fp_mul_array<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_c, d_a,
@@ -268,8 +255,7 @@ void fp_mul_batch_on_host(cudaStream_t stream, uint32_t gpu_index, Fp *c,
  cuda_synchronize_stream(stream, gpu_index);

  // Copy back (synchronous after stream sync)
-  cuda_memcpy_async_to_cpu(c, d_c, safe_mul_sizeof<Fp>(static_cast<size_t>(n)),
-                           stream, gpu_index);
+  cuda_memcpy_async_to_cpu(c, d_c, n * sizeof(Fp), stream, gpu_index);
  cuda_synchronize_stream(stream, gpu_index);

  // Free device memory (asynchronous with stream)
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/primitives/test_fp.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/primitives/test_fp.cu
@@ -355,11 +355,9 @@ TEST_F(FpArithmeticTest, Multiplication) {
  fp_mul_gpu(stream, gpu_index, &result, &five, &three);

  // Also test on CPU for comparison
-  // operator* expects Montgomery-form inputs and returns Montgomery form
-  Fp five_m, three_m;
-  fp_to_montgomery(five_m, five);
-  fp_to_montgomery(three_m, three);
-  Fp result_cpu_mont = five_m * three_m;
+  // operator* returns result in Montgomery form, convert to normal for
+  // comparison
+  Fp result_cpu_mont = five * three;
  Fp result_cpu;
  fp_from_montgomery(result_cpu, result_cpu_mont);

@@ -574,11 +572,9 @@ TEST_F(FpArithmeticTest, MultiplicationByZero) {
  fp_mul_gpu(stream, gpu_index, &result, &a, &zero);

  // Also test on CPU for comparison
-  // operator* expects Montgomery-form inputs and returns Montgomery form
-  Fp a_m, zero_m;
-  fp_to_montgomery(a_m, a);
-  fp_to_montgomery(zero_m, zero);
-  Fp result_cpu_mont = a_m * zero_m;
+  // operator* returns result in Montgomery form, convert to normal for
+  // comparison
+  Fp result_cpu_mont = a * zero;
  fp_from_montgomery(result_cpu, result_cpu_mont);

  EXPECT_TRUE(fp_is_zero_gpu(stream, gpu_index, &result));
@@ -602,13 +598,10 @@ TEST_F(FpArithmeticTest, Inversion) {
  fp_mul_gpu(stream, gpu_index, &result, &a, &a_inv);

  // Also test on CPU for comparison
-  // fp_inv returns normal form, convert both operands to Montgomery for
-  // operator*
  fp_inv(a_inv_cpu, a);
-  Fp a_m, a_inv_cpu_m;
-  fp_to_montgomery(a_m, a);
-  fp_to_montgomery(a_inv_cpu_m, a_inv_cpu);
-  Fp result_cpu_mont = a_m * a_inv_cpu_m;
+  // operator* returns result in Montgomery form, convert to normal for
+  // comparison
+  Fp result_cpu_mont = a * a_inv_cpu;
  Fp result_cpu;
  fp_from_montgomery(result_cpu, result_cpu_mont);

@@ -657,13 +650,11 @@ TEST_F(FpArithmeticTest, Division) {
  fp_div_gpu(stream, gpu_index, &quotient, &a, &b);
  fp_mul_gpu(stream, gpu_index, &result, &quotient, &b);

-  // operator/ now expects Montgomery-form inputs and returns Montgomery form
-  Fp a_m, b_m;
-  fp_to_montgomery(a_m, a);
-  fp_to_montgomery(b_m, b);
-  Fp quotient_cpu_m = a_m / b_m;
-  // quotient_cpu_m * b_m should give a_m back
-  Fp result_cpu_mont = quotient_cpu_m * b_m;
+  // Also test on CPU for comparison
+  Fp quotient_cpu = a / b;
+  // operator* returns result in Montgomery form, convert to normal for
+  // comparison
+  Fp result_cpu_mont = quotient_cpu * b;
  Fp result_cpu;
  fp_from_montgomery(result_cpu, result_cpu_mont);

@@ -688,13 +679,8 @@ TEST_F(FpArithmeticTest, DivisionByOne) {
  // Test on GPU
  fp_div_gpu(stream, gpu_index, &result, &a, &one);

-  // operator/ expects Montgomery-form inputs, returns Montgomery form
-  Fp a_m, one_m;
-  fp_to_montgomery(a_m, a);
-  fp_to_montgomery(one_m, one);
-  Fp result_cpu_m = a_m / one_m;
-  Fp result_cpu;
-  fp_from_montgomery(result_cpu, result_cpu_m);
+  // Also test on CPU for comparison
+  Fp result_cpu = a / one;

  EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &result, &a), ComparisonType::Equal)
      << "a / 1 should equal a";
@@ -831,10 +817,9 @@ TEST_F(FpArithmeticTest, SquareRoot) {
  fp_mul_gpu(stream, gpu_index, &square, &a, &a);

  // Also test on CPU for comparison
-  // operator* expects Montgomery-form inputs and returns Montgomery form
-  Fp a_m;
-  fp_to_montgomery(a_m, a);
-  Fp square_cpu_mont = a_m * a_m;
+  // operator* returns result in Montgomery form, convert to normal for
+  // comparison
+  Fp square_cpu_mont = a * a;
  fp_from_montgomery(square_cpu, square_cpu_mont);

  // Verify that square is a quadratic residue (on GPU)
@@ -853,11 +838,10 @@ TEST_F(FpArithmeticTest, SquareRoot) {
    cuda_synchronize_stream(stream, gpu_index);

    // Also test on CPU for comparison
-    // fp_sqrt returns normal form; convert to Montgomery for operator*
    fp_sqrt(sqrt_result_cpu, square_cpu);
-    Fp sqrt_result_cpu_m;
-    fp_to_montgomery(sqrt_result_cpu_m, sqrt_result_cpu);
-    Fp verify_cpu_mont = sqrt_result_cpu_m * sqrt_result_cpu_m;
+    // operator* returns result in Montgomery form, convert to normal for
+    // comparison
+    Fp verify_cpu_mont = sqrt_result_cpu * sqrt_result_cpu;
    fp_from_montgomery(verify_cpu, verify_cpu_mont);

    EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &verify, &square),
@@ -946,10 +930,9 @@ TEST_F(FpArithmeticTest, IsQuadraticResidue) {
  fp_mul_gpu(stream, gpu_index, &square, &a, &a);

  // Also test on CPU for comparison
-  // operator* expects Montgomery-form inputs and returns Montgomery form
-  Fp a_m;
-  fp_to_montgomery(a_m, a);
-  Fp square_cpu_mont = a_m * a_m;
+  // operator* returns result in Montgomery form, convert to normal for
+  // comparison
+  Fp square_cpu_mont = a * a;
  fp_from_montgomery(square_cpu, square_cpu_mont);

  EXPECT_TRUE(fp_is_quadratic_residue_gpu(stream, gpu_index, &square))
@@ -1164,11 +1147,9 @@ TEST_F(FpArithmeticTest, LargeMultiplication1) {
  fp_mul_gpu(stream, gpu_index, &verify, &result, &one);

  // Also test on CPU for comparison
-  // operator* expects Montgomery-form inputs and returns Montgomery form
-  Fp a_m, b_m;
-  fp_to_montgomery(a_m, a);
-  fp_to_montgomery(b_m, b);
-  Fp result_cpu_mont = a_m * b_m;
+  // operator* returns result in Montgomery form, convert to normal for
+  // comparison
+  Fp result_cpu_mont = a * b;
  fp_from_montgomery(result_cpu, result_cpu_mont);

  EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &result, &verify),
@@ -1209,11 +1190,9 @@ TEST_F(FpArithmeticTest, LargeMultiplication2ModulusMinus1) {
  fp_mul_gpu(stream, gpu_index, &result, &a, &b);

  // Also test on CPU for comparison
-  // operator* expects Montgomery-form inputs and returns Montgomery form
-  Fp a_m, b_m;
-  fp_to_montgomery(a_m, a);
-  fp_to_montgomery(b_m, b);
-  Fp result_cpu_mont = a_m * b_m;
+  // operator* returns result in Montgomery form, convert to normal for
+  // comparison
+  Fp result_cpu_mont = a * b;
  fp_from_montgomery(result_cpu, result_cpu_mont);

  EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &result, &expected),
@@ -1248,11 +1227,9 @@ TEST_F(FpArithmeticTest, LargeMultiplication3Half) {
  fp_add_gpu(stream, gpu_index, &expected, &a, &a);

  // Also test on CPU for comparison
-  // operator* expects Montgomery-form inputs and returns Montgomery form
-  Fp a_m, b_m;
-  fp_to_montgomery(a_m, a);
-  fp_to_montgomery(b_m, b);
-  Fp result_cpu_mont = a_m * b_m;
+  // operator* returns result in Montgomery form, convert to normal for
+  // comparison
+  Fp result_cpu_mont = a * b;
  fp_from_montgomery(result_cpu, result_cpu_mont);
  expected_cpu = a + a;

@@ -1282,10 +1259,9 @@ TEST_F(FpArithmeticTest, LargeMultiplication4Square) {
  fp_mul_gpu(stream, gpu_index, &verify, &result, &one);

  // Also test on CPU for comparison
-  // operator* expects Montgomery-form inputs and returns Montgomery form
-  Fp a_m;
-  fp_to_montgomery(a_m, a);
-  Fp result_cpu_mont = a_m * a_m;
+  // operator* returns result in Montgomery form, convert to normal for
+  // comparison
+  Fp result_cpu_mont = a * a;
  fp_from_montgomery(result_cpu, result_cpu_mont);

  EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &result, &verify),
@@ -1354,11 +1330,9 @@ TEST_F(FpArithmeticTest, LargeMultiplication5Complex) {
  fp_mul_gpu(stream, gpu_index, &verify, &result, &one);

  // Also test on CPU for comparison
-  // operator* expects Montgomery-form inputs and returns Montgomery form
-  Fp a_m, b_m;
-  fp_to_montgomery(a_m, a);
-  fp_to_montgomery(b_m, b);
-  Fp result_cpu_mont = a_m * b_m;
+  // operator* returns result in Montgomery form, convert to normal for
+  // comparison
+  Fp result_cpu_mont = a * b;
  fp_from_montgomery(result_cpu, result_cpu_mont);

  EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &result, &verify),
@@ -2138,11 +2112,9 @@ TEST_F(FpCudaKernelTest, CudaKernelArrayMul) {
    h_a[i] = test_utils::random_fp(rng);
    h_b[i] = test_utils::random_fp(rng);
    // Compute expected result on host
-    // operator* expects Montgomery-form inputs and returns Montgomery form
-    Fp a_m, b_m;
-    fp_to_montgomery(a_m, h_a[i]);
-    fp_to_montgomery(b_m, h_b[i]);
-    Fp expected_mont = a_m * b_m;
+    // operator* returns result in Montgomery form, convert to normal for
+    // comparison
+    Fp expected_mont = h_a[i] * h_b[i];
    fp_from_montgomery(h_expected[i], expected_mont);
  }

@@ -2243,11 +2215,9 @@ TEST_F(FpCudaKernelTest, CudaKernelArrayMulEdgeCases) {
      h_b[i] = test_utils::random_fp(rng);
    }
    // Compute expected result on host
-    // operator* expects Montgomery-form inputs and returns Montgomery form
-    Fp a_m, b_m;
-    fp_to_montgomery(a_m, h_a[i]);
-    fp_to_montgomery(b_m, h_b[i]);
-    Fp expected_mont = a_m * b_m;
+    // operator* returns result in Montgomery form, convert to normal for
+    // comparison
+    Fp expected_mont = h_a[i] * h_b[i];
    fp_from_montgomery(h_expected[i], expected_mont);
  }

@@ -2398,11 +2368,9 @@ TEST_F(FpCudaKernelTest, CudaKernelDeviceConstants) {
  for (int i = 0; i < n; i++) {
    h_a[i] = test_utils::random_fp(rng);
    h_b[i] = test_utils::random_fp(rng);
-    // operator* expects Montgomery-form inputs and returns Montgomery form
-    Fp a_m, b_m;
-    fp_to_montgomery(a_m, h_a[i]);
-    fp_to_montgomery(b_m, h_b[i]);
-    Fp expected_mont = a_m * b_m;
+    // operator* returns result in Montgomery form, convert to normal for
+    // comparison
+    Fp expected_mont = h_a[i] * h_b[i];
    fp_from_montgomery(h_expected[i], expected_mont);
  }

--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/primitives/test_fp_device_call.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/primitives/test_fp_device_call.cu
@@ -0,0 +1,148 @@
+#include "device.h"
+#include "fp.h"
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+
+// Kernel that calls fp_one_montgomery INSIDE the device kernel
+__global__ void kernel_fp_one_montgomery_device(Fp *result) {
+  fp_one_montgomery(*result);
+}
+
+// Kernel that calls fp_to_montgomery INSIDE the device kernel
+__global__ void kernel_fp_to_montgomery_device(Fp *result) {
+  Fp one;
+  one.limb[0] = 1;
+  for (int i = 1; i < FP_LIMBS; i++) {
+    one.limb[i] = 0;
+  }
+  fp_to_montgomery(*result, one);
+}
+
+// Kernel that manually sets hardcoded Z value
+__global__ void kernel_hardcoded_z(Fp *result) {
+  result->limb[0] = 0x3b8fff65553d5554ULL;
+  result->limb[1] = 0xa446eb5cea3128cfULL;
+  result->limb[2] = 0xf6c648f07714c846ULL;
+  result->limb[3] = 0xc22966d114e3a7f5ULL;
+  result->limb[4] = 0xfda96d21d7f40737ULL;
+  result->limb[5] = 0x7fc0f2da6954a6ffULL;
+  result->limb[6] = 0x0c847c135ce86b2bULL;
+}
+
+TEST(FpDeviceCall, FpOneMontgomeryInKernel) {
+  uint64_t size_tracker = 0;
+  if (!cuda_is_available()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  uint32_t gpu_index = 0;
+  auto stream = cuda_create_stream(gpu_index);
+
+  auto *d_result = static_cast<Fp *>(cuda_malloc_with_size_tracking_async(
+      sizeof(Fp), stream, gpu_index, size_tracker, true));
+
+  // Call fp_one_montgomery INSIDE device kernel
+  kernel_fp_one_montgomery_device<<<1, 1, 0, stream>>>(d_result);
+  check_cuda_error(cudaGetLastError());
+
+  cuda_synchronize_stream(stream, gpu_index);
+
+  Fp h_result;
+  cuda_memcpy_async_to_cpu(&h_result, d_result, sizeof(Fp), stream, gpu_index);
+  cuda_synchronize_stream(stream, gpu_index);
+
+  // Check if result is zero
+  bool is_zero = fp_is_zero(h_result);
+
+  std::cout << "fp_one_montgomery (called IN device kernel) result:"
+            << std::endl;
+  for (int i = 0; i < 7; i++) {
+    std::cout << "  limb[" << i << "] = 0x" << std::hex << h_result.limb[i]
+              << std::dec << std::endl;
+  }
+  std::cout << "Is zero: " << (is_zero ? "YES - BUG!" : "no") << std::endl;
+
+  EXPECT_FALSE(is_zero)
+      << "fp_one_montgomery should NOT return zero when called from device!";
+
+  cuda_drop_with_size_tracking_async(d_result, stream, gpu_index, true);
+  cuda_destroy_stream(stream, gpu_index);
+}
+
+TEST(FpDeviceCall, FpToMontgomeryInKernel) {
+  uint64_t size_tracker = 0;
+  if (!cuda_is_available()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  uint32_t gpu_index = 0;
+  auto stream = cuda_create_stream(gpu_index);
+
+  auto *d_result = static_cast<Fp *>(cuda_malloc_with_size_tracking_async(
+      sizeof(Fp), stream, gpu_index, size_tracker, true));
+
+  // Call fp_to_montgomery INSIDE device kernel
+  kernel_fp_to_montgomery_device<<<1, 1, 0, stream>>>(d_result);
+  check_cuda_error(cudaGetLastError());
+
+  cuda_synchronize_stream(stream, gpu_index);
+
+  Fp h_result;
+  cuda_memcpy_async_to_cpu(&h_result, d_result, sizeof(Fp), stream, gpu_index);
+  cuda_synchronize_stream(stream, gpu_index);
+
+  // Check if result is zero
+  bool is_zero = fp_is_zero(h_result);
+
+  std::cout << "fp_to_montgomery(1) (called IN device kernel) result:"
+            << std::endl;
+  for (int i = 0; i < 7; i++) {
+    std::cout << "  limb[" << i << "] = 0x" << std::hex << h_result.limb[i]
+              << std::dec << std::endl;
+  }
+  std::cout << "Is zero: " << (is_zero ? "YES - BUG!" : "no") << std::endl;
+
+  EXPECT_FALSE(is_zero)
+      << "fp_to_montgomery(1) should NOT return zero when called from device!";
+
+  cuda_drop_with_size_tracking_async(d_result, stream, gpu_index, true);
+  cuda_destroy_stream(stream, gpu_index);
+}
+
+TEST(FpDeviceCall, HardcodedZValue) {
+  uint64_t size_tracker = 0;
+  if (!cuda_is_available()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  uint32_t gpu_index = 0;
+  auto stream = cuda_create_stream(gpu_index);
+
+  auto *d_result = static_cast<Fp *>(cuda_malloc_with_size_tracking_async(
+      sizeof(Fp), stream, gpu_index, size_tracker, true));
+
+  // Set hardcoded Z value INSIDE device kernel
+  kernel_hardcoded_z<<<1, 1, 0, stream>>>(d_result);
+  check_cuda_error(cudaGetLastError());
+
+  cuda_synchronize_stream(stream, gpu_index);
+
+  Fp h_result;
+  cuda_memcpy_async_to_cpu(&h_result, d_result, sizeof(Fp), stream, gpu_index);
+  cuda_synchronize_stream(stream, gpu_index);
+
+  // Check if result is zero
+  bool is_zero = fp_is_zero(h_result);
+
+  std::cout << "Hardcoded Z (set IN device kernel) result:" << std::endl;
+  for (int i = 0; i < 7; i++) {
+    std::cout << "  limb[" << i << "] = 0x" << std::hex << h_result.limb[i]
+              << std::dec << std::endl;
+  }
+  std::cout << "Is zero: " << (is_zero ? "YES - BUG!" : "no") << std::endl;
+
+  EXPECT_FALSE(is_zero) << "Hardcoded Z value should NOT be zero!";
+
+  cuda_drop_with_size_tracking_async(d_result, stream, gpu_index, true);
+  cuda_destroy_stream(stream, gpu_index);
+}
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/test_msm.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/test_msm.cu
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/test_point_ops.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/test_point_ops.cu
@@ -95,37 +95,40 @@ protected:
  cudaStream_t stream;

  // Helper to check if a point is on the curve y^2 = x^3 + b
-  // All arithmetic stays in Montgomery form (point coordinates are already
-  // Montgomery), converted to normal only for debug output.
  bool is_on_curve(const G1Affine &point) {
    if (point.infinity) {
      return true; // Point at infinity is on the curve
    }

-    // Coordinates are already in Montgomery form
-    const Fp &x_m = point.x;
-    const Fp &y_m = point.y;
+    // Convert from Montgomery form to normal form for verification
+    Fp x_normal, y_normal;
+    fp_from_montgomery(x_normal, point.x);
+    fp_from_montgomery(y_normal, point.y);

-    // Compute y^2 in Montgomery form
-    Fp y_squared_m = y_m * y_m;
+    // Compute y^2 (operator* returns Montgomery form, convert to normal)
+    Fp y_squared_mont = y_normal * y_normal;
+    Fp y_squared;
+    fp_from_montgomery(y_squared, y_squared_mont);

-    // Compute x^3 in Montgomery form
-    Fp x_squared_m = x_m * x_m;
-    Fp x_cubed_m = x_squared_m * x_m;
+    // Compute x^3 (operator* returns Montgomery form, convert to normal)
+    Fp x_squared_mont = x_normal * x_normal;
+    Fp x_squared;
+    fp_from_montgomery(x_squared, x_squared_mont);
+    Fp x_cubed_mont = x_squared * x_normal;
+    Fp x_cubed;
+    fp_from_montgomery(x_cubed, x_cubed_mont);

-    // Compute x^3 + b in Montgomery form (b = 1)
-    Fp b_m;
-    fp_one_montgomery(b_m);
-    Fp x_cubed_plus_b_m = x_cubed_m + b_m;
+    // Compute x^3 + b (b = 1)
+    Fp b;
+    fp_zero(b);
+    b.limb[0] = 1;
+    Fp x_cubed_plus_b = x_cubed + b;

-    // Check if y^2 == x^3 + b (comparison works directly in Montgomery form)
-    bool on_curve = y_squared_m == x_cubed_plus_b_m;
+    // Check if y^2 == x^3 + b
+    bool on_curve = y_squared == x_cubed_plus_b;

-    // Debug output if not on curve (convert to normal form for printing)
+    // Debug output if not on curve
    if (!on_curve) {
-      Fp y_squared, x_cubed_plus_b;
-      fp_from_montgomery(y_squared, y_squared_m);
-      fp_from_montgomery(x_cubed_plus_b, x_cubed_plus_b_m);
      std::cout << "WARNING: Point is NOT on the curve!" << std::endl;
      print_fp("  y^2", y_squared);
      print_fp("  x^3 + b", x_cubed_plus_b);
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/test_projective.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/test_projective.cu
@@ -0,0 +1,145 @@
+#include "curve.h"
+#include "device.h"
+#include <gtest/gtest.h>
+#include <iostream>
+
+// Test fixture for projective coordinate operations
+class ProjectiveTest : public ::testing::Test {
+protected:
+  void *stream;
+  int gpu_index;
+
+  void SetUp() override {
+    gpu_index = 0;
+    stream = cuda_stream_create(gpu_index);
+  }
+
+  void TearDown() override { cuda_stream_destroy(stream, gpu_index); }
+};
+
+// Test: Convert affine -> projective -> affine (round trip)
+TEST_F(ProjectiveTest, G1RoundTrip) {
+  uint64_t size_tracker = 0;
+  // Get generator in Montgomery form
+  G1Affine G = g1_generator();
+  G1Affine G_mont = G;
+  point_to_montgomery_inplace(G_mont);
+
+  // Convert to projective
+  G1ProjectivePoint G_proj;
+  affine_to_projective(G_proj, G_mont);
+
+  // Convert back to affine
+  G1Affine G_back;
+  projective_to_affine_g1(G_back, G_proj);
+
+  // Convert from Montgomery
+  G1Affine G_result;
+  fp_from_montgomery(G_result.x, G_back.x);
+  fp_from_montgomery(G_result.y, G_back.y);
+  G_result.infinity = G_back.infinity;
+
+  // Compare
+  EXPECT_EQ(fp_cmp(G_result.x, G.x), FpComparison::Equal)
+      << "X coordinate mismatch in round-trip";
+  EXPECT_EQ(fp_cmp(G_result.y, G.y), FpComparison::Equal)
+      << "Y coordinate mismatch in round-trip";
+  EXPECT_EQ(G_result.infinity, G.infinity) << "Infinity flag mismatch";
+}
+
+// Test: Projective doubling vs affine doubling
+TEST_F(ProjectiveTest, G1DoublingVsAffine) {
+  uint64_t size_tracker = 0;
+  // Get generator in Montgomery form
+  G1Affine G = g1_generator();
+  G1Affine G_mont = G;
+  point_to_montgomery_inplace(G_mont);
+  G_mont.infinity = false;
+
+  // Affine doubling: 2*G using existing point_add
+  G1Affine *d_G = (G1Affine *)cuda_malloc_with_size_tracking_async(
+      sizeof(G1Affine), stream, gpu_index, size_tracker, true);
+  G1Affine *d_2G_affine = (G1Affine *)cuda_malloc_with_size_tracking_async(
+      sizeof(G1Affine), stream, gpu_index, size_tracker, true);
+  cuda_memcpy_with_size_tracking_async_to_gpu(d_G, &G_mont, sizeof(G1Affine),
+                                              stream, gpu_index, true);
+  point_add<G1Affine>(stream, gpu_index, d_2G_affine, d_G, d_G);
+
+  G1Affine result_affine;
+  cuda_memcpy_async_to_cpu(&result_affine, d_2G_affine, sizeof(G1Affine),
+                           stream, gpu_index);
+  cuda_synchronize_stream(stream, gpu_index);
+
+  // Projective doubling
+  G1ProjectivePoint G_proj;
+  affine_to_projective(G_proj, G_mont);
+
+  G1ProjectivePoint G2_proj;
+  projective_point_double(G2_proj, G_proj);
+
+  // Convert back to affine
+  G1Affine result_proj;
+  projective_to_affine_g1(result_proj, G2_proj);
+
+  // Compare (both are in Montgomery form)
+  EXPECT_EQ(fp_cmp(result_proj.x, result_affine.x), FpComparison::Equal)
+      << "X coordinate mismatch: projective doubling vs affine doubling";
+  EXPECT_EQ(fp_cmp(result_proj.y, result_affine.y), FpComparison::Equal)
+      << "Y coordinate mismatch: projective doubling vs affine doubling";
+
+  cuda_drop_with_size_tracking_async(d_G, stream, gpu_index, true);
+  cuda_drop_with_size_tracking_async(d_2G_affine, stream, gpu_index, true);
+}
+
+// Test: Projective addition vs affine addition
+TEST_F(ProjectiveTest, G1AdditionVsAffine) {
+  uint64_t size_tracker = 0;
+  // Get generator in Montgomery form
+  G1Affine G = g1_generator();
+  G1Affine G_mont = G;
+  point_to_montgomery_inplace(G_mont);
+  G_mont.infinity = false;
+
+  // Compute 2*G in affine
+  G1Affine *d_G = (G1Affine *)cuda_malloc_with_size_tracking_async(
+      sizeof(G1Affine), stream, gpu_index, size_tracker, true);
+  G1Affine *d_2G = (G1Affine *)cuda_malloc_with_size_tracking_async(
+      sizeof(G1Affine), stream, gpu_index, size_tracker, true);
+  cuda_memcpy_with_size_tracking_async_to_gpu(d_G, &G_mont, sizeof(G1Affine),
+                                              stream, gpu_index, true);
+  point_add<G1Affine>(stream, gpu_index, d_2G, d_G, d_G);
+
+  G1Affine G2_mont;
+  cuda_memcpy_async_to_cpu(&G2_mont, d_2G, sizeof(G1Affine), stream, gpu_index);
+  cuda_synchronize_stream(stream, gpu_index);
+
+  // Compute G + 2G = 3G in affine
+  G1Affine *d_3G_affine = (G1Affine *)cuda_malloc_with_size_tracking_async(
+      sizeof(G1Affine), stream, gpu_index, size_tracker, true);
+  point_add<G1Affine>(stream, gpu_index, d_3G_affine, d_G, d_2G);
+
+  G1Affine result_affine;
+  cuda_memcpy_async_to_cpu(&result_affine, d_3G_affine, sizeof(G1Affine),
+                           stream, gpu_index);
+  cuda_synchronize_stream(stream, gpu_index);
+
+  // Compute G + 2G = 3G in projective
+  G1ProjectivePoint G_proj, G2_proj, G3_proj;
+  affine_to_projective(G_proj, G_mont);
+  affine_to_projective(G2_proj, G2_mont);
+  projective_point_add(G3_proj, G_proj, G2_proj);
+
+  // Convert back to affine
+  G1Affine result_proj;
+  projective_to_affine_g1(result_proj, G3_proj);
+
+  // Compare (both are in Montgomery form)
+  EXPECT_EQ(fp_cmp(result_proj.x, result_affine.x), FpComparison::Equal)
+      << "X coordinate mismatch: projective addition vs affine addition";
+  EXPECT_EQ(fp_cmp(result_proj.y, result_affine.y), FpComparison::Equal)
+      << "Y coordinate mismatch: projective addition vs affine addition";
+
+  cuda_drop_with_size_tracking_async(d_G, stream, gpu_index, true);
+  cuda_drop_with_size_tracking_async(d_2G, stream, gpu_index, true);
+  cuda_drop_with_size_tracking_async(d_3G_affine, stream, gpu_index, true);
+}
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/test_scalar_mul.cu
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/test_scalar_mul.cu
@@ -0,0 +1,379 @@
+#include "curve.h"
+#include "device.h"
+#include "fp.h"
+#include "fp2.h"
+#include <cstdint>
+#include <cstring>
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+
+// Test fixture for scalar multiplication tests
+class ScalarMulTest : public ::testing::Test {
+protected:
+  void SetUp() override {
+    // Initialize CUDA
+    if (!cuda_is_available()) {
+      GTEST_SKIP() << "CUDA not available";
+    }
+
+    gpu_index = 0;
+    stream = cuda_create_stream(gpu_index);
+
+    // Device generators are now hardcoded at compile time, no initialization
+    // needed
+  }
+
+  void TearDown() override {
+    if (stream != nullptr) {
+      cuda_destroy_stream(stream, gpu_index);
+    }
+  }
+
+  uint32_t gpu_index;
+  cudaStream_t stream;
+};
+
+// Test scalar multiplication by using MSM with a single point
+// This tests the building block projective_scalar_mul indirectly
+// MSM with n=1 calls projective_scalar_mul internally
+
+// Test G1 scalar multiplication: scalar = 1 (should return point itself)
+TEST_F(ScalarMulTest, G1ScalarMulOne) {
+  uint64_t size_tracker = 0;
+  // Get generator point
+  const G1Affine &G = g1_generator();
+  if (g1_is_infinity(G)) {
+    GTEST_SKIP() << "G1 generator not set";
+  }
+
+  // Convert to Montgomery form
+  G1Affine G_mont = G;
+  point_to_montgomery_inplace(G_mont);
+  G_mont.infinity = false;
+
+  // Create scalar = 1
+  Scalar scalar_one;
+  scalar_one.limb[0] = 1;
+  for (int i = 1; i < 5; i++) {
+    scalar_one.limb[i] = 0;
+  }
+
+  // Allocate device memory
+  auto *d_point = static_cast<G1Affine *>(cuda_malloc_with_size_tracking_async(
+      sizeof(G1Affine), stream, gpu_index, size_tracker, true));
+  auto *d_scalar = static_cast<Scalar *>(cuda_malloc_with_size_tracking_async(
+      sizeof(Scalar), stream, gpu_index, size_tracker, true));
+  auto *d_result =
+      static_cast<G1Projective *>(cuda_malloc_with_size_tracking_async(
+          sizeof(G1Projective), stream, gpu_index, size_tracker, true));
+
+  // Copy to device
+  cuda_memcpy_with_size_tracking_async_to_gpu(
+      d_point, &G_mont, sizeof(G1Affine), stream, gpu_index, true);
+  cuda_memcpy_with_size_tracking_async_to_gpu(
+      d_scalar, &scalar_one, sizeof(Scalar), stream, gpu_index, true);
+
+  // Test scalar multiplication using MSM with single point (tests
+  // projective_scalar_mul)
+  int threadsPerBlock = 256;
+  int num_blocks = 1;
+  size_t scratch_size =
+      (num_blocks + 1) * MSM_G1_BUCKET_COUNT * sizeof(G1Projective);
+  auto *d_scratch =
+      static_cast<G1Projective *>(cuda_malloc_with_size_tracking_async(
+          scratch_size, stream, gpu_index, size_tracker, true));
+
+  point_msm_g1(stream, gpu_index, d_result, d_point, d_scalar, d_scratch, 1,
+               size_tracker);
+  check_cuda_error(cudaGetLastError());
+
+  cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);
+
+  // Copy result back
+  G1Projective h_result;
+  cuda_memcpy_async_to_cpu(&h_result, d_result, sizeof(G1Projective), stream,
+                           gpu_index);
+  cuda_synchronize_stream(stream, gpu_index);
+
+  // Convert projective to affine
+  G1Affine result_affine;
+  projective_to_affine_g1(result_affine, h_result);
+
+  // Convert from Montgomery to normal form
+  G1Affine result_normal;
+  fp_from_montgomery(result_normal.x, result_affine.x);
+  fp_from_montgomery(result_normal.y, result_affine.y);
+  result_normal.infinity = result_affine.infinity;
+
+  // Check: result should be the same as input (scalar = 1)
+  EXPECT_FALSE(result_normal.infinity)
+      << "Result should not be at infinity for scalar=1";
+  EXPECT_EQ(fp_cmp(result_normal.x, G.x), FpComparison::Equal)
+      << "x-coordinate should match input point";
+  EXPECT_EQ(fp_cmp(result_normal.y, G.y), FpComparison::Equal)
+      << "y-coordinate should match input point";
+
+  // Cleanup
+  cuda_drop_with_size_tracking_async(d_point, stream, gpu_index, true);
+  cuda_drop_with_size_tracking_async(d_scalar, stream, gpu_index, true);
+  cuda_drop_with_size_tracking_async(d_result, stream, gpu_index, true);
+}
+
+// Test G1 scalar multiplication: scalar = 0 (should return infinity)
+TEST_F(ScalarMulTest, G1ScalarMulZero) {
+  uint64_t size_tracker = 0;
+  // Get generator point
+  const G1Affine &G = g1_generator();
+  if (g1_is_infinity(G)) {
+    GTEST_SKIP() << "G1 generator not set";
+  }
+
+  // Convert to Montgomery form
+  G1Affine G_mont = G;
+  point_to_montgomery_inplace(G_mont);
+
+  // Create scalar = 0
+  Scalar scalar_zero;
+  std::memset(scalar_zero.limb, 0, sizeof(scalar_zero.limb));
+
+  // Allocate device memory
+  auto *d_point = static_cast<G1Affine *>(cuda_malloc_with_size_tracking_async(
+      sizeof(G1Affine), stream, gpu_index, size_tracker, true));
+  auto *d_scalar = static_cast<Scalar *>(cuda_malloc_with_size_tracking_async(
+      sizeof(Scalar), stream, gpu_index, size_tracker, true));
+  auto *d_result =
+      static_cast<G1Projective *>(cuda_malloc_with_size_tracking_async(
+          sizeof(G1Projective), stream, gpu_index, size_tracker, true));
+
+  // Copy to device
+  cuda_memcpy_with_size_tracking_async_to_gpu(
+      d_point, &G_mont, sizeof(G1Affine), stream, gpu_index, true);
+  cuda_memcpy_with_size_tracking_async_to_gpu(
+      d_scalar, &scalar_zero, sizeof(Scalar), stream, gpu_index, true);
+
+  // Test scalar multiplication using MSM with single point (tests
+  // projective_scalar_mul)
+  int threadsPerBlock = 256;
+  int num_blocks = 1;
+  size_t scratch_size =
+      (num_blocks + 1) * MSM_G1_BUCKET_COUNT * sizeof(G1Projective);
+  auto *d_scratch =
+      static_cast<G1Projective *>(cuda_malloc_with_size_tracking_async(
+          scratch_size, stream, gpu_index, size_tracker, true));
+
+  point_msm_g1(stream, gpu_index, d_result, d_point, d_scalar, d_scratch, 1,
+               size_tracker);
+  check_cuda_error(cudaGetLastError());
+
+  cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);
+
+  // Copy result back
+  G1Projective h_result;
+  cuda_memcpy_async_to_cpu(&h_result, d_result, sizeof(G1Projective), stream,
+                           gpu_index);
+  cuda_synchronize_stream(stream, gpu_index);
+
+  // Check: result should be at infinity (Z = 0)
+  EXPECT_TRUE(fp_is_zero(h_result.Z))
+      << "Result should be at infinity for scalar=0";
+
+  // Cleanup
+  cuda_drop_with_size_tracking_async(d_point, stream, gpu_index, true);
+  cuda_drop_with_size_tracking_async(d_scalar, stream, gpu_index, true);
+  cuda_drop_with_size_tracking_async(d_result, stream, gpu_index, true);
+}
+
+// Test G1 scalar multiplication: scalar = 2 (should return 2*point)
+TEST_F(ScalarMulTest, G1ScalarMulTwo) {
+  uint64_t size_tracker = 0;
+  // Get generator point
+  const G1Affine &G = g1_generator();
+  if (g1_is_infinity(G)) {
+    GTEST_SKIP() << "G1 generator not set";
+  }
+
+  // Convert to Montgomery form
+  G1Affine G_mont = G;
+  point_to_montgomery_inplace(G_mont);
+
+  // Create scalar = 2
+  Scalar scalar_two;
+  scalar_two.limb[0] = 2;
+  for (int i = 1; i < 5; i++) {
+    scalar_two.limb[i] = 0;
+  }
+
+  // Allocate device memory
+  auto *d_point = static_cast<G1Affine *>(cuda_malloc_with_size_tracking_async(
+      sizeof(G1Affine), stream, gpu_index, size_tracker, true));
+  auto *d_scalar = static_cast<Scalar *>(cuda_malloc_with_size_tracking_async(
+      sizeof(Scalar), stream, gpu_index, size_tracker, true));
+  auto *d_result =
+      static_cast<G1Projective *>(cuda_malloc_with_size_tracking_async(
+          sizeof(G1Projective), stream, gpu_index, size_tracker, true));
+  auto *d_expected =
+      static_cast<G1Affine *>(cuda_malloc_with_size_tracking_async(
+          sizeof(G1Affine), stream, gpu_index, size_tracker, true));
+
+  // Copy to device
+  cuda_memcpy_with_size_tracking_async_to_gpu(
+      d_point, &G_mont, sizeof(G1Affine), stream, gpu_index, true);
+  cuda_memcpy_with_size_tracking_async_to_gpu(
+      d_scalar, &scalar_two, sizeof(Scalar), stream, gpu_index, true);
+
+  // Test scalar multiplication using MSM with single point (tests
+  // projective_scalar_mul)
+  int threadsPerBlock = 256;
+  int num_blocks = 1;
+  size_t scratch_size =
+      (num_blocks + 1) * MSM_G1_BUCKET_COUNT * sizeof(G1Projective);
+  auto *d_scratch =
+      static_cast<G1Projective *>(cuda_malloc_with_size_tracking_async(
+          scratch_size, stream, gpu_index, size_tracker, true));
+
+  point_msm_g1(stream, gpu_index, d_result, d_point, d_scalar, d_scratch, 1,
+               size_tracker);
+  check_cuda_error(cudaGetLastError());
+
+  cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);
+
+  // Compute expected result: 2*G using point doubling
+  single_point_scalar_mul<G1Affine>(stream, gpu_index, d_expected, d_point, 2);
+
+  // Synchronize and copy results back
+  cuda_synchronize_stream(stream, gpu_index);
+  G1Projective h_result;
+  cuda_memcpy_async_to_cpu(&h_result, d_result, sizeof(G1Projective), stream,
+                           gpu_index);
+  G1Affine h_expected;
+  cuda_memcpy_async_to_cpu(&h_expected, d_expected, sizeof(G1Affine), stream,
+                           gpu_index);
+  cuda_synchronize_stream(stream, gpu_index);
+
+  // Convert projective result to affine
+  G1Affine result_affine;
+  projective_to_affine_g1(result_affine, h_result);
+
+  // Convert from Montgomery to normal form
+  G1Affine result_normal, expected_normal;
+  fp_from_montgomery(result_normal.x, result_affine.x);
+  fp_from_montgomery(result_normal.y, result_affine.y);
+  result_normal.infinity = result_affine.infinity;
+  fp_from_montgomery(expected_normal.x, h_expected.x);
+  fp_from_montgomery(expected_normal.y, h_expected.y);
+  expected_normal.infinity = h_expected.infinity;
+
+  // Check: result should match expected (2*G)
+  EXPECT_EQ(result_normal.infinity, expected_normal.infinity)
+      << "Infinity flag should match";
+  if (!result_normal.infinity && !expected_normal.infinity) {
+    EXPECT_EQ(fp_cmp(result_normal.x, expected_normal.x), FpComparison::Equal)
+        << "x-coordinate should match 2*G";
+    EXPECT_EQ(fp_cmp(result_normal.y, expected_normal.y), FpComparison::Equal)
+        << "y-coordinate should match 2*G";
+  }
+
+  // Cleanup
+  cuda_drop_with_size_tracking_async(d_point, stream, gpu_index, true);
+  cuda_drop_with_size_tracking_async(d_scalar, stream, gpu_index, true);
+  cuda_drop_with_size_tracking_async(d_result, stream, gpu_index, true);
+  cuda_drop_with_size_tracking_async(d_expected, stream, gpu_index, true);
+}
+
+// Test G1 scalar multiplication: scalar = 3 (should return 3*point = point +
+// 2*point)
+TEST_F(ScalarMulTest, G1ScalarMulThree) {
+  uint64_t size_tracker = 0;
+  // Get generator point
+  const G1Affine &G = g1_generator();
+  if (g1_is_infinity(G)) {
+    GTEST_SKIP() << "G1 generator not set";
+  }
+
+  // Convert to Montgomery form
+  G1Affine G_mont = G;
+  point_to_montgomery_inplace(G_mont);
+
+  // Create scalar = 3
+  Scalar scalar_three;
+  scalar_three.limb[0] = 3;
+  for (int i = 1; i < 5; i++) {
+    scalar_three.limb[i] = 0;
+  }
+
+  // Allocate device memory
+  auto *d_point = static_cast<G1Affine *>(cuda_malloc_with_size_tracking_async(
+      sizeof(G1Affine), stream, gpu_index, size_tracker, true));
+  auto *d_scalar = static_cast<Scalar *>(cuda_malloc_with_size_tracking_async(
+      sizeof(Scalar), stream, gpu_index, size_tracker, true));
+  auto *d_result =
+      static_cast<G1Projective *>(cuda_malloc_with_size_tracking_async(
+          sizeof(G1Projective), stream, gpu_index, size_tracker, true));
+  auto *d_expected =
+      static_cast<G1Affine *>(cuda_malloc_with_size_tracking_async(
+          sizeof(G1Affine), stream, gpu_index, size_tracker, true));
+
+  // Copy to device
+  cuda_memcpy_with_size_tracking_async_to_gpu(
+      d_point, &G_mont, sizeof(G1Affine), stream, gpu_index, true);
+  cuda_memcpy_with_size_tracking_async_to_gpu(
+      d_scalar, &scalar_three, sizeof(Scalar), stream, gpu_index, true);
+
+  // Test scalar multiplication using MSM with single point (tests
+  // projective_scalar_mul)
+  int threadsPerBlock = 256;
+  int num_blocks = 1;
+  size_t scratch_size =
+      (num_blocks + 1) * MSM_G1_BUCKET_COUNT * sizeof(G1Projective);
+  auto *d_scratch =
+      static_cast<G1Projective *>(cuda_malloc_with_size_tracking_async(
+          scratch_size, stream, gpu_index, size_tracker, true));
+
+  point_msm_g1(stream, gpu_index, d_result, d_point, d_scalar, d_scratch, 1,
+               size_tracker);
+  check_cuda_error(cudaGetLastError());
+
+  cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);
+
+  // Compute expected result: 3*G using u64 scalar multiplication
+  single_point_scalar_mul<G1Affine>(stream, gpu_index, d_expected, d_point, 3);
+
+  // Synchronize and copy results back
+  cuda_synchronize_stream(stream, gpu_index);
+  G1Projective h_result;
+  cuda_memcpy_async_to_cpu(&h_result, d_result, sizeof(G1Projective), stream,
+                           gpu_index);
+  G1Affine h_expected;
+  cuda_memcpy_async_to_cpu(&h_expected, d_expected, sizeof(G1Affine), stream,
+                           gpu_index);
+  cuda_synchronize_stream(stream, gpu_index);
+
+  // Convert projective result to affine
+  G1Affine result_affine;
+  projective_to_affine_g1(result_affine, h_result);
+
+  // Convert from Montgomery to normal form
+  G1Affine result_normal, expected_normal;
+  fp_from_montgomery(result_normal.x, result_affine.x);
+  fp_from_montgomery(result_normal.y, result_affine.y);
+  result_normal.infinity = result_affine.infinity;
+  fp_from_montgomery(expected_normal.x, h_expected.x);
+  fp_from_montgomery(expected_normal.y, h_expected.y);
+  expected_normal.infinity = h_expected.infinity;
+
+  // Check: result should match expected (3*G)
+  EXPECT_EQ(result_normal.infinity, expected_normal.infinity)
+      << "Infinity flag should match";
+  if (!result_normal.infinity && !expected_normal.infinity) {
+    EXPECT_EQ(fp_cmp(result_normal.x, expected_normal.x), FpComparison::Equal)
+        << "x-coordinate should match 3*G";
+    EXPECT_EQ(fp_cmp(result_normal.y, expected_normal.y), FpComparison::Equal)
+        << "y-coordinate should match 3*G";
+  }
+
+  // Cleanup
+  cuda_drop_with_size_tracking_async(d_point, stream, gpu_index, true);
+  cuda_drop_with_size_tracking_async(d_scalar, stream, gpu_index, true);
+  cuda_drop_with_size_tracking_async(d_result, stream, gpu_index, true);
+  cuda_drop_with_size_tracking_async(d_expected, stream, gpu_index, true);
+}
--- a/backends/zk-cuda-backend/src/bindings.rs
+++ b/backends/zk-cuda-backend/src/bindings.rs
@@ -139,37 +139,31 @@ unsafe extern "C" {
    pub fn g2_is_infinity_wrapper(point: *const G2Point) -> bool;
 }
 unsafe extern "C" {
-    pub fn g1_msm_unmanaged_wrapper_async(
+    pub fn g1_msm_unmanaged_wrapper(
        stream: cudaStream_t,
        gpu_index: u32,
-        h_result: *mut G1ProjectivePoint,
+        d_result: *mut G1ProjectivePoint,
        d_points: *const G1Point,
        d_scalars: *const Scalar,
-        n: u32,
        d_scratch: *mut G1ProjectivePoint,
-        gpu_memory_allocated: bool,
+        n: u32,
+        points_in_montgomery: bool,
        size_tracker: *mut u64,
    );
 }
 unsafe extern "C" {
-    pub fn g2_msm_unmanaged_wrapper_async(
+    pub fn g2_msm_unmanaged_wrapper(
        stream: cudaStream_t,
        gpu_index: u32,
-        h_result: *mut G2ProjectivePoint,
+        d_result: *mut G2ProjectivePoint,
        d_points: *const G2Point,
        d_scalars: *const Scalar,
-        n: u32,
        d_scratch: *mut G2ProjectivePoint,
-        gpu_memory_allocated: bool,
+        n: u32,
+        points_in_montgomery: bool,
        size_tracker: *mut u64,
    );
 }
-unsafe extern "C" {
-    pub fn pippenger_scratch_size_g1_wrapper(n: u32, gpu_index: u32) -> usize;
-}
-unsafe extern "C" {
-    pub fn pippenger_scratch_size_g2_wrapper(n: u32, gpu_index: u32) -> usize;
-}
 unsafe extern "C" {
    pub fn g1_msm_managed_wrapper(
        stream: cudaStream_t,
--- a/backends/zk-cuda-backend/src/c_wrapper.cu
+++ b/backends/zk-cuda-backend/src/c_wrapper.cu
@@ -1,7 +1,6 @@
 // C wrapper functions for Rust FFI
 // These functions provide a C-compatible interface to the C++ functions

-#include "checked_arithmetic.h"
 #include "curve.h"
 #include "device.h"
 #include "msm.h"
@@ -63,77 +62,109 @@ bool g2_is_infinity_wrapper(const G2Affine* point) {
    return g2_is_infinity(*point);
 }

-// Unmanaged MSM wrapper for G1 (points/scalars/scratch on device, result on host)
-// Points MUST be in Montgomery form. Caller provides scratch buffer and
-// controls allocation tracking via gpu_memory_allocated.
-// Zero internal allocations — this is a thin validation + dispatch layer.
-void g1_msm_unmanaged_wrapper_async(
+// Unmanaged MSM wrapper for G1 (assumes all data is already on device)
+// If points_in_montgomery is false, a temporary copy will be made and converted.
+// For best performance, provide points already in Montgomery form to avoid allocation overhead.
+// NOTE: This wrapper synchronizes the stream before returning — callers do not need to sync.
+void g1_msm_unmanaged_wrapper(
    cudaStream_t stream,
    uint32_t gpu_index,
-    G1Projective* h_result,
+    G1Projective* d_result,
    const G1Affine* d_points,
    const Scalar* d_scalars,
-    uint32_t n,
    G1Projective* d_scratch,
-    bool gpu_memory_allocated,
+    uint32_t n,
+    bool points_in_montgomery,
    uint64_t* size_tracker
 ) {
-    PANIC_IF_FALSE(size_tracker != nullptr, "G1 MSM error: size_tracker is null");
    uint64_t& size_tracker_ref = *size_tracker;
    PANIC_IF_FALSE(n > 0, "G1 MSM error: n must be positive, got %u", n);
    PANIC_IF_FALSE(stream != nullptr, "G1 MSM error: stream is null");
-    PANIC_IF_FALSE(h_result != nullptr, "G1 MSM error: h_result is null");
+    PANIC_IF_FALSE(d_result != nullptr, "G1 MSM error: d_result is null");
    PANIC_IF_FALSE(d_points != nullptr, "G1 MSM error: d_points is null");
    PANIC_IF_FALSE(d_scalars != nullptr, "G1 MSM error: d_scalars is null");
    PANIC_IF_FALSE(d_scratch != nullptr, "G1 MSM error: d_scratch is null");
-    PANIC_IF_FALSE(gpu_index < static_cast<uint32_t>(cuda_get_number_of_gpus()),
+    PANIC_IF_FALSE(gpu_index < (uint32_t)cuda_get_number_of_gpus(),
                   "G1 MSM error: invalid gpu_index=%u (gpu_count=%d)", gpu_index,
                   cuda_get_number_of_gpus());

-    point_msm_g1_async(stream, gpu_index, h_result, d_points, d_scalars, n,
-                       d_scratch, size_tracker_ref, gpu_memory_allocated);
+    const G1Affine* points_to_use = d_points;
+    G1Affine* d_points_converted = nullptr;
+
+    if (!points_in_montgomery) {
+        size_t points_bytes = 0;
+        bool overflow = __builtin_mul_overflow((size_t)n, sizeof(G1Affine), &points_bytes);
+        PANIC_IF_FALSE(!overflow,
+                       "G1 MSM unmanaged error: points byte size overflow (n=%u)", n);
+        d_points_converted = static_cast<G1Affine*>(cuda_malloc_with_size_tracking_async(points_bytes, stream, gpu_index, size_tracker_ref, true));
+        PANIC_IF_FALSE(d_points_converted != nullptr, "G1 MSM error: failed to allocate memory for Montgomery conversion");
+        cuda_memcpy_with_size_tracking_async_gpu_to_gpu(d_points_converted, d_points, points_bytes, stream, gpu_index, true);
+        convert_g1_points_to_montgomery(stream, gpu_index, d_points_converted, n);
+        check_cuda_error(cudaGetLastError());
+        points_to_use = d_points_converted;
+    }
+
+    point_msm_async_g1(stream, gpu_index, d_result, points_to_use, d_scalars, d_scratch, n, size_tracker_ref);
    check_cuda_error(cudaGetLastError());
+
+    if (d_points_converted != nullptr) {
+        cuda_drop_with_size_tracking_async(d_points_converted, stream, gpu_index, true);
+    }
+
+    cuda_synchronize_stream(stream, gpu_index);
 }

-// Unmanaged MSM wrapper for G2 (points/scalars/scratch on device, result on host)
-// Points MUST be in Montgomery form. Caller provides scratch buffer and
-// controls allocation tracking via gpu_memory_allocated.
-// Zero internal allocations — this is a thin validation + dispatch layer.
-void g2_msm_unmanaged_wrapper_async(
+// Unmanaged MSM wrapper for G2 (assumes all data is already on device)
+// If points_in_montgomery is false, a temporary copy will be made and converted.
+// For best performance, provide points already in Montgomery form to avoid allocation overhead.
+// NOTE: This wrapper synchronizes the stream before returning — callers do not need to sync.
+void g2_msm_unmanaged_wrapper(
    cudaStream_t stream,
    uint32_t gpu_index,
-    G2Projective* h_result,
+    G2Projective* d_result,
    const G2Affine* d_points,
    const Scalar* d_scalars,
-    uint32_t n,
    G2Projective* d_scratch,
-    bool gpu_memory_allocated,
+    uint32_t n,
+    bool points_in_montgomery,
    uint64_t* size_tracker
 ) {
-    PANIC_IF_FALSE(size_tracker != nullptr, "G2 MSM error: size_tracker is null");
    uint64_t& size_tracker_ref = *size_tracker;
    PANIC_IF_FALSE(n > 0, "G2 MSM error: n must be positive, got %u", n);
    PANIC_IF_FALSE(stream != nullptr, "G2 MSM error: stream is null");
-    PANIC_IF_FALSE(h_result != nullptr, "G2 MSM error: h_result is null");
+    PANIC_IF_FALSE(d_result != nullptr, "G2 MSM error: d_result is null");
    PANIC_IF_FALSE(d_points != nullptr, "G2 MSM error: d_points is null");
    PANIC_IF_FALSE(d_scalars != nullptr, "G2 MSM error: d_scalars is null");
    PANIC_IF_FALSE(d_scratch != nullptr, "G2 MSM error: d_scratch is null");
-    PANIC_IF_FALSE(gpu_index < static_cast<uint32_t>(cuda_get_number_of_gpus()),
+    PANIC_IF_FALSE(gpu_index < (uint32_t)cuda_get_number_of_gpus(),
                   "G2 MSM error: invalid gpu_index=%u (gpu_count=%d)", gpu_index,
                   cuda_get_number_of_gpus());

-    point_msm_g2_async(stream, gpu_index, h_result, d_points, d_scalars, n,
-                       d_scratch, size_tracker_ref, gpu_memory_allocated);
+    const G2Affine* points_to_use = d_points;
+    G2Affine* d_points_converted = nullptr;
+
+    if (!points_in_montgomery) {
+        size_t points_bytes = 0;
+        bool overflow = __builtin_mul_overflow((size_t)n, sizeof(G2Affine), &points_bytes);
+        PANIC_IF_FALSE(!overflow,
+                       "G2 MSM unmanaged error: points byte size overflow (n=%u)", n);
+        d_points_converted = static_cast<G2Affine*>(cuda_malloc_with_size_tracking_async(points_bytes, stream, gpu_index, size_tracker_ref, true));
+        PANIC_IF_FALSE(d_points_converted != nullptr, "G2 MSM error: failed to allocate memory for Montgomery conversion");
+        cuda_memcpy_with_size_tracking_async_gpu_to_gpu(d_points_converted, d_points, points_bytes, stream, gpu_index, true);
+        convert_g2_points_to_montgomery(stream, gpu_index, d_points_converted, n);
+        check_cuda_error(cudaGetLastError());
+        points_to_use = d_points_converted;
+    }
+
+    point_msm_async_g2(stream, gpu_index, d_result, points_to_use, d_scalars, d_scratch, n, size_tracker_ref);
    check_cuda_error(cudaGetLastError());
-}

-// Scratch size query wrappers (needed for bindgen `.*_wrapper` allowlist)
-size_t pippenger_scratch_size_g1_wrapper(uint32_t n, uint32_t gpu_index) {
-    return pippenger_scratch_size_g1(n, gpu_index);
-}
+    // Free temporary memory if allocated
+    if (d_points_converted != nullptr) {
+        cuda_drop_with_size_tracking_async(d_points_converted, stream, gpu_index, true);
+    }

-size_t pippenger_scratch_size_g2_wrapper(uint32_t n, uint32_t gpu_index) {
-    return pippenger_scratch_size_g2(n, gpu_index);
+    cuda_synchronize_stream(stream, gpu_index);
 }

 // Managed MSM wrapper for G1 (handles memory management internally)
@@ -154,48 +185,74 @@ void g1_msm_managed_wrapper(
    PANIC_IF_FALSE(stream != nullptr, "G1 MSM error: stream is null");
    PANIC_IF_FALSE(points != nullptr, "G1 MSM error: points is null");
    PANIC_IF_FALSE(scalars != nullptr, "G1 MSM error: scalars is null");
-    PANIC_IF_FALSE(gpu_index < static_cast<uint32_t>(cuda_get_number_of_gpus()),
+    PANIC_IF_FALSE(gpu_index < (uint32_t)cuda_get_number_of_gpus(),
                   "G1 MSM error: invalid gpu_index=%u (gpu_count=%d)", gpu_index,
                   cuda_get_number_of_gpus());
-
+    
    cuda_set_device(gpu_index);

+    /////////////////////////////////
+    // TODO: Move this check closer to the kernels
+    const auto threadsPerBlock = get_msm_threads_per_block<G1Affine>(n);
+    const auto num_blocks = CEIL_DIV(n, threadsPerBlock);
+    /////////////////////////////////
+
    // Compute buffer sizes with overflow checking.
-    size_t points_bytes = safe_mul_sizeof<G1Affine>(static_cast<size_t>(n));
-    size_t scalars_bytes = safe_mul_sizeof<Scalar>(static_cast<size_t>(n));
+    size_t scratch_elems = 0;
+    bool scratch_elems_overflow = __builtin_mul_overflow(
+        (size_t)(num_blocks + 1), (size_t)MSM_G1_BUCKET_COUNT, &scratch_elems);
+    PANIC_IF_FALSE(!scratch_elems_overflow,
+                   "G1 MSM error: scratch element count overflow (num_blocks=%u)",
+                   num_blocks);
+
+    size_t scratch_size = 0;
+    bool scratch_size_overflow =
+        __builtin_mul_overflow(scratch_elems, sizeof(G1Projective), &scratch_size);
+    PANIC_IF_FALSE(!scratch_size_overflow,
+                   "G1 MSM error: scratch size overflow (scratch_elems=%zu)",
+                   scratch_elems);
+
+    size_t points_bytes = 0;
+    bool points_bytes_overflow =
+        __builtin_mul_overflow((size_t)n, sizeof(G1Affine), &points_bytes);
+    PANIC_IF_FALSE(!points_bytes_overflow,
+                   "G1 MSM error: points byte size overflow (n=%u)", n);
+
+    size_t scalars_bytes = 0;
+    bool scalars_bytes_overflow =
+        __builtin_mul_overflow((size_t)n, sizeof(Scalar), &scalars_bytes);
+    PANIC_IF_FALSE(!scalars_bytes_overflow,
+                   "G1 MSM error: scalars byte size overflow (n=%u)", n);

    // TODO: We should migrate to _unmanaged_ methods and have scratch/cleanup functions as tfhe-cuda-backend
    auto* d_points = static_cast<G1Affine*>(cuda_malloc_with_size_tracking_async(points_bytes, stream, gpu_index, size_tracker_ref, true));
    auto* d_scalars = static_cast<Scalar*>(cuda_malloc_with_size_tracking_async(scalars_bytes, stream, gpu_index, size_tracker_ref, true));
-
+    auto* d_result = static_cast<G1Projective*>(cuda_malloc_with_size_tracking_async(sizeof(G1Projective), stream, gpu_index, size_tracker_ref, true));
+    auto* d_scratch = static_cast<G1Projective*>(cuda_malloc_with_size_tracking_async(scratch_size, stream, gpu_index, size_tracker_ref, true));
+    
+    PANIC_IF_FALSE(d_points && d_scalars && d_result && d_scratch, 
+                   "G1 MSM error: device memory allocation failed");
+    
    // Always copy points to GPU first
    cuda_memcpy_with_size_tracking_async_to_gpu(d_points, points, points_bytes, stream, gpu_index, true);
    cuda_memcpy_with_size_tracking_async_to_gpu(d_scalars, scalars, scalars_bytes, stream, gpu_index, true);
-
+    
    // Convert to Montgomery form on GPU if not already in Montgomery form
    if (!points_in_montgomery) {
        convert_g1_points_to_montgomery(stream, gpu_index, d_points, n);
        check_cuda_error(cudaGetLastError());
    }

-    // Allocate scratch buffer sized to match the pippenger internal partitioning
-    size_t scratch_bytes = pippenger_scratch_size_g1(n, gpu_index);
-    auto* d_scratch = static_cast<G1Projective*>(cuda_malloc_with_size_tracking_async(
-        scratch_bytes, stream, gpu_index, size_tracker_ref, true));
-
-    PANIC_IF_FALSE(d_points && d_scalars && d_scratch,
-                   "G1 MSM error: device memory allocation failed");
-
-    // Result written directly to host pointer -- no device round-trip needed
-    point_msm_g1_async(stream, gpu_index, result, d_points, d_scalars, n,
-                       d_scratch, size_tracker_ref, true);
+    point_msm_async_g1(stream, gpu_index, d_result, d_points, d_scalars, d_scratch, n, size_tracker_ref);
    check_cuda_error(cudaGetLastError());

-    cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);
+    cuda_memcpy_async_to_cpu(result, d_result, sizeof(G1Projective), stream, gpu_index);
+
    cuda_drop_with_size_tracking_async(d_points, stream, gpu_index, true);
    cuda_drop_with_size_tracking_async(d_scalars, stream, gpu_index, true);
+    cuda_drop_with_size_tracking_async(d_result, stream, gpu_index, true);
+    cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);

-    // Sync for the async frees above.
    cuda_synchronize_stream(stream, gpu_index);
 }

@@ -218,46 +275,69 @@ void g2_msm_managed_wrapper(
    PANIC_IF_FALSE(stream != nullptr, "G2 MSM error: stream is null");
    PANIC_IF_FALSE(points != nullptr, "G2 MSM error: points is null");
    PANIC_IF_FALSE(scalars != nullptr, "G2 MSM error: scalars is null");
-    PANIC_IF_FALSE(gpu_index < static_cast<uint32_t>(cuda_get_number_of_gpus()),
+    PANIC_IF_FALSE(gpu_index < (uint32_t)cuda_get_number_of_gpus(),
                   "G2 MSM error: invalid gpu_index=%u (gpu_count=%d)", gpu_index,
                   cuda_get_number_of_gpus());
-
+    
    cuda_set_device(gpu_index);
+    
+    const auto threadsPerBlock = get_msm_threads_per_block<G2Affine>(n);
+    const auto num_blocks = CEIL_DIV(n, threadsPerBlock);

    // Compute buffer sizes with overflow checking.
-    size_t points_bytes = safe_mul_sizeof<G2Affine>(static_cast<size_t>(n));
-    size_t scalars_bytes = safe_mul_sizeof<Scalar>(static_cast<size_t>(n));
+    size_t scratch_elems = 0;
+    bool scratch_elems_overflow = __builtin_mul_overflow(
+        (size_t)(num_blocks + 1), (size_t)MSM_G2_BUCKET_COUNT, &scratch_elems);
+    PANIC_IF_FALSE(!scratch_elems_overflow,
+                   "G2 MSM error: scratch element count overflow (num_blocks=%u)",
+                   num_blocks);

-    // TODO: We should migrate to _unmanaged_ methods and have scratch/cleanup functions as tfhe-cuda-backend
+    size_t scratch_size = 0;
+    bool scratch_size_overflow =
+        __builtin_mul_overflow(scratch_elems, sizeof(G2Projective), &scratch_size);
+    PANIC_IF_FALSE(!scratch_size_overflow,
+                   "G2 MSM error: scratch size overflow (scratch_elems=%zu)",
+                   scratch_elems);
+
+    size_t points_bytes = 0;
+    bool points_bytes_overflow =
+        __builtin_mul_overflow((size_t)n, sizeof(G2Affine), &points_bytes);
+    PANIC_IF_FALSE(!points_bytes_overflow,
+                   "G2 MSM error: points byte size overflow (n=%u)", n);
+
+    size_t scalars_bytes = 0;
+    bool scalars_bytes_overflow =
+        __builtin_mul_overflow((size_t)n, sizeof(Scalar), &scalars_bytes);
+    PANIC_IF_FALSE(!scalars_bytes_overflow,
+                   "G2 MSM error: scalars byte size overflow (n=%u)", n);
+    
    auto* d_points = static_cast<G2Affine*>(cuda_malloc_with_size_tracking_async(points_bytes, stream, gpu_index, size_tracker_ref, true));
    auto* d_scalars = static_cast<Scalar*>(cuda_malloc_with_size_tracking_async(scalars_bytes, stream, gpu_index, size_tracker_ref, true));
-
+    auto* d_result = static_cast<G2Projective*>(cuda_malloc_with_size_tracking_async(sizeof(G2Projective), stream, gpu_index, size_tracker_ref, true));
+    auto* d_scratch = static_cast<G2Projective*>(cuda_malloc_with_size_tracking_async(scratch_size, stream, gpu_index, size_tracker_ref, true));
+    
+    PANIC_IF_FALSE(d_points && d_scalars && d_result && d_scratch, 
+                   "G2 MSM error: device memory allocation failed");
+    
    cuda_memcpy_with_size_tracking_async_to_gpu(d_points, points, points_bytes, stream, gpu_index, true);
    cuda_memcpy_with_size_tracking_async_to_gpu(d_scalars, scalars, scalars_bytes, stream, gpu_index, true);
-
+    
    if (!points_in_montgomery) {
        convert_g2_points_to_montgomery(stream, gpu_index, d_points, n);
        check_cuda_error(cudaGetLastError());
    }

-    // Allocate scratch buffer sized to match the pippenger internal partitioning
-    size_t scratch_bytes = pippenger_scratch_size_g2(n, gpu_index);
-    auto* d_scratch = static_cast<G2Projective*>(cuda_malloc_with_size_tracking_async(
-        scratch_bytes, stream, gpu_index, size_tracker_ref, true));
-
-    PANIC_IF_FALSE(d_points && d_scalars && d_scratch,
-                   "G2 MSM error: device memory allocation failed");
-
-    // Result written directly to host pointer -- no device round-trip needed
-    point_msm_g2_async(stream, gpu_index, result, d_points, d_scalars, n,
-                       d_scratch, size_tracker_ref, true);
+    point_msm_async_g2(stream, gpu_index, d_result, d_points, d_scalars, d_scratch, n, size_tracker_ref);
    check_cuda_error(cudaGetLastError());

-    cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);
+
+    cuda_memcpy_async_to_cpu(result, d_result, sizeof(G2Projective), stream, gpu_index);
+
    cuda_drop_with_size_tracking_async(d_points, stream, gpu_index, true);
    cuda_drop_with_size_tracking_async(d_scalars, stream, gpu_index, true);
+    cuda_drop_with_size_tracking_async(d_result, stream, gpu_index, true);
+    cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);

-    // Sync for the async frees above.
    cuda_synchronize_stream(stream, gpu_index);
 }

--- a/backends/zk-cuda-backend/src/include/api.h
+++ b/backends/zk-cuda-backend/src/include/api.h
@@ -2,7 +2,6 @@

 #include <stdint.h>
 #include <stdbool.h>
-#include <stddef.h>

 #ifdef __cplusplus
 extern "C" {
@@ -99,38 +98,33 @@ bool g1_is_infinity_wrapper(const G1Point* point);
 // Check if G2 point is at infinity
 bool g2_is_infinity_wrapper(const G2Point* point);

-// Unmanaged MSM wrappers (points/scalars/scratch on device, result on host)
-// Points MUST be in Montgomery form. Caller provides a scratch buffer.
-// Zero internal allocations — all device memory is caller-provided.
-void g1_msm_unmanaged_wrapper_async(
+// Unmanaged MSM wrappers (assumes all data is already on device)
+// If points_in_montgomery is false, a temporary copy will be made and converted.
+// For best performance, provide points already in Montgomery form to avoid allocation overhead.
+void g1_msm_unmanaged_wrapper(
    cudaStream_t stream,
    uint32_t gpu_index,
-    G1ProjectivePoint* h_result,
+    G1ProjectivePoint* d_result,
    const G1Point* d_points,
    const Scalar* d_scalars,
-    uint32_t n,
    G1ProjectivePoint* d_scratch,
-    bool gpu_memory_allocated,
+    uint32_t n,
+    bool points_in_montgomery,
    uint64_t* size_tracker
 );

-void g2_msm_unmanaged_wrapper_async(
+void g2_msm_unmanaged_wrapper(
    cudaStream_t stream,
    uint32_t gpu_index,
-    G2ProjectivePoint* h_result,
+    G2ProjectivePoint* d_result,
    const G2Point* d_points,
    const Scalar* d_scalars,
-    uint32_t n,
    G2ProjectivePoint* d_scratch,
-    bool gpu_memory_allocated,
+    uint32_t n,
+    bool points_in_montgomery,
    uint64_t* size_tracker
 );

-// Scratch size queries for Pippenger MSM
-// Returns the exact scratch buffer size in bytes needed for a given input count.
-size_t pippenger_scratch_size_g1_wrapper(uint32_t n, uint32_t gpu_index);
-size_t pippenger_scratch_size_g2_wrapper(uint32_t n, uint32_t gpu_index);
-
 // Managed MSM wrappers with BigInt scalars (320-bit scalars)
 // Handles memory allocation and transfers internally.
 void g1_msm_managed_wrapper(
--- a/backends/zk-cuda-backend/src/types/g1.rs
+++ b/backends/zk-cuda-backend/src/types/g1.rs
@@ -84,8 +84,6 @@ impl G1Affine {
    }
 }

-/// Displays coordinates in decimal. Assumes the point is in Montgomery form (e.g., from MSM
-/// output).
 impl fmt::Display for G1Affine {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        if self.is_infinity() {
@@ -266,9 +264,8 @@ impl G1Projective {
        let mut size_tracker: u64 = 0;
        // NOTE: This method uses the managed API (g1_msm_managed_wrapper) which handles
        // memory allocation and transfers internally. For a pure-GPU verify/proof implementation
-        // where all data is already on the device and memory is managed externally, use the
-        // unmanaged API (g1_msm_unmanaged_wrapper_async) instead — it performs zero internal
-        // allocations (caller provides d_scratch via pippenger_scratch_size_g1_wrapper).
+        // where all data is already on the device and memory is managed externally, consider
+        // using the unmanaged API (g1_msm_unmanaged_wrapper) instead for better performance.
        //
        // SAFETY:
        // - `stream` was validated as non-null above and must be a valid `cudaStream_t` obtained
@@ -281,10 +278,6 @@ impl G1Projective {
        // - `points_ffi` and `scalars_ffi` are valid Vec slices with matching length `n`
        // - `result` and `size_tracker` are valid stack-allocated outputs
        // - The managed wrapper handles all device memory allocation/deallocation internally
-        // - Failure: The C++ managed wrapper validates all inputs via PANIC_IF_FALSE and checks
-        //   CUDA errors via cudaGetLastError() after each kernel launch.
-        // - Success: The C++ managed wrapper calls cuda_synchronize_stream before returning,
-        //   ensuring `result` contains the final MSM output.
        unsafe {
            crate::bindings::g1_msm_managed_wrapper(
                stream as crate::bindings::cudaStream_t,
@@ -302,7 +295,6 @@ impl G1Projective {
    }
 }

-/// Converts to affine and displays. Assumes coordinates are in Montgomery form.
 impl fmt::Display for G1Projective {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let affine = self.to_affine();
--- a/backends/zk-cuda-backend/src/types/g2.rs
+++ b/backends/zk-cuda-backend/src/types/g2.rs
@@ -84,8 +84,6 @@ impl G2Affine {
    }
 }

-/// Displays coordinates in decimal. Assumes the point is in Montgomery form (e.g., from MSM
-/// output).
 impl fmt::Display for G2Affine {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        if self.is_infinity() {
@@ -274,9 +272,8 @@ impl G2Projective {
        let mut size_tracker: u64 = 0;
        // NOTE: This method uses the managed API (g2_msm_managed_wrapper) which handles
        // memory allocation and transfers internally. For a pure-GPU verify/proof implementation
-        // where all data is already on the device and memory is managed externally, use the
-        // unmanaged API (g2_msm_unmanaged_wrapper_async) instead — it performs zero internal
-        // allocations (caller provides d_scratch via pippenger_scratch_size_g2_wrapper).
+        // where all data is already on the device and memory is managed externally, consider
+        // using the unmanaged API (g2_msm_unmanaged_wrapper) instead for better performance.
        //
        // SAFETY:
        // - `stream` was validated as non-null above and must be a valid `cudaStream_t` obtained
@@ -289,10 +286,6 @@ impl G2Projective {
        // - `points_ffi` and `scalars_ffi` are valid Vec slices with matching length `n`
        // - `result` and `size_tracker` are valid stack-allocated outputs
        // - The managed wrapper handles all device memory allocation/deallocation internally
-        // - Failure: The C++ managed wrapper validates all inputs via PANIC_IF_FALSE and checks
-        //   CUDA errors via cudaGetLastError() after each kernel launch.
-        // - Success: The C++ managed wrapper calls cuda_synchronize_stream before returning,
-        //   ensuring `result` contains the final MSM output.
        unsafe {
            crate::bindings::g2_msm_managed_wrapper(
                stream as crate::bindings::cudaStream_t,
@@ -310,7 +303,6 @@ impl G2Projective {
    }
 }

-/// Converts to affine and displays. Assumes coordinates are in Montgomery form.
 impl fmt::Display for G2Projective {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let affine = self.to_affine();
--- a/ci/data_extractor/src/benchmark_specs.py
+++ b/ci/data_extractor/src/benchmark_specs.py
@@ -34,7 +34,6 @@ class Layer(enum.StrEnum):
    Integer = "integer"
    Shortint = "shortint"
    CoreCrypto = "core_crypto"
-    Wasm = "wasm"

    @staticmethod
    def from_str(layer_name):
@@ -47,8 +46,6 @@ class Layer(enum.StrEnum):
                return Layer.Shortint
            case "core_crypto":
                return Layer.CoreCrypto
-            case "wasm":
-                return Layer.Wasm
            case _:
                raise NotImplementedError(f"layer '{layer_name}' not supported")

@@ -303,7 +300,6 @@ class BenchType(enum.StrEnum):
 class BenchSubset(enum.StrEnum):
    All = "all"
    Erc20 = "erc20"
-    Zk = "zk"

    @staticmethod
    def from_str(bench_subset):
@@ -312,52 +308,10 @@ class BenchSubset(enum.StrEnum):
                return BenchSubset.All
            case "erc20":
                return BenchSubset.Erc20
-            case "zk":
-                return BenchSubset.Zk
            case _:
                raise ValueError(f"BenchSubset '{bench_subset}' not supported")


-class ZKOperation(enum.StrEnum):
-    """
-    Operations names mapped to their display in the public documentation.
-    """
-
-    Proof = "Proving"
-    Verify = "Verifying"
-    VerifyAndExpand = "Verify + expand"
-
-    @staticmethod
-    def from_str(op_name):
-        match op_name.lower().rsplit("pke_zk_")[-1]:
-            case "proof":
-                return ZKOperation.Proof
-            case "verify":
-                return ZKOperation.Verify
-            case "verify_and_expand":
-                return ZKOperation.VerifyAndExpand
-            case _:
-                raise ValueError(f"ZK operation '{op_name}' not supported")
-
-
-class ZKComputeLoad(enum.StrEnum):
-    Proof = "slow proof / fast verify"
-    Verify = "fast proof / slow verify"
-
-    @staticmethod
-    def from_str(load):
-        match load.lower():
-            case "proof":
-                return ZKComputeLoad.Proof
-            case "verify":
-                return ZKComputeLoad.Verify
-            case _:
-                raise ValueError(f"ZK compute load '{load}' not supported")
-
-    def fs_safe_str(self):
-        return self.value.replace(" ", "_").replace("/", "and")
-
-
 class ParamsDefinition:
    """
    Represents a parameter definition for specific cryptographic settings.
@@ -515,7 +469,6 @@ class BenchDetails:
    def __init__(self, layer: Layer, bench_full_name: str, bit_size: int):
        self.layer = layer

-        self.bench_type = BenchType.Latency
        self.operation_name = None
        self.bit_size = bit_size
        self.params = None
@@ -523,12 +476,11 @@ class BenchDetails:
        self.sign_flavor = None
        # Only relevant for HLApi layer
        self.rust_type = None
-        self.case_variation = None

        self.parse_test_name(bench_full_name)

    def __repr__(self):
-        return f"BenchDetails(layer={self.layer.value}, type={self.bench_type}, operation_name={self.operation_name}, bit_size={self.bit_size}, params={self.params}, sign={self.sign_flavor or 'N/A'}, case={self.case_variation or 'N/A'})"
+        return f"BenchDetails(layer={self.layer.value}, operation_name={self.operation_name}, bit_size={self.bit_size}, params={self.params}, {self.sign_flavor})"

    def __str__(self):
        return self.__repr__()
@@ -536,26 +488,22 @@ class BenchDetails:
    def __eq__(self, other):
        return (
            self.layer == other.layer
-            and self.bench_type == other.bench_type
            and self.operation_name == other.operation_name
            and self.bit_size == other.bit_size
            and self.params == other.params
            and self.sign_flavor == other.sign_flavor
            and self.rust_type == other.rust_type
-            and self.case_variation == other.case_variation
        )

    def __hash__(self):
        return hash(
            (
                self.layer,
-                self.bench_type,
                self.operation_name,
                self.bit_size,
                self.params,
                self.rust_type,
                self.sign_flavor,
-                self.case_variation,
            )
        )

@@ -570,9 +518,6 @@ class BenchDetails:
        """
        parts = name.split("::")

-        if "throughput" in parts:
-            self.bench_type = BenchType.Throughput
-
        for part in parts:
            if "PARAM" in part:
                self.params = part.partition("_mean")[0]
@@ -580,11 +525,7 @@ class BenchDetails:

        match self.layer:
            case Layer.Integer:
-                op_name_index = 2 if parts[1] in ["cuda", "hpu", "zk"] else 1
-
-                if self.params and not parts[-1].startswith(self.params):
-                    self.case_variation = parts[-1].partition("_mean")[0]
-
+                op_name_index = 2 if parts[1] in ["cuda", "hpu"] else 1
                if parts[op_name_index] == "signed":
                    op_name_index += 1
                    self.sign_flavor = SignFlavor.Signed
@@ -623,12 +564,6 @@ class BenchDetails:
                self.rust_type = parts[-1].partition("_mean")[0]
            case Layer.Shortint:
                self.operation_name = parts[1]
-            case Layer.Wasm:
-                op_name_index = 2 if parts[1] in ["cuda", "hpu", "zk"] else 1
-                self.operation_name = parts[op_name_index]
-
-                if self.params and not parts[-1].startswith(self.params):
-                    self.case_variation = parts[-1].partition("_mean")[0]
            case _:
                raise NotImplementedError(
                    f"layer '{self.layer}' not supported yet for name parsing"
--- a/ci/data_extractor/src/config.py
+++ b/ci/data_extractor/src/config.py
@@ -35,8 +35,6 @@ class UserConfig:

        self.bench_subset = BenchSubset.from_str(input_args.bench_subset)

-        self.name_suffix = input_args.name_suffix
-
        self.layer = Layer.from_str(input_args.layer.lower())
        self.pbs_kind = PBSKind.from_str(input_args.pbs_kind)
        self.grouping_factor = input_args.grouping_factor
--- a/ci/data_extractor/src/connector.py
+++ b/ci/data_extractor/src/connector.py
@@ -132,7 +132,7 @@ class PostgreConnector:
        operation_filter: list = None,
        layer: Layer = None,
        branch: str = None,
-        name_suffix: str = None,
+        name_suffix: str = "_mean_avx512",
        last_value_only: bool = True,
    ) -> dict[BenchDetails, list[int]]:
        """
@@ -155,7 +155,7 @@ class PostgreConnector:
        :type layer: Layer, optional
        :param branch: Optional branch filter, defaulting to the user's head branch if not specified.
        :type branch: str, optional
-        :param name_suffix: Suffix to match the test names.
+        :param name_suffix: Suffix to match the test names, defaulting to "_mean_avx512".
        :type name_suffix: str, optional
        :param last_value_only: A flag indicating whether to fetch only the most recent metric value for each benchmark.
        :type last_value_only: bool
@@ -169,7 +169,6 @@ class PostgreConnector:
        layer = layer if layer else user_config.layer
        version = user_config.project_version
        pbs_kind = user_config.pbs_kind
-        name_suffix = name_suffix if name_suffix else user_config.name_suffix

        timestamp_range_end = user_config.bench_date
        timestamp = datetime.datetime.fromisoformat(timestamp_range_end)
--- a/ci/data_extractor/src/data_extractor.py
+++ b/ci/data_extractor/src/data_extractor.py
@@ -24,7 +24,6 @@ import connector
 import formatters.core
 import formatters.hlapi
 import formatters.integer
-import formatters.wasm
 import regression
 from benchmark_specs import BenchSubset, BenchType, Layer, OperandType, RustType
 from formatters.common import BenchArray, CSVFormatter, MarkdownFormatter, SVGFormatter
@@ -137,16 +136,13 @@ parser.add_argument(
 parser.add_argument(
    "--bench-subset",
    dest="bench_subset",
-    choices=["all", "erc20", "zk"],
+    choices=[
+        "all",
+        "erc20",
+    ],
    default="all",
    help="Subset of benchmarks to filter against, dedicated formatting will be applied",
 )
-parser.add_argument(
-    "--name-suffix",
-    dest="name_suffix",
-    default="_mean_avx512",
-    help="Suffix to match the test names",
-)
 parser.add_argument(
    "--regression-profiles",
    dest="regression_profiles",
@@ -287,11 +283,6 @@ def get_formatter(layer: Layer, bench_subset: BenchSubset):
    match bench_subset:
        case BenchSubset.Erc20:
            return formatters.hlapi.Erc20Formatter
-        case BenchSubset.Zk:
-            if layer == Layer.Wasm:
-                return formatters.wasm.ZKFormatter
-            else:
-                return formatters.integer.ZKFormatter

    match layer:
        case Layer.Integer:
@@ -434,26 +425,6 @@ def generate_files_from_arrays(
            )


-def get_operands_types(layer: Layer, bench_subset: BenchSubset = None):
-    ciphertext_only = (OperandType.CipherText,)
-    ciphertext_and_plaintext = (OperandType.CipherText, OperandType.PlainText)
-
-    if layer == Layer.CoreCrypto:
-        return ciphertext_only
-    elif bench_subset:
-        match bench_subset:
-            case BenchSubset.Zk | BenchSubset.Erc20:
-                return ciphertext_only
-            case BenchSubset.All:
-                return ciphertext_and_plaintext
-            case _:
-                raise NotImplementedError(
-                    f"operand types cannot be defined for bench subset '{bench_subset}'"
-                )
-    else:
-        return ciphertext_and_plaintext
-
-
 if __name__ == "__main__":
    args = parser.parse_args()
    user_config = config.UserConfig(args)
@@ -501,9 +472,7 @@ if __name__ == "__main__":
        args.hardware_comp.lower().split(",") if args.hardware_comp else None
    )

-    operands_types = get_operands_types(layer, bench_subset)
-
-    for operand_type in operands_types:
+    for operand_type in (OperandType.CipherText, OperandType.PlainText):
        if hardware_list:
            perform_hardware_comparison(user_config, layer)

@@ -511,6 +480,11 @@ if __name__ == "__main__":
                print("Markdown generation is not supported with comparisons")
            continue

+        if (
+            layer == Layer.CoreCrypto or (layer == Layer.HLApi and bench_subset)
+        ) and operand_type == OperandType.PlainText:
+            continue
+
        file_suffix = f"_{operand_type.lower()}"
        arrays = perform_data_extraction(
            user_config,
--- a/ci/data_extractor/src/formatters/common.py
+++ b/ci/data_extractor/src/formatters/common.py
@@ -15,8 +15,6 @@ from benchmark_specs import (
    OperandType,
    PBSKind,
    RustType,
-    ZKComputeLoad,
-    ZKOperation,
 )
 from py_markdown_table.markdown_table import markdown_table

@@ -309,109 +307,6 @@ class MarkdownFormatter(GenericFormatter):
        return md_array


-class ZKGenericFormatter(GenericFormatter):
-    INPUTS_PROOF_COLUMN_HEADERS = f"Inputs ({ZKComputeLoad.Proof.value})"
-    INPUTS_VERIFY_COLUMN_HEADERS = f"Inputs ({ZKComputeLoad.Verify.value})"
-    DEFAULT_CRS_SIZE = 2048
-
-    @staticmethod
-    def _get_default_dict() -> collections.defaultdict:
-        raise NotImplementedError("This method must be implemented by subclasses")
-
-    @staticmethod
-    def _match_case_variation_filter(case_variation: dict):
-        raise NotImplementedError("This method must be implemented by subclasses")
-
-    def _format_data(self, data: dict[BenchDetails : list[int]], conversion_func):
-        formatted = self._get_default_dict()
-
-        for details, timings in data.items():
-            parsed_case_variation = self._parse_benchmarks_case_variation(
-                details.case_variation
-            )
-
-            if not (
-                (parsed_case_variation["crs_size"] == self.DEFAULT_CRS_SIZE)
-                and self._match_case_variation_filter(parsed_case_variation)
-            ):
-                continue
-
-            test_name = "::".join(
-                [
-                    parsed_case_variation["compute_load"],
-                    str(parsed_case_variation["packed_size"]),
-                    str(parsed_case_variation["crs_size"]),
-                ]
-            )
-
-            value = conversion_func(timings[-1])
-            formatted[test_name][ZKOperation.from_str(details.operation_name)] = value
-        return formatted
-
-    @staticmethod
-    def _parse_benchmarks_case_variation(case_variation: str):
-        parts = case_variation.split("_")
-        return {
-            "packed_size": int(parts[0]),
-            "crs_size": int(parts[3]),
-            "compute_load": parts[8],
-        }
-
-    def _generate_arrays(self, data, *args, **kwargs):
-        # Sorted as they appear in the public documentation.
-        input_names = {
-            64: "1xFheUint64 (64 bits)",
-            256: "4xFheUint64 (256 bits) ",
-            2048: "32xFheUint64 (2048 bits)",
-        }
-
-        sorted_with_compute_load = {
-            ZKComputeLoad.Proof: {},
-            ZKComputeLoad.Verify: {},
-        }
-
-        result_lines_compute_load_proof = []
-        result_lines_compute_load_verify = []
-
-        for key in data:
-            compute_load, packed_bits, _ = key.split("::")
-            packed_bits = int(packed_bits)
-
-            if packed_bits not in input_names:
-                continue
-
-            sorted_with_compute_load[ZKComputeLoad.from_str(compute_load)][
-                packed_bits
-            ] = data[key]
-
-        for load, results in sorted_with_compute_load.items():
-            if load == ZKComputeLoad.Proof:
-                table = result_lines_compute_load_proof
-                header = self.INPUTS_PROOF_COLUMN_HEADERS
-            elif load == ZKComputeLoad.Verify:
-                table = result_lines_compute_load_verify
-                header = self.INPUTS_VERIFY_COLUMN_HEADERS
-
-            # The following loop ensures display consistency between inputs
-            for packed_bits, input_name in input_names.items():
-                line = {header: input_name}
-                line.update({op.value: v for op, v in results[packed_bits].items()})
-                table.append(line)
-
-        return [
-            BenchArray(
-                result_lines_compute_load_proof,
-                self.layer,
-                metadata={"compute_load": ZKComputeLoad.Proof.fs_safe_str()},
-            ),
-            BenchArray(
-                result_lines_compute_load_verify,
-                self.layer,
-                metadata={"compute_load": ZKComputeLoad.Verify.fs_safe_str()},
-            ),
-        ]
-
-
 # -------------
 # SVG constants
 # -------------
@@ -512,7 +407,7 @@ class SVGFormatter(GenericFormatter):
                        )
                    else:  # Backends comparison (CPU, GPU, HPU)
                        header_elements.append(header_one_row_span)
-                case Layer.HLApi | Layer.CoreCrypto | Layer.Wasm:
+                case Layer.HLApi | Layer.CoreCrypto:
                    # Core_crypto arrays contains only ciphertext modulus size as headers
                    header_elements.append(header_one_row_span)
                case _:
--- a/ci/data_extractor/src/formatters/integer/integer.py
+++ b/ci/data_extractor/src/formatters/integer/integer.py
@@ -5,17 +5,10 @@ from benchmark_specs import (
    ALL_RUST_INTEGER_TYPES,
    Backend,
    BenchDetails,
-    BenchType,
    OperandType,
    RustType,
-    ZKOperation,
-)
-from formatters.common import (
-    OPERATION_SIZE_COLUMN_HEADER,
-    BenchArray,
-    GenericFormatter,
-    ZKGenericFormatter,
 )
+from formatters.common import OPERATION_SIZE_COLUMN_HEADER, BenchArray, GenericFormatter


 class OperationDisplayName(enum.StrEnum):
@@ -240,21 +233,3 @@ class IntegerFormatter(GenericFormatter):
        return [
            BenchArray(result_lines, self.layer),
        ]
-
-
-class ZKFormatter(ZKGenericFormatter):
-    @staticmethod
-    def _get_default_dict() -> collections.defaultdict:
-        return collections.defaultdict(
-            lambda: {
-                ZKOperation.Proof: "N/A",
-                ZKOperation.Verify: "N/A",
-                ZKOperation.VerifyAndExpand: "N/A",
-            }
-        )
-
-    @staticmethod
-    def _match_case_variation_filter(*args, **kwargs):
-        # At this layer, server-like ZK are performed there are no variations such as browser kind.
-        # Simply match all cases.
-        return True
--- a/ci/data_extractor/src/formatters/wasm/init.py
+++ b/ci/data_extractor/src/formatters/wasm/init.py
@@ -1 +0,0 @@
-from .wasm import *
--- a/ci/data_extractor/src/formatters/wasm/wasm.py
+++ b/ci/data_extractor/src/formatters/wasm/wasm.py
@@ -1,82 +0,0 @@
-import collections
-import enum
-
-from benchmark_specs import ZKOperation
-from formatters.common import ZKGenericFormatter
-
-
-class Browser(enum.StrEnum):
-    Chrome = "chrome"
-    Firefox = "firefox"
-
-    @staticmethod
-    def from_str(browser_name):
-        match browser_name.lower():
-            case "chrome":
-                return Browser.Chrome
-            case "firefox":
-                return Browser.Firefox
-            case _:
-                raise ValueError(f"Browser '{browser_name}' not supported")
-
-
-DEFAULT_BROWSER = Browser.Chrome
-
-
-class ZKFormatter(ZKGenericFormatter):
-    @staticmethod
-    def _get_default_dict() -> collections.defaultdict:
-        return collections.defaultdict(
-            lambda: {
-                ZKOperation.Proof: "N/A",
-            }
-        )
-
-    @staticmethod
-    def _parse_benchmarks_case_variation(case_variation: str):
-        parts = case_variation.split("_")
-        case = {
-            "packed_size": int(parts[0]),
-            "crs_size": int(parts[3]),
-            "compute_load": parts[8],
-            "sub_variation": {},
-        }
-        try:
-            sub_variation_parts = parts[9:]
-        except IndexError:
-            # No sub variation for this case
-            return case
-
-        try:
-            browser = Browser.from_str(sub_variation_parts[-1])
-            sub_variation_parts.pop()
-        except ValueError:
-            browser = None
-
-        version = None
-        if sub_variation_parts[0].lower().startswith("zkv"):
-            version = sub_variation_parts.pop(0)
-
-        details = sub_variation_parts[:]
-
-        case["sub_variation"] = {
-            "version": version,
-            "browser": browser,
-            "details": details,
-        }
-
-        return case
-
-    @staticmethod
-    def _match_case_variation_filter(case_variation: dict):
-        sub_variation = case_variation["sub_variation"]
-        try:
-            # No details must be specified, otherwise it could mean that a ciphertext
-            # size measurement or a non-threaded benchmark case.
-            return (
-                sub_variation["browser"] == DEFAULT_BROWSER
-                and sub_variation["details"] == []
-            )
-        except KeyError:
-            # At least we must have a browser specified.
-            return False
--- a/ci/slab.toml
+++ b/ci/slab.toml
@@ -25,7 +25,7 @@ user = "ubuntu"
 # Profile used to build CUDA code without the need to get p-like instance.
 [backend.aws.gpu-build]
 region = "us-east-1"
-image_id = "ami-093b80553736c78e3"
+image_id = "ami-06a04649d895d10e0"
 instance_type = "m6i.4xlarge"
 user = "ubuntu"

--- a/scripts/check_scratch_cleanup.py
+++ b/scripts/check_scratch_cleanup.py
@@ -54,13 +54,13 @@ RUST_CALL_SITES = [
 # Bindings parsed from bindings.rs
 # Scratch functions: Two more than cleanup functions because of
 #  'scratch_cuda_programmable_bootstrap_32_async' and
-EXPECTED_SCRATCH_COUNT = 70
+EXPECTED_SCRATCH_COUNT = 71

 # Cuda operation functions
-EXPECTED_CUDA_COUNT = 107
+EXPECTED_CUDA_COUNT = 109

 # Cleanup functions
-EXPECTED_CLEANUP_COUNT = 70
+EXPECTED_CLEANUP_COUNT = 71

 # Check 3: Rust call-site scanning
 # Number of functions in ffi.rs files
--- a/setup_hpu.sh
+++ b/setup_hpu.sh
@@ -1,4 +1,4 @@
-#! /usr/bin/env bash
+#! /usr/bin/env/ bash

 # Find current script directory. This should be PROJECT_DIR
 CUR_SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
@@ -15,11 +15,7 @@ HPU_CONFIG="sim"
 RUST_LOG="info"

 # Setting PCI device variable: depends on the machine
-if command -v lscpi &> /dev/null; then
-    mapfile -t DEVICE< <(lspci -d 10ee:50b5)
-else
-    DEVICE=()
-fi
+mapfile -t DEVICE< <(lspci -d 10ee:50b5)
 V80_PCIE_DEV="unselected"

 # V80 bitstream refresh rely on XilinxVivado tools
@@ -33,7 +29,7 @@ opt_short="hc:l:p:"
 opt_long="help,config:,rust-log:pcie-dev"
 OPTS=$(getopt -o "$opt_short" -l "$opt_long" -- "$@")

-while [ $# -gt 0 ]
+while true
 do
    case "$1" in
        -h|--help)
@@ -63,7 +59,7 @@ do
            shift 2
            ;;
        -p|--pcie-dev)
-            if [ -n "${2:-}" ] && [[ ! ${2:-} =~ ^- ]]; then
+            if [ -n "${2}" ] && [[ ! ${2} =~ ^- ]]; then
                V80_PCIE_DEV="${2}"
                ((i++))
                shift 1
--- a/tests/backward_compatibility/high_level_api.rs
+++ b/tests/backward_compatibility/high_level_api.rs
@@ -3,7 +3,9 @@ use crate::{load_and_unversionize, TestedModule};
 use std::path::Path;
 #[cfg(feature = "zk-pok")]
 use tfhe::integer::parameters::DynamicDistribution;
-use tfhe::prelude::*;
+use tfhe::prelude::{
+    CiphertextList, FheDecrypt, FheEncrypt, ParameterSetConformant, ReRandomize, SquashNoise,
+};
 #[cfg(feature = "zk-pok")]
 use tfhe::shortint::parameters::{
    CompactCiphertextListExpansionKind, CompactPublicKeyEncryptionParameters,
@@ -11,29 +13,27 @@ use tfhe::shortint::parameters::{
 #[cfg(feature = "zk-pok")]
 use tfhe::shortint::prelude::LweDimension;
 use tfhe::shortint::{CarryModulus, CiphertextModulus, MessageModulus};
-use tfhe::xof_key_set::CompressedXofKeySet;
 #[cfg(feature = "zk-pok")]
 use tfhe::zk::{CompactPkeCrs, CompactPkeCrsConformanceParams};
-#[cfg(feature = "zk-pok")]
-use tfhe::ProvenCompactCiphertextList;
 use tfhe::{
-    set_server_key, ClientKey, CompactCiphertextList, CompactCiphertextListBuilder,
-    CompactPublicKey, CompressedCiphertextList, CompressedCiphertextListBuilder,
+    set_server_key, ClientKey, CompactCiphertextList, CompressedCiphertextList,
    CompressedCompactPublicKey, CompressedFheBool, CompressedFheInt8, CompressedFheUint8,
    CompressedKVStore, CompressedPublicKey, CompressedServerKey,
-    CompressedSquashedNoiseCiphertextList, CompressedSquashedNoiseCiphertextListBuilder, FheBool,
-    FheInt8, FheUint32, FheUint64, FheUint8, ReRandomizationContext, ServerKey,
-    SquashedNoiseFheBool, SquashedNoiseFheInt, SquashedNoiseFheUint,
+    CompressedSquashedNoiseCiphertextList, FheBool, FheInt8, FheUint64, FheUint8,
+    ReRandomizationContext, ServerKey, SquashedNoiseFheBool, SquashedNoiseFheInt,
+    SquashedNoiseFheUint,
 };
+#[cfg(feature = "zk-pok")]
+use tfhe::{CompactPublicKey, ProvenCompactCiphertextList};
 use tfhe_backward_compat_data::load::{
    load_versioned_auxiliary, DataFormat, TestFailure, TestResult, TestSuccess,
 };
 use tfhe_backward_compat_data::{
    DataKind, HlBoolCiphertextTest, HlCiphertextTest, HlClientKeyTest, HlCompressedKVStoreTest,
-    HlCompressedSquashedNoiseCiphertextListTest, HlCompressedXofKeySetTest,
-    HlHeterogeneousCiphertextListTest, HlPublicKeyTest, HlServerKeyTest, HlSignedCiphertextTest,
-    HlSquashedNoiseBoolCiphertextTest, HlSquashedNoiseSignedCiphertextTest,
-    HlSquashedNoiseUnsignedCiphertextTest, TestMetadata, TestType, Testcase, ZkPkePublicParamsTest,
+    HlCompressedSquashedNoiseCiphertextListTest, HlHeterogeneousCiphertextListTest,
+    HlPublicKeyTest, HlServerKeyTest, HlSignedCiphertextTest, HlSquashedNoiseBoolCiphertextTest,
+    HlSquashedNoiseSignedCiphertextTest, HlSquashedNoiseUnsignedCiphertextTest, TestMetadata,
+    TestType, Testcase, ZkPkePublicParamsTest,
 };
 use tfhe_versionable::Unversionize;

@@ -360,155 +360,6 @@ pub fn test_hl_pubkey(
    }
 }

-/// Shared feature-testing logic for server keys: computation, re-randomization, noise squashing,
-/// compression, and compressed noise-squashed lists.
-fn test_hl_key_features(
-    client_key: &ClientKey,
-    server_key: ServerKey,
-    compact_public_key: Option<&CompactPublicKey>,
-    test: &impl TestType,
-    format: DataFormat,
-) -> Result<(), TestFailure> {
-    set_server_key(server_key.clone());
-
-    let clear_a = 278120u32;
-    let clear_b = 839412u32;
-
-    let (mut a, mut b) = match compact_public_key {
-        Some(pk) => {
-            let compact_list = CompactCiphertextListBuilder::new(pk)
-                .push(clear_a)
-                .push(clear_b)
-                .build_packed();
-
-            let expanded = compact_list
-                .expand()
-                .map_err(|e| test.failure(format!("Failed to expand: {e}"), format))?;
-            let a: FheUint32 = expanded.get(0).unwrap().unwrap();
-            let b: FheUint32 = expanded.get(1).unwrap().unwrap();
-            (a, b)
-        }
-        None => {
-            let a = FheUint32::encrypt(clear_a, client_key);
-            let b = FheUint32::encrypt(clear_b, client_key);
-            (a, b)
-        }
-    };
-
-    // Re-randomization
-    if let (Some(pk), true) = (
-        compact_public_key,
-        server_key.supports_ciphertext_re_randomization(),
-    ) {
-        let nonce: [u8; 256 / 8] = core::array::from_fn(|i| i as u8);
-        let mut re_rand_context = ReRandomizationContext::new(
-            *b"TFHE_Rrd",
-            [b"FheUint32 bin ops".as_slice(), nonce.as_slice()],
-            *b"TFHE_Enc",
-        );
-
-        re_rand_context.add_ciphertext(&a);
-        re_rand_context.add_ciphertext(&b);
-
-        let mut seed_gen = re_rand_context.finalize();
-
-        a.re_randomize(pk, seed_gen.next_seed().unwrap())
-            .map_err(|e| test.failure(format!("Failed to re-randomize a: {e}"), format))?;
-        b.re_randomize(pk, seed_gen.next_seed().unwrap())
-            .map_err(|e| test.failure(format!("Failed to re-randomize b: {e}"), format))?;
-    }
-
-    // Computation
-    let c = &a + &b;
-    let d = &a & &b;
-
-    let expected_c = clear_a.wrapping_add(clear_b);
-    let expected_d = clear_a & clear_b;
-
-    for (val, expected) in [&c, &d].iter().zip([expected_c, expected_d]) {
-        let dec: u32 = val.decrypt(client_key);
-        if dec != expected {
-            return Err(test.failure(
-                format!("Invalid decryption: expected {expected}, got {dec}"),
-                format,
-            ));
-        }
-    }
-
-    // Noise squashing
-    if server_key.supports_noise_squashing() {
-        let ns_c = c
-            .squash_noise()
-            .map_err(|e| test.failure(format!("Failed to squash noise: {e}"), format))?;
-        let ns_d = d
-            .squash_noise()
-            .map_err(|e| test.failure(format!("Failed to squash noise: {e}"), format))?;
-
-        for (ns_val, expected) in [&ns_c, &ns_d].iter().zip([expected_c, expected_d]) {
-            let dec: u32 = ns_val.decrypt(client_key);
-            if dec != expected {
-                return Err(test.failure(
-                    format!("Invalid noise-squashed decryption: expected {expected}, got {dec}"),
-                    format,
-                ));
-            }
-        }
-
-        if server_key.supports_noise_squashing_compression() {
-            // Compressed noise-squashed ciphertext list
-            let ns_compressed_list = CompressedSquashedNoiseCiphertextListBuilder::new()
-                .push(ns_c)
-                .push(ns_d)
-                .build()
-                .map_err(|e| {
-                    test.failure(
-                        format!("Failed to build compressed squashed noise list: {e}"),
-                        format,
-                    )
-                })?;
-
-            for (i, expected) in [expected_c, expected_d].iter().enumerate() {
-                let val: SquashedNoiseFheUint = ns_compressed_list.get(i).unwrap().unwrap();
-                let dec: u32 = val.decrypt(client_key);
-                if dec != *expected {
-                    return Err(test.failure(
-                        format!(
-                            "Invalid compressed noise-squashed[{i}]: \
-                             expected {expected}, got {dec}"
-                        ),
-                        format,
-                    ));
-                }
-            }
-        }
-    }
-
-    // Compression / decompression
-    if server_key.supports_compression() {
-        let compressed_list = CompressedCiphertextListBuilder::new()
-            .push(a)
-            .push(b)
-            .push(c)
-            .push(d)
-            .build()
-            .map_err(|e| test.failure(format!("Failed to build compressed list: {e}"), format))?;
-
-        let expected_values = [clear_a, clear_b, expected_c, expected_d];
-        for (i, expected) in expected_values.iter().enumerate() {
-            let val: FheUint32 = compressed_list.get(i).unwrap().unwrap();
-            let dec: u32 = val.decrypt(client_key);
-            if dec != *expected {
-                return Err(test.failure(
-                    format!("Invalid decompressed[{i}]: expected {expected}, got {dec}"),
-                    format,
-                ));
-            }
-        }
-    }
-
-    Ok(())
-}
-
 /// Test HL server key: encrypt two values with a client key, add them using the server key and
 /// check that the decrypted sum is valid.
 pub fn test_hl_serverkey(
@@ -522,6 +373,11 @@ pub fn test_hl_serverkey(
    )
    .map_err(|e| test.failure(e, format))?;

+    let v1 = 73u8;
+    let mut ct1 = FheUint8::encrypt(v1, &client_key);
+    let v2 = 102u8;
+    let ct2 = FheUint8::encrypt(v2, &client_key);
+
    let key = if test.compressed {
        let compressed: CompressedServerKey = load_and_unversionize(dir, test, format)?;
        compressed.decompress()
@@ -529,20 +385,77 @@ pub fn test_hl_serverkey(
        load_and_unversionize(dir, test, format)?
    };

-    let compact_public_key = test
-        .rerand_cpk_filename
-        .as_ref()
-        .map(|filename| {
-            let cpk_file = dir.join(filename.to_string());
-            CompressedCompactPublicKey::unversionize(
-                load_versioned_auxiliary(cpk_file).map_err(|e| test.failure(e, format))?,
-            )
-            .map_err(|e| test.failure(e, format))
-            .map(|cpk| cpk.decompress())
-        })
-        .transpose()?;
+    let has_noise_squashing = key.supports_noise_squashing();
+    let has_rerand = key.supports_ciphertext_re_randomization();
+    set_server_key(key);

-    test_hl_key_features(&client_key, key, compact_public_key.as_ref(), test, format)?;
+    if has_noise_squashing {
+        let ns = ct1.squash_noise().unwrap();
+        let res: u8 = ns.decrypt(&client_key);
+        if res != v1 {
+            return Err(test.failure(
+                format!(
+                    "Invalid result for noise squashing using loaded server key, expected {v1} got {res}",
+                ),
+                format,
+            ));
+        }
+    }
+
+    if let Some(rerand_cpk_filename) = test.rerand_cpk_filename.as_ref() {
+        if has_rerand {
+            let rerand_cpk_file = dir.join(rerand_cpk_filename.to_string());
+            let public_key = CompressedCompactPublicKey::unversionize(
+                load_versioned_auxiliary(rerand_cpk_file).map_err(|e| test.failure(e, format))?,
+            )
+            .map_err(|e| test.failure(e, format))?
+            .decompress();
+
+            let nonce: [u8; 256 / 8] = rand::random();
+            let mut re_rand_context = ReRandomizationContext::new(
+                *b"TFHE_Rrd",
+                [b"FheUint8".as_slice(), nonce.as_slice()],
+                *b"TFHE_Enc",
+            );
+
+            re_rand_context.add_ciphertext(&ct1);
+            let mut seed_gen = re_rand_context.finalize();
+
+            ct1.re_randomize(&public_key, seed_gen.next_seed().unwrap())
+                .unwrap();
+
+            #[allow(clippy::eq_op)]
+            let rrd = &ct1 & &ct1;
+            let res: u8 = rrd.decrypt(&client_key);
+            if res != v1 {
+                return Err(test.failure(
+                    format!(
+                    "Invalid result for rerand using loaded server key, expected {v1} got {res}",
+                ),
+                    format,
+                ));
+            }
+        } else {
+            return Err(test.failure(
+                "Test requires rerand key but server key does not have it".to_string(),
+                format,
+            ));
+        }
+    }
+
+    let ct_sum = ct1 + ct2;
+    let sum: u8 = ct_sum.decrypt(&client_key);
+
+    if sum != v1 + v2 {
+        return Err(test.failure(
+            format!(
+                "Invalid result for addition using loaded server key, expected {} got {}",
+                v1 + v2,
+                sum,
+            ),
+            format,
+        ));
+    }

    Ok(test.success(format))
 }
@@ -746,39 +659,6 @@ fn test_hl_compressed_kv_store_test(
    Ok(test.success(format))
 }

-fn test_hl_compressed_xof_key_set_test(
-    dir: &Path,
-    test: &HlCompressedXofKeySetTest,
-    format: DataFormat,
-) -> Result<TestSuccess, TestFailure> {
-    let client_key_file = dir.join(&*test.client_key_file_name);
-    let client_key = ClientKey::unversionize(
-        load_versioned_auxiliary(client_key_file).map_err(|e| test.failure(e, format))?,
-    )
-    .map_err(|e| test.failure(format!("Failed to load client key file: {e}"), format))?;
-
-    let compressed_xof_key_set_file = dir.join(&*test.compressed_xof_key_set_file_name);
-    let compressed_xof_key_set = CompressedXofKeySet::unversionize(
-        load_versioned_auxiliary(compressed_xof_key_set_file)
-            .map_err(|e| test.failure(e, format))?,
-    )
-    .map_err(|e| {
-        test.failure(
-            format!("Failed to load compressed xof key set file: {e}"),
-            format,
-        )
-    })?;
-
-    let xof_key_set = compressed_xof_key_set
-        .decompress()
-        .map_err(|e| test.failure(format!("Failed to decompress the xof key set: {e}"), format))?;
-
-    let (pk, server_key) = xof_key_set.into_raw_parts();
-
-    test_hl_key_features(&client_key, server_key, Some(&pk), test, format)?;
-
-    Ok(test.success(format))
-}
 pub struct Hl;

 impl TestedModule for Hl {
@@ -831,9 +711,6 @@ impl TestedModule for Hl {
            TestMetadata::HlCompressedKVStoreTest(test) => {
                test_hl_compressed_kv_store_test(test_dir.as_ref(), test, format).into()
            }
-            TestMetadata::HlCompressedXofKeySet(test) => {
-                test_hl_compressed_xof_key_set_test(test_dir.as_ref(), test, format).into()
-            }
            _ => {
                println!("WARNING: missing test: {:?}", testcase.metadata);
                TestResult::Skipped(testcase.skip())
--- a/tfhe-benchmark/Cargo.toml
+++ b/tfhe-benchmark/Cargo.toml
@@ -2,8 +2,8 @@
 name = "tfhe-benchmark"
 version = "0.1.0"
 edition = "2021"
-homepage = "https://zama.org/"
-documentation = "https://docs.zama.org/tfhe-rs"
+homepage = "https://zama.ai/"
+documentation = "https://docs.zama.ai/tfhe-rs"
 repository = "https://github.com/zama-ai/tfhe-rs"
 license = "BSD-3-Clause-Clear"
 description = "tfhe-benchmark: Performances measurements facility for tfhe-rs."
@@ -29,7 +29,6 @@ rand = { workspace = true }
 rayon = { workspace = true }
 tfhe = { path = "../tfhe", default-features = false }
 tfhe-csprng = { path = "../tfhe-csprng" }
-tfhe-zk-pok = { path = "../tfhe-zk-pok", optional = true }
 cpu-time = "1.0"
 num_cpus = "1.17"
 gag = "1.0.0"
@@ -40,14 +39,12 @@ boolean = ["tfhe/boolean"]
 shortint = ["tfhe/shortint"]
 integer = ["shortint", "tfhe/integer"]
 gpu = ["tfhe/gpu"]
-# gpu enables tfhe-cuda-backend which provides CUDA stream management used by tfhe-zk-pok
-gpu-experimental-zk = ["gpu", "zk-pok", "tfhe/gpu-experimental-zk", "tfhe-zk-pok/gpu-experimental"]
 hpu = ["tfhe/hpu"]
 hpu-v80 = ["tfhe/hpu-v80"]
 internal-keycache = ["tfhe/internal-keycache"]
 avx512 = ["tfhe/avx512"]
 pbs-stats = ["tfhe/pbs-stats"]
-zk-pok = ["tfhe/zk-pok", "dep:tfhe-zk-pok"]
+zk-pok = ["tfhe/zk-pok"]

 [[bench]]
 name = "boolean"
@@ -199,12 +196,6 @@ path = "benches/core_crypto/pbs128_bench.rs"
 harness = false
 required-features = ["shortint", "internal-keycache"]

-[[bench]]
-name = "zk-msm"
-path = "benches/zk/msm.rs"
-harness = false
-required-features = ["zk-pok"]
-
 [[bin]]
 name = "boolean_key_sizes"
 path = "src/bin/boolean_key_sizes.rs"
--- a/tfhe-benchmark/benches/high_level_api/bench_common.rs
+++ b/tfhe-benchmark/benches/high_level_api/bench_common.rs
@@ -1,7 +1,7 @@
 use benchmark::high_level_api::bench_wait::*;
 use benchmark::high_level_api::benchmark_op::*;
 use benchmark::utilities::{
-    get_bench_type, will_this_bench_run, write_to_json, BenchmarkType, OperandType, OperatorType,
+    get_bench_type, write_to_json, BenchmarkType, OperandType, OperatorType,
 };
 use criterion::{black_box, Criterion, Throughput};
 use rand::prelude::*;
@@ -18,6 +18,34 @@ pub struct BenchConfig<'a> {
    pub bit_size: usize,
 }

+/// This function aims to prevent the setup function from running.
+/// `Gag` is used here to suppress the temporary output noise from Criterion.
+/// We use a minimal Criterion configuration to retrieve information about the current filter setup.
+/// The function returns a boolean indicating whether the current `bench_id` should be executed or
+/// not.
+pub fn will_this_bench_run(bench_group: &str, bench_id: &str) -> bool {
+    let mut c = Criterion::default()
+        .configure_from_args()
+        .sample_size(10)
+        .output_directory(&std::env::temp_dir())
+        .warm_up_time(std::time::Duration::from_nanos(1))
+        .measurement_time(std::time::Duration::from_nanos(1))
+        .without_plots();
+    let mut will_run = false;
+    {
+        use gag::Gag;
+        let _print_gag = Gag::stdout().unwrap();
+        let _err_gag = Gag::stderr().unwrap();
+        c.benchmark_group(bench_group)
+            .bench_function(bench_id, |b| {
+                b.iter(|| {
+                    will_run = true;
+                });
+            });
+    }
+    will_run
+}
+
 #[inline(never)]
 pub fn bench_fhe_type_op<FheType, Op>(
    c: &mut Criterion,
--- a/tfhe-benchmark/benches/high_level_api/noise_squash.rs
+++ b/tfhe-benchmark/benches/high_level_api/noise_squash.rs
@@ -15,7 +15,7 @@ use benchmark::params_aliases::{
 #[cfg(feature = "gpu")]
 use benchmark::utilities::configure_gpu;
 use benchmark::utilities::{
-    get_bench_type, will_this_bench_run, write_to_json, BenchmarkType, BitSizesSet, EnvConfig,
+    get_bench_type, throughput_num_threads, write_to_json, BenchmarkType, BitSizesSet, EnvConfig,
    OperatorType,
 };
 use criterion::{Criterion, Throughput};
@@ -51,7 +51,8 @@ fn bench_sns_only_fhe_type<FheType>(
    type_name: &str,
    num_bits: usize,
 ) where
-    FheType: FheEncrypt<u128, ClientKey> + Send + Sync + SquashNoise,
+    FheType: FheEncrypt<u128, ClientKey> + Send + Sync,
+    FheType: SquashNoise,
 {
    let (param, noise_param, _, _) = params;

@@ -102,47 +103,13 @@ fn bench_sns_only_fhe_type<FheType>(
        }
        BenchmarkType::Throughput => {
            bench_id = format!("{bench_id_prefix}::throughput::{bench_id_suffix}");
-
-            let elements = if will_this_bench_run(type_name, &bench_id) {
-                #[cfg(feature = "gpu")]
-                {
-                    use benchmark::utilities::throughput_num_threads;
-
-                    let params = client_key.computation_parameters();
-                    let num_blocks = num_bits.div_ceil(
-                        (params.message_modulus().0 * params.carry_modulus().0).ilog2() as usize,
-                    );
-
-                    throughput_num_threads(num_blocks, 4)
-                }
-                #[cfg(not(any(feature = "gpu", feature = "hpu")))]
-                {
-                    use benchmark::high_level_api::find_optimal_batch::find_optimal_batch;
-
-                    let _ = num_bits; // Avoid clippy warning since FheType::num_bits() is not available.
-
-                    let setup = |batch_size: usize| {
-                        (0..batch_size)
-                            .map(|_| FheType::encrypt(random(), &client_key))
-                            .collect::<Vec<_>>()
-                    };
-                    let run = |inputs: &Vec<_>, batch_size: usize| {
-                        inputs
-                            .par_iter()
-                            .take(batch_size)
-                            .for_each(|input: &FheType| {
-                                let _ = input.squash_noise();
-                            });
-                    };
-
-                    find_optimal_batch(run, setup) as u64
-                }
-            } else {
-                0
-            };
+            let params = client_key.computation_parameters();
+            let num_blocks = num_bits
+                .div_ceil((params.message_modulus().0 * params.carry_modulus().0).ilog2() as usize);

            #[cfg(feature = "gpu")]
            {
+                let elements = throughput_num_threads(num_blocks, 4);
                bench_group.throughput(Throughput::Elements(elements));
                println!("elements: {elements}");
                let gpu_count = get_number_of_gpus() as usize;
@@ -176,6 +143,7 @@ fn bench_sns_only_fhe_type<FheType>(

            #[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
            {
+                let elements = throughput_num_threads(num_blocks, 1);
                bench_group.throughput(Throughput::Elements(elements));
                println!("elements: {elements}");
                bench_group.bench_function(&bench_id, |b| {
@@ -285,51 +253,13 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
        }
        BenchmarkType::Throughput => {
            bench_id = format!("{bench_id_prefix}::throughput::{bench_id_suffix}");
-
-            let elements = if will_this_bench_run(type_name, &bench_id) {
-                #[cfg(feature = "gpu")]
-                {
-                    use benchmark::utilities::throughput_num_threads;
-
-                    let params = client_key.computation_parameters();
-                    let num_blocks = num_bits.div_ceil(
-                        (params.message_modulus().0 * params.carry_modulus().0).ilog2() as usize,
-                    );
-
-                    throughput_num_threads(num_blocks, 4)
-                }
-                #[cfg(not(any(feature = "gpu", feature = "hpu")))]
-                {
-                    use benchmark::high_level_api::find_optimal_batch::find_optimal_batch;
-
-                    let _ = num_bits; // Avoid clippy warning since FheType::num_bits() is not available.
-
-                    // Noise squashing is the current bottleneck.
-                    // Measuring CPU load with decompression and compression operations alongside
-                    // the noise squash would just increase the batch size. Then benchmark execution
-                    // duration would increase dramatically (from ~1.000 seconds to ~6.000 seconds).
-                    let setup = |batch_size: usize| {
-                        (0..batch_size)
-                            .map(|_| FheType::encrypt(random(), &client_key))
-                            .collect::<Vec<_>>()
-                    };
-                    let run = |inputs: &Vec<_>, batch_size: usize| {
-                        inputs
-                            .par_iter()
-                            .take(batch_size)
-                            .for_each(|input: &FheType| {
-                                let _ = input.squash_noise();
-                            });
-                    };
-
-                    find_optimal_batch(run, setup) as u64
-                }
-            } else {
-                0
-            };
+            let params = client_key.computation_parameters();
+            let num_blocks = num_bits
+                .div_ceil((params.message_modulus().0 * params.carry_modulus().0).ilog2() as usize);

            #[cfg(feature = "gpu")]
            {
+                let elements = throughput_num_threads(num_blocks, 4);
                bench_group.throughput(Throughput::Elements(elements));
                println!("elements: {elements}");
                let gpu_count = get_number_of_gpus() as usize;
@@ -376,6 +306,7 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(

            #[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
            {
+                let elements = throughput_num_threads(num_blocks, 1);
                bench_group.throughput(Throughput::Elements(elements));
                bench_group.bench_function(&bench_id, |b| {
                    let compressed_values = || {
--- a/tfhe-benchmark/benches/integer/zk_pke.rs
+++ b/tfhe-benchmark/benches/integer/zk_pke.rs
@@ -485,24 +485,6 @@ mod cuda {
    use tfhe::integer::gpu::zk::CudaProvenCompactCiphertextList;
    use tfhe::integer::gpu::CudaServerKey;
    use tfhe::integer::CompressedServerKey;
-    use tfhe::GpuIndex;
-
-    /// Compute the number of elements for GPU ZK throughput benchmarks.
-    /// Values are tuned to avoid OOM on H100 GPUs while still saturating the GPU.
-    /// Memory usage scales with both CRS size and bits being proven.
-    fn gpu_zk_throughput_elements(crs_size: usize, bits: usize) -> u64 {
-        match (crs_size, bits) {
-            // 64-bit CRS: smaller proofs, can handle more elements
-            (64, _) => 30,
-            // 2048-bit CRS: moderate memory usage
-            (2048, b) if b <= 256 => 15,
-            (2048, _) => 10,
-            // 4096-bit CRS: largest proofs, most memory intensive
-            (4096, _) => 6,
-            // Default fallback for unknown configurations
-            _ => 10,
-        }
-    }

    fn gpu_pke_zk_verify(c: &mut Criterion, results_file: &Path) {
        let bench_name = "integer::cuda::zk::pke_zk_verify";
@@ -704,8 +686,12 @@ mod cuda {
                            });
                        }
                        BenchmarkType::Throughput => {
-                            let elements = gpu_zk_throughput_elements(crs_size, *bits)
-                                * get_number_of_gpus() as u64;
+                            let mut elements_per_gpu = 100;
+                            if *bits == 4096 {
+                                elements_per_gpu /= 5;
+                            }
+                            // This value, found empirically, ensure saturation of 8XH100 SXM5
+                            let elements = elements_per_gpu * get_number_of_gpus() as u64;
                            bench_group.throughput(Throughput::Elements(elements));

                            bench_id_verify = format!(
@@ -730,38 +716,15 @@ mod cuda {
                                .collect::<Vec<_>>();

                            let local_streams = cuda_local_streams(num_block, elements as usize);
-                            let gpu_count = get_number_of_gpus() as usize;
-
-                            let gpu_sks_vec: Vec<CudaServerKey> = (0..gpu_count)
-                                .map(|gpu_idx| {
-                                    let stream =
-                                        CudaStreams::new_single_gpu(GpuIndex::new(gpu_idx as u32));
-                                    CudaServerKey::decompress_from_cpu(
-                                        &compressed_server_key,
-                                        &stream,
-                                    )
-                                })
-                                .collect();
-
-                            let d_ksk_material_vec: Vec<CudaKeySwitchingKeyMaterial> = (0
-                                ..gpu_count)
-                                .map(|gpu_idx| {
-                                    let stream =
-                                        CudaStreams::new_single_gpu(GpuIndex::new(gpu_idx as u32));
+                            let d_ksk_material_vec = local_streams
+                                .par_iter()
+                                .map(|local_stream| {
                                    CudaKeySwitchingKeyMaterial::from_key_switching_key(
-                                        &ksk, &stream,
+                                        &ksk,
+                                        local_stream,
                                    )
                                })
-                                .collect();
-
-                            let d_ksks: Vec<CudaKeySwitchingKey> = (0..gpu_count)
-                                .map(|gpu_idx| {
-                                    CudaKeySwitchingKey::from_cuda_key_switching_key_material(
-                                        &d_ksk_material_vec[gpu_idx],
-                                        &gpu_sks_vec[gpu_idx],
-                                    )
-                                })
-                                .collect();
+                                .collect::<Vec<_>>();

                            bench_group.bench_function(&bench_id_verify, |b| {
                                b.iter(|| {
@@ -787,16 +750,17 @@ mod cuda {
                                                   |gpu_cts| {
                                                       gpu_cts.par_iter().enumerate().for_each
                                                       (|(i, gpu_ct)| {
-                                                           let stream_idx = i % local_streams.len();
-                                                           let local_stream = &local_streams[stream_idx];
-                                                           let gpu_idx = i % gpu_count;
-                                                           let d_ksk = &d_ksks[gpu_idx];
+                                                           let local_stream = &local_streams[i % local_streams.len()];
+
+                                                           let gpu_sk = CudaServerKey::decompress_from_cpu(&compressed_server_key, local_stream);
+                                                           let d_ksk =
+                                                               CudaKeySwitchingKey::from_cuda_key_switching_key_material(&d_ksk_material_vec[i % local_streams.len()], &gpu_sk);

                                                           gpu_ct
-                                                               .expand_without_verification(d_ksk, local_stream)
+                                                               .expand_without_verification(&d_ksk, local_stream)
                                                               .unwrap();
                                                       });
-                                                   }, BatchSize::PerIteration);
+                                                   }, BatchSize::SmallInput);
                                });

                            bench_group.bench_function(&bench_id_verify_and_expand, |b| {
@@ -814,18 +778,18 @@ mod cuda {
                                                   |gpu_cts| {
                                                       gpu_cts.par_iter().enumerate().for_each
                                                       (|(i, gpu_ct)| {
-                                                           let stream_idx = i % local_streams.len();
-                                                           let local_stream = &local_streams[stream_idx];
-                                                           let gpu_idx = i % gpu_count;
-                                                           let d_ksk = &d_ksks[gpu_idx];
+                                                           let local_stream = &local_streams[i % local_streams.len()];
+                                                           let gpu_sk = CudaServerKey::decompress_from_cpu(&compressed_server_key, local_stream);
+                                                           let d_ksk =
+                                                               CudaKeySwitchingKey::from_cuda_key_switching_key_material(&d_ksk_material_vec[i % local_streams.len()], &gpu_sk);

                                                           gpu_ct
                                                               .verify_and_expand(
-                                                                   &crs, &pk, &metadata, d_ksk, local_stream,
+                                                                   &crs, &pk, &metadata, &d_ksk, local_stream,
                                                               )
                                                               .unwrap();
                                                       });
-                                                   }, BatchSize::PerIteration);
+                                                   }, BatchSize::SmallInput);
                                });
                        }
                    }
@@ -852,154 +816,11 @@ mod cuda {
        bench_group.finish()
    }

-    fn gpu_pke_zk_proof(c: &mut Criterion) {
-        let bench_name = "zk::cuda::pke_zk_proof";
-        let mut bench_group = c.benchmark_group(bench_name);
-        bench_group
-            .sample_size(15)
-            .measurement_time(std::time::Duration::from_secs(60));
-
-        let params: [(
-            CompactPublicKeyEncryptionParameters,
-            ShortintKeySwitchingParameters,
-            PBSParameters,
-        ); 2] = [
-            (
-                PARAM_PKE_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-                PARAM_GPU_MULTI_BIT_GROUP_4_KEYSWITCH_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-                PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(),
-            ),
-            (
-                BENCH_PARAM_PKE_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-                BENCH_PARAM_KEYSWITCH_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-                BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(),
-            ),
-        ];
-
-        for (param_pke, _param_ksk, param_fhe) in params.iter() {
-            let param_name = param_fhe.name();
-            let param_name = param_name.as_str();
-            let cks = ClientKey::new(*param_fhe);
-            let sks = ServerKey::new_radix_server_key(&cks);
-            let compact_private_key = CompactPrivateKey::new(*param_pke);
-            let pk = CompactPublicKey::new(&compact_private_key);
-            // Kept for consistency
-            let _casting_key =
-                KeySwitchingKey::new((&compact_private_key, None), (&cks, &sks), *_param_ksk);
-
-            // We have a use case with 320 bits of metadata
-            let mut metadata = [0u8; (320 / u8::BITS) as usize];
-            let mut rng = rand::thread_rng();
-            metadata.fill_with(|| rng.gen());
-
-            let zk_vers = param_pke.zk_scheme;
-
-            for proof_config in default_proof_config().iter() {
-                let msg_bits =
-                    (param_pke.message_modulus.0 * param_pke.carry_modulus.0).ilog2() as usize;
-                println!("Generating CRS... ");
-                let crs_size = proof_config.crs_size;
-                let crs = CompactPkeCrs::from_shortint_params(
-                    *param_pke,
-                    LweCiphertextCount(crs_size / msg_bits),
-                )
-                .unwrap();
-
-                for bits in proof_config.bits_to_prove.iter() {
-                    assert_eq!(bits % 64, 0);
-                    // Packing, so we take the message and carry modulus to compute our block count
-                    let num_block = 64usize.div_ceil(msg_bits);
-
-                    let fhe_uint_count = bits / 64;
-
-                    for compute_load in [ZkComputeLoad::Proof, ZkComputeLoad::Verify] {
-                        let zk_load = match compute_load {
-                            ZkComputeLoad::Proof => "compute_load_proof",
-                            ZkComputeLoad::Verify => "compute_load_verify",
-                        };
-
-                        let bench_id;
-
-                        match get_bench_type() {
-                            BenchmarkType::Latency => {
-                                bench_id = format!(
-                                    "{bench_name}::{param_name}_{bits}_bits_packed_{crs_size}_bits_crs_{zk_load}_ZK{zk_vers:?}"
-                                );
-                                bench_group.bench_function(&bench_id, |b| {
-                                    let input_msg = rng.gen::<u64>();
-                                    let messages = vec![input_msg; fhe_uint_count];
-
-                                    b.iter(|| {
-                                        let _ct1 =
-                                            tfhe::integer::ProvenCompactCiphertextList::builder(
-                                                &pk,
-                                            )
-                                            .extend(messages.iter().copied())
-                                            .build_with_proof_packed(&crs, &metadata, compute_load)
-                                            .unwrap();
-                                    })
-                                });
-                            }
-                            BenchmarkType::Throughput => {
-                                // The zk proof is currently not pooled, so we simply use the number
-                                // of threads as heuristic for the
-                                // batch size
-                                let elements =
-                                    (rayon::current_num_threads() / num_block).max(1) + 1;
-                                bench_group.throughput(Throughput::Elements(elements as u64));
-
-                                bench_id = format!(
-                                    "{bench_name}::throughput::{param_name}_{bits}_bits_packed_{crs_size}_bits_crs_{zk_load}_ZK{zk_vers:?}"
-                                );
-                                bench_group.bench_function(&bench_id, |b| {
-                                    let messages = (0..elements)
-                                        .map(|_| {
-                                            let input_msg = rng.gen::<u64>();
-                                            vec![input_msg; fhe_uint_count]
-                                        })
-                                        .collect::<Vec<_>>();
-
-                                    b.iter(|| {
-                                        messages.par_iter().for_each(|msg| {
-                                            tfhe::integer::ProvenCompactCiphertextList::builder(
-                                                &pk,
-                                            )
-                                            .extend(msg.iter().copied())
-                                            .build_with_proof_packed(&crs, &metadata, compute_load)
-                                            .unwrap();
-                                        })
-                                    })
-                                });
-                            }
-                        }
-
-                        let shortint_params: PBSParameters = *param_fhe;
-
-                        write_to_json::<u64, _>(
-                            &bench_id,
-                            shortint_params,
-                            param_name,
-                            "pke_zk_proof",
-                            &OperatorType::Atomic,
-                            shortint_params.message_modulus().0 as u32,
-                            vec![shortint_params.message_modulus().0.ilog2(); num_block],
-                        );
-                    }
-                }
-            }
-        }
-    }
-
    pub fn gpu_zk_verify() {
        let results_file = Path::new("gpu_pke_zk_crs_sizes.csv");
        let mut criterion: Criterion<_> = (Criterion::default()).configure_from_args();
        gpu_pke_zk_verify(&mut criterion, results_file);
    }
-
-    pub fn gpu_zk_proof() {
-        let mut criterion: Criterion<_> = (Criterion::default()).configure_from_args();
-        gpu_pke_zk_proof(&mut criterion);
-    }
 }

 pub fn zk_verify_and_proof() {
@@ -1010,14 +831,11 @@ pub fn zk_verify_and_proof() {
 }

 #[cfg(all(feature = "gpu", feature = "zk-pok"))]
-use crate::cuda::{gpu_zk_proof, gpu_zk_verify};
+use crate::cuda::gpu_zk_verify;

 fn main() {
    #[cfg(all(feature = "gpu", feature = "zk-pok"))]
-    {
-        gpu_zk_proof();
-        gpu_zk_verify();
-    }
+    gpu_zk_verify();
    #[cfg(not(feature = "gpu"))]
    zk_verify_and_proof();

--- a/tfhe-benchmark/benches/zk/msm.rs
+++ b/tfhe-benchmark/benches/zk/msm.rs
@@ -1,406 +0,0 @@
-//! Benchmark comparing CPU MSM vs GPU MSM for BLS12-446
-//!
-//! This benchmark measures the performance of multi-scalar multiplication (MSM)
-//! for both G1 and G2 points on the BLS12-446 curve.
-//!
-//! CPU benchmarks use the arkworks-based `G1Affine::multi_mul_scalar` /
-//! `G2Affine::multi_mul_scalar`. GPU benchmarks (gated behind the
-//! `gpu-experimental-zk` feature) call `tfhe_zk_pok::gpu::g1_msm_gpu` /
-//! `tfhe_zk_pok::gpu::g2_msm_gpu` directly, which dispatch to the
-//! zk-cuda-backend.
-//!
-//! ## Running the benchmarks
-//!
-//! ```bash
-//! # CPU only
-//! cargo bench --package tfhe-benchmark --bench zk-msm
-//!
-//! # CPU and GPU
-//! cargo bench --package tfhe-benchmark --bench zk-msm --features gpu-experimental-zk
-//! ```
-
-use benchmark::utilities::{
-    get_bench_type, write_to_json, BenchmarkType, CryptoParametersRecord, OperatorType,
-};
-use criterion::{black_box, criterion_group, criterion_main, BatchSize, Criterion, Throughput};
-use rand::rngs::StdRng;
-use rand::SeedableRng;
-use rayon::prelude::*;
-use std::time::Duration;
-
-use tfhe_zk_pok::curve_api::bls12_446::{G1Affine, G2Affine, Zp, G1, G2};
-use tfhe_zk_pok::curve_api::CurveGroupOps;
-
-/// Compute the number of parallel elements for MSM throughput benchmarks.
-/// Uses aggressive values to maximize throughput testing while keeping setup time reasonable.
-fn msm_throughput_elements(input_size: usize) -> u64 {
-    match input_size {
-        n if n <= 1000 => 64,
-        n if n <= 4096 => 32,
-        _ => 16,
-    }
-}
-
-/// Generate random G1 affine points using tfhe-zk-pok
-fn generate_g1_affine_points(rng: &mut StdRng, n: usize) -> Vec<G1Affine> {
-    (0..n)
-        .map(|_| {
-            let point = G1::GENERATOR.mul_scalar(Zp::rand(rng));
-            point.normalize()
-        })
-        .collect()
-}
-
-/// Generate random G2 affine points using tfhe-zk-pok
-fn generate_g2_affine_points(rng: &mut StdRng, n: usize) -> Vec<G2Affine> {
-    (0..n)
-        .map(|_| {
-            let point = G2::GENERATOR.mul_scalar(Zp::rand(rng));
-            point.normalize()
-        })
-        .collect()
-}
-
-/// Generate random scalars using tfhe-zk-pok
-fn generate_scalars(rng: &mut StdRng, n: usize) -> Vec<Zp> {
-    (0..n).map(|_| Zp::rand(rng)).collect()
-}
-
-/// Benchmark CPU MSM for G1 points using tfhe-zk-pok entry points
-fn bench_cpu_g1_msm(c: &mut Criterion) {
-    let curve_name = "bls12_446";
-    let subgroup_name = "G1";
-    let bench_name = format!("zk::msm::{curve_name}::{subgroup_name}");
-
-    let mut group = c.benchmark_group(&bench_name);
-    group.sample_size(10);
-    group.measurement_time(Duration::from_secs(30));
-
-    for size in [100, 1000, 2048, 4096, 10000].iter() {
-        let n = *size;
-        let bench_id;
-        let bench_shortname = "zk::msm::bls12_446::g1";
-
-        match get_bench_type() {
-            BenchmarkType::Latency => {
-                let mut rng = StdRng::seed_from_u64(42);
-                let bases = generate_g1_affine_points(&mut rng, n);
-                let scalars = generate_scalars(&mut rng, n);
-
-                bench_id = format!("{bench_name}::{n}");
-                group.bench_with_input(&bench_id, &n, |b, _| {
-                    b.iter(|| {
-                        let result =
-                            G1Affine::multi_mul_scalar(black_box(&bases), black_box(&scalars));
-                        black_box(result)
-                    });
-                });
-            }
-            BenchmarkType::Throughput => {
-                let elements = msm_throughput_elements(n);
-                group.throughput(Throughput::Elements(elements));
-
-                bench_id = format!("{bench_name}::throughput::{n}");
-                group.bench_with_input(&bench_id, &n, |b, _| {
-                    // Setup generates test data in parallel, excluded from measurement
-                    let setup = || {
-                        (0..elements)
-                            .into_par_iter()
-                            .map(|i| {
-                                let mut rng = StdRng::seed_from_u64(42 + i);
-                                let bases = generate_g1_affine_points(&mut rng, n);
-                                let scalars = generate_scalars(&mut rng, n);
-                                (bases, scalars)
-                            })
-                            .collect::<Vec<_>>()
-                    };
-
-                    b.iter_batched(
-                        setup,
-                        |test_data| {
-                            test_data.par_iter().for_each(|(bases, scalars)| {
-                                let result = G1Affine::multi_mul_scalar(
-                                    black_box(bases),
-                                    black_box(scalars),
-                                );
-                                black_box(result);
-                            });
-                        },
-                        BatchSize::LargeInput,
-                    );
-                });
-            }
-        }
-
-        // MSM benchmarks are curve operations, use minimal parameters
-        let params: CryptoParametersRecord<u64> = CryptoParametersRecord::default();
-        write_to_json(
-            &bench_id,
-            params,
-            "MSM_BLS12_446_G1",
-            bench_shortname,
-            &OperatorType::Atomic,
-            64,     // bit_size for curve scalar operations
-            vec![], // decomposition_basis not applicable for MSM
-        );
-    }
-    group.finish();
-}
-
-/// Benchmark CPU MSM for G2 points using tfhe-zk-pok entry points
-fn bench_cpu_g2_msm(c: &mut Criterion) {
-    let curve_name = "bls12_446";
-    let subgroup_name = "G2";
-    let bench_name = format!("zk::msm::{curve_name}::{subgroup_name}");
-
-    let mut group = c.benchmark_group(&bench_name);
-    group.sample_size(10);
-    group.measurement_time(Duration::from_secs(30));
-
-    for size in [100, 1000, 2048, 4096, 10000].iter() {
-        let n = *size;
-        let bench_id;
-        let bench_shortname = "zk::msm::bls12_446::g2";
-
-        match get_bench_type() {
-            BenchmarkType::Latency => {
-                let mut rng = StdRng::seed_from_u64(42);
-                let bases = generate_g2_affine_points(&mut rng, n);
-                let scalars = generate_scalars(&mut rng, n);
-
-                bench_id = format!("{bench_name}::{n}");
-                group.bench_with_input(&bench_id, &n, |b, _| {
-                    b.iter(|| {
-                        let result =
-                            G2Affine::multi_mul_scalar(black_box(&bases), black_box(&scalars));
-                        black_box(result)
-                    });
-                });
-            }
-            BenchmarkType::Throughput => {
-                let elements = msm_throughput_elements(n);
-                group.throughput(Throughput::Elements(elements));
-
-                bench_id = format!("{bench_name}::throughput::{n}");
-                group.bench_with_input(&bench_id, &n, |b, _| {
-                    // Setup generates test data in parallel, excluded from measurement
-                    let setup = || {
-                        (0..elements)
-                            .into_par_iter()
-                            .map(|i| {
-                                let mut rng = StdRng::seed_from_u64(42 + i);
-                                let bases = generate_g2_affine_points(&mut rng, n);
-                                let scalars = generate_scalars(&mut rng, n);
-                                (bases, scalars)
-                            })
-                            .collect::<Vec<_>>()
-                    };
-
-                    b.iter_batched(
-                        setup,
-                        |test_data| {
-                            test_data.par_iter().for_each(|(bases, scalars)| {
-                                let result = G2Affine::multi_mul_scalar(
-                                    black_box(bases),
-                                    black_box(scalars),
-                                );
-                                black_box(result);
-                            });
-                        },
-                        BatchSize::LargeInput,
-                    );
-                });
-            }
-        }
-
-        // MSM benchmarks are curve operations, use minimal parameters
-        let params: CryptoParametersRecord<u64> = CryptoParametersRecord::default();
-        write_to_json(
-            &bench_id,
-            params,
-            "MSM_BLS12_446_G2",
-            bench_shortname,
-            &OperatorType::Atomic,
-            64,     // bit_size for curve scalar operations
-            vec![], // decomposition_basis not applicable for MSM
-        );
-    }
-    group.finish();
-}
-
-/// Benchmark GPU MSM for G1 points via `tfhe_zk_pok::gpu::g1_msm_gpu`
-#[cfg(feature = "gpu-experimental-zk")]
-fn bench_gpu_g1_msm(c: &mut Criterion) {
-    use tfhe_zk_pok::gpu::{g1_msm_gpu, select_gpu_for_msm};
-
-    let curve_name = "bls12_446";
-    let subgroup_name = "G1";
-    let bench_name = format!("zk::cuda::msm::{curve_name}::{subgroup_name}");
-
-    let mut group = c.benchmark_group(&bench_name);
-    group.sample_size(10);
-    group.measurement_time(Duration::from_secs(30));
-
-    // Resolve GPU index once — stream creation/destruction is handled inside g1_msm_gpu
-    let gpu_index = select_gpu_for_msm();
-
-    for size in [100, 1000, 2048, 4096, 10000].iter() {
-        let n = *size;
-        let bench_id;
-        let bench_shortname = "zk::cuda::msm::bls12_446::g1";
-
-        match get_bench_type() {
-            BenchmarkType::Latency => {
-                let mut rng = StdRng::seed_from_u64(42);
-                let bases = generate_g1_affine_points(&mut rng, n);
-                let scalars = generate_scalars(&mut rng, n);
-
-                bench_id = format!("{bench_name}::{n}");
-                group.bench_with_input(&bench_id, &n, |b, _| {
-                    b.iter(|| {
-                        let result = g1_msm_gpu(black_box(&bases), black_box(&scalars), gpu_index);
-                        black_box(result)
-                    });
-                });
-            }
-            BenchmarkType::Throughput => {
-                let elements = msm_throughput_elements(n);
-                group.throughput(Throughput::Elements(elements));
-
-                bench_id = format!("{bench_name}::throughput::{n}");
-                group.bench_with_input(&bench_id, &n, |b, _| {
-                    let setup = || {
-                        (0..elements)
-                            .into_par_iter()
-                            .map(|i| {
-                                let mut rng = StdRng::seed_from_u64(42 + i);
-                                let bases = generate_g1_affine_points(&mut rng, n);
-                                let scalars = generate_scalars(&mut rng, n);
-                                (bases, scalars)
-                            })
-                            .collect::<Vec<_>>()
-                    };
-
-                    b.iter_batched(
-                        setup,
-                        |test_data| {
-                            test_data.par_iter().for_each(|(bases, scalars)| {
-                                let result =
-                                    g1_msm_gpu(black_box(bases), black_box(scalars), gpu_index);
-                                black_box(result);
-                            });
-                        },
-                        BatchSize::LargeInput,
-                    );
-                });
-            }
-        }
-
-        let params: CryptoParametersRecord<u64> = CryptoParametersRecord::default();
-        write_to_json(
-            &bench_id,
-            params,
-            "MSM_BLS12_446_G1_CUDA",
-            bench_shortname,
-            &OperatorType::Atomic,
-            64,     // bit_size for curve scalar operations
-            vec![], // decomposition_basis not applicable for MSM
-        );
-    }
-    group.finish();
-}
-
-/// Benchmark GPU MSM for G2 points via `tfhe_zk_pok::gpu::g2_msm_gpu`
-#[cfg(feature = "gpu-experimental-zk")]
-fn bench_gpu_g2_msm(c: &mut Criterion) {
-    use tfhe_zk_pok::gpu::{g2_msm_gpu, select_gpu_for_msm};
-
-    let curve_name = "bls12_446";
-    let subgroup_name = "G2";
-    let bench_name = format!("zk::cuda::msm::{curve_name}::{subgroup_name}");
-
-    let mut group = c.benchmark_group(&bench_name);
-    group.sample_size(10);
-    group.measurement_time(Duration::from_secs(30));
-
-    let gpu_index = select_gpu_for_msm();
-
-    for size in [100, 1000, 2048, 4096, 10000].iter() {
-        let n = *size;
-        let bench_id;
-        let bench_shortname = "zk::cuda::msm::bls12_446::g2";
-
-        match get_bench_type() {
-            BenchmarkType::Latency => {
-                let mut rng = StdRng::seed_from_u64(42);
-                let bases = generate_g2_affine_points(&mut rng, n);
-                let scalars = generate_scalars(&mut rng, n);
-
-                bench_id = format!("{bench_name}::{n}");
-                group.bench_with_input(&bench_id, &n, |b, _| {
-                    b.iter(|| {
-                        let result = g2_msm_gpu(black_box(&bases), black_box(&scalars), gpu_index);
-                        black_box(result)
-                    });
-                });
-            }
-            BenchmarkType::Throughput => {
-                let elements = msm_throughput_elements(n);
-                group.throughput(Throughput::Elements(elements));
-
-                bench_id = format!("{bench_name}::throughput::{n}");
-                group.bench_with_input(&bench_id, &n, |b, _| {
-                    let setup = || {
-                        (0..elements)
-                            .into_par_iter()
-                            .map(|i| {
-                                let mut rng = StdRng::seed_from_u64(42 + i);
-                                let bases = generate_g2_affine_points(&mut rng, n);
-                                let scalars = generate_scalars(&mut rng, n);
-                                (bases, scalars)
-                            })
-                            .collect::<Vec<_>>()
-                    };
-
-                    b.iter_batched(
-                        setup,
-                        |test_data| {
-                            test_data.par_iter().for_each(|(bases, scalars)| {
-                                let result =
-                                    g2_msm_gpu(black_box(bases), black_box(scalars), gpu_index);
-                                black_box(result);
-                            });
-                        },
-                        BatchSize::LargeInput,
-                    );
-                });
-            }
-        }
-
-        let params: CryptoParametersRecord<u64> = CryptoParametersRecord::default();
-        write_to_json(
-            &bench_id,
-            params,
-            "MSM_BLS12_446_G2_CUDA",
-            bench_shortname,
-            &OperatorType::Atomic,
-            64,     // bit_size for curve scalar operations
-            vec![], // decomposition_basis not applicable for MSM
-        );
-    }
-    group.finish();
-}
-
-// CPU benchmarks (always available)
-criterion_group!(benches_cpu, bench_cpu_g1_msm, bench_cpu_g2_msm,);
-
-// GPU benchmarks (only when GPU feature is enabled)
-#[cfg(feature = "gpu-experimental-zk")]
-criterion_group!(benches_gpu, bench_gpu_g1_msm, bench_gpu_g2_msm,);
-
-// Conditionally include GPU benchmarks in main
-#[cfg(feature = "gpu-experimental-zk")]
-criterion_main!(benches_cpu, benches_gpu);
-
-#[cfg(not(feature = "gpu-experimental-zk"))]
-criterion_main!(benches_cpu);
--- a/tfhe-benchmark/src/utilities.rs
+++ b/tfhe-benchmark/src/utilities.rs
@@ -1,4 +1,3 @@
-use criterion::Criterion;
 use serde::Serialize;
 use std::path::PathBuf;
 use std::sync::OnceLock;
@@ -549,34 +548,6 @@ where
    factor as usize
 }

-/// This function aims to prevent the setup function from running.
-/// `Gag` is used here to suppress the temporary output noise from Criterion.
-/// We use a minimal Criterion configuration to retrieve information about the current filter setup.
-/// The function returns a boolean indicating whether the current `bench_id` should be executed or
-/// not.
-pub fn will_this_bench_run(bench_group: &str, bench_id: &str) -> bool {
-    let mut c = Criterion::default()
-        .configure_from_args()
-        .sample_size(10)
-        .output_directory(&std::env::temp_dir())
-        .warm_up_time(std::time::Duration::from_nanos(1))
-        .measurement_time(std::time::Duration::from_nanos(1))
-        .without_plots();
-    let mut will_run = false;
-    {
-        use gag::Gag;
-        let _print_gag = Gag::stdout().unwrap();
-        let _err_gag = Gag::stderr().unwrap();
-        c.benchmark_group(bench_group)
-            .bench_function(bench_id, |b| {
-                b.iter(|| {
-                    will_run = true;
-                });
-            });
-    }
-    will_run
-}
-
 #[cfg(feature = "gpu")]
 mod cuda_utils {
    use tfhe::core_crypto::entities::{
--- a/tfhe-csprng/Cargo.toml
+++ b/tfhe-csprng/Cargo.toml
@@ -4,8 +4,8 @@ version = "0.8.0"
 edition = "2021"
 license = "BSD-3-Clause-Clear"
 description = "Cryptographically Secure PRNG used in the TFHE-rs library."
-homepage = "https://zama.org/"
-documentation = "https://docs.zama.org/tfhe-rs"
+homepage = "https://zama.ai/"
+documentation = "https://docs.zama.ai/tfhe-rs"
 repository = "https://github.com/zama-ai/tfhe-rs"
 readme = "README.md"
 keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
--- a/tfhe-csprng/README.md
+++ b/tfhe-csprng/README.md
@@ -20,4 +20,4 @@ RUSTFLAGS="-Ctarget-cpu=native" cargo bench
 ## License

 This software is distributed under the BSD-3-Clause-Clear license. If you have any questions,
-please contact us at `hello@zama.org`.
+please contact us at `hello@zama.ai`.
--- a/tfhe-fft/Cargo.toml
+++ b/tfhe-fft/Cargo.toml
@@ -6,7 +6,7 @@ description = "tfhe-fft is a pure Rust high performance fast Fourier transform l
 readme = "README.md"
 repository = "https://github.com/zama-ai/tfhe-rs"
 license = "BSD-3-Clause-Clear"
-homepage = "https://zama.org/"
+homepage = "https://zama.ai/"
 keywords = ["fft"]

 [dependencies]
--- a/tfhe-fft/README.md
+++ b/tfhe-fft/README.md
@@ -1,6 +1,6 @@
 tfhe-fft is a pure Rust high performance fast Fourier transform library
 that processes vectors of sizes that are powers of two. It was made to be used
-as a backend in Zama's [TFHE-rs](https://docs.zama.org/tfhe-rs) library.
+as a backend in Zama's [TFHE-rs](https://docs.zama.ai/tfhe-rs) library.

 This library provides two FFT modules:
 - The ordered module FFT applies a forward/inverse FFT that takes its input in standard
@@ -69,7 +69,7 @@ fn main() {

 ## Links

- - [Zama](https://www.zama.org/)
+ - [Zama](https://www.zama.ai/)
 - [TFHE-rs Sources](https://github.com/zama-ai/tfhe-rs)

 ## License
@@ -81,4 +81,4 @@ prototyping purposes, as well as for your personal projects.
 If you want to use tfhe-fft in a commercial product however, you will need to
 purchase a separate commercial licence.

-If you have any questions, please contact us at `hello@zama.org.`
+If you have any questions, please contact us at `hello@zama.ai.`
--- a/tfhe-ntt/Cargo.toml
+++ b/tfhe-ntt/Cargo.toml
@@ -6,7 +6,7 @@ description = "tfhe-ntt is a pure Rust high performance number theoretic transfo
 readme = "README.md"
 repository = "https://github.com/zama-ai/tfhe-rs"
 license = "BSD-3-Clause-Clear"
-homepage = "https://zama.org/"
+homepage = "https://zama.ai/"
 keywords = ["ntt"]
 rust-version.workspace = true

--- a/tfhe-zk-pok/Cargo.toml
+++ b/tfhe-zk-pok/Cargo.toml
@@ -3,8 +3,8 @@ name = "tfhe-zk-pok"
 version = "0.8.0"
 edition = "2021"
 keywords = ["zero", "knowledge", "proof", "vector-commitments"]
-homepage = "https://zama.org/"
-documentation = "https://docs.zama.org/tfhe-rs"
+homepage = "https://zama.ai/"
+documentation = "https://docs.zama.ai/tfhe-rs"
 repository = "https://github.com/zama-ai/tfhe-rs"
 license = "BSD-3-Clause-Clear"
 description = "tfhe-zk-pok: An implementation of zero-knowledge proofs of encryption for TFHE."
@@ -14,8 +14,8 @@ rust-version.workspace = true

 [dependencies]
 ark-bls12-381 = "0.5.0"
-ark-ec = { workspace = true, features = ["parallel"] }
-ark-ff = { workspace = true, features = ["parallel"] }
+ark-ec = { version = "0.5.0", features = ["parallel"] }
+ark-ff = { version = "0.5.0", features = ["parallel"] }
 ark-poly = { version = "0.5.0", features = ["parallel"] }
 rand = { workspace = true }
 rayon = { workspace = true }
@@ -24,13 +24,9 @@ serde = { workspace = true, features = ["default", "derive"] }
 zeroize = "1.7.0"
 num-bigint = "0.4.5"
 tfhe-versionable = { version = "0.7.0", path = "../utils/tfhe-versionable" }
-zk-cuda-backend = { version = "0.1.0", path = "../backends/zk-cuda-backend", optional = true }
-tfhe-cuda-backend = { version = "0.13.0", path = "../backends/tfhe-cuda-backend", optional = true }
-itertools.workspace = true

 [features]
 experimental = []
-gpu-experimental = ["dep:zk-cuda-backend", "dep:tfhe-cuda-backend"]

 [dev-dependencies]
 serde_json = "~1.0"
--- a/tfhe-zk-pok/benches/pke_v1.rs
+++ b/tfhe-zk-pok/benches/pke_v1.rs
@@ -91,110 +91,5 @@ fn bench_pke_v1_verify(c: &mut Criterion) {
    }
 }

-#[cfg(feature = "gpu-experimental")]
-mod gpu {
-    use super::*;
-    use tfhe_zk_pok::proofs::pke;
-
-    pub fn bench_pke_v1_prove_gpu(c: &mut Criterion) {
-        let bench_shortname = "pke_zk_proof_v1";
-        let bench_name = format!("tfhe_zk_pok::cuda::{bench_shortname}");
-        let mut bench_group = c.benchmark_group(&bench_name);
-        bench_group
-            .sample_size(15)
-            .measurement_time(std::time::Duration::from_secs(60));
-
-        let rng = &mut rand::thread_rng();
-
-        for (params, param_name) in [
-            (PKEV1_TEST_PARAMS, "PKEV1_TEST_PARAMS"),
-            (PKEV2_TEST_PARAMS, "PKEV2_TEST_PARAMS"),
-        ] {
-            let (public_param, public_commit, private_commit, metadata) = init_params_v1(params);
-            let effective_t = params.t >> 1;
-            let bits = (params.k as u32) * effective_t.ilog2();
-
-            for load in [ComputeLoad::Proof, ComputeLoad::Verify] {
-                let bench_id = format!("{bench_name}::{param_name}_{bits}_bits_packed_{load}");
-
-                let seed: u128 = rng.gen();
-
-                bench_group.bench_function(&bench_id, |b| {
-                    b.iter(|| {
-                        pke::gpu::prove(
-                            (&public_param, &public_commit),
-                            &private_commit,
-                            &metadata,
-                            load,
-                            &seed.to_le_bytes(),
-                        )
-                    })
-                });
-
-                write_to_json(&bench_id, params, param_name, bench_shortname);
-            }
-        }
-    }
-
-    pub fn bench_pke_v1_verify_gpu(c: &mut Criterion) {
-        let bench_shortname = "pke_zk_verify_v1";
-        let bench_name = format!("tfhe_zk_pok::cuda::{bench_shortname}");
-        let mut bench_group = c.benchmark_group(&bench_name);
-        bench_group
-            .sample_size(15)
-            .measurement_time(std::time::Duration::from_secs(60));
-
-        let rng = &mut rand::thread_rng();
-
-        for (params, param_name) in [
-            (PKEV1_TEST_PARAMS, "PKEV1_TEST_PARAMS"),
-            (PKEV2_TEST_PARAMS, "PKEV2_TEST_PARAMS"),
-        ] {
-            let (public_param, public_commit, private_commit, metadata) = init_params_v1(params);
-            let effective_t = params.t >> 1;
-            let bits = (params.k as u32) * effective_t.ilog2();
-
-            for load in [ComputeLoad::Proof, ComputeLoad::Verify] {
-                let bench_id = format!("{bench_name}::{param_name}_{bits}_bits_packed_{load}");
-
-                let seed: u128 = rng.gen();
-
-                // Use GPU prove to generate the proof
-                let proof = pke::gpu::prove(
-                    (&public_param, &public_commit),
-                    &private_commit,
-                    &metadata,
-                    load,
-                    &seed.to_le_bytes(),
-                );
-
-                bench_group.bench_function(&bench_id, |b| {
-                    b.iter(|| {
-                        pke::gpu::verify(&proof, (&public_param, &public_commit), &metadata)
-                            .unwrap();
-                    })
-                });
-
-                write_to_json(&bench_id, params, param_name, bench_shortname);
-            }
-        }
-    }
-}
-
 criterion_group!(benches_pke_v1, bench_pke_v1_verify, bench_pke_v1_prove);
-
-#[cfg(feature = "gpu-experimental")]
-use gpu::{bench_pke_v1_prove_gpu, bench_pke_v1_verify_gpu};
-
-#[cfg(feature = "gpu-experimental")]
-criterion_group!(
-    benches_pke_v1_gpu,
-    bench_pke_v1_verify_gpu,
-    bench_pke_v1_prove_gpu
-);
-
-#[cfg(feature = "gpu-experimental")]
-criterion_main!(benches_pke_v1, benches_pke_v1_gpu);
-
-#[cfg(not(feature = "gpu-experimental"))]
 criterion_main!(benches_pke_v1);
--- a/tfhe-zk-pok/benches/pke_v2.rs
+++ b/tfhe-zk-pok/benches/pke_v2.rs
@@ -107,130 +107,5 @@ fn bench_pke_v2_verify(c: &mut Criterion) {
    }
 }

-#[cfg(feature = "gpu-experimental")]
-mod gpu {
-    use super::*;
-    use tfhe_zk_pok::proofs::pke_v2;
-
-    pub fn bench_pke_v2_prove_gpu(c: &mut Criterion) {
-        let bench_shortname = "pke_zk_proof_v2";
-        let bench_name = format!("tfhe_zk_pok::cuda::{bench_shortname}");
-        let mut bench_group = c.benchmark_group(&bench_name);
-        bench_group
-            .sample_size(15)
-            .measurement_time(std::time::Duration::from_secs(60));
-
-        let rng = &mut rand::thread_rng();
-
-        for ((params, param_name), load, bound) in itertools::iproduct!(
-            [
-                (PKEV1_TEST_PARAMS, "PKEV1_TEST_PARAMS"),
-                (PKEV2_TEST_PARAMS, "PKEV2_TEST_PARAMS"),
-            ],
-            [ComputeLoad::Proof, ComputeLoad::Verify],
-            [Bound::CS, Bound::GHL]
-        ) {
-            let (public_param, public_commit, private_commit, metadata) =
-                init_params_v2(params, bound);
-            let effective_t = params.t >> 1;
-            let bits = (params.k as u32) * effective_t.ilog2();
-
-            let bench_id =
-                format!("{bench_name}::{param_name}_{bits}_bits_packed_{load}_{bound:?}");
-            println!("{bench_id}");
-
-            let seed: u128 = rng.gen();
-
-            bench_group.bench_function(&bench_id, |b| {
-                b.iter(|| {
-                    pke_v2::gpu::prove(
-                        (&public_param, &public_commit),
-                        &private_commit,
-                        &metadata,
-                        load,
-                        &seed.to_le_bytes(),
-                    )
-                })
-            });
-
-            write_to_json(&bench_id, params, param_name, bench_shortname);
-        }
-    }
-
-    pub fn bench_pke_v2_verify_gpu(c: &mut Criterion) {
-        let bench_shortname = "pke_zk_verify_v2";
-        let bench_name = format!("tfhe_zk_pok::cuda::{bench_shortname}");
-        let mut bench_group = c.benchmark_group(&bench_name);
-        bench_group
-            .sample_size(15)
-            .measurement_time(std::time::Duration::from_secs(60));
-
-        let rng = &mut rand::thread_rng();
-
-        for ((params, param_name), load, bound, pairing_mode) in itertools::iproduct!(
-            [
-                (PKEV1_TEST_PARAMS, "PKEV1_TEST_PARAMS"),
-                (PKEV2_TEST_PARAMS, "PKEV2_TEST_PARAMS"),
-            ],
-            [ComputeLoad::Proof, ComputeLoad::Verify],
-            [Bound::CS, Bound::GHL],
-            [
-                VerificationPairingMode::TwoSteps,
-                VerificationPairingMode::Batched
-            ]
-        ) {
-            let (public_param, public_commit, private_commit, metadata) =
-                init_params_v2(params, bound);
-            let effective_t = params.t >> 1;
-            let bits = (params.k as u32) * effective_t.ilog2();
-
-            let bench_id = format!(
-                "{bench_name}::{param_name}_{bits}_bits_packed_{load}_{bound:?}_{pairing_mode:?}"
-            );
-            println!("{bench_id}");
-
-            let seed: u128 = rng.gen();
-
-            // Use GPU prove to generate the proof
-            let proof = pke_v2::gpu::prove(
-                (&public_param, &public_commit),
-                &private_commit,
-                &metadata,
-                load,
-                &seed.to_le_bytes(),
-            );
-
-            bench_group.bench_function(&bench_id, |b| {
-                b.iter(|| {
-                    pke_v2::gpu::verify(
-                        &proof,
-                        (&public_param, &public_commit),
-                        &metadata,
-                        pairing_mode,
-                    )
-                    .unwrap();
-                })
-            });
-
-            write_to_json(&bench_id, params, param_name, bench_shortname);
-        }
-    }
-}
-
 criterion_group!(benches_pke_v2, bench_pke_v2_verify, bench_pke_v2_prove);
-
-#[cfg(feature = "gpu-experimental")]
-use gpu::{bench_pke_v2_prove_gpu, bench_pke_v2_verify_gpu};
-
-#[cfg(feature = "gpu-experimental")]
-criterion_group!(
-    benches_pke_v2_gpu,
-    bench_pke_v2_verify_gpu,
-    bench_pke_v2_prove_gpu
-);
-
-#[cfg(feature = "gpu-experimental")]
-criterion_main!(benches_pke_v2, benches_pke_v2_gpu);
-
-#[cfg(not(feature = "gpu-experimental"))]
 criterion_main!(benches_pke_v2);
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Andrei Stoian	3cf0d7895f	fix(gpu): vector find lut	2026-02-25 17:34:22 +01:00
Andrei Stoian	07374cf3b7	fix(gpu): vector find lut number	2026-02-25 11:39:04 +01:00
Andrei Stoian	8493b609f1	fix(gpu): multiplication fix lut	2026-02-24 22:12:34 +01:00
Andrei Stoian	db6f7eec8f	fix(gpu): protect lut apply to subset on shift	2026-02-24 18:15:55 +01:00
Andrei Stoian	f55d31f38c	fix(gpu): avoid broadcast in comparison	2026-02-24 17:23:30 +01:00
Andrei Stoian	9041c8e602	fix(gpu): wrong assert	2026-02-24 10:53:00 +01:00
Andrei Stoian	f52b342db9	fix(gpu): protect lut re-use	2026-02-24 10:01:45 +01:00