revert optim flags

fix
draft
2026-04-28 03:01:21 -04:00 · 2025-05-13 10:05:23 +02:00 · 2025-05-12 17:35:31 +02:00 · 2025-05-12 15:38:19 +02:00
364 changed files with 1248 additions and 46640 deletions
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -6,7 +6,6 @@ self-hosted-runner:
    - large_windows_16_latest
    - large_ubuntu_16
    - large_ubuntu_16-22.04
-    - v80-desktop
 # Configuration variables in array of strings defined in your repository or
 # organization. `null` means disabling configuration variables check.
 # Empty array means no configuration variable is allowed.
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -23,8 +23,8 @@ on:
  workflow_dispatch:
  pull_request:

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  setup-instance:
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -24,8 +24,8 @@ on:
  workflow_dispatch:
  pull_request:

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  should-run:
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -30,8 +30,8 @@ on:
    branches:
      - main

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  should-run:
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -30,8 +30,8 @@ on:
    branches:
      - main

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  should-run:
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -27,8 +27,8 @@ on:
    # Nightly tests @ 1AM after each work day
    - cron: "0 1 * * MON-FRI"

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  should-run:
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -23,8 +23,8 @@ on:
  pull_request:
    types: [ labeled ]

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  setup-instance:
--- a/.github/workflows/benchmark_boolean.yml
+++ b/.github/workflows/benchmark_boolean.yml
@@ -93,7 +93,7 @@ jobs:

      - name: Parse key sizes results
        run: |
-          python3 ./ci/benchmark_parser.py tfhe-benchmark/boolean_key_sizes.csv "${RESULTS_FILENAME}" \
+          python3 ./ci/benchmark_parser.py tfhe/boolean_key_sizes.csv "${RESULTS_FILENAME}" \
          --object-sizes \
          --append-results

--- a/.github/workflows/benchmark_dex.yml
+++ b/.github/workflows/benchmark_dex.yml
@@ -97,13 +97,13 @@ jobs:

      - name: Parse swap request PBS counts
        run: |
-          python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_request_pbs_count.csv "${RESULTS_FILENAME}" \
+          python3 ./ci/benchmark_parser.py tfhe/dex_swap_request_pbs_count.csv "${RESULTS_FILENAME}" \
          --object-sizes \
          --append-results

      - name: Parse swap claim PBS counts
        run: |
-          python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_claim_pbs_count.csv "${RESULTS_FILENAME}" \
+          python3 ./ci/benchmark_parser.py tfhe/dex_swap_claim_pbs_count.csv "${RESULTS_FILENAME}" \
          --object-sizes \
          --append-results

--- a/.github/workflows/benchmark_erc20.yml
+++ b/.github/workflows/benchmark_erc20.yml
@@ -98,7 +98,7 @@ jobs:

      - name: Parse PBS counts
        run: |
-          python3 ./ci/benchmark_parser.py tfhe-benchmark/erc20_pbs_count.csv "${RESULTS_FILENAME}" \
+          python3 ./ci/benchmark_parser.py tfhe/erc20_pbs_count.csv "${RESULTS_FILENAME}" \
          --object-sizes \
          --append-results

--- a/.github/workflows/benchmark_gpu.yml
+++ b/.github/workflows/benchmark_gpu.yml
@@ -10,14 +10,13 @@ on:
        type: choice
        options:
          - "l40 (n3-L40x1)"
-          - "4-l40 (n3-L40x4)"
-          - "multi-a100-nvlink (n3-A100x8-NVLink)"
          - "single-h100 (n3-H100x1)"
          - "2-h100 (n3-H100x2)"
          - "4-h100 (n3-H100x4)"
          - "multi-h100 (n3-H100x8)"
          - "multi-h100-nvlink (n3-H100x8-NVLink)"
          - "multi-h100-sxm5 (n3-H100x8-SXM5)"
+          - "multi-a100-nvlink (n3-A100x8-NVLink)"
      command:
        description: "Benchmark command to run"
        type: choice
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -22,8 +22,8 @@ on:
    # Weekly benchmarks will be triggered each Friday at 9p.m.
    - cron: "0 21 * * 5"

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  cuda-integer-benchmarks:
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -275,11 +275,11 @@ jobs:

      - name: Run benchmarks
        run: |
-          make BENCH_OP_FLAVOR="${OP_FLAVOR}" BENCH_TYPE="${BENCH_TYPE}" BENCH_PARAM_TYPE="${BENCH_PARAMS_TYPE}" bench_"${BENCH_COMMAND}"_gpu
+          make BENCH_OP_FLAVOR="${OP_FLAVOR}" BENCH_TYPE="${BENCH_TYPE}" BENCH_PARAM_TYPE="${PARAMS_TYPE}" bench_"${COMMAND}"_gpu
        env:
          OP_FLAVOR: ${{ matrix.op_flavor }}
          BENCH_TYPE: ${{ matrix.bench_type }}
-          BENCH_PARAMS_TYPE: ${{ matrix.params_type }}
+          BENCH_PARAM_TYPE: ${{ matrix.params_type }}
          BENCH_COMMAND: ${{ matrix.command }}

      - name: Parse results
--- a/.github/workflows/benchmark_gpu_dex.yml
+++ b/.github/workflows/benchmark_gpu_dex.yml
@@ -10,8 +10,6 @@ on:
        type: choice
        options:
          - "l40 (n3-L40x1)"
-          - "4-l40 (n3-L40x4)"
-          - "multi-a100-nvlink (n3-A100x8-NVLink)"
          - "single-h100 (n3-H100x1)"
          - "2-h100 (n3-H100x2)"
          - "4-h100 (n3-H100x4)"
--- a/.github/workflows/benchmark_gpu_erc20.yml
+++ b/.github/workflows/benchmark_gpu_erc20.yml
@@ -10,8 +10,6 @@ on:
        type: choice
        options:
          - "l40 (n3-L40x1)"
-          - "4-l40 (n3-L40x4)"
-          - "multi-a100-nvlink (n3-A100x8-NVLink)"
          - "single-h100 (n3-H100x1)"
          - "2-h100 (n3-H100x2)"
          - "4-h100 (n3-H100x4)"
--- a/.github/workflows/benchmark_hpu_integer.yml
+++ b/.github/workflows/benchmark_hpu_integer.yml
@@ -1,88 +0,0 @@
-# Run all integer benchmarks on a permanent HPU instance and return parsed results to Slab CI bot.
-name: Hpu Integer Benchmarks
-
-on:
-  workflow_dispatch:
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-
-permissions: {}
-
-jobs:
-  integer-benchmarks-hpu:
-    name: Execute integer & erc20 benchmarks for HPU backend
-    runs-on: v80-desktop
-    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
-      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-    timeout-minutes: 1440  # 24 hours
-    steps:
-      # Needed as long as hw_regmap repository is private
-      - name: Configure SSH
-        uses: webfactory/ssh-agent@a6f90b1f127823b31d4d4a8d96047790581349bd # v0.9.1
-        with:
-          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-      - name: Get benchmark details
-        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
-        with:
-          toolchain: nightly
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          repository: zama-ai/slab
-          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-      - name: Run benchmarks
-        run: |
-          make bench_integer_hpu
-          make bench_hlapi_erc20_hpu
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
-          --database tfhe_rs \
-          --hardware "hpu_x1" \
-          --backend hpu \
-          --project-version "${COMMIT_HASH}" \
-          --branch "${REF_NAME}" \
-          --commit-date "${COMMIT_DATE}" \
-          --bench-date "${BENCH_DATE}" \
-          --walk-subdirs
-        env:
-          REF_NAME: ${{ github.ref_name }}
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
-        with:
-          name: ${{ github.sha }}_integer_benchmarks
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
--- a/.github/workflows/benchmark_shortint.yml
+++ b/.github/workflows/benchmark_shortint.yml
@@ -137,7 +137,7 @@ jobs:
      - name: Parse key sizes results
        if: matrix.op_flavor == 'default'
        run: |
-          python3 ./ci/benchmark_parser.py tfhe-benchmark/shortint_key_sizes.csv "${RESULTS_FILENAME}" \
+          python3 ./ci/benchmark_parser.py tfhe/shortint_key_sizes.csv "${RESULTS_FILENAME}" \
          --object-sizes \
          --append-results

--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -146,7 +146,7 @@ jobs:
      - name: Parse results
        run: |
          make parse_wasm_benchmarks
-          python3 ./ci/benchmark_parser.py tfhe-benchmark/wasm_pk_gen.csv "${RESULTS_FILENAME}" \
+          python3 ./ci/benchmark_parser.py tfhe/wasm_pk_gen.csv "${RESULTS_FILENAME}" \
          --database tfhe_rs \
          --hardware "m6i.4xlarge" \
          --project-version "${COMMIT_HASH}" \
@@ -154,7 +154,7 @@ jobs:
          --commit-date "${COMMIT_DATE}" \
          --bench-date "${BENCH_DATE}" \
          --key-gen
-          rm tfhe-benchmark/wasm_pk_gen.csv
+          rm tfhe/wasm_pk_gen.csv
        env:
          REF_NAME: ${{ github.ref_name }}

@@ -167,7 +167,7 @@ jobs:
      - name: Parse key and ciphertext sizes results
        if:  matrix.browser == 'chrome'
        run: |
-          python3 ./ci/benchmark_parser.py tfhe-benchmark/hlapi_cpk_and_cctl_sizes.csv "${RESULTS_FILENAME}" \
+          python3 ./ci/benchmark_parser.py tfhe/hlapi_cpk_and_cctl_sizes.csv "${RESULTS_FILENAME}" \
          --key-gen \
          --append-results

--- a/.github/workflows/benchmark_zk_pke.yml
+++ b/.github/workflows/benchmark_zk_pke.yml
@@ -184,7 +184,7 @@ jobs:

      - name: Parse CRS sizes results
        run: |
-          python3 ./ci/benchmark_parser.py tfhe-benchmark/pke_zk_crs_sizes.csv "${RESULTS_FILENAME}" \
+          python3 ./ci/benchmark_parser.py tfhe/pke_zk_crs_sizes.csv "${RESULTS_FILENAME}" \
          --object-sizes \
          --append-results

--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -14,8 +14,8 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref }}
  cancel-in-progress: true

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  cargo-builds:
@@ -94,10 +94,5 @@ jobs:
        run: |
          make build_tfhe_coverage

-      - name: Run Hpu pcc checks
-        if: ${{ contains(matrix.os, 'ubuntu') }}
-        run: |
-          make pcc_hpu
-
      # The wasm build check is a bit annoying to set-up here and is done during the tests in
      # aws_tfhe_tests.yml
--- a/.github/workflows/cargo_build_tfhe_fft.yml
+++ b/.github/workflows/cargo_build_tfhe_fft.yml
@@ -12,8 +12,8 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref }}
  cancel-in-progress: true

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  cargo-builds-fft:
--- a/.github/workflows/cargo_build_tfhe_ntt.yml
+++ b/.github/workflows/cargo_build_tfhe_ntt.yml
@@ -12,8 +12,8 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref }}
  cancel-in-progress: true

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  cargo-builds-ntt:
--- a/.github/workflows/cargo_test_fft.yml
+++ b/.github/workflows/cargo_test_fft.yml
@@ -16,8 +16,8 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref }}
  cancel-in-progress: true

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  should-run:
@@ -51,7 +51,7 @@ jobs:
    runs-on: ${{ matrix.runner_type }}
    strategy:
      matrix:
-        runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
+        runner_type: [ubuntu-latest, macos-latest, windows-latest]
      fail-fast: false
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -82,7 +82,7 @@ jobs:
    runs-on: ${{ matrix.runner_type }}
    strategy:
      matrix:
-        runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
+        runner_type: [ubuntu-latest, macos-latest, windows-latest]
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -16,8 +16,8 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref }}
  cancel-in-progress: true

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  should-run:
@@ -51,7 +51,7 @@ jobs:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
-        os: [ ubuntu-latest, macos-latest, windows-latest ]
+        os: [ubuntu-latest, macos-latest, windows-latest]
      fail-fast: false
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -77,7 +77,7 @@ jobs:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
-        os: [ ubuntu-latest, macos-latest, windows-latest ]
+        os: [ubuntu-latest, macos-latest, windows-latest]
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
--- a/.github/workflows/check_commit.yml
+++ b/.github/workflows/check_commit.yml
@@ -3,9 +3,8 @@ name: Check commit and PR compliance
 on:
  pull_request:

-permissions:
-  contents: read
-  pull-requests: read # Permission needed to scan commits in a pull-request
+
+permissions: {}

 jobs:
  check-commit-pr:
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -9,8 +9,7 @@ env:
  ACTIONLINT_CHECKSUM: "023070a287cd8cccd71515fedc843f1985bf96c436b7effaecce67290e7e0757"
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

-permissions:
-  contents: read
+permissions: {}

 jobs:
  lint-check:
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -17,8 +17,8 @@ on:
  workflow_dispatch:
  # Code coverage workflow is only run via workflow_dispatch event since execution duration is not stabilized yet.

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  setup-instance:
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -21,8 +21,8 @@ on:
  pull_request:
    types: [ labeled ]

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  setup-instance:
--- a/.github/workflows/gpu_4090_tests.yml
+++ b/.github/workflows/gpu_4090_tests.yml
@@ -22,8 +22,8 @@ on:
    # Nightly tests @ 1AM after each work day
    - cron: "0 1 * * MON-FRI"

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  cuda-tests-linux:
--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -25,8 +25,8 @@ on:
  pull_request:
    types: [ labeled ]

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  should-run:
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -24,8 +24,8 @@ on:
  workflow_dispatch:
  pull_request:

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  should-run:
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -25,8 +25,8 @@ on:
  pull_request:
    types: [ labeled ]

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  should-run:
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -19,8 +19,8 @@ on:
    # Nightly tests will be triggered each evening 8p.m.
    - cron: "0 20 * * *"

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  setup-instance:
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -23,8 +23,8 @@ env:
 on:
  pull_request:

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  setup-instance:
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -25,8 +25,8 @@ on:
  pull_request:
    types: [ labeled ]

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  should-run:
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -25,8 +25,9 @@ on:
  pull_request:
    types: [ labeled ]

-permissions:
-  contents: read
+
+
+permissions: {}

 jobs:
  should-run:
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -29,8 +29,8 @@ on:
    # Nightly tests @ 1AM after each work day
    - cron: "0 1 * * MON-FRI"

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  should-run:
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -25,8 +25,9 @@ on:
  pull_request:
    types: [ labeled ]

-permissions:
-  contents: read
+
+
+permissions: {}

 jobs:
  should-run:
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -25,8 +25,8 @@ on:
  pull_request:
    types: [ labeled ]

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  should-run:
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -29,8 +29,8 @@ on:
    # Nightly tests @ 1AM after each work day
    - cron: "0 1 * * MON-FRI"

-permissions:
-  contents: read
+
+permissions: {}

 jobs:
  should-run:
--- a/.github/workflows/hpu_hlapi_tests.yml
+++ b/.github/workflows/hpu_hlapi_tests.yml
@@ -1,73 +0,0 @@
-# Test tfhe-fft
-name: Cargo Test HLAPI HPU
-
-on:
-  pull_request:
-  push:
-    branches:
-      - main
-
-env:
-  CARGO_TERM_COLOR: always
-  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref }}
-  cancel-in-progress: true
-
-
-permissions: { }
-
-jobs:
-  should-run:
-    runs-on: ubuntu-latest
-    permissions:
-      pull-requests: read
-    outputs:
-      hpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.hpu_any_changed }}
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
-
-      - name: Check for file changes
-        id: changed-files
-        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
-        with:
-          files_yaml: |
-            hpu:
-              - tfhe/Cargo.toml
-              - Makefile
-              - backends/tfhe-hpu-backend/**
-              - mockups/tfhe-hpu-mockup/**
-
-  cargo-tests-hpu:
-    needs: should-run
-    if: needs.should-run.outputs.hpu_test == 'true'
-    runs-on: large_ubuntu_16
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
-
-      - name: Install Rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
-        with:
-          toolchain: stable
-          override: true
-
-      - name: Install Just
-        run: |
-          cargo install just
-
-      - name: Test HLAPI HPU
-        run: |
-          source setup_hpu.sh
-          just -f mockups/tfhe-hpu-mockup/Justfile  BUILD_PROFILE=release mockup &
-          make HPU_CONFIG=sim test_high_level_api_hpu
-     
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -27,8 +27,7 @@ concurrency:
  group: ${{ github.workflow_ref }}
  cancel-in-progress: true

-permissions:
-  contents: read
+permissions: {}

 jobs:
  cargo-builds-m1:
--- a/.github/workflows/make_release.yml
+++ b/.github/workflows/make_release.yml
@@ -110,10 +110,7 @@ jobs:
          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
-          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
-          # would fail. This is safe since DRY_RUN is handled in the env section above.
-          # shellcheck disable=SC2086
-          cargo publish -p tfhe --token "${CRATES_TOKEN}" ${DRY_RUN}
+          cargo publish -p tfhe --token "${CRATES_TOKEN}" "${DRY_RUN}"

      - name: Generate hash
        id: published_hash
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -159,10 +159,7 @@ jobs:
          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
-          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
-          # would fail. This is safe since DRY_RUN is handled in the env section above.
-          # shellcheck disable=SC2086
-          cargo publish -p tfhe-cuda-backend --token "${CRATES_TOKEN}" ${DRY_RUN}
+          cargo publish -p tfhe-cuda-backend --token "${CRATES_TOKEN}" "${DRY_RUN}"

      - name: Generate hash
        id: published_hash
--- a/.github/workflows/make_release_hpu.yml
+++ b/.github/workflows/make_release_hpu.yml
@@ -1,105 +0,0 @@
-name: Publish HPU release
-
-on:
-  workflow_dispatch:
-    inputs:
-      dry_run:
-        description: "Dry-run"
-        type: boolean
-        default: true
-
-env:
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-permissions: {}
-
-jobs:
-  verify_tag:
-    uses: ./.github/workflows/verify_tagged_commit.yml
-    secrets:
-      RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
-      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
-
-  package:
-    runs-on: ubuntu-latest
-    needs: verify_tag
-    outputs:
-      hash: ${{ steps.hash.outputs.hash }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      - name: Prepare package
-        run: |
-          cargo package -p tfhe-hpu-backend
-      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
-        with:
-          name: crate
-          path: target/package/*.crate
-      - name: generate hash
-        id: hash
-        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
-
-  provenance:
-    if: ${{ !inputs.dry_run  }}
-    needs: [package]
-    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
-    permissions:
-      # Needed to detect the GitHub Actions environment
-      actions: read
-      # Needed to create the provenance via GitHub OIDC
-      id-token: write
-      # Needed to upload assets/artifacts
-      contents: write
-    with:
-      # SHA-256 hashes of the Crate package.
-      base64-subjects: ${{ needs.package.outputs.hash }}
-
-  publish_release:
-    name: Publish tfhe-hpu-backend Release
-    runs-on: ubuntu-latest
-    needs: [verify_tag, package] # for comparing hashes
-    steps:
-      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-      - name: Publish crate.io package
-        env:
-          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
-          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
-        run: |
-          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
-          # would fail. This is safe since DRY_RUN is handled in the env section above.
-          # shellcheck disable=SC2086
-          cargo publish -p tfhe-hpu-backend --token "${CRATES_TOKEN}" ${DRY_RUN}
-
-      - name: Generate hash
-        id: published_hash
-        run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
-
-      - name: Slack notification (hashes comparison)
-        if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
-        env:
-          SLACK_COLOR: failure
-          SLACK_MESSAGE: "SLSA tfhe-hpu-backend crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
-
-      - name: Slack Notification
-        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "tfhe-hpu-backend release failed: (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/make_release_tfhe_csprng.yml
+++ b/.github/workflows/make_release_tfhe_csprng.yml
@@ -84,10 +84,7 @@ jobs:
          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
-          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
-          # would fail. This is safe since DRY_RUN is handled in the env section above.
-          # shellcheck disable=SC2086
-          cargo publish -p tfhe-csprng --token "${CRATES_TOKEN}" ${DRY_RUN}
+          cargo publish -p tfhe-csprng --token "${CRATES_TOKEN}" "${DRY_RUN}"
      - name: Generate hash
        id: published_hash
        run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
--- a/.github/workflows/make_release_tfhe_fft.yml
+++ b/.github/workflows/make_release_tfhe_fft.yml
@@ -80,10 +80,7 @@ jobs:
          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
-          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
-          # would fail. This is safe since DRY_RUN is handled in the env section above.
-          # shellcheck disable=SC2086
-          cargo publish -p tfhe-fft --token "${CRATES_TOKEN}" ${DRY_RUN}
+          cargo publish -p tfhe-fft --token "${CRATES_TOKEN}" "${DRY_RUN}"

      - name: Generate hash
        id: published_hash
--- a/.github/workflows/make_release_tfhe_ntt.yml
+++ b/.github/workflows/make_release_tfhe_ntt.yml
@@ -80,10 +80,7 @@ jobs:
          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
-          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
-          # would fail. This is safe since DRY_RUN is handled in the env section above.
-          # shellcheck disable=SC2086
-          cargo publish -p tfhe-ntt --token "${CRATES_TOKEN}" ${DRY_RUN}
+          cargo publish -p tfhe-ntt --token "${CRATES_TOKEN}" "${DRY_RUN}"

      - name: Generate hash
        id: published_hash
--- a/.github/workflows/make_release_zk_pok.yml
+++ b/.github/workflows/make_release_zk_pok.yml
@@ -81,10 +81,7 @@ jobs:
          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
-          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
-          # would fail. This is safe since DRY_RUN is handled in the env section above.
-          # shellcheck disable=SC2086
-          cargo publish -p tfhe-zk-pok --token "${CRATES_TOKEN}" ${DRY_RUN}
+          cargo publish -p tfhe-zk-pok --token "${CRATES_TOKEN}" "${DRY_RUN}"
      - name: Verify hash
        id: published_hash
        run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
--- a/.lfsconfig
+++ b/.lfsconfig
@@ -1,2 +0,0 @@
-[lfs]
-  fetchexclude = *
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,12 +9,10 @@ members = [
    "tasks",
    "tfhe-csprng",
    "backends/tfhe-cuda-backend",
-    "backends/tfhe-hpu-backend",
    "utils/tfhe-versionable",
    "utils/tfhe-versionable-derive",
    "utils/param_dedup",
    "tests",
-    "mockups/tfhe-hpu-mockup",
 ]

 exclude = [
--- a/79
+++ b/79
@@ -2,7 +2,6 @@ SHELL:=$(shell /usr/bin/env which bash)
 OS:=$(shell uname)
 RS_CHECK_TOOLCHAIN:=$(shell cat toolchain.txt | tr -d '\n')
 CARGO_RS_CHECK_TOOLCHAIN:=+$(RS_CHECK_TOOLCHAIN)
-CARGO_BUILD_JOBS=default
 CPU_COUNT=$(shell ./scripts/cpu_count.sh)
 RS_BUILD_TOOLCHAIN:=stable
 CARGO_RS_BUILD_TOOLCHAIN:=+$(RS_BUILD_TOOLCHAIN)
@@ -56,9 +55,6 @@ REGEX_PATTERN?=''
 TFHECUDA_SRC=backends/tfhe-cuda-backend/cuda
 TFHECUDA_BUILD=$(TFHECUDA_SRC)/build

-# tfhe-hpu-backend
-HPU_CONFIG=v80
-
 # Exclude these files from coverage reports
 define COVERAGE_EXCLUDED_FILES
 --exclude-files apps/trivium/src/trivium/* \
@@ -305,13 +301,6 @@ check_gpu: install_rs_check_toolchain
 		--all-targets \
 		-p $(TFHE_SPEC)

-.PHONY: clippy_hpu # Run clippy lints on tfhe with "hpu" enabled
-clippy_hpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean,shortint,integer,internal-keycache,hpu,pbs-stats,extended-types \
-		--all-targets \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
-
 .PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
 fix_newline: check_linelint_installed
 	linelint -a .
@@ -484,11 +473,6 @@ clippy_cuda_backend: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-cuda-backend -- --no-deps -D warnings

-.PHONY: clippy_hpu_backend # Run clippy lints on the tfhe-hpu-backend
-clippy_hpu_backend: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		-p tfhe-hpu-backend -- --no-deps -D warnings
-
 .PHONY: check_rust_bindings_did_not_change # Check rust bindings are up to date for tfhe-cuda-backend
 check_rust_bindings_did_not_change:
 	cargo build -p tfhe-cuda-backend && "$(MAKE)" fmt_gpu && \
@@ -718,28 +702,6 @@ test_signed_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_n
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --backend "gpu" \
 		--signed-only --tfhe-package "$(TFHE_SPEC)"

-.PHONY: test_integer_hpu_ci # Run the tests for integer ci on hpu backend
-test_integer_hpu_ci: install_rs_check_toolchain install_cargo_nextest
-	cargo test --release -p $(TFHE_SPEC) --features hpu-v80 --test hpu
-
-.PHONY: test_integer_hpu_mockup_ci # Run the tests for integer ci on hpu backend and mockup
-test_integer_hpu_mockup_ci: install_rs_check_toolchain install_cargo_nextest
-	source ./setup_hpu.sh --config sim ; \
-	cargo build --release --bin hpu_mockup; \
-    coproc target/release/hpu_mockup --params mockups/tfhe-hpu-mockup/params/tuniform_64b_pfail64_psi64.toml > mockup.log; \
-	HPU_TEST_ITER=1 \
-	cargo test --profile devo -p $(TFHE_SPEC) --features hpu --test hpu -- u32 && \
-	kill %1
-
-.PHONY: test_integer_hpu_mockup_ci_fast # Run the quick tests for integer ci on hpu backend and mockup.
-test_integer_hpu_mockup_ci_fast: install_rs_check_toolchain install_cargo_nextest
-	source ./setup_hpu.sh --config sim ; \
-	cargo build --profile devo --bin hpu_mockup; \
-    coproc target/devo/hpu_mockup --params mockups/tfhe-hpu-mockup/params/tuniform_64b_fast.toml > mockup.log; \
-	HPU_TEST_ITER=1 \
-	cargo test --profile devo -p $(TFHE_SPEC) --features hpu --test hpu -- u32 && \
-	kill %1
-
 .PHONY: test_boolean # Run the tests of the boolean module
 test_boolean: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -895,22 +857,6 @@ test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
 		--features=integer,internal-keycache,gpu -p $(TFHE_SPEC) \
 		-E "test(/high_level_api::.*gpu.*/)"

-test_high_level_api_hpu: install_rs_build_toolchain install_cargo_nextest
-ifeq ($(HPU_CONFIG), v80)
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
-		--build-jobs=$(CARGO_BUILD_JOBS) \
-		--test-threads=1 \
-		--features=integer,internal-keycache,hpu,hpu-v80 -p $(TFHE_SPEC) \
-		-E "test(/high_level_api::.*hpu.*/)"
-else
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
-		--build-jobs=$(CARGO_BUILD_JOBS) \
-		--test-threads=1 \
-		--features=integer,internal-keycache,hpu -p $(TFHE_SPEC) \
-		-E "test(/high_level_api::.*hpu.*/)"
-endif
-
-
 .PHONY: test_strings # Run the tests for strings ci
 test_strings: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -1154,12 +1100,6 @@ clippy_bench_gpu: install_rs_check_toolchain
 		--features=gpu,shortint,integer,internal-keycache,nightly-avx512,pbs-stats,zk-pok \
 		-p tfhe-benchmark -- --no-deps -D warnings

-.PHONY: clippy_bench_hpu # Run clippy lints on tfhe-benchmark
-clippy_bench_hpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=hpu,shortint,integer,internal-keycache,pbs-stats\
-		-p tfhe-benchmark -- --no-deps -D warnings
-
 .PHONY: print_doc_bench_parameters # Print parameters used in doc benchmarks
 print_doc_bench_parameters:
 	RUSTFLAGS="" cargo run --example print_doc_bench_parameters \
@@ -1193,14 +1133,6 @@ bench_signed_integer_gpu: install_rs_check_toolchain
 	--bench integer-signed-bench \
 	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p tfhe-benchmark --

-.PHONY: bench_integer_hpu # Run benchmarks for integer on HPU backend
-bench_integer_hpu: install_rs_check_toolchain
-	source ./setup_hpu.sh --config $(HPU_CONFIG) ; \
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench integer-bench \
-	--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark -- --quick
-
 .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
 bench_integer_compression: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
@@ -1392,14 +1324,6 @@ bench_hlapi_dex_gpu: install_rs_check_toolchain
 	--bench hlapi-dex \
 	--features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --

-.PHONY: bench_hlapi_erc20_hpu # Run benchmarks for ECR20 operations on HPU
-bench_hlapi_erc20_hpu: install_rs_check_toolchain
-	source ./setup_hpu.sh --config $(HPU_CONFIG) ; \
-	RUSTFLAGS="$(RUSTFLAGS)" \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-erc20 \
-	--features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark -- --quick
-
 .PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
 bench_tfhe_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" \
@@ -1499,9 +1423,6 @@ tfhe_lints
 pcc_gpu: check_rust_bindings_did_not_change clippy_rustdoc_gpu \
 clippy_gpu clippy_cuda_backend clippy_bench_gpu check_compile_tests_benches_gpu

-.PHONY: pcc_hpu # pcc stands for pre commit checks for HPU compilation
-pcc_hpu: clippy_hpu clippy_hpu_backend test_integer_hpu_mockup_ci_fast 
-
 .PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
 fpcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
 check_md_docs_are_tested clippy_fast check_compile_tests
--- a/_typos.toml
+++ b/_typos.toml
@@ -11,13 +11,11 @@ extend-ignore-identifiers-re = [
    # Example with string replacing "hello" with "herlo"
    "herlo",
    # Example in trivium
-    "C9217BA0D762ACA1",
-    "0x[0-9a-fA-F]+"
+    "C9217BA0D762ACA1"
 ]

 [files]
 extend-exclude = [
    "backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cu",
    "backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu",
-    "backends/tfhe-hpu-backend/config_store/**/*.link_summary",
 ]
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -1116,14 +1116,32 @@ template <typename Torus> struct int_overflowing_sub_memory {
 };

 template <typename Torus> struct int_sum_ciphertexts_vec_memory {
-  CudaRadixCiphertextFFI *new_blocks;
-  CudaRadixCiphertextFFI *new_blocks_copy;
-  CudaRadixCiphertextFFI *old_blocks;
-  CudaRadixCiphertextFFI *small_lwe_vector;
-  int_radix_params params;

-  int32_t *d_smart_copy_in;
-  int32_t *d_smart_copy_out;
+  int_radix_params params;
+  uint32_t active_gpu_count;
+  size_t chunk_size;
+  size_t max_pbs_count;
+
+  // temporary buffers
+  CudaRadixCiphertextFFI *current_blocks;
+  CudaRadixCiphertextFFI *small_lwe_vector;
+
+  uint32_t *d_columns_data;
+  uint32_t *d_columns_counter;
+  uint32_t **d_columns;
+
+  uint32_t *d_new_columns_data;
+  uint32_t *d_new_columns_counter;
+  uint32_t **d_new_columns;
+
+  uint64_t *d_degrees;
+  uint32_t *d_pbs_counters;
+
+  // additional streams
+  cudaStream_t *helper_streams;
+
+  // lookup table for extracting message and carry
+  int_radix_lut<Torus> *luts_message_carry;

  bool mem_reuse = false;
  bool gpu_memory_allocated;
@@ -1137,100 +1155,139 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
                                 uint64_t *size_tracker) {
    this->params = params;
    gpu_memory_allocated = allocate_gpu_memory;
+    this->chunk_size = (params.message_modulus * params.carry_modulus - 1) /
+                       (params.message_modulus - 1);
+    this->max_pbs_count =
+        num_blocks_in_radix * max_num_radix_in_vec * 2 / chunk_size;
+    this->active_gpu_count = get_active_gpu_count(2 * max_pbs_count, gpu_count);

-    int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
+    size_t max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
+    uint32_t message_modulus = params.message_modulus;
+    printf("max_total_blocks_in_vec: %d\n", max_total_blocks_in_vec);
+    // process streams
+    helper_streams =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    for (uint j = 0; j < active_gpu_count; j++) {
+      helper_streams[j] = cuda_create_stream(gpu_indexes[j]);
+    }

-    // allocate gpu memory for intermediate buffers
-    new_blocks = new CudaRadixCiphertextFFI;
+    current_blocks = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
-        streams[0], gpu_indexes[0], new_blocks, max_pbs_count,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-    new_blocks_copy = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams[0], gpu_indexes[0], new_blocks_copy, max_pbs_count,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-    old_blocks = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams[0], gpu_indexes[0], old_blocks, max_pbs_count,
+        streams[0], gpu_indexes[0], current_blocks, max_total_blocks_in_vec,
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
    small_lwe_vector = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
-        streams[0], gpu_indexes[0], small_lwe_vector, max_pbs_count,
+        streams[0], gpu_indexes[0], small_lwe_vector, max_total_blocks_in_vec,
        params.small_lwe_dimension, size_tracker, allocate_gpu_memory);

-    d_smart_copy_in = (int32_t *)cuda_malloc_with_size_tracking_async(
-        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
+    d_degrees = (uint64_t *)cuda_malloc_with_size_tracking_async(
+        max_total_blocks_in_vec * sizeof(uint64_t), streams[0], gpu_indexes[0],
        size_tracker, allocate_gpu_memory);
-    d_smart_copy_out = (int32_t *)cuda_malloc_with_size_tracking_async(
-        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
-        size_tracker, allocate_gpu_memory);
-    cuda_memset_with_size_tracking_async(
-        d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), streams[0],
-        gpu_indexes[0], allocate_gpu_memory);
-    cuda_memset_with_size_tracking_async(
-        d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), streams[0],
-        gpu_indexes[0], allocate_gpu_memory);
-  }

-  int_sum_ciphertexts_vec_memory(
-      cudaStream_t const *streams, uint32_t const *gpu_indexes,
-      uint32_t gpu_count, int_radix_params params, uint32_t num_blocks_in_radix,
-      uint32_t max_num_radix_in_vec, CudaRadixCiphertextFFI *new_blocks,
-      CudaRadixCiphertextFFI *old_blocks,
-      CudaRadixCiphertextFFI *small_lwe_vector, bool allocate_gpu_memory,
-      uint64_t *size_tracker) {
-    mem_reuse = true;
-    gpu_memory_allocated = allocate_gpu_memory;
-    this->params = params;
+    d_pbs_counters = (uint32_t *)cuda_malloc_with_size_tracking_async(
+        3 * sizeof(uint32_t), streams[0], gpu_indexes[0], size_tracker,
+        allocate_gpu_memory);

-    int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
+    auto setup_columns = [num_blocks_in_radix, max_num_radix_in_vec, streams,
+                          gpu_indexes, size_tracker, allocate_gpu_memory](
+                             uint32_t **&columns, uint32_t *&columns_data,
+                             uint32_t *&columns_counter) {
+      columns_data = (uint32_t *)cuda_malloc_with_size_tracking_async(
+          num_blocks_in_radix * max_num_radix_in_vec * sizeof(uint32_t),
+          streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
+      columns_counter = (uint32_t *)cuda_malloc_with_size_tracking_async(
+          num_blocks_in_radix * sizeof(uint32_t), streams[0], gpu_indexes[0],
+          size_tracker, allocate_gpu_memory);
+      cuda_memset_with_size_tracking_async(
+          columns_counter, 0, num_blocks_in_radix * sizeof(uint32_t),
+          streams[0], gpu_indexes[0], allocate_gpu_memory);

-    // assign  gpu memory for intermediate buffers
-    this->new_blocks = new_blocks;
-    this->old_blocks = old_blocks;
-    this->small_lwe_vector = small_lwe_vector;
-    new_blocks_copy = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams[0], gpu_indexes[0], new_blocks_copy, max_pbs_count,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+      uint32_t **h_columns = new uint32_t *[num_blocks_in_radix];
+      for (int i = 0; i < num_blocks_in_radix; ++i) {
+        h_columns[i] = columns_data + i * max_num_radix_in_vec;
+      }
+      columns = (uint32_t **)cuda_malloc_with_size_tracking_async(
+          num_blocks_in_radix * sizeof(uint32_t *), streams[0], gpu_indexes[0],
+          size_tracker, allocate_gpu_memory);
+      cuda_memcpy_with_size_tracking_async_to_gpu(
+          columns, h_columns, num_blocks_in_radix * sizeof(uint32_t *),
+          streams[0], gpu_indexes[0], allocate_gpu_memory);
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+      delete[] h_columns;
+    };

-    d_smart_copy_in = (int32_t *)cuda_malloc_with_size_tracking_async(
-        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
-        size_tracker, allocate_gpu_memory);
-    d_smart_copy_out = (int32_t *)cuda_malloc_with_size_tracking_async(
-        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
-        size_tracker, allocate_gpu_memory);
-    cuda_memset_with_size_tracking_async(
-        d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), streams[0],
-        gpu_indexes[0], allocate_gpu_memory);
-    cuda_memset_with_size_tracking_async(
-        d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), streams[0],
-        gpu_indexes[0], allocate_gpu_memory);
+    setup_columns(d_columns, d_columns_data, d_columns_counter);
+    setup_columns(d_new_columns, d_new_columns_data, d_new_columns_counter);
+
+    luts_message_carry = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, params, 2, max_total_blocks_in_vec,
+        allocate_gpu_memory, size_tracker);
+
+    auto message_acc = luts_message_carry->get_lut(0, 0);
+    auto carry_acc = luts_message_carry->get_lut(0, 1);
+
+    // define functions for each accumulator
+    auto lut_f_message = [message_modulus](Torus x) -> Torus {
+      return x % message_modulus;
+    };
+    auto lut_f_carry = [message_modulus](Torus x) -> Torus {
+      return x / message_modulus;
+    };
+
+    // generate accumulators
+    generate_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], message_acc,
+        luts_message_carry->get_degree(0),
+        luts_message_carry->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, message_modulus, params.carry_modulus,
+        lut_f_message, allocate_gpu_memory);
+    generate_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], carry_acc,
+        luts_message_carry->get_degree(1),
+        luts_message_carry->get_max_degree(1), params.glwe_dimension,
+        params.polynomial_size, message_modulus, params.carry_modulus,
+        lut_f_carry, allocate_gpu_memory);
+    luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
  }

  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
               uint32_t gpu_count) {
-    cuda_drop_with_size_tracking_async(d_smart_copy_in, streams[0],
-                                       gpu_indexes[0], gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(d_smart_copy_out, streams[0],
+    cuda_drop_with_size_tracking_async(d_degrees, streams[0], gpu_indexes[0],
+                                       gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(d_pbs_counters, streams[0],
                                       gpu_indexes[0], gpu_memory_allocated);

+    cuda_drop_with_size_tracking_async(d_columns_data, streams[0],
+                                       gpu_indexes[0], gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(d_columns_counter, streams[0],
+                                       gpu_indexes[0], gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(d_columns, streams[0], gpu_indexes[0],
+                                       gpu_memory_allocated);
+
+    cuda_drop_with_size_tracking_async(d_new_columns_data, streams[0],
+                                       gpu_indexes[0], gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(d_new_columns_counter, streams[0],
+                                       gpu_indexes[0], gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(d_new_columns, streams[0],
+                                       gpu_indexes[0], gpu_memory_allocated);
+
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_destroy_stream(helper_streams[i], gpu_indexes[i]);
+    }
+
+    free(helper_streams);
+
    if (!mem_reuse) {
-      release_radix_ciphertext_async(streams[0], gpu_indexes[0], new_blocks,
-                                     gpu_memory_allocated);
-      release_radix_ciphertext_async(streams[0], gpu_indexes[0], old_blocks,
+      release_radix_ciphertext_async(streams[0], gpu_indexes[0], current_blocks,
                                     gpu_memory_allocated);
      release_radix_ciphertext_async(streams[0], gpu_indexes[0],
                                     small_lwe_vector, gpu_memory_allocated);
-      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-      delete new_blocks;
-      delete old_blocks;
+      luts_message_carry->release(streams, gpu_indexes, gpu_count);
+
+      delete luts_message_carry;
+      delete current_blocks;
      delete small_lwe_vector;
    }
-    release_radix_ciphertext_async(streams[0], gpu_indexes[0], new_blocks_copy,
-                                   gpu_memory_allocated);
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-    delete new_blocks_copy;
  }
 };
 // For sequential algorithm in group propagation
@@ -2604,8 +2661,7 @@ template <typename Torus> struct int_mul_memory {
    // create memory object for sum ciphertexts
    sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
        streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-        2 * num_radix_blocks, block_mul_res, vector_result_sb, small_lwe_vector,
-        allocate_gpu_memory, size_tracker);
+        2 * num_radix_blocks, allocate_gpu_memory, size_tracker);
    uint32_t uses_carry = 0;
    uint32_t requested_flag = outputFlag::FLAG_NONE;
    sc_prop_mem = new int_sc_prop_memory<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -212,6 +212,7 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
    bool allocate_gpu_memory, bool allocate_ms_array) {

+  printf("pbs_type: %d\n", pbs_type);
  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
                          ks_level, ks_base_log, pbs_level, pbs_base_log,
@@ -234,11 +235,6 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
  if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
    PANIC("Cuda error: input vector length should be a multiple of the "
          "output's number of radix blocks")
-  // FIXME: this should not be necessary, we should make sure sum_ctxt works in
-  // the general case
-  for (int i = 0; i < radix_lwe_vec->num_radix_blocks; i++) {
-    radix_lwe_vec->degrees[i] = mem->params.message_modulus - 1;
-  }
  switch (mem->params.polynomial_size) {
  case 512:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -20,6 +20,7 @@
 #include <fstream>
 #include <iostream>
 #include <omp.h>
+#include <queue>
 #include <sstream>
 #include <string>
 #include <vector>
@@ -123,6 +124,173 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
  }
 }

+__global__ inline void radix_vec_to_columns(
+    uint32_t *const *const columns, uint32_t *const columns_counter,
+    const uint64_t *const degrees, const uint32_t num_radix_blocks,
+    const uint32_t total_blocks_in_vec) {
+
+  const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (idx >= total_blocks_in_vec)
+    return;
+
+  const uint64_t degree = degrees[idx];
+  if (degree == 0)
+    return;
+
+  const uint32_t column_id = idx % num_radix_blocks;
+  const uint32_t out_idx = atomicAdd(&columns_counter[column_id], 1);
+  columns[column_id][out_idx] = idx;
+}
+
+template <typename Torus>
+__global__ inline void prepare_new_columns_and_pbs_indexes(
+    uint32_t *const *const new_columns, uint32_t *const new_columns_counter,
+    Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
+    Torus *const lut_indexes, uint32_t *const pbs_counters,
+    const uint32_t *const *const columns, const uint32_t *const columns_counter,
+    const uint32_t chunk_size) {
+  __shared__ uint32_t counter, sharedOr;
+
+  if (threadIdx.x == 0) {
+    counter = 0;
+    sharedOr = 0;
+  }
+  __syncthreads();
+
+  const uint32_t base_id = threadIdx.x;
+  const uint32_t column_len = columns_counter[base_id];
+
+  uint32_t ct_count = 0;
+  for (uint32_t i = 0; i + chunk_size <= column_len; i += chunk_size) {
+    // those indexes are for message ciphertexts
+    // for message ciphertexts in and out index should be same
+    const uint32_t in_index = columns[base_id][i];
+    new_columns[base_id][ct_count] = in_index;
+    const uint32_t pbs_index = atomicAdd(&counter, 1);
+    pbs_indexes_in[pbs_index] = in_index;
+    pbs_indexes_out[pbs_index] = in_index;
+    lut_indexes[pbs_index] = 0;
+    ++ct_count;
+  }
+
+  __syncthreads();
+  uint32_t message_count = counter;
+
+  if (base_id > 0) {
+    const uint32_t prev_base_id = base_id - 1;
+    const uint32_t prev_column_len = columns_counter[prev_base_id];
+
+    for (uint32_t i = 0; i + chunk_size <= prev_column_len; i += chunk_size) {
+      // those indexes are for carry ciphertexts
+      // for carry ciphertexts input is same as for message
+      // output will be placed to next block in the column
+      const uint32_t in_index = columns[prev_base_id][i];
+      const uint32_t out_index = columns[prev_base_id][i + 1];
+      new_columns[base_id][ct_count] = out_index;
+      const uint32_t pbs_index = atomicAdd(&counter, 1);
+      pbs_indexes_in[pbs_index] = in_index;
+      pbs_indexes_out[pbs_index] = out_index;
+      lut_indexes[pbs_index] = 1;
+      ++ct_count;
+    }
+  }
+
+  const uint32_t start_index = column_len - column_len % chunk_size;
+  for (uint32_t i = start_index; i < column_len; ++i) {
+    new_columns[base_id][ct_count] = columns[base_id][i];
+    ++ct_count;
+  }
+
+  new_columns_counter[base_id] = ct_count;
+
+  if (ct_count > chunk_size) {
+    atomicOr(&sharedOr, 1);
+  }
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    pbs_counters[0] = counter;
+    pbs_counters[1] = message_count;
+    pbs_counters[2] = sharedOr;
+  }
+}
+
+template <typename Torus>
+__global__ inline void prepare_final_pbs_indexes(
+    Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
+    Torus *const lut_indexes, const uint32_t num_radix_blocks) {
+  int idx = threadIdx.x;
+  pbs_indexes_in[idx] = idx % num_radix_blocks;
+  pbs_indexes_out[idx] = idx + idx / num_radix_blocks;
+  lut_indexes[idx] = idx / num_radix_blocks;
+}
+
+template <typename Torus>
+__global__ void calculate_chunks(Torus *const input_blocks,
+                                 const uint32_t *const *const columns,
+                                 const uint32_t *const columns_counter,
+                                 const uint32_t chunk_size,
+                                 const uint32_t block_size) {
+
+  const uint32_t part_size = blockDim.x;
+  const uint32_t base_id = blockIdx.x;
+  const uint32_t part_id = blockIdx.y;
+  const uint32_t coef_id = part_id * part_size + threadIdx.x;
+
+  if (coef_id >= block_size)
+    return;
+
+  const uint32_t column_len = columns_counter[base_id];
+
+  if (column_len >= chunk_size) {
+    const uint32_t num_chunks = column_len / chunk_size;
+    Torus result = 0;
+
+    for (uint32_t chunk_id = 0; chunk_id < num_chunks; ++chunk_id) {
+      const uint32_t first_ct_id = columns[base_id][chunk_id * chunk_size];
+      result = input_blocks[first_ct_id * block_size + coef_id];
+
+      for (uint32_t ct_id = 1; ct_id < chunk_size; ++ct_id) {
+        const uint32_t cur_ct_id =
+            columns[base_id][chunk_id * chunk_size + ct_id];
+        result += input_blocks[cur_ct_id * block_size + coef_id];
+      }
+
+      input_blocks[first_ct_id * block_size + coef_id] = result;
+    }
+  }
+}
+
+template <typename Torus>
+__global__ void calculate_final_chunk_into_radix(
+    Torus *const out_radix, const Torus *const input_blocks,
+    const uint32_t *const *const columns, const uint32_t *const columns_counter,
+    const uint32_t chunk_size, const uint32_t block_size) {
+
+  const uint32_t part_size = blockDim.x;
+  const uint32_t base_id = blockIdx.x;
+  const uint32_t part_id = blockIdx.y;
+  const uint32_t coef_id = part_id * part_size + threadIdx.x;
+
+  if (coef_id >= block_size)
+    return;
+
+  const uint32_t column_len = columns_counter[base_id];
+
+  Torus result = 0;
+  if (column_len) {
+    const uint32_t first_ct_id = columns[base_id][0];
+    result = input_blocks[first_ct_id * block_size + coef_id];
+
+    for (uint32_t i = 1; i < column_len; ++i) {
+      const uint32_t cur_ct_it = columns[base_id][i];
+      result += input_blocks[cur_ct_it * block_size + coef_id];
+    }
+  }
+  out_radix[base_id * block_size + coef_id] = result;
+}
+
 template <typename Torus, class params>
 __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
                                        Torus *msb_blocks,
@@ -167,6 +335,65 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
        (process_msb) ? cur_msb_ct[params::degree] : 0;
  }
 }
+
+inline bool at_least_one_column_needs_processing(
+    const uint64_t *const degrees, const uint32_t num_radix_blocks,
+    const uint32_t num_radix_in_vec, const uint32_t chunk_size) {
+  std::vector<uint32_t> columns_count(num_radix_blocks, 0);
+
+  for (size_t column = 0; column < num_radix_blocks; ++column) {
+    for (size_t block = 0; block < num_radix_in_vec; ++block) {
+      const size_t block_index = block * num_radix_blocks + column;
+      if (degrees[block_index]) {
+        columns_count[column]++;
+        if (columns_count[column] > chunk_size) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+inline void calculate_final_degrees(uint64_t *const out_degrees,
+                                    const uint64_t *const input_degrees,
+                                    size_t num_blocks, size_t num_radix_in_vec,
+                                    size_t chunk_size,
+                                    uint64_t message_modulus) {
+
+  auto get_degree = [message_modulus](uint64_t degree) -> uint64_t {
+    return std::min(message_modulus - 1, degree);
+  };
+  std::vector<std::queue<uint64_t>> columns(num_blocks);
+  for (size_t i = 0; i < num_radix_in_vec; ++i) {
+    for (size_t j = 0; j < num_blocks; ++j) {
+      if (input_degrees[i * num_blocks + j])
+        columns[j].push(input_degrees[i * num_blocks + j]);
+    }
+  }
+
+  for (size_t i = 0; i < num_blocks; ++i) {
+    auto &col = columns[i];
+    while (col.size() > 1) {
+      uint32_t cur_degree = 0;
+      size_t mn = std::min(chunk_size, col.size());
+      for (int j = 0; j < mn; ++j) {
+        cur_degree += col.front();
+        col.pop();
+      }
+      const uint64_t new_degree = get_degree(cur_degree);
+      col.push(new_degree);
+      if ((i + 1) < num_blocks) {
+        columns[i + 1].push(new_degree);
+      }
+    }
+  }
+
+  for (int i = 0; i < num_blocks; i++) {
+    out_degrees[i] = (columns[i].empty()) ? 0 : columns[i].front();
+  }
+}
+
 template <typename Torus>
 __host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -181,6 +408,70 @@ __host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
  return size_tracker;
 }

+void static DEBUG_PRINT_COLUMNS(uint32_t *d_column_data,
+                                uint32_t *d_columns_count,
+                                uint64_t *d_pbs_indexes_in,
+                                uint64_t *d_pbs_indexes_out,
+                                uint64_t *d_lut_indexes, int L, int N,
+                                int pbs_cnt) {
+  cudaDeviceSynchronize(); // Ensure all device work is done
+
+  std::vector<uint64_t> h_pbs_indexes_in(pbs_cnt);
+  std::vector<uint64_t> h_pbs_indexes_out(pbs_cnt);
+  std::vector<uint64_t> h_lut_indexes(pbs_cnt);
+
+  check_cuda_error(cudaMemcpy(h_pbs_indexes_in.data(), d_pbs_indexes_in,
+                              pbs_cnt * sizeof(uint64_t),
+                              cudaMemcpyDeviceToHost));
+  check_cuda_error(cudaMemcpy(h_pbs_indexes_out.data(), d_pbs_indexes_out,
+                              pbs_cnt * sizeof(uint64_t),
+                              cudaMemcpyDeviceToHost));
+  check_cuda_error(cudaMemcpy(h_lut_indexes.data(), d_lut_indexes,
+                              pbs_cnt * sizeof(uint64_t),
+                              cudaMemcpyDeviceToHost));
+
+  std::vector<uint32_t> h_columns_count(L);
+  check_cuda_error(cudaMemcpy(h_columns_count.data(), d_columns_count,
+                              L * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+
+  std::vector<uint32_t> h_column_data(L * N);
+  check_cuda_error(cudaMemcpy(h_column_data.data(), d_column_data,
+                              L * N * sizeof(uint32_t),
+                              cudaMemcpyDeviceToHost));
+  cudaDeviceSynchronize(); // Ensure all device work is done
+
+  std::cout << "column_counters: ";
+  for (auto a : h_columns_count) {
+    std::cout << a << " ";
+  }
+
+  std::cout << std::endl;
+
+  for (int col = 0; col < L; ++col) {
+    std::cout << "Column[" << col << "]: ";
+    uint32_t count = h_columns_count[col];
+    for (uint32_t i = 0; i < count; ++i) {
+      std::cout << h_column_data[col * N + i] << " ";
+    }
+    std::cout << "\n";
+  }
+
+  printf("pbs_indexes %d\n", pbs_cnt);
+  for (auto a : h_pbs_indexes_in) {
+    printf("%d ", a);
+  }
+  printf("\n");
+  for (auto a : h_pbs_indexes_out) {
+    printf("%d ", a);
+  }
+  printf("\n");
+  for (auto a : h_lut_indexes) {
+    printf("%d ", a);
+  }
+  printf("\n");
+  printf("=========================================================\n");
+}
+
 template <typename Torus, class params>
 __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -199,22 +490,30 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    PANIC("Cuda error: input vector does not have enough blocks")
  if (num_radix_blocks > radix_lwe_out->num_radix_blocks)
    PANIC("Cuda error: output does not have enough blocks")
-  auto new_blocks = mem_ptr->new_blocks;
-  auto new_blocks_copy = mem_ptr->new_blocks_copy;
-  auto old_blocks = mem_ptr->old_blocks;
+
+  auto current_blocks = mem_ptr->current_blocks;
  auto small_lwe_vector = mem_ptr->small_lwe_vector;
+  auto d_degrees = mem_ptr->d_degrees;
+  auto d_columns = mem_ptr->d_columns;
+  auto d_columns_counter = mem_ptr->d_columns_counter;
+  auto d_new_columns = mem_ptr->d_new_columns;
+  auto d_new_columns_counter = mem_ptr->d_new_columns_counter;
+  auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
+  auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
+  auto d_pbs_counters = mem_ptr->d_pbs_counters;

-  auto d_smart_copy_in = mem_ptr->d_smart_copy_in;
-  auto d_smart_copy_out = mem_ptr->d_smart_copy_out;
+  auto luts_message_carry = mem_ptr->luts_message_carry;

-  auto message_modulus = mem_ptr->params.message_modulus;
-  auto carry_modulus = mem_ptr->params.carry_modulus;
  auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
  auto big_lwe_size = big_lwe_dimension + 1;
  auto glwe_dimension = mem_ptr->params.glwe_dimension;
  auto polynomial_size = mem_ptr->params.polynomial_size;
  auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
  auto small_lwe_size = small_lwe_dimension + 1;
+  auto helper_streams = mem_ptr->helper_streams;
+  auto chunk_size = mem_ptr->chunk_size;
+
+  size_t total_blocks_in_vec = num_radix_blocks * num_radix_in_vec;

  // In the case of extracting a single LWE this parameters are dummy
  uint32_t num_many_lut = 1;
@@ -228,244 +527,153 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
                                             terms, 0, num_radix_blocks);
    return;
  }
-  if (old_blocks != terms) {
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], old_blocks,
-                                       terms);
-  }
+
  if (num_radix_in_vec == 2) {
-    CudaRadixCiphertextFFI old_blocks_slice;
-    as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
-                                     num_radix_blocks, 2 * num_radix_blocks);
-    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
-                         &old_blocks_slice, num_radix_blocks);
+    CudaRadixCiphertextFFI terms_slice;
+    as_radix_ciphertext_slice<Torus>(&terms_slice, terms, num_radix_blocks,
+                                     2 * num_radix_blocks);
+    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, terms,
+                         &terms_slice, num_radix_blocks);
    return;
  }

-  size_t r = num_radix_in_vec;
-  size_t total_modulus = message_modulus * carry_modulus;
-  size_t message_max = message_modulus - 1;
-  size_t chunk_size = (total_modulus - 1) / message_max;
-
-  size_t h_lwe_idx_in[terms->num_radix_blocks];
-  size_t h_lwe_idx_out[terms->num_radix_blocks];
-  int32_t h_smart_copy_in[terms->num_radix_blocks];
-  int32_t h_smart_copy_out[terms->num_radix_blocks];
-
-  /// Here it is important to query the default max shared memory on device 0
-  /// instead of cuda_get_max_shared_memory,
-  /// to avoid bugs with tree_add_chunks trying to use too much shared memory
-  auto max_shared_memory = 0;
-  check_cuda_error(cudaDeviceGetAttribute(
-      &max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));
-
-  // create lut object for message and carry
-  // we allocate luts_message_carry in the host function (instead of scratch)
-  // to reduce average memory consumption
-  int_radix_lut<Torus> *luts_message_carry;
-  size_t ch_amount = r / chunk_size;
-  if (!ch_amount)
-    ch_amount++;
-  if (reused_lut == nullptr) {
-    luts_message_carry = new int_radix_lut<Torus>(
-        streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
-        2 * ch_amount * num_radix_blocks, true, nullptr);
-  } else {
-    luts_message_carry = new int_radix_lut<Torus>(
-        streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
-        2 * ch_amount * num_radix_blocks, reused_lut, true, nullptr);
+  if (current_blocks != terms) {
+    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
+                                       current_blocks, terms);
  }
-  auto message_acc = luts_message_carry->get_lut(0, 0);
-  auto carry_acc = luts_message_carry->get_lut(0, 1);

-  // define functions for each accumulator
-  auto lut_f_message = [message_modulus](Torus x) -> Torus {
-    return x % message_modulus;
-  };
-  auto lut_f_carry = [message_modulus](Torus x) -> Torus {
-    return x / message_modulus;
-  };
+  cuda_memcpy_async_to_gpu(d_degrees, current_blocks->degrees,
+                           total_blocks_in_vec * sizeof(uint64_t), streams[0],
+                           gpu_indexes[0]);

-  // generate accumulators
-  generate_device_accumulator<Torus>(
-      streams[0], gpu_indexes[0], message_acc,
-      luts_message_carry->get_degree(0), luts_message_carry->get_max_degree(0),
-      glwe_dimension, polynomial_size, message_modulus, carry_modulus,
-      lut_f_message, true);
-  generate_device_accumulator<Torus>(
-      streams[0], gpu_indexes[0], carry_acc, luts_message_carry->get_degree(1),
-      luts_message_carry->get_max_degree(1), glwe_dimension, polynomial_size,
-      message_modulus, carry_modulus, lut_f_carry, true);
-  luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
+  int number_of_threads = 512;
+  int number_of_blocks =
+      (total_blocks_in_vec + number_of_threads - 1) / number_of_threads;

-  while (r > 2) {
-    size_t cur_total_blocks = r * num_radix_blocks;
-    size_t ch_amount = r / chunk_size;
-    if (!ch_amount)
-      ch_amount++;
-    dim3 add_grid(ch_amount, num_radix_blocks, 1);
+  radix_vec_to_columns<<<number_of_blocks, number_of_threads, 0, streams[0]>>>(
+      d_columns, d_columns_counter, d_degrees, num_radix_blocks,
+      total_blocks_in_vec);

-    cuda_set_device(gpu_indexes[0]);
-    tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
-        (Torus *)new_blocks->ptr, (Torus *)old_blocks->ptr,
-        std::min(r, chunk_size), big_lwe_size, num_radix_blocks);
+  DEBUG_PRINT_COLUMNS(mem_ptr->d_columns_data, d_columns_counter,
+                      d_pbs_indexes_in, d_pbs_indexes_out,
+                      luts_message_carry->get_lut_indexes(0, 0),
+                      num_radix_blocks, num_radix_in_vec, 0);
+  bool needs_processing = at_least_one_column_needs_processing(
+      current_blocks->degrees, num_radix_blocks, num_radix_in_vec, chunk_size);

-    check_cuda_error(cudaGetLastError());
+  number_of_threads = min(256, params::degree);
+  int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
+  const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);

-    size_t total_count = 0;
-    size_t message_count = 0;
-    size_t carry_count = 0;
-    size_t sm_copy_count = 0;
+  // h_pbs_counters[0] - total ciphertexts
+  // h_pbs_counters[1] - message ciphertexts
+  // h_pbs_counters[2] - at_leaast_one_column_needs_processing
+  uint32_t *h_pbs_counters = (uint32_t *)malloc(3 * sizeof(uint32_t));

-    generate_ids_update_degrees(
-        terms->degrees, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
-        h_smart_copy_out, ch_amount, r, num_radix_blocks, chunk_size,
-        message_max, total_count, message_count, carry_count, sm_copy_count);
-    auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
-    auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
-    luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
-                                        h_lwe_idx_in, h_lwe_idx_out);
+  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+  int DEBUG_I = 0;
+  while (needs_processing) {
+    calculate_chunks<Torus>
+        <<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
+            (Torus *)(current_blocks->ptr), d_columns, d_columns_counter,
+            chunk_size, big_lwe_size);

-    size_t copy_size = sm_copy_count * sizeof(int32_t);
-    cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
-                             streams[0], gpu_indexes[0]);
-    cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
-                             streams[0], gpu_indexes[0]);
+    prepare_new_columns_and_pbs_indexes<<<1, num_radix_blocks, 0,
+                                          helper_streams[0]>>>(
+        d_new_columns, d_new_columns_counter, d_pbs_indexes_in,
+        d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0),
+        d_pbs_counters, d_columns, d_columns_counter, chunk_size);

-    // inside d_smart_copy_in there are only -1 values
-    // it's fine to call smart_copy with same pointer
-    // as source and destination
-    copy_radix_ciphertext_slice_async<Torus>(
-        streams[0], gpu_indexes[0], new_blocks_copy, 0, r * num_radix_blocks,
-        new_blocks, 0, r * num_radix_blocks);
-    smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
-        (Torus *)new_blocks->ptr, (Torus *)new_blocks_copy->ptr,
-        d_smart_copy_out, d_smart_copy_in, big_lwe_size);
-    check_cuda_error(cudaGetLastError());
+    cuda_memcpy_async_to_cpu(h_pbs_counters, d_pbs_counters,
+                             3 * sizeof(uint32_t), helper_streams[0],
+                             gpu_indexes[0]);

-    if (carry_count > 0)
-      cuda_set_value_async<Torus>(
-          streams[0], gpu_indexes[0],
-          luts_message_carry->get_lut_indexes(0, message_count), 1,
-          carry_count);
+    cuda_synchronize_stream(helper_streams[0], gpu_indexes[0]);

-    luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
+    const uint32_t total_ciphertexts = h_pbs_counters[0];
+    const uint32_t total_messages = h_pbs_counters[1];
+    needs_processing = (h_pbs_counters[2] != 0);

-    /// For multi GPU execution we create vectors of pointers for inputs and
-    /// outputs
-    std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
-    std::vector<Torus *> small_lwe_vector_vec =
-        luts_message_carry->lwe_after_ks_vec;
-    std::vector<Torus *> lwe_after_pbs_vec =
-        luts_message_carry->lwe_after_pbs_vec;
-    std::vector<Torus *> lwe_trivial_indexes_vec =
-        luts_message_carry->lwe_trivial_indexes_vec;
-
-    auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
-    if (active_gpu_count == 1) {
-      /// Apply KS to go from a big LWE dimension to a small LWE dimension
-      /// After this keyswitch execution, we need to synchronize the streams
-      /// because the keyswitch and PBS do not operate on the same number of
-      /// inputs
-      execute_keyswitch_async<Torus>(
-          streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
-          lwe_indexes_in, (Torus *)new_blocks->ptr, lwe_indexes_in, ksks,
-          polynomial_size * glwe_dimension, small_lwe_dimension,
-          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, message_count);
-
-      /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
-      /// dimension to a big LWE dimension
-      execute_pbs_async<Torus>(
-          streams, gpu_indexes, 1, (Torus *)new_blocks->ptr, lwe_indexes_out,
-          luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
-          (Torus *)small_lwe_vector->ptr, lwe_indexes_in, bsks,
-          ms_noise_reduction_key, luts_message_carry->buffer, glwe_dimension,
-          small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log,
-          mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
-          total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
+    if (DEBUG_I % 2 == 0) {
+      DEBUG_PRINT_COLUMNS(
+          mem_ptr->d_new_columns_data, d_new_columns_counter, d_pbs_indexes_in,
+          d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0),
+          num_radix_blocks, num_radix_in_vec, total_ciphertexts);
    } else {
-      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-
-      multi_gpu_scatter_lwe_async<Torus>(
-          streams, gpu_indexes, active_gpu_count, new_blocks_vec,
-          (Torus *)new_blocks->ptr, luts_message_carry->h_lwe_indexes_in,
-          luts_message_carry->using_trivial_lwe_indexes, message_count,
-          big_lwe_size);
-
-      /// Apply KS to go from a big LWE dimension to a small LWE dimension
-      /// After this keyswitch execution, we need to synchronize the streams
-      /// because the keyswitch and PBS do not operate on the same number of
-      /// inputs
-      execute_keyswitch_async<Torus>(
-          streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec,
-          lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
-          ksks, big_lwe_dimension, small_lwe_dimension,
-          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count);
-
-      /// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
-      /// different configuration
-      multi_gpu_gather_lwe_async<Torus>(
-          streams, gpu_indexes, gpu_count, (Torus *)small_lwe_vector->ptr,
-          small_lwe_vector_vec, luts_message_carry->h_lwe_indexes_in,
-          luts_message_carry->using_trivial_lwe_indexes, message_count,
-          small_lwe_size);
-      /// Synchronize all GPUs
-      for (uint i = 0; i < active_gpu_count; i++) {
-        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-      }
-
-      multi_gpu_scatter_lwe_async<Torus>(
-          streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
-          (Torus *)small_lwe_vector->ptr, luts_message_carry->h_lwe_indexes_in,
-          luts_message_carry->using_trivial_lwe_indexes, total_count,
-          small_lwe_size);
-
-      /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
-      /// dimension to a big LWE dimension
-      execute_pbs_async<Torus>(
-          streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
-          lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
-          luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
-          lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
-          luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
-          polynomial_size, mem_ptr->params.pbs_base_log,
-          mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
-          total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
-
-      multi_gpu_gather_lwe_async<Torus>(
-          streams, gpu_indexes, active_gpu_count, (Torus *)new_blocks->ptr,
-          lwe_after_pbs_vec, luts_message_carry->h_lwe_indexes_out,
-          luts_message_carry->using_trivial_lwe_indexes, total_count,
-          big_lwe_size);
-      /// Synchronize all GPUs
-      for (uint i = 0; i < active_gpu_count; i++) {
-        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-      }
-    }
-    for (uint i = 0; i < total_count; i++) {
-      auto degrees_index = luts_message_carry->h_lut_indexes[i];
-      new_blocks->degrees[i] = luts_message_carry->degrees[degrees_index];
-      new_blocks->noise_levels[i] = NoiseLevel::NOMINAL;
+      DEBUG_PRINT_COLUMNS(
+          mem_ptr->d_columns_data, d_new_columns_counter, d_pbs_indexes_in,
+          d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0),
+          num_radix_blocks, num_radix_in_vec, total_ciphertexts);
    }

-    int rem_blocks = (r > chunk_size) ? r % chunk_size * num_radix_blocks : 0;
-    int new_blocks_created = 2 * ch_amount * num_radix_blocks;
+    cudaDeviceSynchronize();

-    if (rem_blocks > 0)
-      copy_radix_ciphertext_slice_async<Torus>(
-          streams[0], gpu_indexes[0], new_blocks, new_blocks_created,
-          new_blocks_created + rem_blocks, old_blocks,
-          cur_total_blocks - rem_blocks, cur_total_blocks);
-    std::swap(new_blocks, old_blocks);
-    r = (new_blocks_created + rem_blocks) / num_radix_blocks;
+    printf("total_messages: %d\n", total_messages);
+    printf("total_ct: %d\n", total_ciphertexts);
+    execute_keyswitch_async<Torus>(
+        streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
+        d_pbs_indexes_in, (Torus *)current_blocks->ptr, d_pbs_indexes_in, ksks,
+        big_lwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log,
+        mem_ptr->params.ks_level, total_messages);
+
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
+        d_pbs_indexes_out, luts_message_carry->lut_vec,
+        luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
+        d_pbs_indexes_in, bsks, ms_noise_reduction_key,
+        luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
+        polynomial_size, mem_ptr->params.pbs_base_log,
+        mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
+        total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
+
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    std::swap(d_columns, d_new_columns);
+    std::swap(d_columns_counter, d_new_columns_counter);
+    ++DEBUG_I;
  }
-  luts_message_carry->release(streams, gpu_indexes, gpu_count);
-  delete (luts_message_carry);

-  CudaRadixCiphertextFFI old_blocks_slice;
-  as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
+  calculate_final_chunk_into_radix<Torus>
+      <<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
+          (Torus *)(radix_lwe_out->ptr), (Torus *)(current_blocks->ptr),
+          d_columns, d_columns_counter, chunk_size, big_lwe_size);
+
+  prepare_final_pbs_indexes<Torus>
+      <<<1, 2 * num_radix_blocks, 0, helper_streams[0]>>>(
+          d_pbs_indexes_in, d_pbs_indexes_out,
+          luts_message_carry->get_lut_indexes(0, 0), num_radix_blocks);
+
+  cuda_memset_async(
+      (Torus *)(current_blocks->ptr) + big_lwe_size * num_radix_blocks, 0,
+      big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
+
+  cuda_synchronize_stream(helper_streams[0], gpu_indexes[0]);
+
+  print_debug<Torus>("indexes_in", d_pbs_indexes_in, 2 * num_radix_blocks);
+  print_debug<Torus>("indexes_out", d_pbs_indexes_out, 2 * num_radix_blocks);
+  print_debug<Torus>("lut_indexes", luts_message_carry->get_lut_indexes(0, 0),
+                     2 * num_radix_blocks);
+
+  execute_keyswitch_async<Torus>(
+      streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr, d_pbs_indexes_in,
+      (Torus *)radix_lwe_out->ptr, d_pbs_indexes_in, ksks, big_lwe_dimension,
+      small_lwe_dimension, mem_ptr->params.ks_base_log,
+      mem_ptr->params.ks_level, num_radix_blocks);
+
+  execute_pbs_async<Torus>(
+      streams, gpu_indexes, 1, (Torus *)current_blocks->ptr, d_pbs_indexes_out,
+      luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
+      (Torus *)small_lwe_vector->ptr, d_pbs_indexes_in, bsks,
+      ms_noise_reduction_key, luts_message_carry->buffer, glwe_dimension,
+      small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log,
+      mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
+      2 * num_radix_blocks, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
+
+  CudaRadixCiphertextFFI current_blocks_slice;
+  as_radix_ciphertext_slice<Torus>(&current_blocks_slice, current_blocks,
                                   num_radix_blocks, 2 * num_radix_blocks);
-  host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
-                       &old_blocks_slice, num_radix_blocks);
+
+  host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out,
+                       current_blocks, &current_blocks_slice, num_radix_blocks);
 }

 template <typename Torus, class params>
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -118,7 +118,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)

    add_to_torus<Torus, params>(accumulator_fft, accumulator_rotated, true);
  }
-
+  __syncthreads();
  auto accumulator = accumulator_rotated;

  if (blockIdx.z == 0) {
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -357,16 +357,19 @@ uint64_t scratch_cuda_programmable_bootstrap_64(
 #endif
      if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
              glwe_dimension, polynomial_size, level_count,
-              input_lwe_ciphertext_count, max_shared_memory))
+              input_lwe_ciphertext_count, max_shared_memory)) {
+    printf("it is cg\n");
    return scratch_cuda_programmable_bootstrap_cg<uint64_t>(
        stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
        lwe_dimension, glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
-  else
+  } else {
+    printf("it is default\n");
    return scratch_cuda_programmable_bootstrap<uint64_t>(
        stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
        lwe_dimension, glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+  }
 }

 template <typename Torus>
--- a/backends/tfhe-hpu-backend/.gitattributes
+++ b/backends/tfhe-hpu-backend/.gitattributes
@@ -1,3 +0,0 @@
-*.xclbin filter=lfs diff=lfs merge=lfs -text
-*.pdi filter=lfs diff=lfs merge=lfs -text
-python/lib/example.json filter=lfs diff=lfs merge=lfs -text
--- a/backends/tfhe-hpu-backend/.gitignore
+++ b/backends/tfhe-hpu-backend/.gitignore
@@ -1,3 +0,0 @@
-ngt_*
-config
-kogge_cfg.toml
--- a/backends/tfhe-hpu-backend/Cargo.toml
+++ b/backends/tfhe-hpu-backend/Cargo.toml
@@ -1,88 +0,0 @@
-[package]
-name = "tfhe-hpu-backend"
-version = "0.1.0"
-edition = "2021"
-license = "BSD-3-Clause-Clear"
-description = "HPU implementation on FPGA of TFHE-rs primitives."
-homepage = "https://www.zama.ai/"
-documentation = "https://docs.zama.ai/tfhe-rs"
-repository = "https://github.com/zama-ai/tfhe-rs"
-readme = "README.md"
-keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography", "hardware", "fpga"]
-
-[features]
-hw-xrt = []
-hw-v80 = []
-io-dump = ["num-traits"]
-rtl_graph = ["dot2"]
-utils = ["clap", "clap-num", "bitvec", "serde_json"]
-
-[build-dependencies]
-cxx-build = "1.0"
-
-[dependencies]
-cxx = "1.0"
-hw_regmap = "0.1.0"
-
-strum = { version = "0.26.2", features = ["derive"] }
-strum_macros = "0.26.2"
-enum_dispatch = "0.3.13"
-tracing = "0.1.40"
-tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
-serde = { version = "1", features = ["derive"] }
-toml = { version = "0.8.*", features = [] }
-paste = "1.0.15"
-thiserror = "1.0.61"
-bytemuck = "1.16.0"
-anyhow = "1.0.82"
-lazy_static = "1.4.0"
-rand = "0.8.5"
-regex = "1.10.4"
-bitflags = { version = "2.5.0", features = ["serde"] }
-itertools = "0.11.0"
-lru = "0.12.3"
-bitfield-struct = "0.10.0"
-crossbeam = { version = "0.8.4", features = ["crossbeam-queue"] }
-rayon = { workspace = true }
-
-# Dependencies used for Sim feature
-ipc-channel = "0.18.3"
-
-# Dependencies used for debug feature
-num-traits = { version = "*", optional = true }
-clap = { version = "4.4.4", features = ["derive"], optional = true }
-clap-num = { version = "1.1.1", optional = true }
-nix = { version = "0.29.0", features = ["ioctl", "uio"] }
-
-# Dependencies used for rtl_graph features
-dot2 = { version = "*", optional = true }
-
-bitvec = { version = "*", optional = true }
-serde_json = { version = "*", optional = true }
-
-# Binary for manual debugging
-# Enable to access Hpu register and drive some custom sequence by hand
-[[bin]]
-name = "hputil"
-path = "src/utils/hputil.rs"
-required-features = ["utils"]
-
-# Binary for asm manipulation
-# Enable to convert back and forth between asm/hex format
-[[bin]]
-name = "dop_fmt"
-path = "src/utils/dop_fmt.rs"
-required-features = ["utils"]
-
-# Enable to convert back and forth between asm/hex format
-[[bin]]
-name = "iop_fmt"
-path = "src/utils/iop_fmt.rs"
-required-features = ["utils"]
-
-# Firmware generation
-# Enable to expand IOp in list of Dop for inspection
-[[bin]]
-name = "fw"
-path = "src/utils/fw.rs"
-required-features = ["utils"]
--- a/backends/tfhe-hpu-backend/LICENSE
+++ b/backends/tfhe-hpu-backend/LICENSE
@@ -1,28 +0,0 @@
-BSD 3-Clause Clear License
-
-Copyright © 2025 ZAMA.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, this
-list of conditions and the following disclaimer in the documentation and/or other
-materials provided with the distribution.
-
-3. Neither the name of ZAMA nor the names of its contributors may be used to endorse
-or promote products derived from this software without specific prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE.
-THIS SOFTWARE IS PROVIDED BY THE ZAMA AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
-ZAMA OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
-OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/backends/tfhe-hpu-backend/Readme.md
+++ b/backends/tfhe-hpu-backend/Readme.md
@@ -1,261 +0,0 @@
-# TFHE-hpu-backend
-
-## Brief
-The `tfhe-hpu-backend` holds the code to interface with the HPU accelerator of TFHE.
-It contains a `HpuDevice` abstraction that enables easy configuration and dispatching of TFHE operations on the HPU accelerator.
-
-The user API exposes the following functions for hardware setup:
- `HpuDevice::new`, `HpuDevice::from_config`: Instantiates abstraction device from configuration file.
- `HpuDevice::init`: Configures and uploads the required public material.
- `new_var_from`: Creates a HPU ciphertext from `tfhe-rs` ciphertext.
-
-HPU device could also be used from `integer` with the help of the following function:
- `tfhe::integer::hpu::init_device`: Init given HPU device with server key.
- `tfhe::integer::hpu::ciphertext::HpuRadixCiphertext::from_radix_ciphertext`: Convert a CpuRadixCiphertext in it's HPU counterpart.
- 
-HPU device could also be used seamlessly from `hl-api` by setting up a thread-local HPU server key:
- `tfhe::Config::from_hpu_device`: Extract hl-api configuration from HpuDevice.
- `tfhe::set_server_key`: Register the Hpu server key in the current thread.
-
-HPU variables could also be created from a `high-level-api` object, with the help of the `hw-xfer` feature.
-This implements a trait that enables `clone_on`, `mv_on` `FheUint` object on the HPU accelerator, and cast back `from` them.
-
-These objects implement the `std::ops` trait and could be used to dispatch operations on HPU hardware.
-
-### Backend structure
-`tfhe-hpu-backend` is split in various modules:
- `entities`: Defines structure handled by HPU accelerator. Conversion traits from/into those objects are implemented in `tfhe-rs`.
- `asm`: Describes assembly-like language for the HPU. It enables abstract HPU behavior and easily updates it through micro-code.
- `fw`: Abstraction to help the micro-code designer. Uses a simple rust program for describing new HPU operations. Helps with register/heap management.
- `interface`:
-  + `device`: High-level structure that exposes the User API.
-  + `backend`: Inner private structure that contains HPU modules
-  + `variable`: Wraps HPU ciphertexts. It enables to hook an hardware object lifetime within the `rust` borrow-checker.
-  + `memory`: Handles on-board memory allocation and synchronization
-  + `config`: Helps to configure HPU accelerator through a TOML configuration file
-  + `cmd`: Translates operation over `variable` in concrete HPU commands
-  + `regmap`: Communicates with the HPU internal register with ease.
-  + `rtl`: Defines concrete `rust` structure populated from HPU's status/configuration registers
-
-
-Below is an overview of the internal structure of the Backend.
-![HPU backend structure](./figures/tfhe-hpu-backend.excalidraw.png)
-
-This picture depicts the internal modules of `tfhe-hpu-backend`, Device is the main entry point for the user. Its lifecycle is as follows:
-
-1. Create HpuDevice, open link with the associated FPGA. Configure associated drivers and upload the bitstream. Read FPGA registers to extract supported configuration and features. Build Firmware conversion table (IOp -> DOps stream).
-
-2. Allocate required memory chunks in the on-board memory. Upload public material required by TFHE computation.
-
-3. Create HPU variables that handle TFHE Ciphertexts. It wraps TFHE Ciphertext with required internal resources and enforces the correct lifetime management. This abstraction enforces that during the variable lifecycle all required resources are valid.
-
-4. Users could trigger HPU operation from the HPU variable.
-  Variable abstraction enforces that required objects are correctly synced on the hardware and converts each operation in a concrete HPU command.
-  When HPU operation is acknowledged by the hardware, the internal state of the associated variable is updated.
-  This mechanism enables asynchronous operation and minimal amount of Host to/from HW memory transfer.
-  This mechanism also enables offloading a computation graph to the HPU and requires a synchronization only on the final results.
-
-## Example
-### Configuration file
-HPU configuration knobs are gathered in a TOML configuration file. This file describes the targeted FPGA with its associated configuration:
-```toml
-[fpga] # FPGA target
-  # Register layout in the FPGA
-  regmap=["${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_cfg_1in3.toml",
-          "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_cfg_3in3.toml",
-          "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_prc_1in3.toml",
-          "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_prc_3in3.toml"]
-  polling_us=10
-[fpga.ffi.V80] # Hardware properties
-  ami_dev="/dev/ami1" # Name of ami device
-  qdma_h2c="/dev/qdma${V80_PCIE_DEV}001-MM-0" # QDma host to card device
-  qdma_c2h="/dev/qdma${V80_PCIE_DEV}001-MM-1" # QDma card to host device
-
-[rtl] # RTL option
-  bpip_used = true # BPIP/IPIP mode
-  bpip_use_opportunism = false # Use strict flush paradigm
-  bpip_timeout = 100_000 # BPIP timeout in clock `cycles`
-
-[board] # Board configuration
-  ct_mem = 32768 # Number of allocated ciphertext
-  ct_pc = [ # Memory used for ciphertext
-    {Hbm= {pc=32}},
-    {Hbm= {pc=33}},
-  ]
-  heap_size = 16384 # Number of slots reserved for heap
-
-  lut_mem = 256 # Number of allocated LUT table
-  lut_pc = {Hbm={pc=34}} # Memory used for LUT
-
-  fw_size= 16777216 # Size in byte of the Firmware translation table
-  fw_pc = {Ddr= {offset= 0x3900_0000}} # Memory used for firmware translation table
-
-  bsk_pc = [ # Memory used for Bootstrapping key
-    {Hbm={pc=8}},
-    {Hbm={pc=12}},
-    {Hbm={pc=24}},
-    {Hbm={pc=28}},
-    {Hbm={pc=40}},
-    {Hbm={pc=44}},
-    {Hbm={pc=56}},
-    {Hbm={pc=60}}
-  ]
-
-  ksk_pc = [ # Memory used for Keyswitching key
-    {Hbm={pc=0}},
-    {Hbm={pc=1}},
-    {Hbm={pc=2}},
-    {Hbm={pc=3}},
-    {Hbm={pc=4}},
-    {Hbm={pc=5}},
-    {Hbm={pc=6}},
-    {Hbm={pc=7}},
-    {Hbm={pc=16}},
-    {Hbm={pc=17}},
-    {Hbm={pc=18}},
-    {Hbm={pc=19}},
-    {Hbm={pc=20}},
-    {Hbm={pc=21}},
-    {Hbm={pc=22}},
-    {Hbm={pc=23}}
-  ]
-
-  trace_pc = {Hbm={pc=35}} # Memory used for trace log
-  trace_depth = 32 # Size of Memory in MiB allocated for trace log
-
-[firmware] # Firmware properties
-  implementation = "Llt" # Firmware flavor to use
-  integer_w=[4,6,8,10,12,14,16,32,64,128] # List of supported IOp width
-  min_batch_size = 11 # Minimum batch size for maximum throughput
-  kogge_cfg            = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/kogge_cfg.toml"
-  custom_iop.'IOP[0]'  = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_0.asm"
-
-# Default firmware configuration. Could be edited on per-IOp basis
-[firmware.op_cfg.default]
-  fill_batch_fifo = true
-  min_batch_size = false
-  use_tiers = false
-  flush_behaviour = "Patient"
-  flush = true
-  ```
-
-### Device setup
-Following code snippet shows how to instantiate and configure a `HpuDevice`:
-```rust
-    // Following code snippets used the HighLevelApi abstraction
-    // Instantiate HpuDevice --------------------------------------------------
-    let hpu_device = HpuDevice::from_config(&args.config.expand());
-
-    // Generate keys ----------------------------------------------------------
-    let config = Config::from_hpu_device(&hpu_device);
-
-    let cks = ClientKey::generate(config);
-    let csks = CompressedServerKey::new(&cks);
-
-    // Register HpuDevice and key as thread-local engine
-    set_server_key((hpu_device, csks));
-```
-
-### Clone CPU ciphertext on HPU
-Following code snippet shows how to convert CPU ciphertext in HPU one:
-``` rust
-    // Draw random value as input
-    let a = rand::thread_rng().gen_range(0..u8::MAX);
-
-    // Encrypt them on Cpu side
-    let a_fhe = FheUint8::encrypt(a, &cks);
-
-    // Clone a ciphertext and move them in HpuWorld
-    // NB: Data doesn't move over Pcie at this stage
-    //     Data are only arranged in Hpu ordered an copy in the host internal buffer
-    let a_hpu = a_fhe.clone_on(&hpu_device);
-```
-
-### Dispatch operation on HPU
-Once registered as thread-local engine, HighLevel FheUint are converted in Hpu format.
-Following code snippets show how to start operation on HPU:
-
-``` rust
-  // Sum -------------------------------------------------------------
-  // Generate random inputs value and compute expected result
-  let in_a = rng.gen_range(0..u64::max_value());
-  let in_b = rng.gen_range(0..u64::max_value());
-  let clear_sum_ab = in_a.wrapping_add(in_b);
-
-  // Encrypt input value
-  let fhe_a = FheUint64::encrypt(in_a, cks);
-  let fhe_b = FheUint64::encrypt(in_b, cks);
-
-  // Triggered operation on HPU through hl_api
-  let fhe_sum_ab = fhe_a+fhe_b;
-
-  // Decrypt values
-  let dec_sum_ab: u64 = fhe_sum_ab.decrypt(cks);
-```
-
-## Pre-made Examples
-There are some example applications already available in `tfhe/examples/hpu`:
- * hpu_hlapi: Depict the used of HPU device through HighLevelApi.
- * hpu_bench: Depict the used of HPU device through Integer abstraction level.
-
-In order to run those applications on hardware, user must build from the project root (i.e `tfhe-rs-internal`) with `hpu-v80` features:
-
-> NB: Running examples required to have correctly pulled the `.pdi` files. Those files, due to their size, are backed by git-lfs and disabled by default.
-> In order to retrieve them, use the following command:
-> ```bash
-> git lfs pull --include="*" --exclude=""
-> ```
-
-``` bash
-cargo build --release --features="hpu-v80" --example hpu_hlapi --example hpu_bench
-# Correctly setup environment with setup_hpu.sh script
-source setup_hpu.sh --config v80 --init-qdma
-./target/release/examples/hpu_bench --integer-w 64 --integer-w 32 --iop MUL --iter 10
-./target/release/examples/hpu_hlapi
-```
-
-## Test framework
-There is also a set of tests backed in tfhe-rs. Tests are gather in testbundle over various integer width.
-Those tests have 5 sub-kind:
-* `alu`: Run and check all ct x ct IOp
-* `alus`: Run and check all ct x scalar IOp
-* `bitwise`: Run and check all bitwise IOp
-* `cmp`: Run and check all comparison IOp
-* `ternary`: Run and check ternary operation
-* `algo`: Run and check IOp dedicated to offload small algorithms
-
-
-Snippets below give some example of command that could be used for testing:
-``` bash
-# Correctly setup environment with setup_hpu.sh script
-source setup_hpu.sh --config v80 --init-qdma
-
-# Run all sub-kind for 64b integer width
-cargo test --release --features="hpu-v80" --test hpu -- u64
-
-# Run only `bitwise` sub-kind for all integer width IOp
-cargo test --release --features="hpu-v80" --test hpu -- bitwise
-```
-
-## Benches framework
-HPU is completely integrated in tfhe benchmark system. Performances results could be extracted from HighLevelApi or Integer Api.
-Three benchmarks could be started, through the following Makefile target for simplicity:
-``` bash
-# Do not forget to correctly set environment before hand
-source setup_hpu.sh --config v80 --init-qdma
-
-# Run hlapi benches
-make test_high_level_api_hpu
-
-# Run hlapi erc20 benches
-make bench_hlapi_erc20_hpu 
-
-# Run integer level benches
-make bench_integer_hpu
-```
-
-## Eager to start without real Hardware ?
-You are still waiting your FPGA board and are frustrated by lead time ?
-Don't worry, you have backed-up. A dedicated simulation infrastructure with accurate performance estimation is available in tfhe-rs.
-You can use it on any linux/MacOs to test HPU integration within tfhe-rs and optimized your application for HPU target.
-Simply through an eye to [Hpu mockup](../../mockups/tfhe-hpu-mockup/Reaadme.md), and follow the instruction.
--- a/backends/tfhe-hpu-backend/build.rs
+++ b/backends/tfhe-hpu-backend/build.rs
@@ -1,26 +0,0 @@
-fn main() {
-    if cfg!(feature = "hw-xrt") {
-        println!("cargo:rustc-link-search=/opt/xilinx/xrt/lib");
-        println!("cargo:rustc-link-lib=dylib=stdc++");
-        println!("cargo:rustc-link-lib=dl");
-        println!("cargo:rustc-link-lib=rt");
-        println!("cargo:rustc-link-lib=uuid");
-        println!("cargo:rustc-link-lib=dylib=xrt_coreutil");
-
-        cxx_build::bridge("src/ffi/xrt/mod.rs")
-            .file("src/ffi/xrt/cxx/hpu_hw.cc")
-            .file("src/ffi/xrt/cxx/mem_zone.cc")
-            .flag_if_supported("-std=c++23")
-            .include("/opt/xilinx/xrt/include") // Enhance: support parsing bash env instead of hard path
-            .flag("-fmessage-length=0")
-            .compile("hpu-hw-ffi");
-
-        println!("cargo:rerun-if-changed=src/ffi/xrt/mod.rs");
-        println!("cargo:rerun-if-changed=src/ffi/xrt/cxx/hpu_hw.cc");
-        println!("cargo:rerun-if-changed=src/ffi/xrt/cxx/hpu_hw.h");
-        println!("cargo:rerun-if-changed=src/ffi/xrt/cxx/mem_zone.cc");
-        println!("cargo:rerun-if-changed=src/ffi/xrt/cxx/mem_zone.h");
-    } else {
-        // Simulation ffi -> nothing to do
-    }
-}
--- a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_0.asm
+++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_0.asm
@@ -1,15 +0,0 @@
-# CUST_0
-# Simple IOp to check the xfer between Hpu/Cpu
-# Construct constant in dest slot -> 249 (0xf9)
-SUB R0 R0 R0
-ADDS R0 R0 1
-ST TD[0].0 R0 
-SUB R1 R1 R1
-ADDS R1 R1 2
-ST TD[0].1 R1 
-SUB R2 R2 R2
-ADDS R2 R2 3
-ST TD[0].2 R2 
-SUB R3 R3 R3
-ADDS R3 R3 3
-ST TD[0].3 R3 
--- a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_1.asm
+++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_1.asm
@@ -1,11 +0,0 @@
-# CUST_1
-# Simple IOp to check the xfer between Hpu/Cpu
-# Dest <- Src_a
-LD R0   TS[0].0
-LD R1   TS[0].1
-LD R2   TS[0].2
-LD R3   TS[0].3
-ST TD[0].0 R0 
-ST TD[0].1 R1 
-ST TD[0].2 R2 
-ST TD[0].3 R3 
--- a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_10.asm
+++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_10.asm
@@ -1,25 +0,0 @@
-; CUST_8
-; Simple IOp to check the ALU operation 
-; Dst[0].0 <- Src[0].0 + Src[1].0
-LD R1 TS[0].0
-LD R2 TS[1].0
-ADD R0 R1 R2
-ST TD[0].0 R0 
-
-; Dst[0].1 <- Src[0].1 + Src[1].1
-LD R5 TS[0].1
-LD R6 TS[1].1
-ADD R4 R5 R6
-ST TD[0].2 R4 
-
-; Dst[0].2 <- Src[0].2 + Src[1].2
-LD R9 TS[0].2
-LD R10 TS[1].2
-ADD R8 R9 R10
-ST TD[0].2 R8 
-
-; Dst[0].3 <- Src[0].3 + Src[1].3
-LD R13 TS[0].3
-LD R14 TS[1].3
-ADD R12 R13 R14
-ST TD[0].3 R0 
--- a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_16.asm
+++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_16.asm
@@ -1,6 +0,0 @@
-# CUST_16
-# Simple IOp to check PBS behavior
-# Dest <- PBSNone(Src_a.0)
-LD   R0   TS[0].0
-PBS_F  R0   R0    PbsNone
-ST   TD[0].0 R0 
--- a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_17.asm
+++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_17.asm
@@ -1,15 +0,0 @@
-# CUST_17
-# Simple IOp to check PBS behavior
-# Dest <- PBSNone(Src_a)
-LD    R0   TS[0].0
-PBS   R0   R0    PbsNone
-ST    TD[0].0 R0 
-LD    R1   TS[0].1
-PBS   R1   R1    PbsNone
-ST    TD[0].1 R1 
-LD    R2   TS[0].2
-PBS   R2   R2    PbsNone
-ST    TD[0].2 R2 
-LD    R3   TS[0].3
-PBS_F R3   R3    PbsNone
-ST    TD[0].3 R3 
--- a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_18.asm
+++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_18.asm
@@ -1,23 +0,0 @@
-; CUST_18
-; Simple IOp to check extraction pattern
-; Correct result:
-;   * Dst[0,1] <- Src[0][0,1]
-;   * Dst[2,3] <- Src[1][0,1]
-
-; Pack Src[0][0,1] with a Mac and extract Carry/Msg in Dst[0][0,1]
-LD R0 TS[0].0
-LD R1 TS[0].1
-MAC R3 R1 R0 4
-PBS R4 R3 PbsMsgOnly
-PBS R5 R3 PbsCarryInMsg
-ST TD[0].0 R4 
-ST TD[0].1 R5 
-
-; Pack Src[1][0,1] with a Mac and extract Carry/Msg in Dst[0][2,3]
-LD R10 TS[1].0
-LD R11 TS[1].1
-MAC R13 R11 R10 4
-PBS R14 R13 PbsMsgOnly
-PBS R15 R13 PbsCarryInMsg
-ST TD[0].2 R14 
-ST TD[0].3 R15 
--- a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_19.asm
+++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_19.asm
@@ -1,19 +0,0 @@
-; CUST_19
-; Simple IOp to check PbsMl2
-; Correct result:
-;   * Dst[0][0]   <- Src[0][0]
-;   * Dst[0][1]   <- 0
-;   * Dst[0][2]   <- Src[0][0] +1
-;   * Dst[0][3]   <- 0
-; i.e Cust_19(0x2) => 0x32 
-
-; Construct a 0 for destination padding
-SUB R16 R16 R16
-
-; Apply PbsMl2 on Src[0] result goes in dest[0][0-3] (0-padded)
-LD R0 TS[0].0
-PBS_ML2_F R0 R0 PbsTestMany2
-ST TD[0].0 R0 
-ST TD[0].1 R16
-ST TD[0].2 R1 
-ST TD[0].3 R16
--- a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_2.asm
+++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_2.asm
@@ -1,11 +0,0 @@
-# CUST_2
-# Simple IOp to check the xfer between Hpu/Cpu
-# Dest <- Src_b
-LD R0   TS[1].0
-LD R1   TS[1].1
-LD R2   TS[1].2
-LD R3   TS[1].3
-ST TD[0].0 R0 
-ST TD[0].1 R1 
-ST TD[0].2 R2 
-ST TD[0].3 R3 
--- a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_20.asm
+++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_20.asm
@@ -1,22 +0,0 @@
-; CUST_20
-; Simple IOp to check PbsMl4
-; Correct result:
-;   * Dst[0][0]   <- Src[0][0]
-;   * Dst[0][1]   <- Src[0][0] +1
-;   * Dst[0][2]   <- Src[0][0] +2
-;   * Dst[0][3]   <- Src[0][0] +3
-; i.e Cust_20(0x0) => 0xe4 
-
-SUB R16 R16 R16
-ST TD[0].0 R0
-ST TD[0].1 R0
-ST TD[0].2 R0
-ST TD[0].3 R0
-
-; Apply PbsMl4 on Src[0] result goes in dest[0][0-3]
-LD R0 TS[0].0
-PBS_ML4_F R0 R0 PbsTestMany4
-ST TD[0].0 R0
-ST TD[0].1 R1
-ST TD[0].2 R2
-ST TD[0].3 R3
--- a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_21.asm
+++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_21.asm
@@ -1,24 +0,0 @@
-; CUST_21
-; Simple IOp to check PbsMl8
-; WARN: This operation required 16b ct width
-; Correct result:
-;   * Dst[0][0]   <- Src[0][0]
-;   * Dst[0][1]   <- Src[0][0] +1
-;   * Dst[0][2]   <- Src[0][0] +2
-;   * Dst[0][3]   <- Src[0][0] +3
-;   * Dst[0][4]   <- Src[0][0] +4
-;   * Dst[0][5]   <- Src[0][0] +5
-;   * Dst[0][6]   <- Src[0][0] +6
-;   * Dst[0][7]   <- Src[0][0] +7
-
-; Apply PbsMl8 on Src[0] result goes in dest[0][0-7]
-LD R0 TS[0].0
-PBS_ML8_F R0 R0 PbsTestMany8
-ST TD[0].0 R0
-ST TD[0].1 R1
-ST TD[0].2 R2
-ST TD[0].3 R3
-ST TD[0].4 R4
-ST TD[0].5 R5
-ST TD[0].6 R6
-ST TD[0].7 R7
--- a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_3.asm
+++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_3.asm
@@ -1,16 +0,0 @@
-# CUST_3
-# Simple IOp to check isc behavior
-# Generate obvious deps and check that isc correctly issued the dop
-# Correct result must bu Dest <- Src[0]
-LD R0   TS[0].0
-LD R1   TS[0].1
-LD R2   TS[0].2
-LD R3   TS[0].3
-PBS R4 R0 PbsNone
-ST TD[0].0 R4 
-PBS R4 R1 PbsNone
-ST TD[0].1 R4 
-PBS R4 R2 PbsNone
-ST TD[0].2 R4 
-PBS_F R4 R3 PbsNone
-ST TD[0].3 R4 
--- a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_8.asm
+++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_8.asm
@@ -1,19 +0,0 @@
-; CUST_8
-; Simple IOp to check the ALU operation 
-; Dst[0].0 <- Src[0].0 + Src[1].0
-LD R1 TS[0].0
-LD R2 TS[1].0
-ADD R0 R1 R2
-ST TD[0].0 R0 
-
-; Dst[0].1 <- Src[0].1 - Src[1].1
-LD R5 TS[0].1
-LD R6 TS[1].1
-SUB R4 R5 R6
-ST TD[0].1 R4 
-
-; Dst[0].2 <- Src[0].2 + (Src[1].2 *4)
-LD R9 TS[0].2
-LD R10 TS[1].2
-MAC R8 R9 R10 4
-ST TD[0].2 R8 
--- a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_9.asm
+++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_9.asm
@@ -1,21 +0,0 @@
-; CUST_9
-; Simple IOp to check the ALU Scalar operation 
-; Dst[0].0 <- Src[0].0 + Imm[0].0
-LD R1 TS[0].0
-ADDS R0 R1 TI[0].0
-ST TD[0].0 R0 
-
-; Dst[0].1 <- Src[0].1 - Imm[0].1
-LD R5 TS[0].1
-SUBS R4 R5 TI[0].1
-ST TD[0].1 R4 
-
-; Dst[0].2 <- Imm[0].2 - Src[0].2
-LD R9 TS[0].2
-SSUB R8 R9 TI[0].2
-ST TD[0].2 R8 
-
-; Dst[0].3 <- Src[0].3 * Imm[0].3
-LD R13 TS[0].3
-MULS R12 R13 TI[0].3
-ST TD[0].3 R12 
--- a/backends/tfhe-hpu-backend/config_store/sim/hpu_config.toml
+++ b/backends/tfhe-hpu-backend/config_store/sim/hpu_config.toml
@@ -1,108 +0,0 @@
- 
-[fpga]
-  regmap=["${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_cfg_1in3.toml",
-          "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_cfg_3in3.toml",
-          "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_prc_1in3.toml",
-          "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_prc_3in3.toml",
-          "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/tb_hpu_regif_dummy.toml"]
-  polling_us=100000
-[fpga.ffi.Sim]
-  ipc_name="/tmp/${USER}/hpu_mockup_ipc"
-
-[rtl]
-  bpip_use = true
-  bpip_use_opportunism = true
-  bpip_timeout = 100_000
-
-[board]
-  ct_mem = 32768
-  ct_pc = [
-    {Hbm= {pc=32}},
-    {Hbm= {pc=33}},
-  ]
-  heap_size = 16384
-
-  lut_mem = 256
-  lut_pc = {Hbm={pc=34}}
-
-  fw_size= 16777215 # i.e. 16 MiB
-  fw_pc = {Ddr= {offset= 0x3900_0000}} # NB: Allocation must take place in the Discret DDR
-
-  bsk_pc = [
-    {Hbm={pc=8}},
-    {Hbm={pc=12}},
-    {Hbm={pc=24}},
-    {Hbm={pc=28}},
-    {Hbm={pc=40}},
-    {Hbm={pc=44}},
-    {Hbm={pc=56}},
-    {Hbm={pc=60}}
-  ]
-
-  ksk_pc = [
-    {Hbm={pc=0}},
-    {Hbm={pc=1}},
-    {Hbm={pc=2}},
-    {Hbm={pc=3}},
-    {Hbm={pc=4}},
-    {Hbm={pc=5}},
-    {Hbm={pc=6}},
-    {Hbm={pc=7}},
-    {Hbm={pc=16}},
-    {Hbm={pc=17}},
-    {Hbm={pc=18}},
-    {Hbm={pc=19}},
-    {Hbm={pc=20}},
-    {Hbm={pc=21}},
-    {Hbm={pc=22}},
-    {Hbm={pc=23}}
-  ]
-
-  trace_pc = {Hbm={pc=35}}
-  trace_depth = 32 # In MB
-
-[firmware]
-  implementation = "Llt"
-  integer_w=[2,4,6,8,10,12,14,16,32,64,128]
-  min_batch_size = 11
-  kogge_cfg            = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/kogge_cfg.toml"
-  custom_iop.'IOP[0]'  = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_0.asm"
-  custom_iop.'IOP[1]'  = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_1.asm"
-  custom_iop.'IOP[2]'  = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_2.asm"
-  custom_iop.'IOP[3]'  = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_3.asm"
-  custom_iop.'IOP[8]'  = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_8.asm"
-  custom_iop.'IOP[9]'  = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_9.asm"
-  custom_iop.'IOP[16]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_16.asm"
-  custom_iop.'IOP[17]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_17.asm"
-  custom_iop.'IOP[18]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_18.asm"
-  custom_iop.'IOP[19]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_19.asm"
-  custom_iop.'IOP[20]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_20.asm"
-  custom_iop.'IOP[21]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_21.asm"
-
-[firmware.op_cfg.default]
-  fill_batch_fifo = true
-  min_batch_size = false
-  use_tiers = false
-  flush_behaviour = "Patient"
-  flush = true
-
-[firmware.op_cfg.by_op.MUL]
-  fill_batch_fifo = false
-  min_batch_size = false
-  use_tiers = false
-  flush_behaviour = "Patient"
-  flush = true
-
-[firmware.op_cfg.by_op.MULS]
-  fill_batch_fifo = false
-  min_batch_size = false
-  use_tiers = false
-  flush_behaviour = "Patient"
-  flush = true
-
-[firmware.op_cfg.by_op.ERC_20]
-  fill_batch_fifo = true
-  min_batch_size = false
-  use_tiers = true
-  flush_behaviour = "Patient"
-  flush = true
--- a/backends/tfhe-hpu-backend/config_store/sim/hpu_regif_core_cfg_1in3.toml
+++ b/backends/tfhe-hpu-backend/config_store/sim/hpu_regif_core_cfg_1in3.toml
@@ -1,256 +0,0 @@
-module_name="hpu_regif_core_cfg_1in3"
-description="HPU top-level register interface. Used by the host to retrieve design information, and to configure it."
-word_size_b = 32
-offset = 0x00
-range = 0x10000
-ext_pkg = ["axi_if_common_param_pkg", "axi_if_shell_axil_pkg"]
-
-# =====================================================================================================================
-[section.entry_cfg_1in3]
-description="entry_cfg_1in3 section with known value used for debug."
-offset= 0x0
-
-[section.entry_cfg_1in3.register.dummy_val0]
-  description="RTL version"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Cst=0x01010101}
-
-[section.entry_cfg_1in3.register.dummy_val1]
-  description="RTL version"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Cst=0x11111111}
-
-[section.entry_cfg_1in3.register.dummy_val2]
-  description="RTL version"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Cst=0x21212121}
-
-
-[section.entry_cfg_1in3.register.dummy_val3]
-  description="RTL version"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Cst=0x31313131}
-
-# =====================================================================================================================
-[section.info]
-description="RTL architecture parameters"
-offset= 0x10
-
-[section.info.register.version]
-  description="RTL version"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Param="VERSION"}
-
-[section.info.register.ntt_architecture]
-  description="NTT architecture"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Param="NTT_CORE_ARCH"}
-
-[section.info.register.ntt_structure]
-  description="NTT structure parameters"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  field.radix = { size_b=8, offset_b=0 , default={Param="R"}, description="NTT radix"}
-  field.psi   = { size_b=8, offset_b=8 , default={Param="PSI"}, description="NTT psi"}
-  field.div   = { size_b=8, offset_b=16, default={Param="BWD_PSI_DIV"}, description="NTT backward div"}
-  field.delta = { size_b=8, offset_b=24, default={Param="DELTA"}, description="NTT network delta (for wmm arch)"}
-
-[section.info.register.ntt_rdx_cut]
-  description="NTT radix cuts, in log2 unit (for gf64 arch)"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  field.radix_cut0 = { size_b=4, offset_b=0 , default={Param="NTT_RDX_CUT_S_0"}, description="NTT radix cut #0"}
-  field.radix_cut1 = { size_b=4, offset_b=4 , default={Param="NTT_RDX_CUT_S_1"}, description="NTT radix cut #1"}
-  field.radix_cut2 = { size_b=4, offset_b=8 , default={Param="NTT_RDX_CUT_S_2"}, description="NTT radix cut #2"}
-  field.radix_cut3 = { size_b=4, offset_b=12, default={Param="NTT_RDX_CUT_S_3"}, description="NTT radix cut #3"}
-  field.radix_cut4 = { size_b=4, offset_b=16, default={Param="NTT_RDX_CUT_S_4"}, description="NTT radix cut #4"}
-  field.radix_cut5 = { size_b=4, offset_b=20, default={Param="NTT_RDX_CUT_S_5"}, description="NTT radix cut #5"}
-  field.radix_cut6 = { size_b=4, offset_b=24, default={Param="NTT_RDX_CUT_S_6"}, description="NTT radix cut #6"}
-  field.radix_cut7 = { size_b=4, offset_b=28, default={Param="NTT_RDX_CUT_S_7"}, description="NTT radix cut #7"}
-
-[section.info.register.ntt_pbs]
-  description="Maximum number of PBS in the NTT pipeline"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  field.batch_pbs_nb = { size_b=8, offset_b=0 , default={Param="BATCH_PBS_NB"}, description="Maximum number of PBS in the NTT pipe"}
-  field.total_pbs_nb = { size_b=8, offset_b=8 , default={Param="TOTAL_PBS_NB"}, description="Maximum number of PBS stored in PEP buffer"}
-
-[section.info.register.ntt_modulo]
-  description="Code associated to the NTT prime"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Param="MOD_NTT_NAME"}
-
-[section.info.register.application]
-  description="Code associated with the application"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Param="APPLICATION_NAME"}
-
-[section.info.register.ks_structure]
-  description="Key-switch structure parameters"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  field.x = { size_b=8, offset_b=0 , default={Param="LBX"}, description="Number of coefficients on X dimension"}
-  field.y = { size_b=8, offset_b=8 , default={Param="LBY"}, description="Number of coefficients on Y dimension"}
-  field.z = { size_b=8, offset_b=16, default={Param="LBZ"}, description="Number of coefficients on Z dimension"}
-
-[section.info.register.ks_crypto_param]
-  description="Key-switch crypto parameters"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  field.mod_ksk_w = { size_b=8, offset_b=0 , default={Param="MOD_KSK_W"}, description="Width of KSK modulo"}
-  field.ks_l      = { size_b=8, offset_b=8 , default={Param="KS_L"}, description="Number of KS decomposition level"}
-  field.ks_b      = { size_b=8, offset_b=16, default={Param="KS_B_W"}, description="Width of KS decomposition base"}
-
-[section.info.register.regf_structure]
-  description="Register file structure parameters"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  field.reg_nb  = { size_b=8, offset_b=0 , default={Param="REGF_REG_NB"}, description="Number of registers in regfile"}
-  field.coef_nb = { size_b=8, offset_b=8 , default={Param="REGF_COEF_NB"}, description="Number of coefficients at regfile interface"}
-
-[section.info.register.isc_structure]
-  description="Instruction scheduler structure parameters"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  field.depth = { size_b=8, offset_b=0 , default={Param="ISC_DEPTH"}, description="Number of slots in ISC lookahead buffer."}
-  field.min_iop_size  = { size_b=8, offset_b=8 , default={Param="MIN_IOP_SIZE"}, description="Minimum number of DOp per IOp to prevent sync_id overflow."}
-
-[section.info.register.pe_properties]
-  description="Processing elements parameters"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  field.alu_nb =          { size_b=8, offset_b=24 , default={Param="PEA_ALU_NB"}, description="Number of coefficients processed in parallel in pe_alu"}
-  field.pep_regf_period = { size_b=8, offset_b=16 , default={Param="PEP_REGF_PERIOD"}, description="Number of cycles between 2 consecutive data transfer between PEP and regfile"}
-  field.pem_regf_period = { size_b=8, offset_b=8 ,  default={Param="PEM_REGF_PERIOD"}, description="Number of cycles between 2 consecutive data transfer between PEM and regfile"}
-  field.pea_regf_period = { size_b=8, offset_b=0 ,  default={Param="PEA_REGF_PERIOD"}, description="Number of cycles between 2 consecutive data transfer between PEA and regfile"}
-
-[section.info.register.bsk_structure]
-  description="BSK manager structure parameters"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  field.bsk_cut_nb = { size_b=8, offset_b=8 , default={Param="BSK_CUT_NB"}, description="BSK cut nb"}
-
-[section.info.register.ksk_structure]
-  description="KSK manager structure parameters"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  field.ksk_cut_nb = { size_b=8, offset_b=8 , default={Param="KSK_CUT_NB"}, description="KSK cut nb"}
-
-[section.info.register.hbm_axi4_nb]
-  description="Number of AXI4 connections to HBM"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  field.bsk_pc  = { size_b=8, offset_b=0 , default={Param="BSK_PC"},  description="Number of HBM connections for BSK"}
-  field.ksk_pc  = { size_b=8, offset_b=8,  default={Param="KSK_PC"},  description="Number of HBM connections for KSK"}
-  field.pem_pc  = { size_b=8, offset_b=16, default={Param="PEM_PC"},  description="Number of HBM connections for ciphertexts (PEM)"}
-  field.glwe_pc = { size_b=8, offset_b=24, default={Param="GLWE_PC"}, description="Number of HBM connections for GLWE"}
-
-[section.info.register.hbm_axi4_dataw_pem]
-  description="Ciphertext HBM AXI4 connection data width"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Param="AXI4_PEM_DATA_W"}
-
-[section.info.register.hbm_axi4_dataw_glwe]
-  description="GLWE HBM AXI4 connection data width"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Param="AXI4_GLWE_DATA_W"}
-
-[section.info.register.hbm_axi4_dataw_bsk]
-  description="BSK HBM AXI4 connection data width"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Param="AXI4_BSK_DATA_W"}
-
-[section.info.register.hbm_axi4_dataw_ksk]
-  description="KSK HBM AXI4 connection data width"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Param="AXI4_KSK_DATA_W"}
-
-
-# =====================================================================================================================
-[section.hbm_axi4_addr_1in3]
-offset= 0x1000
-description="HBM AXI4 connection address offset"
-
-[section.hbm_axi4_addr_1in3.register.ct]
-  description="Address offset for each ciphertext HBM AXI4 connection"
-  owner="User"
-  read_access="Read"
-  write_access="Write"
-  duplicate=["_pc0_lsb", "_pc0_msb","_pc1_lsb", "_pc1_msb"]
-
-[section.hbm_axi4_addr_1in3.register.glwe]
-  description="Address offset for each GLWE HBM AXI4 connection"
-  owner="User"
-  read_access="Read"
-  write_access="Write"
-  duplicate=["_pc0_lsb", "_pc0_msb"]
-
-
-[section.hbm_axi4_addr_1in3.register.ksk]
-  description="Address offset for each KSK HBM AXI4 connection"
-  owner="User"
-  read_access="Read"
-  write_access="Write"
-  duplicate=["_pc0_lsb", "_pc0_msb", "_pc1_lsb", "_pc1_msb", "_pc2_lsb", "_pc2_msb", "_pc3_lsb", "_pc3_msb", "_pc4_lsb", "_pc4_msb", "_pc5_lsb", "_pc5_msb", "_pc6_lsb", "_pc6_msb", "_pc7_lsb", "_pc7_msb", "_pc8_lsb", "_pc8_msb", "_pc9_lsb", "_pc9_msb", "_pc10_lsb", "_pc10_msb", "_pc11_lsb", "_pc11_msb", "_pc12_lsb", "_pc12_msb", "_pc13_lsb", "_pc13_msb", "_pc14_lsb", "_pc14_msb", "_pc15_lsb", "_pc15_msb"]
-
- [section.hbm_axi4_addr_1in3.register.trc]
-  description="Address offset for each trace HBM AXI4 connection"
-  owner="User"
-  read_access="Read"
-  write_access="Write"
-  duplicate=["_pc0_lsb", "_pc0_msb"]
-
-# =====================================================================================================================
-[section.bpip]
-offset= 0x2000
-description="BPIP configuration"
-
-[section.bpip.register.use]
-  description="(1) Use BPIP mode, (0) use IPIP mode (default)"
-  owner="User"
-  read_access="Read"
-  write_access="Write"
-  field.use_bpip = { size_b=1, offset_b=0 , default={Cst=1}, description="use"}
-  field.use_opportunism = { size_b=1, offset_b=1 , default={Cst=0}, description="use opportunistic PBS flush"}
-
-[section.bpip.register.timeout]
-  description="Timeout for BPIP mode"
-  owner="User"
-  read_access="Read"
-  write_access="Write"
-  default={Cst=0xffffffff}
--- a/backends/tfhe-hpu-backend/config_store/sim/hpu_regif_core_cfg_3in3.toml
+++ b/backends/tfhe-hpu-backend/config_store/sim/hpu_regif_core_cfg_3in3.toml
@@ -1,51 +0,0 @@
-module_name="hpu_regif_core_cfg_3in3"
-description="HPU top-level register interface. Used by the host to retrieve design information, and to configure it."
-word_size_b = 32
-offset = 0x20000
-range = 0x10000
-ext_pkg = ["axi_if_common_param_pkg", "axi_if_shell_axil_pkg"]
-
-# =====================================================================================================================
-[section.entry_cfg_3in3]
-description="entry_cfg_3in3 section with known value used for debug."
-offset= 0x0
-
-[section.entry_cfg_3in3.register.dummy_val0]
-  description="RTL version"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Cst=0x03030303}
-
-[section.entry_cfg_3in3.register.dummy_val1]
-  description="RTL version"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Cst=0x13131313}
-
-[section.entry_cfg_3in3.register.dummy_val2]
-  description="RTL version"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Cst=0x23232323}
-
-[section.entry_cfg_3in3.register.dummy_val3]
-  description="RTL version"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Cst=0x33333333}
-
-# =====================================================================================================================
-[section.hbm_axi4_addr_3in3]
-description="HBM AXI4 connection address offset"
-offset= 0x10
-
-[section.hbm_axi4_addr_3in3.register.bsk]
-  description="Address offset for each BSK HBM AXI4 connection"
-  owner="User"
-  read_access="Read"
-  write_access="Write"
-  duplicate=["_pc0_lsb", "_pc0_msb", "_pc1_lsb", "_pc1_msb", "_pc2_lsb", "_pc2_msb", "_pc3_lsb", "_pc3_msb", "_pc4_lsb", "_pc4_msb", "_pc5_lsb", "_pc5_msb", "_pc6_lsb", "_pc6_msb", "_pc7_lsb", "_pc7_msb", "_pc8_lsb", "_pc8_msb", "_pc9_lsb", "_pc9_msb", "_pc10_lsb", "_pc10_msb", "_pc11_lsb", "_pc11_msb", "_pc12_lsb", "_pc12_msb", "_pc13_lsb", "_pc13_msb", "_pc14_lsb", "_pc14_msb", "_pc15_lsb", "_pc15_msb"]
--- a/backends/tfhe-hpu-backend/config_store/sim/hpu_regif_core_prc_1in3.toml
+++ b/backends/tfhe-hpu-backend/config_store/sim/hpu_regif_core_prc_1in3.toml
@@ -1,336 +0,0 @@
-module_name="hpu_regif_core_prc_1in3"
-description="HPU top-level register interface. Used by the host to retrieve design information, and to configure it."
-word_size_b = 32
-offset = 0x10000
-range = 0x10000
-ext_pkg = ["axi_if_common_param_pkg", "axi_if_shell_axil_pkg"]
-
-# =====================================================================================================================
-[section.entry_prc_1in3]
-description="entry_prc_1in3 section with known value used for debug."
-offset= 0x0
-
-[section.entry_prc_1in3.register.dummy_val0]
-  description="RTL version"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Cst=0x02020202}
-
-[section.entry_prc_1in3.register.dummy_val1]
-  description="RTL version"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Cst=0x12121212}
-
-[section.entry_prc_1in3.register.dummy_val2]
-  description="RTL version"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Cst=0x22222222}
-
-[section.entry_prc_1in3.register.dummy_val3]
-  description="RTL version"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Cst=0x32323232}
-
-# =====================================================================================================================
-[section.status_1in3]
-description="HPU status of part 1in3"
-offset= 0x10
-
-[section.status_1in3.register.error]
-  description="Error register (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-  field.pbs   = { size_b=32, offset_b=0 , default={Cst=0}, description="HPU error part 1in3"}
-
-# =====================================================================================================================
-[section.ksk_avail]
-description="KSK availability configuration"
-offset= 0x1000
-
-[section.ksk_avail.register.avail]
-  description="KSK available bit"
-  owner="User"
-  read_access="Read"
-  write_access="Write"
-  field.avail = { size_b=1, offset_b=0 , default={Cst=0}, description="avail"}
-
-[section.ksk_avail.register.reset]
-  description="KSK reset sequence"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-  field.request = { size_b=1, offset_b=0 ,  default={Cst=0}, description="request"}
-  field.done    = { size_b=1, offset_b=31 , default={Cst=0}, description="done"}
-
-# =====================================================================================================================
-[section.runtime_1in3]
-description="Runtime information"
-offset= 0x2000
-
-[section.runtime_1in3.register.pep_cmux_loop]
-  description="PEP: CMUX iteration loop number"
-  owner="Kernel"
-  read_access="Read"
-  write_access="None"
-  field.br_loop   = { size_b=15, offset_b=0 ,  default={Cst=0}, description="PBS current BR-loop"}
-  field.br_loop_c = { size_b=1,  offset_b=15 , default={Cst=0}, description="PBS current BR-loop parity"}
-  field.ks_loop   = { size_b=15, offset_b=16 , default={Cst=0}, description="KS current KS-loop"}
-  field.ks_loop_c = { size_b=1,  offset_b=31 , default={Cst=0}, description="KS current KS-loop parity"}
-
-[section.runtime_1in3.register.pep_pointer_0]
-  description="PEP: pointers (part 1)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="None"
-  field.pool_rp = { size_b=8, offset_b=0  , default={Cst=0}, description="PEP pool_rp"}
-  field.pool_wp = { size_b=8, offset_b=8  , default={Cst=0}, description="PEP pool_wp"}
-  field.ldg_pt  = { size_b=8, offset_b=16 , default={Cst=0}, description="PEP ldg_pt"}
-  field.ldb_pt  = { size_b=8, offset_b=24 , default={Cst=0}, description="PEP ldb_pt"}
-
-[section.runtime_1in3.register.pep_pointer_1]
-  description="PEP: pointers (part 2)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="None"
-  field.ks_in_rp  = { size_b=8, offset_b=0  , default={Cst=0}, description="PEP ks_in_rp"}
-  field.ks_in_wp  = { size_b=8, offset_b=8  , default={Cst=0}, description="PEP ks_in_wp"}
-  field.ks_out_rp = { size_b=8, offset_b=16 , default={Cst=0}, description="PEP ks_out_rp"}
-  field.ks_out_wp = { size_b=8, offset_b=24 , default={Cst=0}, description="PEP ks_out_wp"}
-
-[section.runtime_1in3.register.pep_pointer_2]
-  description="PEP: pointers (part 3)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="None"
-  field.pbs_in_rp  = { size_b=8, offset_b=0  , default={Cst=0}, description="PEP pbs_in_rp"}
-  field.pbs_in_wp  = { size_b=8, offset_b=8  , default={Cst=0}, description="PEP pbs_in_wp"}
-  field.ipip_flush_last_pbs_in_loop = { size_b=16, offset_b=16  , default={Cst=0}, description="PEP IPIP flush last pbs_in_loop"}
-
-[section.runtime_1in3.register.isc_latest_instruction]
-  description="ISC: 4 latest instructions received ([0] is the most recent)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="None"
-  duplicate=["_0","_1","_2","_3"]
-
-[section.runtime_1in3.register.pep_seq_bpip_batch_cnt]
-  description="PEP: BPIP batch counter (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.pep_seq_bpip_batch_flush_cnt]
-  description="PEP: BPIP batch triggered by a flush counter (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.pep_seq_bpip_batch_timeout_cnt]
-  description="PEP: BPIP batch triggered by a timeout counter (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.pep_seq_bpip_waiting_batch_cnt]
-  description="PEP: BPIP batch that waits the trigger counter (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.pep_seq_bpip_batch_filling_cnt]
-  description="PEP: Count batch with filled with a given number of CT (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-  duplicate=["_1","_2","_3","_4","_5","_6","_7","_8","_9","_10","_11","_12","_13","_14","_15","_16"]
-
-[section.runtime_1in3.register.pep_seq_ld_ack_cnt]
-  description="PEP: load BLWE ack counter (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.pep_seq_cmux_not_full_batch_cnt]
-  description="PEP: not full batch CMUX counter (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.pep_seq_ipip_flush_cnt]
-  description="PEP: IPIP flush CMUX counter (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.pep_ldb_rcp_dur]
-  description="PEP: load BLWE reception max duration (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.pep_ldg_req_dur]
-  description="PEP: load GLWE request max duration (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.pep_ldg_rcp_dur]
-  description="PEP: load GLWE reception max duration (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.pep_load_ksk_rcp_dur]
-  description="PEP: load KSK slice reception max duration (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-  duplicate=["_pc0","_pc1","_pc2","_pc3","_pc4","_pc5","_pc6","_pc7","_pc8","_pc9","_pc10","_pc11","_pc12","_pc13","_pc14","_pc15"]
-
-
-[section.runtime_1in3.register.pep_mmacc_sxt_rcp_dur]
-  description="PEP: MMACC SXT reception duration (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.pep_mmacc_sxt_req_dur]
-  description="PEP: MMACC SXT request duration (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.pep_mmacc_sxt_cmd_wait_b_dur]
-  description="PEP: MMACC SXT command wait for b duration (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.pep_inst_cnt]
-  description="PEP: input instruction counter (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.pep_ack_cnt]
-  description="PEP: instruction acknowledge counter (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.pem_load_inst_cnt]
-  description="PEM: load input instruction counter (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.pem_load_ack_cnt]
-  description="PEM: load instruction acknowledge counter (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.pem_store_inst_cnt]
-  description="PEM: store input instruction counter (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.pem_store_ack_cnt]
-  description="PEM: store instruction acknowledge counter (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.pea_inst_cnt]
-  description="PEA: input instruction counter (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.pea_ack_cnt]
-  description="PEA: instruction acknowledge counter (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.isc_inst_cnt]
-  description="ISC: input instruction counter (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.isc_ack_cnt]
-  description="ISC: instruction acknowledge counter (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.runtime_1in3.register.pem_load_info_0]
-  description="PEM: load first data)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="None"
-  duplicate=["_pc0_0","_pc0_1","_pc0_2","_pc0_3","_pc1_0","_pc1_1","_pc1_2","_pc1_3"]
-
-[section.runtime_1in3.register.pem_load_info_1]
-  description="PEM: load first address"
-  owner="Kernel"
-  read_access="Read"
-  write_access="None"
-  duplicate=["_pc0_lsb","_pc0_msb","_pc1_lsb","_pc1_msb"]
-
-[section.runtime_1in3.register.pem_store_info_0]
-  description="PEM: store info 0)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="None"
-  field.cmd_vld             = { size_b=1, offset_b=0  , default={Cst=0}, description="PEM_ST cmd vld"}
-  field.cmd_rdy             = { size_b=1, offset_b=1  , default={Cst=0}, description="PEM_ST cmd rdy"}
-  field.pem_regf_rd_req_vld = { size_b=1, offset_b=2  , default={Cst=0}, description="PEM_ST pem_regf_rd_req_vld"}
-  field.pem_regf_rd_req_rdy = { size_b=1, offset_b=3  , default={Cst=0}, description="PEM_ST pem_regf_rd_req_rdy"}
-  field.brsp_fifo_in_vld    = { size_b=4, offset_b=4  , default={Cst=0}, description="PEM_ST brsp_fifo_in_vld"}
-  field.brsp_fifo_in_rdy    = { size_b=4, offset_b=8  , default={Cst=0}, description="PEM_ST brsp_fifo_in_rdy"}
-  field.rcp_fifo_in_vld     = { size_b=4, offset_b=12 , default={Cst=0}, description="PEM_ST rcp_fifo_in_vld"}
-  field.rcp_fifo_in_rdy     = { size_b=4, offset_b=16 , default={Cst=0}, description="PEM_ST rcp_fifo_in_rdy"}
-  field.r2_axi_vld          = { size_b=4, offset_b=20 , default={Cst=0}, description="PEM_ST r2_axi_vld"}
-  field.r2_axi_rdy          = { size_b=4, offset_b=24 , default={Cst=0}, description="PEM_ST r2_axi_rdy"}
-  field.c0_enough_location  = { size_b=4, offset_b=28 , default={Cst=0}, description="PEM_ST c0_enough_location"}
-
-[section.runtime_1in3.register.pem_store_info_1]
-  description="PEM: store info 1"
-  owner="Kernel"
-  read_access="Read"
-  write_access="None"
-  field.s0_cmd_vld          = { size_b=4, offset_b=0  , default={Cst=0}, description="PEM_ST s0_cmd_vld"}
-  field.s0_cmd_rdy          = { size_b=4, offset_b=4  , default={Cst=0}, description="PEM_ST s0_cmd_rdy"}
-  field.m_axi_bvalid        = { size_b=4, offset_b=8  , default={Cst=0}, description="PEM_ST m_axi_bvalid"}
-  field.m_axi_bready        = { size_b=4, offset_b=12 , default={Cst=0}, description="PEM_ST m_axi_bready"}
-  field.m_axi_wvalid        = { size_b=4, offset_b=16 , default={Cst=0}, description="PEM_ST m_axi_wvalid"}
-  field.m_axi_wready        = { size_b=4, offset_b=20 , default={Cst=0}, description="PEM_ST m_axi_wready"}
-  field.m_axi_awvalid       = { size_b=4, offset_b=24 , default={Cst=0}, description="PEM_ST m_axi_awvalid"}
-  field.m_axi_awready       = { size_b=4, offset_b=28 , default={Cst=0}, description="PEM_ST m_axi_awready"}
-
-[section.runtime_1in3.register.pem_store_info_2]
-  description="PEM: store info 2"
-  owner="Kernel"
-  read_access="Read"
-  write_access="None"
-  field.c0_free_loc_cnt      = { size_b=16, offset_b=0  , default={Cst=0}, description="PEM_ST c0_free_loc_cnt"}
-  field.brsp_bresp_cnt       = { size_b=16, offset_b=16 , default={Cst=0}, description="PEM_ST brsp_bresp_cnt"}
-
-[section.runtime_1in3.register.pem_store_info_3]
-  description="PEM: store info 3"
-  owner="Kernel"
-  read_access="Read"
-  write_access="None"
-  field.brsp_ack_seen      = { size_b=16, offset_b=0  , default={Cst=0}, description="PEM_ST brsp_ack_seen"}
-  field.c0_cmd_cnt         = { size_b=8,  offset_b=16 , default={Cst=0}, description="PEM_ST c0_cmd_cnt"}
--- a/backends/tfhe-hpu-backend/config_store/sim/hpu_regif_core_prc_3in3.toml
+++ b/backends/tfhe-hpu-backend/config_store/sim/hpu_regif_core_prc_3in3.toml
@@ -1,100 +0,0 @@
-module_name="hpu_regif_core_prc_3in3"
-description="HPU top-level register interface. Used by the host to retrieve design information, and to configure it."
-word_size_b = 32
-offset = 0x30000
-range = 0x10000
-ext_pkg = ["axi_if_common_param_pkg", "axi_if_shell_axil_pkg"]
-
-# =====================================================================================================================
-[section.entry_prc_3in3]
-description="entry_prc_3in3 section with known value used for debug."
-offset= 0x0
-
-[section.entry_prc_3in3.register.dummy_val0]
-  description="RTL version"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Cst=0x04040404}
-
-[section.entry_prc_3in3.register.dummy_val1]
-  description="RTL version"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Cst=0x14141414}
-
-[section.entry_prc_3in3.register.dummy_val2]
-  description="RTL version"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Cst=0x24242424}
-
-[section.entry_prc_3in3.register.dummy_val3]
-  description="RTL version"
-  owner="Parameter"
-  read_access="Read"
-  write_access="None"
-  default={Cst=0x34343434}
-
-# =====================================================================================================================
-[section.status_3in3]
-description="HPU status of parts 2in3 and 3in3"
-offset= 0x10
-
-[section.status_3in3.register.error]
-  description="Error register (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-  field.pbs   = { size_b=32, offset_b=0 , default={Cst=0}, description="HPU error part 3in3"}
-
-# =====================================================================================================================
-[section.bsk_avail]
-description="BSK availability configuration"
-offset= 0x1000
-
-[section.bsk_avail.register.avail]
-  description="BSK available bit"
-  owner="User"
-  read_access="Read"
-  write_access="Write"
-  field.avail = { size_b=1, offset_b=0 , default={Cst=0}, description="avail"}
-
-[section.bsk_avail.register.reset]
-  description="BSK reset sequence"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-  field.request = { size_b=1, offset_b=0 ,  default={Cst=0}, description="request"}
-  field.done    = { size_b=1, offset_b=31 , default={Cst=0}, description="done"}
-
-# =====================================================================================================================
-[section.runtime_3in3]
-description="Runtime information"
-offset= 0x2000
-
-[section.runtime_3in3.register.pep_load_bsk_rcp_dur]
-  description="PEP: load BSK slice reception max duration (Could be reset by user)"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-  duplicate=["_pc0","_pc1","_pc2","_pc3","_pc4","_pc5","_pc6","_pc7","_pc8","_pc9","_pc10","_pc11","_pc12","_pc13","_pc14","_pc15"]
-
-[section.runtime_3in3.register.pep_bskif_req_info_0]
-  description="PEP: BSK_IF: requester info 0"
-  owner="Kernel"
-  read_access="Read"
-  write_access="None"
-  field.req_br_loop_rp   = { size_b=16, offset_b=0  , default={Cst=0}, description="PEP BSK_IF requester BSK read pointer"}
-  field.req_br_loop_wp   = { size_b=16, offset_b=16 , default={Cst=0}, description="PEP BSK_IF requester BSK write pointer"}
-
-[section.runtime_3in3.register.pep_bskif_req_info_1]
-  description="PEP: BSK_IF: requester info 0"
-  owner="Kernel"
-  read_access="Read"
-  write_access="None"
-  field.req_prf_br_loop  = { size_b=16, offset_b=0  , default={Cst=0}, description="PEP BSK_IF requester BSK prefetch pointer"}
-  field.req_parity       = { size_b=1,  offset_b=16 , default={Cst=0}, description="PEP BSK_IF requester BSK pointer parity"}
-  field.req_assigned     = { size_b=1,  offset_b=31 , default={Cst=0}, description="PEP BSK_IF requester assignment"}
--- a/backends/tfhe-hpu-backend/config_store/sim/tb_hpu_regif_dummy.toml
+++ b/backends/tfhe-hpu-backend/config_store/sim/tb_hpu_regif_dummy.toml
@@ -1,22 +0,0 @@
-module_name="tb_hpu_regif_dummy"
-description="Fake registers needed by the mockup"
-word_size_b = 32
-offset = 0x40000
-range = 0x10000
-ext_pkg = ["axi_if_common_param_pkg", "axi_if_shell_axil_pkg"]
-
-# ==============================================================================
-[section.WorkAck]
-description="Purpose of this section"
-
-[section.WorkAck.register.workq]
-  description="Insert work in workq and read status"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-
-[section.WorkAck.register.ackq]
-  description="Pop ack from in ackq"
-  owner="Kernel"
-  read_access="ReadNotify"
-  write_access="None"
--- a/backends/tfhe-hpu-backend/config_store/u55c_gf64/Readme.md
+++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/Readme.md
@@ -1,6 +0,0 @@
-# Fpga version
-
-Built with the following command: (i.e. xrt/run_syn_hpu_msplit_3parts_64b.sh)
-```
-just zaxl-build hpu_msplit_3parts 3 "0:300" "-F TOP_MSPLIT TOP_MSPLIT_1 -F TOP_BATCH TOP_BATCH_TOPhpu_BPBS8_TPBS32 -F TOP_PCMAX  TOP_PCMAX_pem2_glwe1_bsk8_ksk8 -F TOP_PC TOP_PC_pem2_glwe1_bsk4_ksk4 -F APPLICATION APPLI_msg2_carry2 -F NTT_MOD NTT_MOD_goldilocks -F NTT_CORE_ARCH NTT_CORE_ARCH_gf64 -F NTT_CORE_R_PSI NTT_CORE_R2_PSI16 -F NTT_CORE_RDX_CUT NTT_CORE_RDX_CUT_n5c5c1 -F NTT_CORE_DIV NTT_CORE_DIV_1 -F BSK_SLOT_CUT BSK_SLOT8_CUT4 -F KSK_SLOT_CUT KSK_SLOT8_CUT4 -F KSLB KSLB_x2y32z3 -F HPU_PART HPU_PART_gf64 -F AXI_DATA_W AXI_DATA_W_512" "1:${PROJECT_DIR}/hw/output/micro_code/ucore_fw.elf" 'D:MEMORY_FILE_PATH=\\\"${PROJECT_DIR}/hw/\\\"' | tee build_out.log
-```
--- a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_0.asm
+++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_0.asm
@@ -1,15 +0,0 @@
-# CUST_0
-# Simple IOp to check the xfer between Hpu/Cpu
-# Construct constant in dest slot -> 249 (0xf9)
-SUB R0 R0 R0
-ADDS R0 R0 1
-ST TD[0].0 R0 
-SUB R1 R1 R1
-ADDS R1 R1 2
-ST TD[0].1 R1 
-SUB R2 R2 R2
-ADDS R2 R2 3
-ST TD[0].2 R2 
-SUB R3 R3 R3
-ADDS R3 R3 3
-ST TD[0].3 R3 
--- a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_1.asm
+++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_1.asm
@@ -1,11 +0,0 @@
-# CUST_1
-# Simple IOp to check the xfer between Hpu/Cpu
-# Dest <- Src_a
-LD R0   TS[0].0
-LD R1   TS[0].1
-LD R2   TS[0].2
-LD R3   TS[0].3
-ST TD[0].0 R0 
-ST TD[0].1 R1 
-ST TD[0].2 R2 
-ST TD[0].3 R3 
--- a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_10.asm
+++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_10.asm
@@ -1,25 +0,0 @@
-; CUST_8
-; Simple IOp to check the ALU operation 
-; Dst[0].0 <- Src[0].0 + Src[1].0
-LD R1 TS[0].0
-LD R2 TS[1].0
-ADD R0 R1 R2
-ST TD[0].0 R0 
-
-; Dst[0].1 <- Src[0].1 + Src[1].1
-LD R5 TS[0].1
-LD R6 TS[1].1
-ADD R4 R5 R6
-ST TD[0].2 R4 
-
-; Dst[0].2 <- Src[0].2 + Src[1].2
-LD R9 TS[0].2
-LD R10 TS[1].2
-ADD R8 R9 R10
-ST TD[0].2 R8 
-
-; Dst[0].3 <- Src[0].3 + Src[1].3
-LD R13 TS[0].3
-LD R14 TS[1].3
-ADD R12 R13 R14
-ST TD[0].3 R0 
--- a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_16.asm
+++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_16.asm
@@ -1,6 +0,0 @@
-# CUST_16
-# Simple IOp to check PBS behavior
-# Dest <- PBSNone(Src_a.0)
-LD   R0   TS[0].0
-PBS_F  R0   R0    PbsNone
-ST   TD[0].0 R0 
--- a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_17.asm
+++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_17.asm
@@ -1,15 +0,0 @@
-# CUST_17
-# Simple IOp to check PBS behavior
-# Dest <- PBSNone(Src_a)
-LD    R0   TS[0].0
-PBS   R0   R0    PbsNone
-ST    TD[0].0 R0 
-LD    R1   TS[0].1
-PBS   R1   R1    PbsNone
-ST    TD[0].1 R1 
-LD    R2   TS[0].2
-PBS   R2   R2    PbsNone
-ST    TD[0].2 R2 
-LD    R3   TS[0].3
-PBS_F R3   R3    PbsNone
-ST    TD[0].3 R3 
--- a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_18.asm
+++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_18.asm
@@ -1,23 +0,0 @@
-; CUST_18
-; Simple IOp to check extraction pattern
-; Correct result:
-;   * Dst[0,1] <- Src[0][0,1]
-;   * Dst[2,3] <- Src[1][0,1]
-
-; Pack Src[0][0,1] with a Mac and extract Carry/Msg in Dst[0][0,1]
-LD R0 TS[0].0
-LD R1 TS[0].1
-MAC R3 R1 R0 4
-PBS R4 R3 PbsMsgOnly
-PBS R5 R3 PbsCarryInMsg
-ST TD[0].0 R4 
-ST TD[0].1 R5 
-
-; Pack Src[1][0,1] with a Mac and extract Carry/Msg in Dst[0][2,3]
-LD R10 TS[1].0
-LD R11 TS[1].1
-MAC R13 R11 R10 4
-PBS R14 R13 PbsMsgOnly
-PBS R15 R13 PbsCarryInMsg
-ST TD[0].2 R14 
-ST TD[0].3 R15 
--- a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_19.asm
+++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_19.asm
@@ -1,19 +0,0 @@
-; CUST_19
-; Simple IOp to check PbsMl2
-; Correct result:
-;   * Dst[0][0]   <- Src[0][0]
-;   * Dst[0][1]   <- 0
-;   * Dst[0][2]   <- Src[0][0] +1
-;   * Dst[0][3]   <- 0
-; i.e Cust_19(0x2) => 0x32 
-
-; Construct a 0 for destination padding
-SUB R16 R16 R16
-
-; Apply PbsMl2 on Src[0] result goes in dest[0][0-3] (0-padded)
-LD R0 TS[0].0
-PBS_ML2_F R0 R0 PbsTestMany2
-ST TD[0].0 R0 
-ST TD[0].1 R16
-ST TD[0].2 R1 
-ST TD[0].3 R16
--- a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_2.asm
+++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_2.asm
@@ -1,11 +0,0 @@
-# CUST_2
-# Simple IOp to check the xfer between Hpu/Cpu
-# Dest <- Src_b
-LD R0   TS[1].0
-LD R1   TS[1].1
-LD R2   TS[1].2
-LD R3   TS[1].3
-ST TD[0].0 R0 
-ST TD[0].1 R1 
-ST TD[0].2 R2 
-ST TD[0].3 R3 
--- a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_20.asm
+++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_20.asm
@@ -1,22 +0,0 @@
-; CUST_20
-; Simple IOp to check PbsMl4
-; Correct result:
-;   * Dst[0][0]   <- Src[0][0]
-;   * Dst[0][1]   <- Src[0][0] +1
-;   * Dst[0][2]   <- Src[0][0] +2
-;   * Dst[0][3]   <- Src[0][0] +3
-; i.e Cust_20(0x0) => 0xe4 
-
-SUB R16 R16 R16
-ST TD[0].0 R0
-ST TD[0].1 R0
-ST TD[0].2 R0
-ST TD[0].3 R0
-
-; Apply PbsMl4 on Src[0] result goes in dest[0][0-3]
-LD R0 TS[0].0
-PBS_ML4_F R0 R0 PbsTestMany4
-ST TD[0].0 R0
-ST TD[0].1 R1
-ST TD[0].2 R2
-ST TD[0].3 R3
--- a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_21.asm
+++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_21.asm
@@ -1,24 +0,0 @@
-; CUST_21
-; Simple IOp to check PbsMl8
-; WARN: This operation required 16b ct width
-; Correct result:
-;   * Dst[0][0]   <- Src[0][0]
-;   * Dst[0][1]   <- Src[0][0] +1
-;   * Dst[0][2]   <- Src[0][0] +2
-;   * Dst[0][3]   <- Src[0][0] +3
-;   * Dst[0][4]   <- Src[0][0] +4
-;   * Dst[0][5]   <- Src[0][0] +5
-;   * Dst[0][6]   <- Src[0][0] +6
-;   * Dst[0][7]   <- Src[0][0] +7
-
-; Apply PbsMl8 on Src[0] result goes in dest[0][0-7]
-LD R0 TS[0].0
-PBS_ML8_F R0 R0 PbsTestMany8
-ST TD[0].0 R0
-ST TD[0].1 R1
-ST TD[0].2 R2
-ST TD[0].3 R3
-ST TD[0].4 R4
-ST TD[0].5 R5
-ST TD[0].6 R6
-ST TD[0].7 R7
--- a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_3.asm
+++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_3.asm
@@ -1,16 +0,0 @@
-# CUST_3
-# Simple IOp to check isc behavior
-# Generate obvious deps and check that isc correctly issued the dop
-# Correct result must bu Dest <- Src[0]
-LD R0   TS[0].0
-LD R1   TS[0].1
-LD R2   TS[0].2
-LD R3   TS[0].3
-PBS R4 R0 PbsNone
-ST TD[0].0 R4 
-PBS R4 R1 PbsNone
-ST TD[0].1 R4 
-PBS R4 R2 PbsNone
-ST TD[0].2 R4 
-PBS_F R4 R3 PbsNone
-ST TD[0].3 R4 
--- a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_8.asm
+++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_8.asm
@@ -1,19 +0,0 @@
-; CUST_8
-; Simple IOp to check the ALU operation 
-; Dst[0].0 <- Src[0].0 + Src[1].0
-LD R1 TS[0].0
-LD R2 TS[1].0
-ADD R0 R1 R2
-ST TD[0].0 R0 
-
-; Dst[0].1 <- Src[0].1 - Src[1].1
-LD R5 TS[0].1
-LD R6 TS[1].1
-SUB R4 R5 R6
-ST TD[0].1 R4 
-
-; Dst[0].2 <- Src[0].2 + (Src[1].2 *4)
-LD R9 TS[0].2
-LD R10 TS[1].2
-MAC R8 R9 R10 4
-ST TD[0].2 R8 
--- a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_9.asm
+++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_9.asm
@@ -1,21 +0,0 @@
-; CUST_9
-; Simple IOp to check the ALU Scalar operation 
-; Dst[0].0 <- Src[0].0 + Imm[0].0
-LD R1 TS[0].0
-ADDS R0 R1 TI[0].0
-ST TD[0].0 R0 
-
-; Dst[0].1 <- Src[0].1 - Imm[0].1
-LD R5 TS[0].1
-SUBS R4 R5 TI[0].1
-ST TD[0].1 R4 
-
-; Dst[0].2 <- Imm[0].2 - Src[0].2
-LD R9 TS[0].2
-SSUB R8 R9 TI[0].2
-ST TD[0].2 R8 
-
-; Dst[0].3 <- Src[0].3 * Imm[0].3
-LD R13 TS[0].3
-MULS R12 R13 TI[0].3
-ST TD[0].3 R12 
--- a/backends/tfhe-hpu-backend/config_store/u55c_gf64/hpu_config.toml
+++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/hpu_config.toml
@@ -1,98 +0,0 @@
- 
-[fpga]
-  regmap=["${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core.toml"]
-  polling_us=10
-[fpga.ffi.Xrt]
-  id= 0
-  kernel= "hpu_msplit_3parts_1in3"
-  xclbin="${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_msplit_3parts.xclbin"
-
-[rtl]
-  bpip_use = true
-  bpip_use_opportunism = true
-  bpip_timeout = 100_000
-
-[board]
-  ct_mem = 4096
-  ct_pc = [
-    {Hbm= {pc=10}},
-    {Hbm= {pc=11}},
-  ]
-  heap_size = 3584
-
-  lut_mem = 256
-  lut_pc = {Hbm={pc=12}}
-
-  fw_size= 65536
-  fw_pc = {Hbm={pc=1}}
-
-  bsk_pc = [
-    {Hbm={pc=2}},
-    {Hbm={pc=3}},
-    {Hbm={pc=4}},
-    {Hbm={pc=5}},
-    {Hbm={pc=6}},
-    {Hbm={pc=7}},
-    {Hbm={pc=8}},
-    {Hbm={pc=9}}
-  ]
-
-  ksk_pc = [
-    {Hbm={pc=24}},
-    {Hbm={pc=25}},
-    {Hbm={pc=26}},
-    {Hbm={pc=27}},
-    {Hbm={pc=28}},
-    {Hbm={pc=29}},
-    {Hbm={pc=30}},
-    {Hbm={pc=31}}
-  ]
-
-  trace_pc = {Hbm={pc=0}}
-  trace_depth = 4 # In MB
-
-[firmware]
-  implementation = "Llt"
-  integer_w=[4,6,8,10,12,14,16,32,64,128]
-  min_batch_size = 6
-  kogge_cfg            = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/kogge_cfg.toml"
-  custom_iop.'IOP[0]'  = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_0.asm"
-  custom_iop.'IOP[1]'  = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_1.asm"
-  custom_iop.'IOP[2]'  = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_2.asm"
-  custom_iop.'IOP[3]'  = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_3.asm"
-  custom_iop.'IOP[8]'  = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_8.asm"
-  custom_iop.'IOP[9]'  = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_9.asm"
-  custom_iop.'IOP[16]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_16.asm"
-  custom_iop.'IOP[17]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_17.asm"
-  custom_iop.'IOP[18]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_18.asm"
-  custom_iop.'IOP[19]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_19.asm"
-  custom_iop.'IOP[20]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_20.asm"
-  custom_iop.'IOP[21]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_21.asm"
-
-[firmware.op_cfg.default]
-  fill_batch_fifo = true
-  min_batch_size = false
-  use_tiers = false
-  flush_behaviour = "Patient"
-  flush = true
-
-[firmware.op_cfg.by_op.MUL]
-  fill_batch_fifo = false
-  min_batch_size = false
-  use_tiers = false
-  flush_behaviour = "Patient"
-  flush = true
-
-[firmware.op_cfg.by_op.MULS]
-  fill_batch_fifo = false
-  min_batch_size = false
-  use_tiers = false
-  flush_behaviour = "Patient"
-  flush = true
-
-[firmware.op_cfg.by_op.ERC_20]
-  fill_batch_fifo = false
-  min_batch_size = true
-  use_tiers = true
-  flush_behaviour = "Patient"
-  flush = true
--- a/backends/tfhe-hpu-backend/config_store/u55c_gf64/hpu_msplit_3parts.xclbin
+++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/hpu_msplit_3parts.xclbin
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:35ad67cf9760e37256a6c92cf29ea67334690b724fd3b7b859919ee9b0bde6d3
-size 78194785
--- a/backends/tfhe-hpu-backend/config_store/u55c_gf64/hpu_msplit_3parts.xclbin.info
+++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/hpu_msplit_3parts.xclbin.info
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Agnes Leroy	84af5e983a	revert optim flags	2025-05-13 10:05:23 +02:00
Agnes Leroy	069d47309d	fix	2025-05-12 17:35:31 +02:00
Beka Barbakadze	7389494c45	draft	2025-05-12 15:38:19 +02:00