GITBOOK-5: Update TOC

GITBOOK-4: V2 design details
GITBOOK-3: correct a typo
2026-01-11 15:48:20 -05:00 · 2024-03-05 14:32:28 +00:00 · 2024-02-28 15:27:05 +00:00 · 2024-02-28 14:54:38 +00:00 · 2024-02-28 14:23:50 +00:00 · 2024-02-28 14:11:06 +00:00
195 changed files with 12349 additions and 10168 deletions
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -5,6 +5,7 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -61,10 +62,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable
-          default: true

      - name: Run concrete-csprng tests
        run: |
--- a/.github/workflows/aws_tfhe_gpu_tests.yml
+++ b/.github/workflows/aws_tfhe_gpu_tests.yml
@@ -5,6 +5,7 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -63,7 +64,7 @@ jobs:
          echo "Fork git sha: ${{ inputs.fork_git_sha }}"

      - name: Checkout tfhe-rs
-        uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          repository: ${{ inputs.fork_repo }}
          ref: ${{ inputs.fork_git_sha }}
@@ -73,10 +74,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable
-          default: true

      - name: Export CUDA variables
        if: ${{ !cancelled() }}
@@ -102,3 +102,11 @@ jobs:
      - name: Run all tests
        run: |
          make test_gpu
+
+      - name: Run user docs tests
+        run: |
+          make test_user_doc_gpu
+
+      - name: Test C API
+        run: |
+          make test_c_api_gpu
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -4,6 +4,7 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -60,10 +61,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable
-          default: true

      - name: Gen Keys if required
        run: |
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -4,6 +4,7 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -60,10 +61,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable
-          default: true

      - name: Gen Keys if required
        run: |
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -4,6 +4,7 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -60,10 +61,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable
-          default: true

      - name: Run concrete-csprng tests
        run: |
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -4,6 +4,7 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -60,10 +61,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable
-          default: true

      - name: Run js on wasm API tests
        run: |
--- a/.github/workflows/boolean_benchmark.yml
+++ b/.github/workflows/boolean_benchmark.yml
@@ -32,6 +32,7 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  run-boolean-benchmarks:
@@ -61,10 +62,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Run benchmarks with AVX512
        run: |
@@ -96,7 +96,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_boolean
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -6,6 +6,7 @@ on:
 env:
  CARGO_TERM_COLOR: always
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref }}
@@ -17,7 +18,7 @@ jobs:

    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ubuntu-latest, macos-latest-large, windows-latest]
      fail-fast: false

    steps:
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -4,6 +4,7 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -61,14 +62,13 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable
-          default: true

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@62f4729b5df35e6e0e01265fa70a82ccaf196b4b
+        uses: tj-actions/changed-files@90a06d6ba9543371ab4df8eeca0be07ca6054959
        with:
          files_yaml: |
            tfhe:
@@ -98,7 +98,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@eaaf4bedf32dbdc6b720b63067d99c4d77d6047d
+        uses: codecov/codecov-action@4fe8c5f003fae66aa5ebb77cfd3e7bfbbda0b6b0
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/csprng_randomness_testing.yml
+++ b/.github/workflows/csprng_randomness_testing.yml
@@ -4,6 +4,7 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -52,10 +53,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable
-          default: true

      - name: Dieharder randomness test suite
        run: |
--- a/.github/workflows/integer_benchmark.yml
+++ b/.github/workflows/integer_benchmark.yml
@@ -25,6 +25,7 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  run-integer-benchmarks:
@@ -54,10 +55,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Run benchmarks with AVX512
        run: |
@@ -69,7 +69,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -90,7 +90,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/integer_full_benchmark.yml
+++ b/.github/workflows/integer_full_benchmark.yml
@@ -28,6 +28,7 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  prepare-matrix:
@@ -88,10 +89,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Checkout Slab repo
        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
@@ -118,7 +118,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/integer_gpu_benchmark.yml
+++ b/.github/workflows/integer_gpu_benchmark.yml
@@ -25,6 +25,7 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  run-integer-benchmarks:
@@ -54,7 +55,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          fetch-depth: 0

@@ -64,10 +65,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Export CUDA variables
        if: ${{ !cancelled() }}
@@ -96,7 +96,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -118,13 +118,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/integer_gpu_full_benchmark.yml
+++ b/.github/workflows/integer_gpu_full_benchmark.yml
@@ -19,11 +19,20 @@ on:
      request_id:
        description: "Slab request ID"
        type: string
+      # This input is not used in this workflow but still mandatory since a calling workflow could
+      # use it. If a triggering command include a user_inputs field, then the triggered workflow
+      # must include this very input, otherwise the workflow won't be called.
+      # See start_full_benchmarks.yml as example.
+      user_inputs:
+        description: "Type of benchmarks to run"
+        type: string
+        default: "weekly_benchmarks"

 env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  integer-benchmarks:
@@ -53,7 +62,7 @@ jobs:
          echo "Request ID: ${{ inputs.request_id }}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          fetch-depth: 0

@@ -69,10 +78,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Export CUDA variables
        if: ${{ !cancelled() }}
@@ -92,7 +100,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Checkout Slab repo
-        uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          repository: zama-ai/slab
          path: slab
@@ -117,7 +125,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/integer_multi_bit_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_benchmark.yml
@@ -25,6 +25,7 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  run-integer-benchmarks:
@@ -54,10 +55,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Run multi-bit benchmarks with AVX512
        run: |
@@ -69,7 +69,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -90,7 +90,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/integer_multi_bit_gpu_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_gpu_benchmark.yml
@@ -25,6 +25,7 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  run-integer-benchmarks:
@@ -55,7 +56,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          fetch-depth: 0

@@ -65,10 +66,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Export CUDA variables
        if: ${{ !cancelled() }}
@@ -97,7 +97,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -119,13 +119,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -14,6 +14,7 @@ on:
 env:
  CARGO_TERM_COLOR: always
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  FAST_TESTS: "TRUE"

@@ -30,10 +31,9 @@ jobs:
      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable
-          default: true

      - name: Run pcc checks
        run: |
--- a/.github/workflows/pbs_benchmark.yml
+++ b/.github/workflows/pbs_benchmark.yml
@@ -32,6 +32,7 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  run-pbs-benchmarks:
@@ -61,10 +62,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Run benchmarks with AVX512
        run: |
@@ -86,7 +86,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_pbs
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/pbs_gpu_benchmark.yml
+++ b/.github/workflows/pbs_gpu_benchmark.yml
@@ -0,0 +1,142 @@
+# Run PBS benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
+name: PBS GPU benchmarks
+
+on:
+  workflow_dispatch:
+    inputs:
+      instance_id:
+        description: "Instance ID"
+        type: string
+      instance_image_id:
+        description: "Instance AMI ID"
+        type: string
+      instance_type:
+        description: "Instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: "Slab request ID"
+        type: string
+      # This input is not used in this workflow but still mandatory since a calling workflow could
+      # use it. If a triggering command include a user_inputs field, then the triggered workflow
+      # must include this very input, otherwise the workflow won't be called.
+      # See start_full_benchmarks.yml as example.
+      user_inputs:
+        description: "Type of benchmarks to run"
+        type: string
+        default: "weekly_benchmarks"
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+jobs:
+  run-pbs-benchmarks:
+    name: Execute PBS benchmarks in EC2
+    runs-on: ${{ github.event.inputs.runner_name }}
+    if: ${{ !cancelled() }}
+    steps:
+      - name: Instance configuration used
+        run: |
+          echo "IDs: ${{ inputs.instance_id }}"
+          echo "AMI: ${{ inputs.instance_image_id }}"
+          echo "Type: ${{ inputs.instance_type }}"
+          echo "Request ID: ${{ inputs.request_id }}"
+
+      - name: Get benchmark date
+        run: |
+          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
+
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          fetch-depth: 0
+
+      - name: Set up home
+        # "Install rust" step require root user to have a HOME directory which is not set.
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        with:
+          toolchain: nightly
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Run benchmarks with AVX512
+        run: |
+          make AVX512_SUPPORT=ON bench_pbs_gpu
+
+      - name: Parse results
+        run: |
+          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
+          COMMIT_HASH="$(git describe --tags --dirty)"
+          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database tfhe_rs \
+          --hardware ${{ inputs.instance_type }} \
+          --backend gpu \
+          --project-version "${COMMIT_HASH}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --name-suffix avx512 \
+          --walk-subdirs \
+          --throughput
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
+        with:
+          name: ${{ github.sha }}_pbs
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          echo "Computing HMac on downloaded artifact"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "PBS GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/shortint_benchmark.yml
+++ b/.github/workflows/shortint_benchmark.yml
@@ -24,6 +24,7 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  run-shortint-benchmarks:
@@ -53,10 +54,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Run benchmarks with AVX512
        run: |
@@ -88,7 +88,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_shortint
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/shortint_full_benchmark.yml
+++ b/.github/workflows/shortint_full_benchmark.yml
@@ -32,6 +32,7 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  shortint-benchmarks:
@@ -67,10 +68,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Checkout Slab repo
        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
@@ -112,7 +112,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/signed_integer_benchmark.yml
+++ b/.github/workflows/signed_integer_benchmark.yml
@@ -25,6 +25,7 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  run-integer-benchmarks:
@@ -54,10 +55,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Run benchmarks with AVX512
        run: |
@@ -69,7 +69,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -90,7 +90,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/signed_integer_full_benchmark.yml
+++ b/.github/workflows/signed_integer_full_benchmark.yml
@@ -28,6 +28,7 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  integer-benchmarks:
@@ -66,10 +67,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Checkout Slab repo
        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
@@ -96,7 +96,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/signed_integer_multi_bit_benchmark.yml
+++ b/.github/workflows/signed_integer_multi_bit_benchmark.yml
@@ -25,6 +25,7 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  run-integer-benchmarks:
@@ -54,10 +55,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Run multi-bit benchmarks with AVX512
        run: |
@@ -69,7 +69,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -90,7 +90,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/start_benchmarks.yml
+++ b/.github/workflows/start_benchmarks.yml
@@ -36,6 +36,10 @@ on:
        description: "Run PBS benches"
        type: boolean
        default: true
+      pbs_gpu_bench:
+        description: "Run PBS benches on GPU"
+        type: boolean
+        default: true
      wasm_client_bench:
        description: "Run WASM client benches"
        type: boolean
@@ -50,7 +54,7 @@ jobs:
                   integer_bench, integer_multi_bit_bench,
                   signed_integer_bench, signed_integer_multi_bit_bench,
                   integer_gpu_bench, integer_multi_bit_gpu_bench,
-                   pbs_bench, wasm_client_bench ]
+                   pbs_bench, pbs_gpu_bench, wasm_client_bench ]
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
@@ -60,7 +64,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@62f4729b5df35e6e0e01265fa70a82ccaf196b4b
+        uses: tj-actions/changed-files@90a06d6ba9543371ab4df8eeca0be07ca6054959
        with:
          files_yaml: |
            common_benches:
--- a/.github/workflows/start_full_benchmarks.yml
+++ b/.github/workflows/start_full_benchmarks.yml
@@ -24,8 +24,9 @@ jobs:
    if: ${{ (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
    strategy:
      matrix:
-        command: [ boolean_bench, shortint_full_bench, integer_full_bench, signed_integer_full_bench, integer_gpu_full_bench,
-                   pbs_bench, wasm_client_bench ]
+        command: [ boolean_bench, shortint_full_bench,
+                   integer_full_bench, signed_integer_full_bench, integer_gpu_full_bench,
+                   pbs_bench, pbs_gpu_bench, wasm_client_bench ]
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -17,7 +17,7 @@ jobs:
        with:
          fetch-depth: 0
      - name: Save repo
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: repo-archive
          path: '.'
--- a/.github/workflows/wasm_client_benchmark.yml
+++ b/.github/workflows/wasm_client_benchmark.yml
@@ -32,6 +32,7 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  run-wasm-client-benchmarks:
@@ -61,10 +62,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Run benchmarks
        run: |
@@ -97,7 +97,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_wasm
          path: ${{ env.RESULTS_FILENAME }}
--- a/81
+++ b/81
@@ -17,6 +17,7 @@ FAST_TESTS?=FALSE
 FAST_BENCH?=FALSE
 BENCH_OP_FLAVOR?=DEFAULT
 NODE_VERSION=20
+FORWARD_COMPAT?=OFF
 # sed: -n, do not print input stream, -e means a script/expression
 # 1,/version/ indicates from the first line, to the line matching version at the start of the line
 # p indicates to print, so we keep only the start of the Cargo.toml until we hit the first version
@@ -49,12 +50,18 @@ else
 		COVERAGE_ONLY=
 endif

+ifeq ($(FORWARD_COMPAT),ON)
+		FORWARD_COMPAT_FEATURE=forward_compatibility
+else
+		FORWARD_COMPAT_FEATURE=
+endif
+
 # Variables used only for regex_engine example
 REGEX_STRING?=''
 REGEX_PATTERN?=''

 # tfhe-cuda-backend
-TFHECUDA_SRC="backends/tfhe-cuda-backend/implementation"
+TFHECUDA_SRC="backends/tfhe-cuda-backend/cuda"
 TFHECUDA_BUILD=$(TFHECUDA_SRC)/build

 # Exclude these files from coverage reports
@@ -154,7 +161,7 @@ check_fmt: install_rs_check_toolchain
 clippy_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=$(TARGET_ARCH_FEATURE),integer,shortint,gpu \
-		-p tfhe -- --no-deps -D warnings
+		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
 fix_newline: check_linelint_installed
@@ -172,6 +179,12 @@ clippy_core: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=$(TARGET_ARCH_FEATURE),experimental \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=$(TARGET_ARCH_FEATURE),nightly-avx512 \
+		-p $(TFHE_SPEC) -- --no-deps -D warnings
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=$(TARGET_ARCH_FEATURE),experimental,nightly-avx512 \
+		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_boolean # Run clippy lints enabling the boolean features
 clippy_boolean: install_rs_check_toolchain
@@ -286,15 +299,23 @@ symlink_c_libs_without_fingerprint:
 .PHONY: build_c_api # Build the C API for boolean, shortint and integer
 build_c_api: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,$(FORWARD_COMPAT_FEATURE) \
+		-p $(TFHE_SPEC)
+	@"$(MAKE)" symlink_c_libs_without_fingerprint
+
+.PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
+build_c_api_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,gpu \
 		-p $(TFHE_SPEC)
 	@"$(MAKE)" symlink_c_libs_without_fingerprint

 .PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
 build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,experimental-force_fft_algo_dif4 \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
 		-p $(TFHE_SPEC)
+	@"$(MAKE)" symlink_c_libs_without_fingerprint

 .PHONY: build_web_js_api # Build the js API targeting the web browser
 build_web_js_api: install_rs_build_toolchain install_wasm_pack
@@ -354,16 +375,16 @@ test_gpu: test_core_crypto_gpu test_integer_gpu
 .PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
 test_core_crypto_gpu: install_rs_build_toolchain install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p tfhe -- core_crypto::gpu::
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p tfhe -- core_crypto::gpu::
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- core_crypto::gpu::

 .PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
 test_integer_gpu: install_rs_build_toolchain install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p tfhe -- integer::gpu::server_key::
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p tfhe -- integer::gpu::server_key::
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::

 .PHONY: test_boolean # Run the tests of the boolean module
 test_boolean: install_rs_build_toolchain
@@ -392,19 +413,23 @@ test_c_api_c: build_c_api
 .PHONY: test_c_api # Run all the tests for the C API
 test_c_api: test_c_api_rs test_c_api_c

+.PHONY: test_c_api_gpu # Run the C tests for the C API
+test_c_api_gpu: build_c_api_gpu
+	./scripts/c_api_tests.sh --gpu
+
 .PHONY: test_shortint_ci # Run the tests for shortint ci
 test_shortint_ci: install_rs_build_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/shortint-tests.sh --rust-toolchain $(CARGO_RS_BUILD_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)"
+		--cargo-profile "$(CARGO_PROFILE)" --tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_shortint_multi_bit_ci # Run the tests for shortint ci running only multibit tests
 test_shortint_multi_bit_ci: install_rs_build_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/shortint-tests.sh --rust-toolchain $(CARGO_RS_BUILD_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)" --multi-bit
+		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_shortint # Run all the tests for shortint
 test_shortint: install_rs_build_toolchain
@@ -424,7 +449,8 @@ test_integer_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)"
+		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
+		--tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_unsigned_integer_ci # Run the tests for unsigned integer ci
 test_unsigned_integer_ci: install_rs_check_toolchain install_cargo_nextest
@@ -432,7 +458,7 @@ test_unsigned_integer_ci: install_rs_check_toolchain install_cargo_nextest
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
-		--unsigned-only
+		--unsigned-only --tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_signed_integer_ci # Run the tests for signed integer ci
 test_signed_integer_ci: install_rs_check_toolchain install_cargo_nextest
@@ -440,14 +466,15 @@ test_signed_integer_ci: install_rs_check_toolchain install_cargo_nextest
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
-		--signed-only
+		--signed-only --tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_integer_multi_bit_ci # Run the tests for integer ci running only multibit tests
 test_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)"
+		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
+		--tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_unsigned_integer_multi_bit_ci # Run the tests for nsigned integer ci running only multibit tests
 test_unsigned_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
@@ -455,7 +482,7 @@ test_unsigned_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nex
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
-		--unsigned-only
+		--unsigned-only --tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_signed_integer_multi_bit_ci # Run the tests for nsigned integer ci running only multibit tests
 test_signed_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
@@ -463,7 +490,7 @@ test_signed_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nexte
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
-		--signed-only
+		--signed-only --tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_safe_deserialization # Run the tests for safe deserialization
 test_safe_deserialization: install_rs_build_toolchain install_cargo_nextest
@@ -487,6 +514,12 @@ test_user_doc: install_rs_build_toolchain
 		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) \
 		-- test_user_docs::

+.PHONY: test_user_doc_gpu # Run tests for GPU from the .md documentation
+test_user_doc_gpu: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu -p $(TFHE_SPEC) \
+		-- test_user_docs::
+
 .PHONY: test_regex_engine # Run tests for regex_engine example
 test_regex_engine: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -530,7 +563,7 @@ docs: doc
 lint_doc: install_rs_check_toolchain
 	RUSTDOCFLAGS="--html-in-header katex-header.html -Dwarnings" \
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p tfhe --no-deps
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p $(TFHE_SPEC) --no-deps

 .PHONY: lint_docs # Build rust doc with linting enabled alias for lint_doc
 lint_docs: lint_doc
@@ -621,7 +654,7 @@ bench_integer_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,$(AVX512_FEATURE) -p tfhe --
+	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --

 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
@@ -645,7 +678,7 @@ bench_integer_multi_bit_gpu: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,$(AVX512_FEATURE) -p tfhe --
+	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --

 .PHONY: bench_shortint # Run benchmarks for shortint
 bench_shortint: install_rs_check_toolchain
@@ -659,11 +692,11 @@ bench_oprf: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench oprf-shortint-bench \
-	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,$(AVX512_FEATURE) -p tfhe
+	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench oprf-integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p tfhe
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)



@@ -688,6 +721,12 @@ bench_pbs: install_rs_check_toolchain
 	--bench pbs-bench \
 	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)

+.PHONY: bench_pbs_gpu # Run benchmarks for PBS on GPU backend
+bench_pbs_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench pbs-bench \
+	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)
+
 .PHONY: bench_web_js_api_parallel # Run benchmarks for the web wasm api
 bench_web_js_api_parallel: build_web_js_api_parallel
 	$(MAKE) -C tfhe/web_wasm_parallel_tests bench
--- a/README.md
+++ b/README.md
@@ -4,13 +4,17 @@
 </p>
 <hr/>
 <p align="center">
-  <a href="https://docs.zama.ai/tfhe-rs"> 📒 Read documentation</a> | <a href="https://zama.ai/community"> 💛 Community support</a>
+  <a href="https://docs.zama.ai/tfhe-rs"> 📒 Read documentation</a> | <a href="https://zama.ai/community"> 💛 Community support</a> | <a href="https://github.com/zama-ai/awesome-zama"> 📚 FHE resources</a>
 </p>
 <p align="center">
 <!-- Version badge using shields.io -->
  <a href="https://github.com/zama-ai/tfhe-rs/releases">
    <img src="https://img.shields.io/github/v/release/zama-ai/tfhe-rs?style=flat-square">
  </a>
+  <!-- Link to tutorials badge using shields.io -->
+  <a href="#license">
+    <img src="https://img.shields.io/badge/License-BSD--3--Clause--Clear-orange?style=flat-square">
+  </a>
 <!-- Zama Bounty Program -->
  <a href="https://github.com/zama-ai/bounty-program">
    <img src="https://img.shields.io/badge/Contribute-Zama%20Bounty%20Program-yellow?style=flat-square">
@@ -138,9 +142,11 @@ libraries.

 ## Need support?
 <a target="_blank" href="https://community.zama.ai">
-  <img src="https://user-images.githubusercontent.com/5758427/231115030-21195b55-2629-4c01-9809-be5059243999.png">
+  <img src="https://github.com/zama-ai/tfhe-rs/assets/157474013/33d856dc-f25d-454b-a010-af12bff2aa7d">
 </a>

+
+
 ## Citing TFHE-rs

 To cite TFHE-rs in academic papers, please use the following entry:
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.1.2"
+version = "0.1.3"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
--- a/backends/tfhe-cuda-backend/README.md
+++ b/backends/tfhe-cuda-backend/README.md
@@ -34,13 +34,14 @@ The Cuda project held in `tfhe-cuda-backend` can be compiled independently from
 following way:
 ```
 git clone git@github.com:zama-ai/tfhe-rs
-cd backends/tfhe-cuda-backend/implementation
+cd backends/tfhe-cuda-backend/cuda
 mkdir build
 cd build
 cmake ..
 make
 ```
 The compute capability is detected automatically (with the first GPU information) and set accordingly.
+If your machine does not have an available Nvidia GPU, the compilation will work if you have the nvcc compiler installed. The generated executable will target a 7.0 compute capability (sm_70).

 ## Links

--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.24 FATAL_ERROR)
-project(tfhe_cuda_backend LANGUAGES CXX CUDA)
+project(tfhe_cuda_backend LANGUAGES CXX)

 # See if the minimum CUDA version is available. If not, only enable documentation building.
 set(MINIMUM_SUPPORTED_CUDA_VERSION 10.0)
@@ -56,9 +56,10 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}  -g")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler ${OpenMP_CXX_FLAGS}")
-set(CMAKE_CUDA_ARCHITECTURES native)
-if(NOT CUDA_NVCC_FLAGS)
-  set(CUDA_NVCC_FLAGS -arch=sm_70)
+if(${CUDA_SUCCESS})
+  set(CMAKE_CUDA_ARCHITECTURES native)
+else()
+  set(CMAKE_CUDA_ARCHITECTURES 70)
 endif()

 # in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
--- a/backends/tfhe-cuda-backend/cuda/include/bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/bootstrap.h
@@ -40,7 +40,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);

 void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
@@ -48,7 +48,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);

 void cleanup_cuda_bootstrap_amortized(cuda_stream_t *stream,
                                      int8_t **pbs_buffer);
@@ -71,7 +71,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);

 void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
@@ -79,7 +79,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);

 void cleanup_cuda_bootstrap_low_latency(cuda_stream_t *stream,
                                        int8_t **pbs_buffer);
--- a/backends/tfhe-cuda-backend/cuda/include/bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/bootstrap_multibit.h
@@ -15,7 +15,7 @@ void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
+    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
    uint32_t max_shared_memory, uint32_t chunk_size = 0);

 void scratch_cuda_multi_bit_pbs_64(
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -32,34 +32,6 @@ enum COMPARISON_TYPE {
 };
 enum IS_RELATIONSHIP { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };

-/*
- *  generate bivariate accumulator for device pointer
- *    v_stream - cuda stream
- *    acc - device pointer for bivariate accumulator
- *    ...
- *    f - wrapping function with two Torus inputs
- */
-template <typename Torus>
-void generate_device_accumulator_bivariate(
-    cuda_stream_t *stream, Torus *acc_bivariate, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
-    std::function<Torus(Torus, Torus)> f);
-
-/*
- *  generate univariate accumulator for device pointer
- *    v_stream - cuda stream
- *    acc - device pointer for univariate accumulator
- *    ...
- *    f - evaluating function with one Torus input
- */
-template <typename Torus>
-void generate_device_accumulator(cuda_stream_t *stream, Torus *acc,
-                                 uint32_t glwe_dimension,
-                                 uint32_t polynomial_size,
-                                 uint32_t message_modulus,
-                                 uint32_t carry_modulus,
-                                 std::function<Torus(Torus)> f);
-
 extern "C" {
 void scratch_cuda_full_propagation_64(
    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t lwe_dimension,
@@ -226,6 +198,34 @@ void cleanup_cuda_propagate_single_carry_low_latency(cuda_stream_t *stream,
                                                     int8_t **mem_ptr_void);
 }

+/*
+ *  generate bivariate accumulator (lut) for device pointer
+ *    v_stream - cuda stream
+ *    acc_bivariate - device pointer for bivariate accumulator
+ *    ...
+ *    f - wrapping function with two Torus inputs
+ */
+template <typename Torus>
+void generate_device_accumulator_bivariate(
+    cuda_stream_t *stream, Torus *acc_bivariate, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
+    std::function<Torus(Torus, Torus)> f);
+
+/*
+ *  generate univariate accumulator (lut) for device pointer
+ *    v_stream - cuda stream
+ *    acc - device pointer for univariate accumulator
+ *    ...
+ *    f - evaluating function with one Torus input
+ */
+template <typename Torus>
+void generate_device_accumulator(cuda_stream_t *stream, Torus *acc,
+                                 uint32_t glwe_dimension,
+                                 uint32_t polynomial_size,
+                                 uint32_t message_modulus,
+                                 uint32_t carry_modulus,
+                                 std::function<Torus(Torus)> f);
+
 struct int_radix_params {
  PBS_TYPE pbs_type;
  uint32_t glwe_dimension;
@@ -326,7 +326,7 @@ template <typename Torus> struct int_radix_lut {
    if (allocate_gpu_memory) {
      // Allocate LUT
      // LUT is used as a trivial encryption and must be initialized outside
-      // this contructor
+      // this constructor
      lut = (Torus *)cuda_malloc_async(num_luts * lut_buffer_size, stream);

      lut_indexes = (Torus *)cuda_malloc_async(lut_indexes_size, stream);
@@ -408,7 +408,7 @@ template <typename Torus> struct int_radix_lut {
    return &lut[ind * (params.glwe_dimension + 1) * params.polynomial_size];
  }

-  Torus *get_tvi(size_t ind) { return &lut_indexes[ind]; }
+  Torus *get_lut_indexes(size_t ind) { return &lut_indexes[ind]; }
  void release(cuda_stream_t *stream) {
    cuda_drop_async(lut_indexes, stream);
    cuda_drop_async(lwe_indexes, stream);
@@ -437,10 +437,10 @@ template <typename Torus> struct int_sc_prop_memory {
  Torus *generates_or_propagates;
  Torus *step_output;

-  // test_vector_array[2] = {lut_does_block_generate_carry,
+  // luts_array[2] = {lut_does_block_generate_carry,
  // lut_does_block_generate_or_propagate}
-  int_radix_lut<Torus> *test_vector_array;
-  int_radix_lut<Torus> *lut_carry_propagation_sum;
+  int_radix_lut<Torus> *luts_array;
+  int_radix_lut<Torus> *luts_carry_propagation_sum;
  int_radix_lut<Torus> *message_acc;

  int_radix_params params;
@@ -461,7 +461,7 @@ template <typename Torus> struct int_sc_prop_memory {
    step_output = (Torus *)cuda_malloc_async(
        num_radix_blocks * big_lwe_size_bytes, stream);

-    // declare functions for test vector generation
+    // declare functions for lut generation
    auto f_lut_does_block_generate_carry = [message_modulus](Torus x) -> Torus {
      if (x >= message_modulus)
        return OUTPUT_CARRY::GENERATED;
@@ -477,7 +477,7 @@ template <typename Torus> struct int_sc_prop_memory {
      return OUTPUT_CARRY::NONE;
    };

-    auto f_lut_carry_propagation_sum = [](Torus msb, Torus lsb) -> Torus {
+    auto f_luts_carry_propagation_sum = [](Torus msb, Torus lsb) -> Torus {
      if (msb == OUTPUT_CARRY::PROPAGATED)
        return lsb;
      return msb;
@@ -487,18 +487,18 @@ template <typename Torus> struct int_sc_prop_memory {
      return x % message_modulus;
    };

-    // create test vector objects
-    test_vector_array = new int_radix_lut<Torus>(
+    // create lut objects
+    luts_array = new int_radix_lut<Torus>(
        stream, params, 2, num_radix_blocks, allocate_gpu_memory);
-    lut_carry_propagation_sum = new struct int_radix_lut<Torus>(
+    luts_carry_propagation_sum = new struct int_radix_lut<Torus>(
        stream, params, 1, num_radix_blocks, allocate_gpu_memory);
    message_acc = new struct int_radix_lut<Torus>(
        stream, params, 1, num_radix_blocks, allocate_gpu_memory);

-    auto lut_does_block_generate_carry = test_vector_array->get_lut(0);
-    auto lut_does_block_generate_or_propagate = test_vector_array->get_lut(1);
+    auto lut_does_block_generate_carry = luts_array->get_lut(0);
+    auto lut_does_block_generate_or_propagate = luts_array->get_lut(1);

-    // generate test vectors
+    // generate luts (aka accumulators)
    generate_device_accumulator<Torus>(
        stream, lut_does_block_generate_carry, glwe_dimension, polynomial_size,
        message_modulus, carry_modulus, f_lut_does_block_generate_carry);
@@ -507,12 +507,12 @@ template <typename Torus> struct int_sc_prop_memory {
        polynomial_size, message_modulus, carry_modulus,
        f_lut_does_block_generate_or_propagate);
    cuda_set_value_async<Torus>(&(stream->stream),
-                                test_vector_array->get_tvi(1), 1,
+                                luts_array->get_lut_indexes(1), 1,
                                num_radix_blocks - 1);

    generate_device_accumulator_bivariate<Torus>(
-        stream, lut_carry_propagation_sum->lut, glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, f_lut_carry_propagation_sum);
+        stream, luts_carry_propagation_sum->lut, glwe_dimension, polynomial_size,
+        message_modulus, carry_modulus, f_luts_carry_propagation_sum);

    generate_device_accumulator<Torus>(stream, message_acc->lut, glwe_dimension,
                                       polynomial_size, message_modulus,
@@ -523,12 +523,12 @@ template <typename Torus> struct int_sc_prop_memory {
    cuda_drop_async(generates_or_propagates, stream);
    cuda_drop_async(step_output, stream);

-    test_vector_array->release(stream);
-    lut_carry_propagation_sum->release(stream);
+    luts_array->release(stream);
+    luts_carry_propagation_sum->release(stream);
    message_acc->release(stream);

-    delete test_vector_array;
-    delete lut_carry_propagation_sum;
+    delete luts_array;
+    delete luts_carry_propagation_sum;
    delete message_acc;
  }
 };
@@ -538,9 +538,9 @@ template <typename Torus> struct int_mul_memory {
  Torus *block_mul_res;
  Torus *small_lwe_vector;
  Torus *lwe_pbs_out_array;
-  int_radix_lut<Torus> *test_vector_array; // lsb msb
-  int_radix_lut<Torus> *test_vector_message;
-  int_radix_lut<Torus> *test_vector_carry;
+  int_radix_lut<Torus> *luts_array; // lsb msb
+  int_radix_lut<Torus> *luts_message;
+  int_radix_lut<Torus> *luts_carry;
  int_sc_prop_memory<Torus> *scp_mem;
  int_radix_params params;

@@ -583,18 +583,18 @@ template <typename Torus> struct int_mul_memory {
                                   stream);

    // create int_radix_lut objects for lsb, msb, message, carry
-    // test_vector_array -> lut = {lsb_acc, msb_acc}
-    test_vector_array = new int_radix_lut<Torus>(
+    // luts_array -> lut = {lsb_acc, msb_acc}
+    luts_array = new int_radix_lut<Torus>(
        stream, params, 2, total_block_count, allocate_gpu_memory);
-    test_vector_message = new int_radix_lut<Torus>(
-        stream, params, 1, total_block_count, test_vector_array);
-    test_vector_carry = new int_radix_lut<Torus>(
-        stream, params, 1, total_block_count, test_vector_array);
+    luts_message = new int_radix_lut<Torus>(
+        stream, params, 1, total_block_count, luts_array);
+    luts_carry = new int_radix_lut<Torus>(
+        stream, params, 1, total_block_count, luts_array);

-    auto lsb_acc = test_vector_array->get_lut(0);
-    auto msb_acc = test_vector_array->get_lut(1);
-    auto message_acc = test_vector_message->get_lut(0);
-    auto carry_acc = test_vector_carry->get_lut(0);
+    auto lsb_acc = luts_array->get_lut(0);
+    auto msb_acc = luts_array->get_lut(1);
+    auto message_acc = luts_message->get_lut(0);
+    auto carry_acc = luts_carry->get_lut(0);

    // define functions for each accumulator
    auto lut_f_lsb = [message_modulus](Torus x, Torus y) -> Torus {
@@ -624,12 +624,12 @@ template <typename Torus> struct int_mul_memory {
        stream, msb_acc, glwe_dimension, polynomial_size, message_modulus,
        carry_modulus, lut_f_msb);

-    // tvi for test_vector_array should be reinitialized
+    // lut_indexes for luts_array should be reinitialized
    // first lsb_vector_block_count value should reference to lsb_acc
    // last msb_vector_block_count values should reference to msb_acc
-    // for message and carry default tvi is fine
+    // for message and carry default lut_indexes is fine
    cuda_set_value_async<Torus>(
-        &(stream->stream), test_vector_array->get_tvi(lsb_vector_block_count),
+        &(stream->stream), luts_array->get_lut_indexes(lsb_vector_block_count),
        1, msb_vector_block_count);
  }

@@ -639,15 +639,15 @@ template <typename Torus> struct int_mul_memory {
    cuda_drop_async(small_lwe_vector, stream);
    cuda_drop_async(lwe_pbs_out_array, stream);

-    test_vector_array->release(stream);
-    test_vector_message->release(stream);
-    test_vector_carry->release(stream);
+    luts_array->release(stream);
+    luts_message->release(stream);
+    luts_carry->release(stream);

    scp_mem->release(stream);

-    delete test_vector_array;
-    delete test_vector_message;
-    delete test_vector_carry;
+    delete luts_array;
+    delete luts_message;
+    delete luts_carry;

    delete scp_mem;
  }
@@ -681,12 +681,12 @@ template <typename Torus> struct int_shift_buffer {
      // LUT
      // pregenerate lut vector and indexes
      // lut for left shift
-      // here we generate 'num_bits_in_block' times test_vector
+      // here we generate 'num_bits_in_block' times lut
      // one for each 'shift_within_block' = 'shift' % 'num_bits_in_block'
-      // even though test_vector_left contains 'num_bits_in_block' lut
-      // tvi will have indexes for single lut only and those indexes will be 0
+      // even though lut_left contains 'num_bits_in_block' lut
+      // lut_indexes will have indexes for single lut only and those indexes will be 0
      // it means for pbs corresponding lut should be selected and pass along
-      // tvi filled with zeros
+      // lut_indexes filled with zeros

      // calculate bivariate lut for each 'shift_within_block'
      for (int s_w_b = 1; s_w_b < num_bits_in_block; s_w_b++) {
@@ -738,11 +738,11 @@ template <typename Torus> struct int_shift_buffer {
        lut_buffers_bivariate.push_back(cur_lut_bivariate);
      }

-      // here we generate 'message_modulus' times test_vector
+      // here we generate 'message_modulus' times lut
      // one for each 'shift'
-      // tvi will have indexes for single lut only and those indexes will be 0
+      // lut_indexes will have indexes for single lut only and those indexes will be 0
      // it means for pbs corresponding lut should be selected and pass along
-      // tvi filled with zeros
+      // lut_indexes filled with zeros

      // calculate lut for each 'shift'
      for (int shift = 0; shift < params.message_modulus; shift++) {
--- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
@@ -13,10 +13,6 @@ set(SOURCES
    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h)
 file(GLOB_RECURSE SOURCES "*.cu")
 add_library(tfhe_cuda_backend STATIC ${SOURCES})
-set_target_properties(
-  tfhe_cuda_backend
-  PROPERTIES CUDA_SEPARABLE_COMPILATION ON
-             CUDA_RESOLVE_DEVICE_SYMBOLS ON
-             CUDA_ARCHITECTURES native)
+set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
 target_include_directories(tfhe_cuda_backend PRIVATE .)
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -22,7 +22,7 @@ void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
                 uint32_t lwe_dimension, uint32_t polynomial_size,
                 uint32_t base_log, uint32_t level_count,
                 uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
-                 uint32_t num_lut_vectors, uint32_t lwe_idx,
+                 uint32_t num_luts, uint32_t lwe_idx,
                 uint32_t max_shared_memory, PBS_TYPE pbs_type) {
  if (sizeof(Torus) == sizeof(uint32_t)) {
    // 32 bits
@@ -37,7 +37,7 @@ void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
-          num_lut_vectors, lwe_idx, max_shared_memory);
+          num_luts, lwe_idx, max_shared_memory);
      break;
    case AMORTIZED:
      cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
@@ -45,7 +45,7 @@ void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
-          num_lut_vectors, lwe_idx, max_shared_memory);
+          num_luts, lwe_idx, max_shared_memory);
      break;
    default:
      break;
@@ -59,7 +59,7 @@ void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
          polynomial_size, grouping_factor, base_log, level_count,
-          input_lwe_ciphertext_count, num_lut_vectors, lwe_idx,
+          input_lwe_ciphertext_count, num_luts, lwe_idx,
          max_shared_memory);
      break;
    case LOW_LAT:
@@ -68,7 +68,7 @@ void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
-          num_lut_vectors, lwe_idx, max_shared_memory);
+          num_luts, lwe_idx, max_shared_memory);
      break;
    case AMORTIZED:
      cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
@@ -76,7 +76,7 @@ void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
-          num_lut_vectors, lwe_idx, max_shared_memory);
+          num_luts, lwe_idx, max_shared_memory);
      break;
    default:
      break;
@@ -303,7 +303,7 @@ void generate_device_accumulator_bivariate(
  generate_lookup_table_bivariate<Torus>(h_lut, glwe_dimension, polynomial_size,
                                         message_modulus, carry_modulus, f);

-  // copy host lut and tvi to device
+  // copy host lut and lut_indexes to device
  cuda_memcpy_async_to_gpu(
      acc_bivariate, h_lut,
      (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream);
@@ -335,7 +335,7 @@ void generate_device_accumulator(cuda_stream_t *stream, Torus *acc,
  generate_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
                               message_modulus, carry_modulus, f);

-  // copy host lut and tvi to device
+  // copy host lut and lut_indexes to device
  cuda_memcpy_async_to_gpu(
      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
      stream);
@@ -370,13 +370,13 @@ void host_propagate_single_carry_low_latency(cuda_stream_t *stream,
  auto generates_or_propagates = mem->generates_or_propagates;
  auto step_output = mem->step_output;

-  auto test_vector_array = mem->test_vector_array;
-  auto lut_carry_propagation_sum = mem->lut_carry_propagation_sum;
+  auto luts_array = mem->luts_array;
+  auto luts_carry_propagation_sum = mem->luts_carry_propagation_sum;
  auto message_acc = mem->message_acc;

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      stream, generates_or_propagates, lwe_array, bsk, ksk, num_blocks,
-      test_vector_array);
+      luts_array);

  // compute prefix sum with hillis&steele

@@ -392,7 +392,7 @@ void host_propagate_single_carry_low_latency(cuda_stream_t *stream,

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        stream, cur_blocks, cur_blocks, prev_blocks, bsk, ksk, cur_total_blocks,
-        lut_carry_propagation_sum);
+        luts_carry_propagation_sum);

    cuda_memcpy_async_gpu_to_gpu(&generates_or_propagates[space * big_lwe_size],
                                 cur_blocks,
@@ -414,7 +414,7 @@ void host_propagate_single_carry_low_latency(cuda_stream_t *stream,
 /*
 * input_blocks: input radix ciphertext propagation will happen inplace
 * acc_message_carry: list of two lut s, [(message_acc), (carry_acc)]
- * tvi_message_carry: tvi for message and carry, should always be  {0, 1}
+ * lut_indexes_message_carry: lut_indexes for message and carry, should always be  {0, 1}
 * small_lwe_vector: output of keyswitch should have
 *     size = 2 * (lwe_dimension + 1) * sizeof(Torus)
 * big_lwe_vector: output of pbs should have
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -282,21 +282,21 @@ __host__ void host_integer_mult_radix_kb(
  // glwe_dimension * polynomial_size + 1 coefficients
  auto lwe_pbs_out_array = mem_ptr->lwe_pbs_out_array;

-  // it contains two test vector, first for lsb extraction,
+  // it contains two lut, first for lsb extraction,
  // second for msb extraction, with total length =
  // 2 * (glwe_dimension + 1) * polynomial_size
-  auto test_vector_array = mem_ptr->test_vector_array;
+  auto luts_array = mem_ptr->luts_array;

  // accumulator to extract message
  // with length (glwe_dimension + 1) * polynomial_size
-  auto test_vector_message = mem_ptr->test_vector_message;
+  auto luts_message = mem_ptr->luts_message;

  // accumulator to extract carry
  // with length (glwe_dimension + 1) * polynomial_size
-  auto test_vector_carry = mem_ptr->test_vector_carry;
+  auto luts_carry = mem_ptr->luts_carry;

  // to be used as default indexing
-  auto lwe_indexes = test_vector_array->lwe_indexes;
+  auto lwe_indexes = luts_array->lwe_indexes;

  auto vector_result_lsb = &vector_result_sb[0];
  auto vector_result_msb =
@@ -316,7 +316,7 @@ __host__ void host_integer_mult_radix_kb(

  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
      stream, block_mul_res, block_mul_res, vector_result_sb, bsk, ksk,
-      total_block_count, test_vector_array);
+      total_block_count, luts_array);

  vector_result_lsb = &block_mul_res[0];
  vector_result_msb = &block_mul_res[lsb_vector_block_count *
@@ -409,17 +409,17 @@ __host__ void host_integer_mult_radix_kb(
        mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_copied);

    execute_pbs<Torus>(
-        stream, message_blocks_vector, lwe_indexes, test_vector_message->lut,
-        test_vector_message->lut_indexes, small_lwe_vector, lwe_indexes, bsk,
-        test_vector_message->pbs_buffer, glwe_dimension, lwe_dimension,
+        stream, message_blocks_vector, lwe_indexes, luts_message->lut,
+        luts_message->lut_indexes, small_lwe_vector, lwe_indexes, bsk,
+        luts_message->pbs_buffer, glwe_dimension, lwe_dimension,
        polynomial_size, mem_ptr->params.pbs_base_log,
        mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
        message_count, 1, 0, max_shared_memory, mem_ptr->params.pbs_type);

    execute_pbs<Torus>(stream, carry_blocks_vector, lwe_indexes,
-                       test_vector_carry->lut, test_vector_carry->lut_indexes,
+                       luts_carry->lut, luts_carry->lut_indexes,
                       &small_lwe_vector[message_count * (lwe_dimension + 1)],
-                       lwe_indexes, bsk, test_vector_carry->pbs_buffer,
+                       lwe_indexes, bsk, luts_carry->pbs_buffer,
                       glwe_dimension, lwe_dimension, polynomial_size,
                       mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
                       mem_ptr->params.grouping_factor, carry_count, 1, 0,
@@ -455,10 +455,10 @@ __host__ void host_integer_mult_radix_kb(

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      stream, vector_result_sb, radix_lwe_out, bsk, ksk, num_blocks,
-      test_vector_message);
+      luts_message);
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      stream, &block_mul_res[big_lwe_size], radix_lwe_out, bsk, ksk, num_blocks,
-      test_vector_carry);
+      luts_carry);

  cuda_memset_async(block_mul_res, 0, big_lwe_size * sizeof(Torus), stream);

@@ -544,16 +544,16 @@ void apply_lookup_table(Torus *input_ciphertexts, Torus *output_ciphertexts,
    // when message and carry have tobe extracted
    //  for first message_count blocks we need message_acc
    //  for last carry_count blocks we need carry_acc
-    Torus *cur_tvi;
+    Torus *cur_lut_indexes;
    if (lsb_msb_mode) {
-      cur_tvi = (big_lwe_start_index < lsb_message_blocks_count)
-                    ? mem_ptr->tvi_lsb_multi_gpu[i]
-                    : mem_ptr->tvi_msb_multi_gpu[i];
+      cur_lut_indexes = (big_lwe_start_index < lsb_message_blocks_count)
+                    ? mem_ptr->lut_indexes_lsb_multi_gpu[i]
+                    : mem_ptr->lut_indexes_msb_multi_gpu[i];

    } else {
-      cur_tvi = (big_lwe_start_index < lsb_message_blocks_count)
-                    ? mem_ptr->tvi_message_multi_gpu[i]
-                    : mem_ptr->tvi_carry_multi_gpu[i];
+      cur_lut_indexes = (big_lwe_start_index < lsb_message_blocks_count)
+                    ? mem_ptr->lut_indexes_message_multi_gpu[i]
+                    : mem_ptr->lut_indexes_carry_multi_gpu[i];
    }

    // execute keyswitch on a current gpu with corresponding input and output
@@ -568,7 +568,7 @@ void apply_lookup_table(Torus *input_ciphertexts, Torus *output_ciphertexts,
    // execute pbs on a current gpu with corresponding input and output
    cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
        this_stream, i, mem_ptr->pbs_output_multi_gpu[i],
-        mem_ptr->test_vector_multi_gpu[i], cur_tvi,
+        mem_ptr->lut_multi_gpu[i], cur_lut_indexes,
        mem_ptr->pbs_input_multi_gpu[i], mem_ptr->bsk_multi_gpu[i],
        mem_ptr->pbs_buffer_multi_gpu[i], lwe_dimension, glwe_dimension,
        polynomial_size, grouping_factor, pbs_base_log, pbs_level,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_amortized.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_amortized.cu
@@ -148,7 +148,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {

  checks_bootstrap_amortized(32, base_log, polynomial_size);

@@ -159,7 +159,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
        (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
        (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
        lwe_idx, max_shared_memory);
    break;
  case 512:
@@ -168,7 +168,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
        (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
        (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
        lwe_idx, max_shared_memory);
    break;
  case 1024:
@@ -177,7 +177,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
        (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
        (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
        lwe_idx, max_shared_memory);
    break;
  case 2048:
@@ -186,7 +186,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
        (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
        (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
        lwe_idx, max_shared_memory);
    break;
  case 4096:
@@ -195,7 +195,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
        (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
        (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
        lwe_idx, max_shared_memory);
    break;
  case 8192:
@@ -204,7 +204,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
        (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
        (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
        lwe_idx, max_shared_memory);
    break;
  case 16384:
@@ -213,7 +213,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
        (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
        (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
        lwe_idx, max_shared_memory);
    break;
  default:
@@ -228,11 +228,11 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
 * - `gpu_index` is the index of the GPU to be used in the kernel launch
 *  - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
 * (a0,..an-1,b) where n is the LWE dimension
- *  - lut_vector: should hold as many test vectors of size polynomial_size
+ *  - lut_vector: should hold as many luts of size polynomial_size
 * as there are input ciphertexts, but actually holds
- * num_lut_vectors vectors to reduce memory usage
+ * num_luts vectors to reduce memory usage
 *  - lut_vector_indexes: stores the index corresponding to
- * which test vector of lut_vector to use for each LWE input in
+ * which lut of lut_vector to use for each LWE input in
 * lwe_array_in
 *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
 * mask values + 1 body value
@@ -244,17 +244,17 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
 * bsk is thus a tensor of size (k+1)^2.l.N.n
 * where l is the number of decomposition levels and
 * k is the GLWE dimension, N is the polynomial size for
- * GLWE. The polynomial size for GLWE and the test vector
+ * GLWE. The polynomial size for GLWE and the lut
 * are the same because they have to be in the same ring
 * to be multiplied.
 * - input_lwe_dimension: size of the Torus vector used to encrypt the input
 * LWE ciphertexts - referred to as n above (~ 600)
- * - polynomial_size: size of the test polynomial (test vector) and size of the
+ * - polynomial_size: size of the test polynomial (lut) and size of the
 * GLWE polynomials (~1024) (where `size` refers to the polynomial degree + 1).
 * - base_log: log of the base used for the gadget matrix - B = 2^base_log (~8)
 * - level_count: number of decomposition levels in the gadget matrix (~4)
 * - num_samples: number of encrypted input messages
- * - num_lut_vectors: parameter to set the actual number of test vectors to be
+ * - num_luts: parameter to set the actual number of luts to be
 * used
 * - lwe_idx: the index of the LWE input to consider for the GPU of index
 * gpu_index. In case of multi-GPU computing, it is assumed that only a part of
@@ -292,7 +292,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {

  checks_bootstrap_amortized(64, base_log, polynomial_size);

@@ -303,7 +303,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
        (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
        (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
        lwe_idx, max_shared_memory);
    break;
  case 512:
@@ -312,7 +312,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
        (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
        (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
        lwe_idx, max_shared_memory);
    break;
  case 1024:
@@ -321,7 +321,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
        (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
        (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
        lwe_idx, max_shared_memory);
    break;
  case 2048:
@@ -330,7 +330,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
        (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
        (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
        lwe_idx, max_shared_memory);
    break;
  case 4096:
@@ -339,7 +339,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
        (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
        (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
        lwe_idx, max_shared_memory);
    break;
  case 8192:
@@ -348,7 +348,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
        (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
        (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
        lwe_idx, max_shared_memory);
    break;
  case 16384:
@@ -357,7 +357,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
        (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
        (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
        lwe_idx, max_shared_memory);
    break;
  default:
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_amortized.cuh
@@ -24,10 +24,10 @@ template <typename Torus, class params, sharedMemDegree SMD>
 * Uses shared memory to increase performance
 *  - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
 * (a0,..an-1,b) where n is the LWE dimension
- *  - lut_vector: should hold as many test vectors of size polynomial_size
+ *  - lut_vector: should hold as many luts of size polynomial_size
 * as there are input ciphertexts, but actually holds
- * num_lut_vectors vectors to reduce memory usage
- *  - lut_vector_indexes: stores the index corresponding to which test vector
+ * num_luts vectors to reduce memory usage
+ *  - lut_vector_indexes: stores the index corresponding to which lut
 * to use for each sample in lut_vector
 *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
 * mask values + 1 body value
@@ -37,7 +37,7 @@ template <typename Torus, class params, sharedMemDegree SMD>
 * == NOSM or PARTIALSM)
 *  - lwe_dimension: size of the Torus vector used to encrypt the input
 * LWE ciphertexts - referred to as n above (~ 600)
- *  - polynomial_size: size of the test polynomial (test vector) and size of the
+ *  - polynomial_size: size of the test polynomial (lut) and size of the
 * GLWE polynomial (~1024)
 *  - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
 *  - level_count: number of decomposition levels in the gadget matrix (~4)
@@ -288,7 +288,7 @@ __host__ void host_bootstrap_amortized(
    Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
+    uint32_t input_lwe_ciphertext_count, uint32_t num_luts,
    uint32_t lwe_idx, uint32_t max_shared_memory) {

  cudaSetDevice(stream->gpu_index);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_fast_low_latency.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_fast_low_latency.cuh
@@ -336,7 +336,7 @@ __host__ void host_bootstrap_fast_low_latency(
    Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
+    uint32_t input_lwe_ciphertext_count, uint32_t num_luts,
    uint32_t max_shared_memory) {
  cudaSetDevice(stream->gpu_index);

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_fast_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_fast_multibit.cuh
@@ -206,7 +206,7 @@ __host__ void host_fast_multi_bit_pbs(
    Torus *lwe_input_indexes, uint64_t *bootstrapping_key, int8_t *pbs_buffer,
    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
+    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
    uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
  cudaSetDevice(stream->gpu_index);

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_low_latency.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_low_latency.cu
@@ -368,7 +368,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {

  checks_bootstrap_low_latency(32, glwe_dimension, level_count, base_log,
                               polynomial_size, num_samples);
@@ -387,7 +387,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
          static_cast<uint32_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    else
      host_bootstrap_low_latency<uint32_t, Degree<256>>(
          stream, static_cast<uint32_t *>(lwe_array_out),
@@ -398,7 +398,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
          static_cast<uint32_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    break;
  case 512:
    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
@@ -413,7 +413,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
          static_cast<uint32_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    else
      host_bootstrap_low_latency<uint32_t, Degree<512>>(
          stream, static_cast<uint32_t *>(lwe_array_out),
@@ -424,7 +424,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
          static_cast<uint32_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    break;
  case 1024:
    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
@@ -439,7 +439,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
          static_cast<uint32_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    else
      host_bootstrap_low_latency<uint32_t, Degree<1024>>(
          stream, static_cast<uint32_t *>(lwe_array_out),
@@ -450,7 +450,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
          static_cast<uint32_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    break;
  case 2048:
    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
@@ -465,7 +465,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
          static_cast<uint32_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    else
      host_bootstrap_low_latency<uint32_t, Degree<2048>>(
          stream, static_cast<uint32_t *>(lwe_array_out),
@@ -476,7 +476,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
          static_cast<uint32_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    break;
  case 4096:
    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
@@ -491,7 +491,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
          static_cast<uint32_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    else
      host_bootstrap_low_latency<uint32_t, Degree<4096>>(
          stream, static_cast<uint32_t *>(lwe_array_out),
@@ -502,7 +502,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
          static_cast<uint32_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    break;
  case 8192:
    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
@@ -517,7 +517,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
          static_cast<uint32_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    else
      host_bootstrap_low_latency<uint32_t, Degree<8192>>(
          stream, static_cast<uint32_t *>(lwe_array_out),
@@ -528,7 +528,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
          static_cast<uint32_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    break;
  case 16384:
    if (verify_cuda_bootstrap_fast_low_latency_grid_size<
@@ -543,7 +543,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
          static_cast<uint32_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    else
      host_bootstrap_low_latency<uint32_t, Degree<16384>>(
          stream, static_cast<uint32_t *>(lwe_array_out),
@@ -554,7 +554,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
          static_cast<uint32_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    break;
  default:
    break;
@@ -572,11 +572,11 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
 * - `gpu_index` is the index of the GPU to be used in the kernel launch
 *  - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
 * (a0,..an-1,b) where n is the LWE dimension
- *  - lut_vector: should hold as many test vectors of size polynomial_size
+ *  - lut_vector: should hold as many luts of size polynomial_size
 * as there are input ciphertexts, but actually holds
- * num_lut_vectors vectors to reduce memory usage
+ * num_luts vectors to reduce memory usage
 *  - lut_vector_indexes: stores the index corresponding to
- * which test vector to use for each sample in
+ * which lut to use for each sample in
 * lut_vector
 *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
 * mask values + 1 body value
@@ -588,7 +588,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
 * bsk is thus a tensor of size (k+1)^2.l.N.n
 * where l is the number of decomposition levels and
 * k is the GLWE dimension, N is the polynomial size for
- * GLWE. The polynomial size for GLWE and the test vector
+ * GLWE. The polynomial size for GLWE and the lut
 * are the same because they have to be in the same ring
 * to be multiplied.
 * - lwe_dimension: size of the Torus vector used to encrypt the input
@@ -596,12 +596,12 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
 * - glwe_dimension: size of the polynomial vector used to encrypt the LUT
 * GLWE ciphertexts - referred to as k above. Only the value 1 is supported for
 * this parameter.
- * - polynomial_size: size of the test polynomial (test vector) and size of the
+ * - polynomial_size: size of the test polynomial (lut) and size of the
 * GLWE polynomial (~1024)
 * - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
 * - level_count: number of decomposition levels in the gadget matrix (~4)
 * - num_samples: number of encrypted input messages
- * - num_lut_vectors: parameter to set the actual number of test vectors to be
+ * - num_luts: parameter to set the actual number of luts to be
 * used
 * - lwe_idx: the index of the LWE input to consider for the GPU of index
 * gpu_index. In case of multi-GPU computing, it is assumed that only a part of
@@ -643,7 +643,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
  checks_bootstrap_low_latency(64, glwe_dimension, level_count, base_log,
                               polynomial_size, num_samples);

@@ -661,7 +661,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    else
      host_bootstrap_low_latency<uint64_t, Degree<256>>(
          stream, static_cast<uint64_t *>(lwe_array_out),
@@ -672,7 +672,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    break;
  case 512:
    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
@@ -687,7 +687,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    else
      host_bootstrap_low_latency<uint64_t, Degree<512>>(
          stream, static_cast<uint64_t *>(lwe_array_out),
@@ -698,7 +698,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    break;
  case 1024:
    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
@@ -713,7 +713,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    else
      host_bootstrap_low_latency<uint64_t, Degree<1024>>(
          stream, static_cast<uint64_t *>(lwe_array_out),
@@ -724,7 +724,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    break;
  case 2048:
    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
@@ -739,7 +739,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    else
      host_bootstrap_low_latency<uint64_t, Degree<2048>>(
          stream, static_cast<uint64_t *>(lwe_array_out),
@@ -750,7 +750,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    break;
  case 4096:
    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
@@ -765,7 +765,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    else
      host_bootstrap_low_latency<uint64_t, Degree<4096>>(
          stream, static_cast<uint64_t *>(lwe_array_out),
@@ -776,7 +776,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    break;
  case 8192:
    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
@@ -791,7 +791,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    else
      host_bootstrap_low_latency<uint64_t, Degree<8192>>(
          stream, static_cast<uint64_t *>(lwe_array_out),
@@ -802,7 +802,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    break;
  case 16384:
    if (verify_cuda_bootstrap_fast_low_latency_grid_size<
@@ -817,7 +817,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
    else
      host_bootstrap_low_latency<uint64_t, Degree<16384>>(
          stream, static_cast<uint64_t *>(lwe_array_out),
@@ -828,7 +828,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-          num_lut_vectors, max_shared_memory);
+          num_luts, max_shared_memory);
  default:
    break;
  }
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_low_latency.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_low_latency.cuh
@@ -435,7 +435,7 @@ __host__ void host_bootstrap_low_latency(
    Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
+    uint32_t input_lwe_ciphertext_count, uint32_t num_luts,
    uint32_t max_shared_memory) {
  cudaSetDevice(stream->gpu_index);

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_multibit.cu
@@ -19,7 +19,7 @@ void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
+    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
    uint32_t max_shared_memory, uint32_t lwe_chunk_size) {

  checks_multi_bit_pbs(polynomial_size);
@@ -38,7 +38,7 @@ void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
          max_shared_memory, lwe_chunk_size);
    } else {
      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
@@ -50,7 +50,7 @@ void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
          max_shared_memory, lwe_chunk_size);
    }
    break;
@@ -67,7 +67,7 @@ void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
          max_shared_memory, lwe_chunk_size);
    } else {
      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
@@ -79,7 +79,7 @@ void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
          max_shared_memory, lwe_chunk_size);
    }
    break;
@@ -96,7 +96,7 @@ void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
          max_shared_memory, lwe_chunk_size);
    } else {
      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
@@ -108,7 +108,7 @@ void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
          max_shared_memory, lwe_chunk_size);
    }
    break;
@@ -125,7 +125,7 @@ void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
          max_shared_memory, lwe_chunk_size);
    } else {
      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
@@ -137,7 +137,7 @@ void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
          max_shared_memory, lwe_chunk_size);
    }
    break;
@@ -154,7 +154,7 @@ void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
          max_shared_memory, lwe_chunk_size);
    } else {
      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
@@ -166,7 +166,7 @@ void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
          max_shared_memory, lwe_chunk_size);
    }
    break;
@@ -183,7 +183,7 @@ void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
          max_shared_memory, lwe_chunk_size);
    } else {
      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
@@ -195,7 +195,7 @@ void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
          max_shared_memory, lwe_chunk_size);
    }
    break;
@@ -212,7 +212,7 @@ void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
          max_shared_memory, lwe_chunk_size);
    } else {
      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
@@ -224,7 +224,7 @@ void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
          static_cast<uint64_t *>(lwe_input_indexes),
          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
          max_shared_memory, lwe_chunk_size);
    }
    break;
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_multibit.cuh
@@ -395,7 +395,7 @@ __host__ void host_multi_bit_pbs(
    Torus *lwe_input_indexes, uint64_t *bootstrapping_key, int8_t *pbs_buffer,
    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
+    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
    uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
  cudaSetDevice(stream->gpu_index);

--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -61,7 +61,7 @@ extern "C" {
    pub fn cuda_drop_async(ptr: *mut c_void, v_stream: *const c_void) -> i32;

    /// Free memory for pointer `ptr` on GPU `gpu_index` synchronously
-    pub fn cuda_drop(ptr: *mut c_void) -> i32;
+    pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32) -> i32;

    /// Get the maximum amount of shared memory on GPU `gpu_index`
    pub fn cuda_get_max_shared_memory(gpu_index: u32) -> i32;
--- a/ci/slab.toml
+++ b/ci/slab.toml
@@ -138,6 +138,11 @@ workflow = "pbs_benchmark.yml"
 profile = "bench"
 check_run_name = "PBS CPU AWS Benchmarks"

+[command.pbs_gpu_bench]
+workflow = "pbs_gpu_benchmark.yml"
+profile = "gpu-bench"
+check_run_name = "PBS GPU AWS Benchmarks"
+
 [command.wasm_client_bench]
 workflow = "wasm_client_benchmark.yml"
 profile = "cpu-small"
--- a/scripts/c_api_tests.sh
+++ b/scripts/c_api_tests.sh
@@ -7,11 +7,12 @@ function usage() {
    echo
    echo "--help                    Print this message"
    echo "--build-only              Pass to only build the tests without running them"
+    echo "--gpu                     Enable GPU support"
    echo
 }

 BUILD_ONLY=0
-
+WITH_FEATURE_GPU="OFF"
 while [ -n "$1" ]
 do
   case "$1" in
@@ -24,6 +25,9 @@ do
            BUILD_ONLY=1
            ;;

+        "--gpu" )
+            WITH_FEATURE_GPU="ON"
+            ;;
        *)
            echo "Unknown param : $1"
            exit 1
@@ -40,7 +44,7 @@ mkdir -p "${TFHE_BUILD_DIR}"

 cd "${TFHE_BUILD_DIR}"

-cmake .. -DCMAKE_BUILD_TYPE=RELEASE -DCARGO_PROFILE="${CARGO_PROFILE}"
+cmake .. -DCMAKE_BUILD_TYPE=RELEASE -DCARGO_PROFILE="${CARGO_PROFILE}" -DWITH_FEATURE_GPU="${WITH_FEATURE_GPU}"

 make -j

@@ -55,5 +59,8 @@ if [[ $(uname) == "Darwin" ]]; then
    nproc_bin="sysctl -n hw.logicalcpu"
 fi

-# Let's go parallel
-ARGS="-j$(${nproc_bin})" make test
+if [ "${WITH_FEATURE_GPU}" == "ON" ]; then
+    ctest --output-on-failure --test-dir "." --parallel "$(${nproc_bin})" --tests-regex ".*cuda.*"
+else
+    ctest --output-on-failure --test-dir "." --parallel "$(${nproc_bin})" --exclude-regex ".*cuda.*"
+fi
--- a/scripts/integer-tests.sh
+++ b/scripts/integer-tests.sh
@@ -12,6 +12,7 @@ function usage() {
    echo "--signed-only             Run only signed integer tests, by default both signed and unsigned tests are run"
    echo "--cargo-profile           The cargo profile used to build tests"
    echo "--avx512-support          Set to ON to enable avx512"
+    echo "--tfhe-package            The package spec like tfhe@0.4.2, default=tfhe"
    echo
 }

@@ -23,6 +24,7 @@ signed=""
 not_signed=""
 cargo_profile="release"
 avx512_feature=""
+tfhe_package="tfhe"

 while [ -n "$1" ]
 do
@@ -64,6 +66,11 @@ do
            fi
            ;;

+        "--tfhe-package" )
+            shift
+            tfhe_package="$1"
+            ;;
+
        *)
            echo "Unknown param : $1"
            exit 1
@@ -153,7 +160,7 @@ fi
 cargo "${RUST_TOOLCHAIN}" nextest run \
    --tests \
    --cargo-profile "${cargo_profile}" \
-    --package tfhe \
+    --package "${tfhe_package}" \
    --profile ci \
    --features="${ARCH_FEATURE}",integer,internal-keycache,"${avx512_feature}" \
    --test-threads "${test_threads}" \
@@ -162,7 +169,7 @@ cargo "${RUST_TOOLCHAIN}" nextest run \
 if [[ "${multi_bit}" == "" ]]; then
    cargo "${RUST_TOOLCHAIN}" test \
        --profile "${cargo_profile}" \
-        --package tfhe \
+        --package "${tfhe_package}" \
        --features="${ARCH_FEATURE}",integer,internal-keycache,"${avx512_feature}" \
        --doc \
        -- --test-threads="${doctest_threads}" integer::
--- a/scripts/shortint-tests.sh
+++ b/scripts/shortint-tests.sh
@@ -9,12 +9,14 @@ function usage() {
    echo "--rust-toolchain          The toolchain to run the tests with default: stable"
    echo "--multi-bit               Run multi-bit tests only: default off"
    echo "--cargo-profile           The cargo profile used to build tests"
+    echo "--tfhe-package            The package spec like tfhe@0.4.2, default=tfhe"
    echo
 }

 RUST_TOOLCHAIN="+stable"
 multi_bit=""
 cargo_profile="release"
+tfhe_package="tfhe"

 while [ -n "$1" ]
 do
@@ -38,6 +40,11 @@ do
            cargo_profile="$1"
            ;;

+        "--tfhe-package" )
+            shift
+            tfhe_package="$1"
+            ;;
+
        *)
            echo "Unknown param : $1"
            exit 1
@@ -111,7 +118,7 @@ and not test(~smart_add_and_mul)""" # This test is too slow
    cargo "${RUST_TOOLCHAIN}" nextest run \
        --tests \
        --cargo-profile "${cargo_profile}" \
-        --package tfhe \
+        --package "${tfhe_package}" \
        --profile ci \
        --features="${ARCH_FEATURE}",shortint,internal-keycache \
        --test-threads "${n_threads_small}" \
@@ -128,7 +135,7 @@ and not test(~smart_add_and_mul)"""
    cargo "${RUST_TOOLCHAIN}" nextest run \
        --tests \
        --cargo-profile "${cargo_profile}" \
-        --package tfhe \
+        --package "${tfhe_package}" \
        --profile ci \
        --features="${ARCH_FEATURE}",shortint,internal-keycache \
        --test-threads "${n_threads_big}" \
@@ -137,7 +144,7 @@ and not test(~smart_add_and_mul)"""
        if [[ "${multi_bit}" == "" ]]; then
            cargo "${RUST_TOOLCHAIN}" test \
                --profile "${cargo_profile}" \
-                --package tfhe \
+                --package "${tfhe_package}" \
                --features="${ARCH_FEATURE}",shortint,internal-keycache \
                --doc \
                -- shortint::
@@ -177,7 +184,7 @@ and not test(~smart_add_and_mul)""" # This test is too slow
    cargo "${RUST_TOOLCHAIN}" nextest run \
        --tests \
        --cargo-profile "${cargo_profile}" \
-        --package tfhe \
+        --package "${tfhe_package}" \
        --profile ci \
        --features="${ARCH_FEATURE}",shortint,internal-keycache \
        --test-threads "$(${nproc_bin})" \
@@ -186,7 +193,7 @@ and not test(~smart_add_and_mul)""" # This test is too slow
    if [[ "${multi_bit}" == "" ]]; then
        cargo "${RUST_TOOLCHAIN}" test \
            --profile "${cargo_profile}" \
-            --package tfhe \
+            --package "${tfhe_package}" \
            --features="${ARCH_FEATURE}",shortint,internal-keycache \
            --doc \
            -- --test-threads="$(${nproc_bin})" shortint::
--- a/tasks/src/format_latex_doc.rs
+++ b/tasks/src/format_latex_doc.rs
@@ -196,7 +196,7 @@ fn find_contiguous_doc_test<'a>(

 fn find_contiguous_part_in_doc_test_or_comment(
    part_is_code_block: bool,
-    full_doc_comment_content: &Vec<CommentContent>,
+    full_doc_comment_content: &[CommentContent],
    part_start_idx: usize,
 ) -> (usize, usize) {
    let mut next_line_idx = part_start_idx + 1;
@@ -348,7 +348,7 @@ fn process_doc_lines_until_impossible<'a>(
 }

 fn process_non_doc_lines_until_impossible(
-    lines: &Vec<&str>,
+    lines: &[&str],
    rewritten_content: &mut String,
    mut line_idx: usize,
 ) -> usize {
--- a/tfhe/Cargo.toml
+++ b/tfhe/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe"
-version = "0.5.0"
+version = "0.5.2"
 edition = "2021"
 readme = "../README.md"
 keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
@@ -60,8 +60,8 @@ lazy_static = { version = "1.4.0", optional = true }
 serde = { version = "1.0", features = ["derive"] }
 rayon = { version = "1.5.0" }
 bincode = "1.3.3"
-concrete-fft = { version = "0.3.0", features = ["serde", "fft128"] }
-pulp = "0.13"
+concrete-fft = { version = "0.4.0", features = ["serde", "fft128"] }
+pulp = "0.18.8"
 tfhe-cuda-backend = { version = "0.1.2", path = "../backends/tfhe-cuda-backend", optional = true }
 aligned-vec = { version = "0.5", features = ["serde"] }
 dyn-stack = { version = "0.9" }
@@ -79,7 +79,7 @@ js-sys = { version = "0.3", optional = true }
 console_error_panic_hook = { version = "0.1.7", optional = true }
 serde-wasm-bindgen = { version = "0.6.0", optional = true }
 getrandom = { version = "0.2.8", optional = true }
-bytemuck = "1.13.1"
+bytemuck = "1.14.3"

 [features]
 boolean = []
--- a/tfhe/benches/core_crypto/pbs_bench.rs
+++ b/tfhe/benches/core_crypto/pbs_bench.rs
@@ -8,6 +8,7 @@ use serde::Serialize;
 use tfhe::boolean::parameters::{
    BooleanParameters, DEFAULT_PARAMETERS, PARAMETERS_ERROR_PROB_2_POW_MINUS_165,
 };
+
 use tfhe::core_crypto::prelude::*;
 use tfhe::keycache::NamedParam;
 use tfhe::shortint::parameters::*;
@@ -43,29 +44,6 @@ const BOOLEAN_BENCH_PARAMS: [(&str, BooleanParameters); 2] = [
    ),
 ];

-criterion_group!(
-    name = pbs_group;
-    config = Criterion::default().sample_size(2000);
-    targets = mem_optimized_pbs::<u64>, mem_optimized_pbs::<u32>
-);
-
-criterion_group!(
-    name = multi_bit_pbs_group;
-    config = Criterion::default().sample_size(2000);
-    targets =   multi_bit_pbs::<u64>,
-                multi_bit_pbs::<u32>,
-                multi_bit_deterministic_pbs::<u64>,
-                multi_bit_deterministic_pbs::<u32>,
-);
-
-criterion_group!(
-    name = pbs_throughput_group;
-    config = Criterion::default().sample_size(100);
-    targets = pbs_throughput::<u64>, pbs_throughput::<u32>
-);
-
-criterion_main!(pbs_group, multi_bit_pbs_group, pbs_throughput_group);
-
 fn benchmark_parameters<Scalar: UnsignedInteger>() -> Vec<(String, CryptoParametersRecord<Scalar>)>
 {
    if Scalar::BITS == 64 {
@@ -121,25 +99,35 @@ fn throughput_benchmark_parameters<Scalar: UnsignedInteger>(
 fn multi_bit_benchmark_parameters<Scalar: UnsignedInteger + Default>(
 ) -> Vec<(String, CryptoParametersRecord<Scalar>, LweBskGroupingFactor)> {
    if Scalar::BITS == 64 {
-        vec![
-            PARAM_MULTI_BIT_MESSAGE_1_CARRY_1_GROUP_2_KS_PBS,
-            PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_2_KS_PBS,
-            PARAM_MULTI_BIT_MESSAGE_3_CARRY_3_GROUP_2_KS_PBS,
-            PARAM_MULTI_BIT_MESSAGE_1_CARRY_1_GROUP_3_KS_PBS,
-            PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS,
-            PARAM_MULTI_BIT_MESSAGE_3_CARRY_3_GROUP_3_KS_PBS,
-        ]
-        .iter()
-        .map(|params| {
-            (
-                params.name(),
-                <MultiBitPBSParameters as Into<PBSParameters>>::into(*params)
-                    .to_owned()
-                    .into(),
-                params.grouping_factor,
-            )
-        })
-        .collect()
+        let parameters = if cfg!(feature = "gpu") {
+            vec![
+                PARAM_MULTI_BIT_MESSAGE_1_CARRY_1_GROUP_3_KS_PBS,
+                PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS,
+                PARAM_MULTI_BIT_MESSAGE_3_CARRY_3_GROUP_3_KS_PBS,
+            ]
+        } else {
+            vec![
+                PARAM_MULTI_BIT_MESSAGE_1_CARRY_1_GROUP_2_KS_PBS,
+                PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_2_KS_PBS,
+                PARAM_MULTI_BIT_MESSAGE_3_CARRY_3_GROUP_2_KS_PBS,
+                PARAM_MULTI_BIT_MESSAGE_1_CARRY_1_GROUP_3_KS_PBS,
+                PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS,
+                PARAM_MULTI_BIT_MESSAGE_3_CARRY_3_GROUP_3_KS_PBS,
+            ]
+        };
+
+        parameters
+            .iter()
+            .map(|params| {
+                (
+                    params.name(),
+                    <MultiBitPBSParameters as Into<PBSParameters>>::into(*params)
+                        .to_owned()
+                        .into(),
+                    params.grouping_factor,
+                )
+            })
+            .collect()
    } else {
        // For now there are no parameters available to test multi bit PBS on 32 bits.
        vec![]
@@ -547,3 +535,294 @@ fn pbs_throughput<Scalar: UnsignedTorus + CastInto<usize> + Sync + Send + Serial
        }
    }
 }
+
+#[cfg(feature = "gpu")]
+mod cuda {
+    use super::{benchmark_parameters, multi_bit_benchmark_parameters};
+    use crate::utilities::{write_to_json, OperatorType};
+    use criterion::{black_box, criterion_group, Criterion};
+    use serde::Serialize;
+    use tfhe::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+    use tfhe::core_crypto::gpu::lwe_bootstrap_key::CudaLweBootstrapKey;
+    use tfhe::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+    use tfhe::core_crypto::gpu::lwe_multi_bit_bootstrap_key::CudaLweMultiBitBootstrapKey;
+    use tfhe::core_crypto::gpu::{
+        cuda_multi_bit_programmable_bootstrap_lwe_ciphertext,
+        cuda_programmable_bootstrap_lwe_ciphertext, CudaDevice, CudaStream,
+    };
+    use tfhe::core_crypto::prelude::*;
+
+    fn cuda_pbs<Scalar: UnsignedTorus + CastInto<usize> + Serialize>(c: &mut Criterion) {
+        let bench_name = "cuda::pbs";
+        let mut bench_group = c.benchmark_group(bench_name);
+
+        // Create the PRNG
+        let mut seeder = new_seeder();
+        let seeder = seeder.as_mut();
+        let mut encryption_generator =
+            EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(seeder.seed(), seeder);
+        let mut secret_generator =
+            SecretRandomGenerator::<ActivatedRandomGenerator>::new(seeder.seed());
+
+        let gpu_index = 0;
+        let device = CudaDevice::new(gpu_index);
+        let stream = CudaStream::new_unchecked(device);
+
+        for (name, params) in benchmark_parameters::<Scalar>().iter() {
+            // Create the LweSecretKey
+            let input_lwe_secret_key = allocate_and_generate_new_binary_lwe_secret_key(
+                params.lwe_dimension.unwrap(),
+                &mut secret_generator,
+            );
+            let output_glwe_secret_key: GlweSecretKeyOwned<Scalar> =
+                allocate_and_generate_new_binary_glwe_secret_key(
+                    params.glwe_dimension.unwrap(),
+                    params.polynomial_size.unwrap(),
+                    &mut secret_generator,
+                );
+            let output_lwe_secret_key = output_glwe_secret_key.into_lwe_secret_key();
+
+            let bsk = LweBootstrapKey::new(
+                Scalar::ZERO,
+                params.glwe_dimension.unwrap().to_glwe_size(),
+                params.polynomial_size.unwrap(),
+                params.pbs_base_log.unwrap(),
+                params.pbs_level.unwrap(),
+                params.lwe_dimension.unwrap(),
+                tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
+            );
+            let bsk_gpu = CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, &stream);
+
+            // Allocate a new LweCiphertext and encrypt our plaintext
+            let lwe_ciphertext_in = allocate_and_encrypt_new_lwe_ciphertext(
+                &input_lwe_secret_key,
+                Plaintext(Scalar::ZERO),
+                params.lwe_modular_std_dev.unwrap(),
+                tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
+                &mut encryption_generator,
+            );
+            let lwe_ciphertext_in_gpu =
+                CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &stream);
+
+            let accumulator = GlweCiphertext::new(
+                Scalar::ZERO,
+                params.glwe_dimension.unwrap().to_glwe_size(),
+                params.polynomial_size.unwrap(),
+                tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
+            );
+            let accumulator_gpu =
+                CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream);
+
+            // Allocate the LweCiphertext to store the result of the PBS
+            let mut out_pbs_ct = LweCiphertext::new(
+                Scalar::ZERO,
+                output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
+            );
+            let mut out_pbs_ct_gpu =
+                CudaLweCiphertextList::from_lwe_ciphertext(&out_pbs_ct, &stream);
+            let h_indexes = &[Scalar::ZERO];
+            stream.synchronize();
+            let mut d_input_indexes = unsafe { stream.malloc_async::<Scalar>(1u32) };
+            let mut d_output_indexes = unsafe { stream.malloc_async::<Scalar>(1u32) };
+            let mut d_lut_indexes = unsafe { stream.malloc_async::<Scalar>(1u32) };
+            unsafe {
+                stream.copy_to_gpu_async(&mut d_input_indexes, h_indexes.as_ref());
+                stream.copy_to_gpu_async(&mut d_output_indexes, h_indexes.as_ref());
+                stream.copy_to_gpu_async(&mut d_input_indexes, h_indexes.as_ref());
+            }
+            stream.synchronize();
+
+            let id = format!("{bench_name}_{name}");
+            {
+                bench_group.bench_function(&id, |b| {
+                    b.iter(|| {
+                        cuda_programmable_bootstrap_lwe_ciphertext(
+                            &lwe_ciphertext_in_gpu,
+                            &mut out_pbs_ct_gpu,
+                            &accumulator_gpu,
+                            &d_lut_indexes,
+                            &d_output_indexes,
+                            &d_input_indexes,
+                            LweCiphertextCount(1),
+                            &bsk_gpu,
+                            &stream,
+                        );
+                        black_box(&mut out_pbs_ct_gpu);
+                    })
+                });
+            }
+
+            let bit_size = (params.message_modulus.unwrap_or(2) as u32).ilog2();
+            write_to_json(
+                &id,
+                *params,
+                name,
+                "pbs",
+                &OperatorType::Atomic,
+                bit_size,
+                vec![bit_size],
+            );
+        }
+    }
+
+    fn cuda_multi_bit_pbs<
+        Scalar: UnsignedTorus + CastInto<usize> + CastFrom<usize> + Default + Serialize + Sync,
+    >(
+        c: &mut Criterion,
+    ) {
+        let bench_name = "cuda::pbs";
+        let mut bench_group = c.benchmark_group(bench_name);
+
+        // Create the PRNG
+        let mut seeder = new_seeder();
+        let seeder = seeder.as_mut();
+        let mut encryption_generator =
+            EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(seeder.seed(), seeder);
+        let mut secret_generator =
+            SecretRandomGenerator::<ActivatedRandomGenerator>::new(seeder.seed());
+
+        let gpu_index = 0;
+        let device = CudaDevice::new(gpu_index);
+        let stream = CudaStream::new_unchecked(device);
+
+        for (name, params, grouping_factor) in multi_bit_benchmark_parameters::<Scalar>().iter() {
+            // Create the LweSecretKey
+            let input_lwe_secret_key = allocate_and_generate_new_binary_lwe_secret_key(
+                params.lwe_dimension.unwrap(),
+                &mut secret_generator,
+            );
+            let output_glwe_secret_key: GlweSecretKeyOwned<Scalar> =
+                allocate_and_generate_new_binary_glwe_secret_key(
+                    params.glwe_dimension.unwrap(),
+                    params.polynomial_size.unwrap(),
+                    &mut secret_generator,
+                );
+            let output_lwe_secret_key = output_glwe_secret_key.into_lwe_secret_key();
+
+            let multi_bit_bsk = LweMultiBitBootstrapKey::new(
+                Scalar::ZERO,
+                params.glwe_dimension.unwrap().to_glwe_size(),
+                params.polynomial_size.unwrap(),
+                params.pbs_base_log.unwrap(),
+                params.pbs_level.unwrap(),
+                params.lwe_dimension.unwrap(),
+                *grouping_factor,
+                tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
+            );
+            let multi_bit_bsk_gpu = CudaLweMultiBitBootstrapKey::from_lwe_multi_bit_bootstrap_key(
+                &multi_bit_bsk,
+                &stream,
+            );
+
+            // Allocate a new LweCiphertext and encrypt our plaintext
+            let lwe_ciphertext_in = allocate_and_encrypt_new_lwe_ciphertext(
+                &input_lwe_secret_key,
+                Plaintext(Scalar::ZERO),
+                params.lwe_modular_std_dev.unwrap(),
+                tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
+                &mut encryption_generator,
+            );
+            let lwe_ciphertext_in_gpu =
+                CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &stream);
+
+            let accumulator = GlweCiphertext::new(
+                Scalar::ZERO,
+                params.glwe_dimension.unwrap().to_glwe_size(),
+                params.polynomial_size.unwrap(),
+                tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
+            );
+            let accumulator_gpu =
+                CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream);
+
+            // Allocate the LweCiphertext to store the result of the PBS
+            let mut out_pbs_ct = LweCiphertext::new(
+                Scalar::ZERO,
+                output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
+            );
+            let mut out_pbs_ct_gpu =
+                CudaLweCiphertextList::from_lwe_ciphertext(&out_pbs_ct, &stream);
+            let h_indexes = &[Scalar::ZERO];
+            stream.synchronize();
+            let mut d_input_indexes = unsafe { stream.malloc_async::<Scalar>(1u32) };
+            let mut d_output_indexes = unsafe { stream.malloc_async::<Scalar>(1u32) };
+            let mut d_lut_indexes = unsafe { stream.malloc_async::<Scalar>(1u32) };
+            unsafe {
+                stream.copy_to_gpu_async(&mut d_input_indexes, h_indexes.as_ref());
+                stream.copy_to_gpu_async(&mut d_output_indexes, h_indexes.as_ref());
+                stream.copy_to_gpu_async(&mut d_input_indexes, h_indexes.as_ref());
+            }
+            stream.synchronize();
+
+            let id = format!("{bench_name}_{name}");
+            bench_group.bench_function(&id, |b| {
+                b.iter(|| {
+                    cuda_multi_bit_programmable_bootstrap_lwe_ciphertext(
+                        &lwe_ciphertext_in_gpu,
+                        &mut out_pbs_ct_gpu,
+                        &accumulator_gpu,
+                        &d_lut_indexes,
+                        &d_output_indexes,
+                        &d_input_indexes,
+                        &multi_bit_bsk_gpu,
+                        &stream,
+                    );
+                    black_box(&mut out_pbs_ct_gpu);
+                })
+            });
+
+            let bit_size = params.message_modulus.unwrap().ilog2();
+            write_to_json(
+                &id,
+                *params,
+                name,
+                "pbs",
+                &OperatorType::Atomic,
+                bit_size,
+                vec![bit_size],
+            );
+        }
+    }
+
+    criterion_group!(
+        name = cuda_pbs_group;
+        config = Criterion::default().sample_size(2000);
+        targets = cuda_pbs::<u64>
+    );
+
+    criterion_group!(
+        name = cuda_multi_bit_pbs_group;
+        config = Criterion::default().sample_size(2000);
+        targets = cuda_multi_bit_pbs::<u64>
+    );
+}
+
+#[cfg(feature = "gpu")]
+use cuda::{cuda_multi_bit_pbs_group, cuda_pbs_group};
+
+criterion_group!(
+    name = pbs_group;
+    config = Criterion::default().sample_size(2000);
+    targets = mem_optimized_pbs::<u64>, mem_optimized_pbs::<u32>
+);
+
+criterion_group!(
+    name = multi_bit_pbs_group;
+    config = Criterion::default().sample_size(2000);
+    targets =   multi_bit_pbs::<u64>,
+                multi_bit_pbs::<u32>,
+                multi_bit_deterministic_pbs::<u64>,
+                multi_bit_deterministic_pbs::<u32>,
+);
+
+criterion_group!(
+    name = pbs_throughput_group;
+    config = Criterion::default().sample_size(100);
+    targets = pbs_throughput::<u64>, pbs_throughput::<u32>
+);
+
+#[cfg(not(feature = "gpu"))]
+criterion_main!(pbs_group, multi_bit_pbs_group, pbs_throughput_group);
+#[cfg(feature = "gpu")]
+criterion_main!(cuda_pbs_group, cuda_multi_bit_pbs_group);
--- a/tfhe/benches/integer/bench.rs
+++ b/tfhe/benches/integer/bench.rs
@@ -3,7 +3,7 @@
 #[path = "../utilities.rs"]
 mod utilities;

-use crate::utilities::{write_to_json, OperatorType};
+use crate::utilities::{write_to_json, EnvConfig, OperatorType};
 use std::env;

 use criterion::{criterion_group, Criterion};
@@ -11,8 +11,9 @@ use itertools::iproduct;
 use rand::prelude::*;
 use rand::Rng;
 use std::vec::IntoIter;
+use tfhe::core_crypto::algorithms::misc::divide_ceil;
 use tfhe::integer::keycache::KEY_CACHE;
-use tfhe::integer::{IntegerKeyKind, RadixCiphertext, ServerKey};
+use tfhe::integer::{IntegerKeyKind, RadixCiphertext, RadixClientKey, ServerKey};
 use tfhe::keycache::NamedParam;

 use tfhe::integer::U256;
@@ -28,9 +29,6 @@ use tfhe::shortint::parameters::{
 /// It must be as big as the largest bit size tested
 type ScalarType = U256;

-const FAST_BENCH_BIT_SIZES: [usize; 1] = [32];
-const BENCH_BIT_SIZES: [usize; 7] = [8, 16, 32, 40, 64, 128, 256];
-
 fn gen_random_u256(rng: &mut ThreadRng) -> U256 {
    let clearlow = rng.gen::<u128>();
    let clearhigh = rng.gen::<u128>();
@@ -48,37 +46,15 @@ struct ParamsAndNumBlocksIter {

 impl Default for ParamsAndNumBlocksIter {
    fn default() -> Self {
-        let is_multi_bit = match env::var("__TFHE_RS_BENCH_TYPE") {
-            Ok(val) => val.to_lowercase() == "multi_bit",
-            Err(_) => false,
-        };
+        let env_config = EnvConfig::new();

-        let is_fast_bench = match env::var("__TFHE_RS_FAST_BENCH") {
-            Ok(val) => val.to_lowercase() == "true",
-            Err(_) => false,
-        };
-
-        let bit_sizes = if is_fast_bench {
-            FAST_BENCH_BIT_SIZES.to_vec()
-        } else {
-            BENCH_BIT_SIZES.to_vec()
-        };
-
-        if is_multi_bit {
+        if env_config.is_multi_bit {
            #[cfg(feature = "gpu")]
            let params = vec![PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS.into()];
            #[cfg(not(feature = "gpu"))]
            let params = vec![PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_2_KS_PBS.into()];

-            let bit_sizes = if is_fast_bench {
-                vec![32]
-            } else if cfg!(feature = "gpu") {
-                BENCH_BIT_SIZES.to_vec()
-            } else {
-                vec![8, 16, 32, 40, 64]
-            };
-
-            let params_and_bit_sizes = iproduct!(params, bit_sizes);
+            let params_and_bit_sizes = iproduct!(params, env_config.bit_sizes());
            Self {
                params_and_bit_sizes,
            }
@@ -91,7 +67,7 @@ impl Default for ParamsAndNumBlocksIter {
                // PARAM_MESSAGE_4_CARRY_4_KS_PBS.into(),
            ];

-            let params_and_bit_sizes = iproduct!(params, bit_sizes);
+            let params_and_bit_sizes = iproduct!(params, env_config.bit_sizes());
            Self {
                params_and_bit_sizes,
            }
@@ -566,6 +542,65 @@ fn if_then_else_parallelized(c: &mut Criterion) {
    bench_group.finish()
 }

+fn ciphertexts_sum_parallelized(c: &mut Criterion) {
+    let bench_name = "integer::sum_ciphertexts_parallelized";
+    let display_name = "sum_ctxts";
+
+    let mut bench_group = c.benchmark_group(bench_name);
+    bench_group
+        .sample_size(15)
+        .measurement_time(std::time::Duration::from_secs(60));
+    let mut rng = rand::thread_rng();
+
+    for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
+        let param_name = param.name();
+        let max_for_bit_size = ScalarType::MAX >> (ScalarType::BITS as usize - bit_size);
+
+        for len in [5, 10, 20] {
+            let bench_id = format!("{bench_name}_{len}_ctxts::{param_name}::{bit_size}_bits");
+            bench_group.bench_function(&bench_id, |b| {
+                let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                let nb_ctxt = divide_ceil(bit_size, param.message_modulus().0.ilog2() as usize);
+                let cks = RadixClientKey::from((cks, nb_ctxt));
+
+                let encrypt_values = || {
+                    let clears = (0..len)
+                        .map(|_| gen_random_u256(&mut rng) & max_for_bit_size)
+                        .collect::<Vec<_>>();
+
+                    // encryption of integers
+                    let ctxts = clears
+                        .iter()
+                        .copied()
+                        .map(|clear| cks.encrypt(clear))
+                        .collect::<Vec<_>>();
+
+                    ctxts
+                };
+
+                b.iter_batched(
+                    encrypt_values,
+                    |ctxts| sks.sum_ciphertexts_parallelized(&ctxts),
+                    criterion::BatchSize::SmallInput,
+                )
+            });
+
+            write_to_json::<u64, _>(
+                &bench_id,
+                param,
+                param.name(),
+                display_name,
+                &OperatorType::Atomic,
+                bit_size as u32,
+                vec![param.message_modulus().0.ilog2(); num_block],
+            );
+        }
+    }
+
+    bench_group.finish()
+}
+
 macro_rules! define_server_key_bench_unary_fn (
    (method_name: $server_key_method:ident, display_name:$name:ident) => {
        fn $server_key_method(c: &mut Criterion) {
@@ -1036,12 +1071,6 @@ define_server_key_bench_unary_default_fn!(method_name: abs_parallelized, display

 define_server_key_bench_unary_default_fn!(method_name: unchecked_abs_parallelized, display_name: abs);

-define_server_key_bench_unary_fn!(method_name: full_propagate, display_name: carry_propagation);
-define_server_key_bench_unary_fn!(
-    method_name: full_propagate_parallelized,
-    display_name: carry_propagation
-);
-
 define_server_key_bench_default_fn!(method_name: unchecked_max, display_name: max);
 define_server_key_bench_default_fn!(method_name: unchecked_min, display_name: min);
 define_server_key_bench_default_fn!(method_name: unchecked_eq, display_name: equal);
@@ -1927,6 +1956,7 @@ criterion_group!(
    right_shift_parallelized,
    rotate_left_parallelized,
    rotate_right_parallelized,
+    ciphertexts_sum_parallelized,
 );

 criterion_group!(
@@ -2086,6 +2116,92 @@ criterion_group!(
    unchecked_scalar_ge_parallelized,
 );

+//================================================================================
+//     Miscellaneous Benches
+//================================================================================
+
+fn bench_server_key_cast_function<F>(
+    c: &mut Criterion,
+    bench_name: &str,
+    display_name: &str,
+    cast_op: F,
+) where
+    F: Fn(&ServerKey, RadixCiphertext, usize),
+{
+    let mut bench_group = c.benchmark_group(bench_name);
+    bench_group
+        .sample_size(15)
+        .measurement_time(std::time::Duration::from_secs(30));
+    let mut rng = rand::thread_rng();
+
+    let env_config = EnvConfig::new();
+
+    for (param, num_blocks, bit_size) in ParamsAndNumBlocksIter::default() {
+        let all_num_blocks = env_config
+            .bit_sizes()
+            .iter()
+            .copied()
+            .map(|bit| divide_ceil(bit, param.message_modulus().0.ilog2() as usize))
+            .collect::<Vec<_>>();
+        let param_name = param.name();
+
+        for target_num_blocks in all_num_blocks.iter().copied() {
+            let target_bit_size = target_num_blocks * param.message_modulus().0.ilog2() as usize;
+            let bench_id = format!("{bench_name}::{param_name}::{bit_size}_to_{target_bit_size}");
+            bench_group.bench_function(&bench_id, |b| {
+                let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                let encrypt_one_value = || cks.encrypt_radix(gen_random_u256(&mut rng), num_blocks);
+
+                b.iter_batched(
+                    encrypt_one_value,
+                    |ct| {
+                        cast_op(&sks, ct, target_num_blocks);
+                    },
+                    criterion::BatchSize::SmallInput,
+                )
+            });
+
+            write_to_json::<u64, _>(
+                &bench_id,
+                param,
+                param.name(),
+                display_name,
+                &OperatorType::Atomic,
+                bit_size as u32,
+                vec![param.message_modulus().0.ilog2(); num_blocks],
+            );
+        }
+    }
+
+    bench_group.finish()
+}
+
+macro_rules! define_server_key_bench_cast_fn (
+  (method_name: $server_key_method:ident, display_name:$name:ident) => {
+      fn $server_key_method(c: &mut Criterion) {
+        bench_server_key_cast_function(
+              c,
+              concat!("integer::", stringify!($server_key_method)),
+              stringify!($name),
+              |server_key, lhs, rhs| {
+                server_key.$server_key_method(lhs, rhs);
+          })
+      }
+  }
+);
+
+define_server_key_bench_cast_fn!(method_name: cast_to_unsigned, display_name: cast_to_unsigned);
+define_server_key_bench_cast_fn!(method_name: cast_to_signed, display_name: cast_to_signed);
+
+criterion_group!(cast_ops, cast_to_unsigned, cast_to_signed);
+
+define_server_key_bench_unary_fn!(method_name: full_propagate, display_name: carry_propagation);
+define_server_key_bench_unary_fn!(
+    method_name: full_propagate_parallelized,
+    display_name: carry_propagation
+);
+
 criterion_group!(misc, full_propagate, full_propagate_parallelized);

 #[cfg(feature = "gpu")]
@@ -2109,7 +2225,8 @@ fn go_through_cpu_bench_groups(val: &str) {
            default_parallelized_ops();
            default_parallelized_ops_comp();
            default_scalar_parallelized_ops();
-            default_scalar_parallelized_ops_comp()
+            default_scalar_parallelized_ops_comp();
+            cast_ops()
        }
        "smart" => {
            smart_ops();
@@ -2143,7 +2260,8 @@ fn main() {
            default_parallelized_ops();
            default_parallelized_ops_comp();
            default_scalar_parallelized_ops();
-            default_scalar_parallelized_ops_comp()
+            default_scalar_parallelized_ops_comp();
+            cast_ops()
        }
    };

--- a/tfhe/benches/integer/signed_bench.rs
+++ b/tfhe/benches/integer/signed_bench.rs
@@ -1,7 +1,7 @@
 #[path = "../utilities.rs"]
 mod utilities;

-use crate::utilities::{write_to_json, OperatorType};
+use crate::utilities::{write_to_json, EnvConfig, OperatorType};
 use std::env;

 use criterion::{criterion_group, Criterion};
@@ -9,6 +9,7 @@ use itertools::iproduct;
 use rand::prelude::*;
 use rand::Rng;
 use std::vec::IntoIter;
+use tfhe::core_crypto::algorithms::misc::divide_ceil;
 use tfhe::integer::keycache::KEY_CACHE;
 use tfhe::integer::{IntegerKeyKind, RadixCiphertext, ServerKey, SignedRadixCiphertext, I256};
 use tfhe::keycache::NamedParam;
@@ -34,26 +35,12 @@ struct ParamsAndNumBlocksIter {

 impl Default for ParamsAndNumBlocksIter {
    fn default() -> Self {
-        let is_multi_bit = match env::var("__TFHE_RS_BENCH_TYPE") {
-            Ok(val) => val.to_lowercase() == "multi_bit",
-            Err(_) => false,
-        };
+        let env_config = EnvConfig::new();

-        let is_fast_bench = match env::var("__TFHE_RS_FAST_BENCH") {
-            Ok(val) => val.to_lowercase() == "true",
-            Err(_) => false,
-        };
-
-        if is_multi_bit {
+        if env_config.is_multi_bit {
            let params = vec![PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_2_KS_PBS.into()];

-            let bit_sizes = if is_fast_bench {
-                vec![32]
-            } else {
-                vec![8, 16, 32, 40, 64]
-            };
-
-            let params_and_bit_sizes = iproduct!(params, bit_sizes);
+            let params_and_bit_sizes = iproduct!(params, env_config.bit_sizes());
            Self {
                params_and_bit_sizes,
            }
@@ -66,13 +53,7 @@ impl Default for ParamsAndNumBlocksIter {
                // PARAM_MESSAGE_4_CARRY_4_KS_PBS.into(),
            ];

-            let bit_sizes = if is_fast_bench {
-                vec![32]
-            } else {
-                vec![8, 16, 32, 40, 64, 128, 256]
-            };
-
-            let params_and_bit_sizes = iproduct!(params, bit_sizes);
+            let params_and_bit_sizes = iproduct!(params, env_config.bit_sizes());
            Self {
                params_and_bit_sizes,
            }
@@ -1125,6 +1106,83 @@ criterion_group!(
    unchecked_scalar_min_parallelized,
 );

+fn bench_server_key_signed_cast_function<F>(
+    c: &mut Criterion,
+    bench_name: &str,
+    display_name: &str,
+    cast_op: F,
+) where
+    F: Fn(&ServerKey, SignedRadixCiphertext, usize),
+{
+    let mut bench_group = c.benchmark_group(bench_name);
+    bench_group
+        .sample_size(15)
+        .measurement_time(std::time::Duration::from_secs(30));
+    let mut rng = rand::thread_rng();
+
+    let env_config = EnvConfig::new();
+
+    for (param, num_blocks, bit_size) in ParamsAndNumBlocksIter::default() {
+        let all_num_blocks = env_config
+            .bit_sizes()
+            .iter()
+            .copied()
+            .map(|bit| divide_ceil(bit, param.message_modulus().0.ilog2() as usize))
+            .collect::<Vec<_>>();
+        let param_name = param.name();
+
+        for target_num_blocks in all_num_blocks.iter().copied() {
+            let target_bit_size = target_num_blocks * param.message_modulus().0.ilog2() as usize;
+            let bench_id = format!("{bench_name}::{param_name}::{bit_size}_to_{target_bit_size}");
+            bench_group.bench_function(&bench_id, |b| {
+                let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                let encrypt_one_value =
+                    || cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_blocks);
+
+                b.iter_batched(
+                    encrypt_one_value,
+                    |ct| {
+                        cast_op(&sks, ct, target_num_blocks);
+                    },
+                    criterion::BatchSize::SmallInput,
+                )
+            });
+
+            write_to_json::<u64, _>(
+                &bench_id,
+                param,
+                param.name(),
+                display_name,
+                &OperatorType::Atomic,
+                bit_size as u32,
+                vec![param.message_modulus().0.ilog2(); num_blocks],
+            );
+        }
+    }
+
+    bench_group.finish()
+}
+
+macro_rules! define_server_key_bench_cast_fn (
+  (method_name: $server_key_method:ident, display_name:$name:ident) => {
+      fn $server_key_method(c: &mut Criterion) {
+        bench_server_key_signed_cast_function(
+              c,
+              concat!("integer::signed::", stringify!($server_key_method)),
+              stringify!($name),
+              |server_key, lhs, rhs| {
+                server_key.$server_key_method(lhs, rhs);
+          })
+      }
+  }
+);
+
+define_server_key_bench_cast_fn!(method_name: cast_to_unsigned, display_name: cast_to_unsigned);
+define_server_key_bench_cast_fn!(method_name: cast_to_signed, display_name: cast_to_signed);
+
+criterion_group!(cast_ops, cast_to_unsigned, cast_to_signed);
+
 fn main() {
    match env::var("__TFHE_RS_BENCH_OP_FLAVOR") {
        Ok(val) => {
@@ -1133,7 +1191,8 @@ fn main() {
                    default_parallelized_ops();
                    default_parallelized_ops_comp();
                    default_scalar_parallelized_ops();
-                    default_scalar_parallelized_ops_comp()
+                    default_scalar_parallelized_ops_comp();
+                    cast_ops()
                }
                "unchecked" => {
                    unchecked_ops();
@@ -1147,6 +1206,7 @@ fn main() {
        Err(_) => {
            default_parallelized_ops();
            default_scalar_parallelized_ops();
+            cast_ops()
        }
    };

--- a/tfhe/benches/utilities.rs
+++ b/tfhe/benches/utilities.rs
@@ -1,6 +1,6 @@
 use serde::Serialize;
-use std::fs;
 use std::path::PathBuf;
+use std::{env, fs};
 #[cfg(feature = "boolean")]
 use tfhe::boolean::parameters::BooleanParameters;
 use tfhe::core_crypto::prelude::*;
@@ -226,6 +226,54 @@ pub fn write_to_json<
    fs::write(params_directory, serde_json::to_string(&record).unwrap()).unwrap();
 }

+const FAST_BENCH_BIT_SIZES: [usize; 1] = [32];
+const BENCH_BIT_SIZES: [usize; 7] = [8, 16, 32, 40, 64, 128, 256];
+
+/// User configuration in which benchmarks must be run.
+#[derive(Default)]
+pub struct EnvConfig {
+    pub is_multi_bit: bool,
+    pub is_fast_bench: bool,
+}
+
+impl EnvConfig {
+    #[allow(dead_code)]
+    pub fn new() -> Self {
+        let is_multi_bit = match env::var("__TFHE_RS_BENCH_TYPE") {
+            Ok(val) => val.to_lowercase() == "multi_bit",
+            Err(_) => false,
+        };
+
+        let is_fast_bench = match env::var("__TFHE_RS_FAST_BENCH") {
+            Ok(val) => val.to_lowercase() == "true",
+            Err(_) => false,
+        };
+
+        EnvConfig {
+            is_multi_bit,
+            is_fast_bench,
+        }
+    }
+
+    /// Get precisions values to benchmark.
+    #[allow(dead_code)]
+    pub fn bit_sizes(&self) -> Vec<usize> {
+        if self.is_multi_bit {
+            if self.is_fast_bench {
+                FAST_BENCH_BIT_SIZES.to_vec()
+            } else if cfg!(feature = "gpu") {
+                BENCH_BIT_SIZES.to_vec()
+            } else {
+                vec![8, 16, 32, 40, 64]
+            }
+        } else if self.is_fast_bench {
+            FAST_BENCH_BIT_SIZES.to_vec()
+        } else {
+            BENCH_BIT_SIZES.to_vec()
+        }
+    }
+}
+
 // Empty main to please clippy.
 #[allow(dead_code)]
 pub fn main() {}
--- a/tfhe/build.rs
+++ b/tfhe/build.rs
@@ -55,6 +55,8 @@ fn gen_c_api() {
        "shortint",
        #[cfg(feature = "integer")]
        "integer",
+        #[cfg(feature = "gpu")]
+        "gpu",
    ];

    let parse_expand_vec = if parse_expand_features_vec.is_empty() {
--- a/tfhe/c_api_tests/CMakeLists.txt
+++ b/tfhe/c_api_tests/CMakeLists.txt
@@ -7,6 +7,8 @@ if(NOT CARGO_PROFILE)
 endif()
 set(TFHE_C_API_RELEASE "${CMAKE_CURRENT_SOURCE_DIR}/../../target/${CARGO_PROFILE}")

+option(WITH_FEATURE_GPU "Enable if tfhe-rs C API was compiled with the 'gpu' feature activated" OFF)
+
 include_directories(${TFHE_C_API_RELEASE})
 # This one is to fetch the dynamic buffer header
 include_directories(${TFHE_C_API_RELEASE}/deps)
@@ -22,6 +24,11 @@ if(APPLE)
    endif()
 endif()

+if (WITH_FEATURE_GPU)
+    find_package(CUDAToolkit 10.0 REQUIRED)
+    find_package(OpenMP REQUIRED)
+endif()
+
 file(GLOB TEST_CASES test_*.c)
 foreach (testsourcefile ${TEST_CASES})
    get_filename_component(testname ${testsourcefile} NAME_WLE)
@@ -34,6 +41,12 @@ foreach (testsourcefile ${TEST_CASES})
    )
    target_include_directories(${testname} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
    target_link_libraries(${testname} LINK_PUBLIC Tfhe TfheDynamicBuffer m pthread dl)
+
+    if (WITH_FEATURE_GPU)
+        target_link_libraries(${testname} LINK_PUBLIC CUDA::cudart -lstdc++ OpenMP::OpenMP_CXX)
+        target_compile_definitions(${testname} PUBLIC -DWITH_FEATURE_GPU)
+    endif()
+
    if(APPLE)
        target_link_libraries(${testname} LINK_PUBLIC ${SECURITY_FRAMEWORK})
    endif()
--- a/tfhe/c_api_tests/test_high_level_integers_cuda.c
+++ b/tfhe/c_api_tests/test_high_level_integers_cuda.c
@@ -0,0 +1,103 @@
+#if defined(WITH_FEATURE_GPU)
+#include <tfhe.h>
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdint.h>
+
+int uint8_client_key(const ClientKey *client_key) {
+  int ok;
+  FheUint8 *lhs = NULL;
+  FheUint8 *rhs = NULL;
+  FheUint8 *result = NULL;
+
+  uint8_t lhs_clear = 123;
+  uint8_t rhs_clear = 14;
+
+  ok = fhe_uint8_try_encrypt_with_client_key_u8(lhs_clear, client_key, &lhs);
+  assert(ok == 0);
+
+  ok = fhe_uint8_try_encrypt_with_client_key_u8(rhs_clear, client_key, &rhs);
+  assert(ok == 0);
+
+  uint8_t clear;
+
+  // Check addition
+  {
+    ok = fhe_uint8_add(lhs, rhs, &result);
+    assert(ok == 0);
+
+    ok = fhe_uint8_decrypt(result, client_key, &clear);
+    assert(ok == 0);
+
+    assert(clear == (lhs_clear + rhs_clear));
+  }
+
+  // Check sum
+  {
+    FheUint8 *sum_result;
+    const FheUint8 *data[2] = {lhs, rhs};
+    ok = fhe_uint8_sum(data, 2, &sum_result);
+    assert(ok == 0);
+
+    clear = 0;
+    ok = fhe_uint8_decrypt(result, client_key, &clear);
+    assert(ok == 0);
+
+    assert(clear == (lhs_clear + rhs_clear));
+    fhe_uint8_destroy(sum_result);
+  }
+
+  fhe_uint8_destroy(lhs);
+  fhe_uint8_destroy(rhs);
+  fhe_uint8_destroy(result);
+  return ok;
+}
+
+
+int main(void) {
+  int ok = 0;
+  {
+    ConfigBuilder *builder;
+    Config *config;
+
+    ok = config_builder_default(&builder);
+    assert(ok == 0);
+    ok = config_builder_build(builder, &config);
+    assert(ok == 0);
+
+    ClientKey *client_key = NULL;
+    CompressedServerKey *compressed_sks = NULL;
+    CudaServerKey *cuda_server_key = NULL;
+    
+    ok = client_key_generate(config, &client_key);
+    assert(ok == 0);
+    
+    ok = compressed_server_key_new(client_key, &compressed_sks);
+    assert(ok == 0);
+    
+    ok = compressed_server_key_decompress_to_gpu(compressed_sks, &cuda_server_key);
+    assert(ok == 0);
+
+    ok = set_cuda_server_key(cuda_server_key);
+    assert(ok == 0);
+    
+    uint8_client_key(client_key);
+    
+    client_key_destroy(client_key);
+    compressed_server_key_destroy(compressed_sks);
+    cuda_server_key_destroy(cuda_server_key);
+  }
+
+  return ok;
+}
+
+#else
+#include <stdio.h>
+
+int main(void) {
+  fputs("tfhe-rs was not compiled with gpu support\n", stdout);
+  return 0;
+}
+#endif
--- a/tfhe/cbindgen.toml
+++ b/tfhe/cbindgen.toml
@@ -45,7 +45,7 @@ usize_is_size_t = true

 [defines]
 # "target_os = freebsd" = "DEFINE_FREEBSD"
-# "feature = serde" = "DEFINE_SERDE"
+"feature = gpu" = "WITH_FEATURE_GPU"


 [export]
--- a/tfhe/docs/.gitbook/assets/1
+++ b/tfhe/docs/.gitbook/assets/1
--- a/tfhe/docs/.gitbook/assets/1.png
+++ b/tfhe/docs/.gitbook/assets/1.png
--- a/tfhe/docs/.gitbook/assets/2
+++ b/tfhe/docs/.gitbook/assets/2
--- a/tfhe/docs/.gitbook/assets/2.png
+++ b/tfhe/docs/.gitbook/assets/2.png
--- a/tfhe/docs/.gitbook/assets/3.png
+++ b/tfhe/docs/.gitbook/assets/3.png
--- a/tfhe/docs/.gitbook/assets/Concrete
+++ b/tfhe/docs/.gitbook/assets/Concrete
--- a/tfhe/docs/.gitbook/assets/Concrete
+++ b/tfhe/docs/.gitbook/assets/Concrete
--- a/tfhe/docs/.gitbook/assets/Untitled
+++ b/tfhe/docs/.gitbook/assets/Untitled
--- a/tfhe/docs/.gitbook/assets/Untitled
+++ b/tfhe/docs/.gitbook/assets/Untitled
--- a/tfhe/docs/.gitbook/assets/doc
+++ b/tfhe/docs/.gitbook/assets/doc
--- a/tfhe/docs/.gitbook/assets/feedback-banner.png
+++ b/tfhe/docs/.gitbook/assets/feedback-banner.png
--- a/tfhe/docs/.gitbook/assets/feedback_banner_dark.png
+++ b/tfhe/docs/.gitbook/assets/feedback_banner_dark.png
--- a/tfhe/docs/.gitbook/assets/feedback_banner_light.png
+++ b/tfhe/docs/.gitbook/assets/feedback_banner_light.png
--- a/tfhe/docs/.gitbook/assets/fundamental.png
+++ b/tfhe/docs/.gitbook/assets/fundamental.png
--- a/tfhe/docs/.gitbook/assets/guide.png
+++ b/tfhe/docs/.gitbook/assets/guide.png
--- a/tfhe/docs/.gitbook/assets/header.jpg
+++ b/tfhe/docs/.gitbook/assets/header.jpg
--- a/tfhe/docs/.gitbook/assets/tutorial.png
+++ b/tfhe/docs/.gitbook/assets/tutorial.png
--- a/tfhe/docs/README.md
+++ b/tfhe/docs/README.md
@@ -1,36 +1,42 @@
-# What is TFHE-rs?
+# Welcome to TFHE-rs

-📁 [Github](https://github.com/zama-ai/tfhe-rs) | 💛 [Community support](https://zama.ai/community) | 🟨 [Zama Bounty Program](https://github.com/zama-ai/bounty-program)
+⭐️ [Star the repo on Github](https://github.com/zama-ai/tfhe-rs) | 📚 [FHE resources by Zama](https://github.com/zama-ai/awesome-zama/tree/main) | 💬 [Community support](https://community.zama.ai/)

-![](\_static/tfhe-rs-doc-home.png)
+<figure><img src=".gitbook/assets/doc header.png" alt=""><figcaption></figcaption></figure>

 TFHE-rs is a pure Rust implementation of TFHE for Boolean and integer arithmetics over encrypted data. It includes a Rust and C API, as well as a client-side WASM API.

-TFHE-rs is meant for developers and researchers who want full control over what they can do with TFHE, while not worrying about the low level implementation.
+### Start here

-The goal is to have a stable, simple, high-performance, and production-ready library for all the advanced features of TFHE.
+Learn the basics of TFHE-rs, set it up, and make it run with ease.

-## Key cryptographic concepts
+<table data-view="cards"><thead><tr><th align="center"></th><th data-hidden data-card-target data-type="content-ref"></th></tr></thead><tbody><tr><td align="center">What is TFHE-rs?</td><td><a href="getting-started/what-is-tfhe-rs.md">what-is-tfhe-rs.md</a></td></tr><tr><td align="center">Installation</td><td><a href="getting_started/installation.md">installation.md</a></td></tr><tr><td align="center">Quick start</td><td><a href="getting_started/quick_start.md">quick_start.md</a></td></tr></tbody></table>

-The TFHE-rs library implements Zama’s variant of Fully Homomorphic Encryption over the Torus (TFHE). TFHE is based on Learning With Errors (LWE), a well-studied cryptographic primitive believed to be secure even against quantum computers.
+### Build with TFHE-rs

-In cryptography, a raw value is called a message (also sometimes called a cleartext), while an encoded message is called a plaintext and an encrypted plaintext is called a ciphertext.
+Start building with TFHE-rs by exploring its core features, discovering essential guides, and learning more with user-friendly tutorials.

-The idea of homomorphic encryption is that you can compute on ciphertexts while not knowing messages encrypted within them. A scheme is said to be _fully homomorphic_, meaning any program can be evaluated with it, if at least two of the following operations are supported ($$x$$ is a plaintext and $$E[x]$$ is the corresponding ciphertext):
+<table data-view="cards"><thead><tr><th></th><th></th><th></th><th data-hidden data-card-cover data-type="files"></th><th data-hidden data-card-target data-type="content-ref"></th></tr></thead><tbody><tr><td><strong>Fundamentals</strong></td><td>Explore the core features  and basics of TFHE-rs.</td><td><ul><li><a href="fundamentals/configure-and-create-keys.md">Configure and create keys</a></li><li><a href="fundamentals/set-the-server-key.md">Set the server key</a></li><li><a href="fundamentals/encrypt-data.md">Encrypt data</a></li><li><a href="fundamentals/compute-and-decrypt.md">Compute and decrypt</a></li></ul></td><td><a href=".gitbook/assets/3.png">3.png</a></td><td></td></tr><tr><td><strong>Guides</strong></td><td>Discover essential guides to work with TFHE-rs.</td><td><ul><li><a href="guides/run_on_gpu.md">Run on GPU</a></li><li><a href="guides/rust_configuration.md">Configure Rust</a></li><li><a href="guides/overflow_operations.md">Detect overflow</a></li><li><a href="guides/trait_bounds.md">Generic function bounds</a></li></ul></td><td><a href=".gitbook/assets/2 (1).png">2 (1).png</a></td><td></td></tr><tr><td><strong>Tutorials</strong></td><td>Learn more about TFHE-rs with our tutorials.</td><td><p></p><ul><li><a href="tutorials/see-all-tutorials.md#start-here">Start here</a></li><li><a href="tutorials/see-all-tutorials.md#go-further">Go further</a></li><li><a href="tutorials/see-all-tutorials.md">See all tutorials</a></li></ul></td><td><a href=".gitbook/assets/1 (1).png">1 (1).png</a></td><td></td></tr></tbody></table>

-* homomorphic univariate function evaluation: $$f(E[x]) = E[f(x)]$$
-* homomorphic addition: $$E[x] + E[y] = E[x + y]$$
-* homomorphic multiplication: $$E[x] * E[y] = E[x * y]$$
+### References

-Zama's variant of TFHE is fully homomorphic and deals with fixed-precision numbers as messages. It implements all needed homomorphic operations, such as addition and function evaluation via **Programmable Bootstrapping**. You can read more about Zama's TFHE variant in the [preliminary whitepaper](https://whitepaper.zama.ai/).
+Take a deep dive into TFHE-rs, exploring APIs from the highest to the lowest level of abstraction.

-Using FHE in a Rust program with TFHE-rs consists in:
+<table data-view="cards"><thead><tr><th></th><th></th><th data-hidden data-card-target data-type="content-ref"></th></tr></thead><tbody><tr><td><strong>API References</strong></td><td>High-level API that abstracts cryptographic complexities and simplifies the development process</td><td><a href="https://docs.rs/tfhe/latest/tfhe/">https://docs.rs/tfhe/latest/tfhe/</a></td></tr><tr><td><strong>Fine-grained APIs</strong></td><td>Mid-level API that enables evaluation of Boolean, short integer, and integer circuits</td><td><a href="references/fine-grained-apis/">fine-grained-apis</a></td></tr><tr><td><strong>Crypto core API</strong></td><td>Low-level API with the primitive functions and types of the TFHE scheme</td><td><a href="references/crypto-core-api/">crypto-core-api</a></td></tr></tbody></table>

-* generating a client key and a server key using secure parameters:
-  * a client key encrypts/decrypts data and must be kept secret
-  * a server key is used to perform operations on encrypted data and could be public (also called an evaluation key)
-* encrypting plaintexts using the client key to produce ciphertexts
-* operating homomorphically on ciphertexts with the server key
-* decrypting the resulting ciphertexts into plaintexts using the client key
+### Support

-If you would like to know more about the problems that FHE solves, we suggest you review our [6 minute introduction to homomorphic encryption](https://6min.zama.ai/).
+Our team of experts usually answers within 24 hours in working days.
+
+<table data-card-size="large" data-view="cards"><thead><tr><th></th><th></th><th data-hidden data-card-target data-type="content-ref"></th></tr></thead><tbody><tr><td>💬 <strong>Community Forum</strong></td><td>Ask technical questions to the Zama team and find solutions to common issues</td><td><a href="https://community.zama.ai/">https://community.zama.ai/</a></td></tr><tr><td>👾 <strong>Discord Channel</strong></td><td>Discuss FHE-related topics with the FHE community in real-time</td><td><a href="https://discord.com/invite/fhe-org">https://discord.com/invite/fhe-org</a></td></tr></tbody></table>
+
+### Developers
+
+* [Contribute to TFHE-rs](dev/contributing.md)
+* Read the l[atest release note](https://github.com/zama-ai/tfhe-rs/releases)
+* Request a f[eature ](https://github.com/zama-ai/tfhe-rs/issues/new?assignees=\&labels=feature\_request\&projects=\&template=feature\_request.md\&title=)
+* Report a b[ug](https://github.com/zama-ai/tfhe-rs/issues/new?assignees=\&labels=triage\_required\&projects=\&template=bug\_report.md\&title=)
+
+***
+
+<figure><picture><source srcset=".gitbook/assets/feedback_banner_dark.png" media="(prefers-color-scheme: dark)"><img src=".gitbook/assets/feedback_banner_light.png" alt=""></picture><figcaption></figcaption></figure>
--- a/tfhe/docs/SUMMARY.md
+++ b/tfhe/docs/SUMMARY.md
@@ -1,61 +1,73 @@
 # Table of contents

-* [What is TFHE-rs?](README.md)
+* [Welcome to TFHE-rs](README.md)

 ## Getting Started
-* [Installation](getting_started/installation.md)
-* [Quick Start](getting_started/quick_start.md)
-* [Types & Operations](getting_started/operations.md)
-* [Benchmarks](getting_started/benchmarks.md)
-* [Security and Cryptography](getting_started/security_and_cryptography.md)
+
+* [What is TFHE-rs?](getting-started/what-is-tfhe-rs.md)
+* [Installation](getting\_started/installation.md)
+* [Quick start](getting\_started/quick\_start.md)
+* [Types & Operations](getting\_started/operations.md)
+* [Benchmarks](getting\_started/benchmarks.md)
+* [Security and cryptography](getting\_started/security\_and\_cryptography.md)
+
+## Fundamentals
+
+* [Configure and create keys](fundamentals/configure-and-create-keys.md)
+* [Set the server key](fundamentals/set-the-server-key.md)
+* [Encrypt data](fundamentals/encrypt-data.md)
+* [Compute and decrypt](fundamentals/compute-and-decrypt.md)
+* [Serialize/Deserialize](fundamentals/serialization.md)
+* [Compress ciphertexts/keys](fundamentals/compress.md)
+* [Simulator mode](fundamentals/trivial\_ciphertext.md)
+
+## Guides
+
+* [Run on GPU](guides/run\_on\_gpu.md)
+* [Configure Rust](guides/rust\_configuration.md)
+* [Detect overflow](guides/overflow\_operations.md)
+* [Generic function bounds](guides/trait\_bounds.md)
+* [Use public key encryption](guides/public\_key.md)
+* [Use parallelized PBS](guides/parallelized\_pbs.md)
+* [Migrate data to newer versions of TFHE-rs](guides/migrate\_data.md)
+* [Use the C API](guides/c\_api.md)
+* [Use the JS on WASM API](guides/js\_on\_wasm\_api.md)
+* [Use multi-threading using the rayon crate](guides/rayon\_crate.md)
+* [Debug](guides/debug.md)
+* [Count PBS](guides/count-pbs.md)
+* [PRF Generate homomorphic randomness](guides/prf-generate-homomorphic-randomness.md)

 ## Tutorials
-* [Homomorphic Parity Bit](tutorials/parity_bit.md)
-* [Homomorphic Case Changing on Ascii String](tutorials/ascii_fhe_string.md)

-## How To
-* [Run on GPU](how_to/run_on_gpu.md)
-* [Configure Rust](how_to/rust_configuration.md)
-* [Detect Overflow](how_to/overflow_operations.md)
-* [Serialize/Deserialize](how_to/serialization.md)
-* [Migrate Data to Newer Versions of TFHE-rs](how_to/migrate_data.md)
-* [Compress Ciphertexts/Keys](how_to/compress.md)
-* [Use Public Key Encryption](how_to/public_key.md)
-* [Use Trivial Ciphertext](how_to/trivial_ciphertext.md)
-* [Generic Function Bounds](how_to/trait_bounds.md)
-* [Use Parallelized PBS](how_to/parallelized_pbs.md)
-* [Use the C API](how_to/c_api.md)
-* [Use the JS on WASM API](how_to/js_on_wasm_api.md)
+* [See all tutorials](tutorials/see-all-tutorials.md)
+* [Homomorphic parity bit](tutorials/parity\_bit.md)
+* [Homomorphic case changing on Ascii string](tutorials/ascii\_fhe\_string.md)
+* [SHA256 with Boolean API](tutorials/sha256\_bool.md)
+* [Dark market with integer API](tutorials/dark\_market.md)
+* [Homomorphic regular expressions integer API](tutorials/regex.md)

-## Fine-grained APIs
-* [Quick Start](fine_grained_api/quick_start.md)
-* [Boolean](fine_grained_api/Boolean/readme.md)
-    * [Operations](fine_grained_api/Boolean/operations.md)
-    * [Cryptographic Parameters](fine_grained_api/Boolean/parameters.md)
-    * [Serialization/Deserialization](fine_grained_api/Boolean/serialization.md)
+## References

-* [Shortint](fine_grained_api/shortint/readme.md)
-    * [Operations](fine_grained_api/shortint/operations.md)
-    * [Cryptographic Parameters](fine_grained_api/shortint/parameters.md)
-    * [Serialization/Deserialization](fine_grained_api/shortint/serialization.md)
-
-* [Integer](fine_grained_api/integer/readme.md)
-    * [Operations](fine_grained_api/integer/operations.md)
-    * [Cryptographic Parameters](fine_grained_api/integer/parameters.md)
-    * [Serialization/Deserialization](fine_grained_api/integer/serialization.md)
-
-## Application Tutorials
-* [SHA256 with *Boolean API*](application_tutorials/sha256_bool.md)
-* [Dark Market with *Integer API*](application_tutorials/dark_market.md)
-* [Homomorphic Regular Expressions *Integer API*](application_tutorials/regex.md)
-
-
-## Crypto Core API [Advanced users]
-* [Quick Start](core_crypto/presentation.md)
-* [Tutorial](core_crypto/tutorial.md)
+* [API references](references/api-references/README.md)
+  * [docs.rs](https://docs.rs/tfhe/)
+* [Fine-grained APIs](references/fine-grained-apis/README.md)
+  * [Quick Start](references/fine-grained-apis/quick\_start.md)
+  * [Boolean](references/fine-grained-apis/boolean/README.md)
+    * [Operations](references/fine-grained-apis/boolean/operations.md)
+    * [Cryptographic Parameters](references/fine-grained-apis/boolean/parameters.md)
+    * [Serialization/Deserialization](references/fine-grained-apis/boolean/serialization.md)
+  * [Shortint](references/fine-grained-apis/shortint/README.md)
+    * [Operations](references/fine-grained-apis/shortint/operations.md)
+    * [Cryptographic Parameters](references/fine-grained-apis/shortint/parameters.md)
+    * [Serialization/Deserialization](references/fine-grained-apis/shortint/serialization.md)
+  * [Integer](references/fine-grained-apis/integer/README.md)
+    * [Operations](references/fine-grained-apis/integer/operations.md)
+    * [Cryptographic Parameters](references/fine-grained-apis/integer/parameters.md)
+    * [Serialization/Deserialization](references/fine-grained-apis/integer/serialization.md)
+* [Crypto core API](references/crypto-core-api/README.md)
+  * [Quick Start](references/crypto-core-api/presentation.md)
+  * [Tutorial](references/crypto-core-api/tutorial.md)

 ## Developers
-* [Contributing](dev/contributing.md)

-## API references
-* [docs.rs](https://docs.rs/tfhe/)
+* [Contribute](dev/contributing.md)
--- a/tfhe/docs/fundamentals/compress.md
+++ b/tfhe/docs/fundamentals/compress.md
@@ -1,4 +1,5 @@
-# Reducing the size of keys and ciphertexts
+# Compress Ciphertexts/Keys
+
 TFHE-rs includes features to reduce the size of both keys and ciphertexts, by compressing them. Most TFHE-rs entities contain random numbers generated by a Pseudo Random Number Generator (PRNG). A PRNG is deterministic, therefore storing only the random seed used to generate those numbers is enough to keep all the required information: using the same PRNG and the same seed, the full chain of random values can be reconstructed when decompressing the entity.

 In the library, entities that can be compressed are prefixed by `Compressed`. For instance, the type of a compressed `FheUint256` is `CompressedFheUint256`.
@@ -6,6 +7,7 @@ In the library, entities that can be compressed are prefixed by `Compressed`. Fo
 In the following example code, we use the `bincode` crate dependency to serialize in a binary format and compare serialized sizes.

 ## Compressed ciphertexts
+
 This example shows how to compress a ciphertext encypting messages over 16 bits.

 ```rust
@@ -35,8 +37,8 @@ fn main() {
 }
 ```

-
 ## Compressed server keys
+
 This example shows how to compress the server keys.

 ```rust
@@ -75,8 +77,8 @@ fn main() {

 ```

-
 ## Compressed public keys
+
 This example shows how to compress the classical public keys.

 {% hint style="warning" %}
@@ -106,10 +108,9 @@ fn main() {
 }
 ```

-
 ## Compressed compact public key
-This example shows how to use compressed compact public keys.

+This example shows how to use compressed compact public keys.

 ```rust
 use tfhe::prelude::*;
--- a/tfhe/docs/fundamentals/compute-and-decrypt.md
+++ b/tfhe/docs/fundamentals/compute-and-decrypt.md
@@ -0,0 +1,17 @@
+# Compute and decrypt
+
+Computations should be as easy as normal Rust to write, thanks to the usage of operator overloading.
+
+```Rust
+let result = a + b;
+```
+
+The decryption is achieved by using the `decrypt` method, which comes from the FheDecrypt trait.
+
+```Rust
+let decrypted_result: u8 = result.decrypt(&client_key);
+
+let clear_result = clear_a + clear_b;
+
+assert_eq!(decrypted_result, clear_result);
+```
--- a/tfhe/docs/fundamentals/configure-and-create-keys.md
+++ b/tfhe/docs/fundamentals/configure-and-create-keys.md
@@ -0,0 +1,24 @@
+# Configure and create keys
+
+The first step is the creation of the configuration. The configuration is used to declare which type you will (or will not) use, as well as enabling you to use custom crypto-parameters for these types. Custom parameters should only be used for more advanced usage and/or testing.
+
+A configuration can be created by using the ConfigBuilder type.
+
+In this example, 8-bit unsigned integers with default parameters are used. The `integers` feature must also be enabled, as per the table on [this page](../guides/rust\_configuration.md#choosing-your-features).
+
+The config is generated by first creating a builder with all types deactivated. Then, the integer types with default parameters are activated, since we are going to use FheUint8 values.
+
+```rust
+use tfhe::{ConfigBuilder, generate_keys};
+
+fn main() {
+    let config = ConfigBuilder::default().build();
+
+
+    let (client_key, server_key) = generate_keys(config);
+}
+```
+
+The `generate_keys` command returns a client key and a server key.
+
+The `client_key` is meant to stay private and not leave the client, whereas the `server_key` can be made public and sent to a server for it to enable FHE computations.
--- a/tfhe/docs/fundamentals/encrypt-data.md
+++ b/tfhe/docs/fundamentals/encrypt-data.md
@@ -0,0 +1,13 @@
+# Encrypt data
+
+Encrypting data is achieved via the `encrypt` associated function of the FheEncrypt trait.
+
+Types exposed by this crate implement at least one of FheEncrypt or FheTryEncrypt to allow encryption.
+
+```Rust
+let clear_a = 27u8;
+let clear_b = 128u8;
+
+let a = FheUint8::encrypt(clear_a, &client_key);
+let b = FheUint8::encrypt(clear_b, &client_key);
+```
--- a/tfhe/docs/fundamentals/serialization.md
+++ b/tfhe/docs/fundamentals/serialization.md
@@ -1,4 +1,6 @@
-# Serialization/Deserialization
+# Serialize/Deserialize
+
+## Serialization/Deserialization

 As explained in the Introduction, most types are meant to be shared with the server that performs the computations.

@@ -11,7 +13,7 @@ To serialize our data, a [data format](https://serde.rs/#data-formats) should be

 [dependencies]
 # ...
-tfhe = { version = "0.5.0", features = ["integer","x86_64-unix"]}
+tfhe = { version = "0.5.2", features = ["integer","x86_64-unix"]}
 bincode = "1.3.3"
 ```

@@ -70,25 +72,17 @@ fn server_function(serialized_data: &[u8]) -> Result<Vec<u8>, Box<dyn std::error
 }
 ```

+## Safe Serialization/Deserialization

-# Safe Serialization/Deserialization
+For some types, safe serialization and deserialization functions are available. Bincode is used internally.

-For some types, safe serialization and deserialization functions are available.
-Bincode is used internally.
+Safe-deserialization must take as input the output of a safe-serialization. On this condition, validation of the following is done:

-Safe-deserialization must take as input the output of a safe-serialization.
-On this condition, validation of the following is done:
- type: trying to deserialize `type A` from a serialized `type B` raises an error along the lines of *On deserialization, expected type A, got type B* instead of a generic deserialization error (or less likely a meaningless result of `type A`)
- version: trying to deserialize `type A` (version 0.2) from a serialized `type A` (incompatible version 0.1) raises an error along the lines of *On deserialization, expected serialization version 0.2, got version 0.1* instead of a generic deserialization error (or less likely a meaningless result of `type A` (version 0.2))
- parameter compatibility: trying to deserialize into an object of `type A` with some crypto parameters from a an object of `type A` with other crypto parameters raises an error along the lines of *Deserialized object of type A not conformant with given parameter set*.
-If both parameters sets 1 and 2 have the same lwe dimension for ciphertexts, a ciphertext from param 1 may not fail this deserialization check with param 2 even if doing this deserialization may not make sense.
-Also, this check can't distinguish ciphertexts/server keys from independant client keys with the same parameters (which makes no sense combining to do homomorphic operations).
-This check is meant to prevent runtime errors in server homomorphic operations by checking that server keys and ciphertexts are compatible with the same parameter set.
+* type: trying to deserialize `type A` from a serialized `type B` raises an error along the lines of _On deserialization, expected type A, got type B_ instead of a generic deserialization error (or less likely a meaningless result of `type A`)
+* version: trying to deserialize `type A` (version 0.2) from a serialized `type A` (incompatible version 0.1) raises an error along the lines of _On deserialization, expected serialization version 0.2, got version 0.1_ instead of a generic deserialization error (or less likely a meaningless result of `type A` (version 0.2))
+* parameter compatibility: trying to deserialize into an object of `type A` with some crypto parameters from a an object of `type A` with other crypto parameters raises an error along the lines of _Deserialized object of type A not conformant with given parameter set_. If both parameters sets 1 and 2 have the same lwe dimension for ciphertexts, a ciphertext from param 1 may not fail this deserialization check with param 2 even if doing this deserialization may not make sense. Also, this check can't distinguish ciphertexts/server keys from independant client keys with the same parameters (which makes no sense combining to do homomorphic operations). This check is meant to prevent runtime errors in server homomorphic operations by checking that server keys and ciphertexts are compatible with the same parameter set.

-Moreover, a size limit (in number of bytes) for the serialized data is expected on both serialization and deserialization.
-On serialization, an error is raised if the serialized output would be bigger than the given limit.
-On deserialization, an error is raised if the serialized input is bigger than the given limit.
-It is meant to gracefully return an error in case of an attacker trying to cause an out of memory error on deserialization. 
+Moreover, a size limit (in number of bytes) for the serialized data is expected on both serialization and deserialization. On serialization, an error is raised if the serialized output would be bigger than the given limit. On deserialization, an error is raised if the serialized input is bigger than the given limit. It is meant to gracefully return an error in case of an attacker trying to cause an out of memory error on deserialization.

 A standalone `is_conformant` method is also available on those types to do a parameter compatibility check.

--- a/tfhe/docs/fundamentals/set-the-server-key.md
+++ b/tfhe/docs/fundamentals/set-the-server-key.md
@@ -0,0 +1,17 @@
+# Set the server key
+
+The next step is to call `set_server_key`
+
+This function will **move** the server key to an internal state of the crate and manage the details to give a simpler interface.
+
+```rust
+use tfhe::{ConfigBuilder, generate_keys, set_server_key};
+
+fn main() {
+    let config = ConfigBuilder::default().build();
+
+    let (client_key, server_key) = generate_keys(config);
+
+    set_server_key(server_key);
+}
+```
--- a/tfhe/docs/fundamentals/trivial_ciphertext.md
+++ b/tfhe/docs/fundamentals/trivial_ciphertext.md
@@ -1,11 +1,8 @@
-# Trivial Ciphertext
+# Use Trivial Ciphertext

-Sometimes, the server side needs to initialize a value.
-For example, when computing the sum of a list of ciphertext,
-one might want to initialize the `sum` variable to `0`.
+Sometimes, the server side needs to initialize a value. For example, when computing the sum of a list of ciphertext, one might want to initialize the `sum` variable to `0`.

-Instead of asking the client to send a real encryption of zero,
-the server can do a *trivial encryption*
+Instead of asking the client to send a real encryption of zero, the server can do a _trivial encryption_

 ```rust
 use tfhe::prelude::*;
@@ -22,13 +19,9 @@ let clear: u8 = a.decrypt(&client_key);
 assert_eq!(clear, 234);
 ```

-A *trivial encryption* will create a ciphertext that contains
-the desired value, however, the 'encryption' is trivial that is,
-it is not really encrypted: anyone, any key can decrypt it.
+A _trivial encryption_ will create a ciphertext that contains the desired value, however, the 'encryption' is trivial that is, it is not really encrypted: anyone, any key can decrypt it.

-Note that when you want to do an operation that involves a ciphertext
-and a clear value, you should only use a trivial encryption of the clear
-value if the ciphertext/clear-value operation (often called scalar operation) you want to run is not supported.
+Note that when you want to do an operation that involves a ciphertext and a clear value, you should only use a trivial encryption of the clear value if the ciphertext/clear-value operation (often called scalar operation) you want to run is not supported.

 ### Example

--- a/tfhe/docs/getting-started/what-is-tfhe-rs.md
+++ b/tfhe/docs/getting-started/what-is-tfhe-rs.md
@@ -0,0 +1,32 @@
+# What is TFHE-rs?
+
+TFHE-rs is a pure Rust implementation of TFHE for Boolean and integer arithmetics over encrypted data. It includes a Rust and C API, as well as a client-side WASM API.
+
+TFHE-rs is meant for developers and researchers who want full control over what they can do with TFHE, while not worrying about the low level implementation.
+
+The goal is to have a stable, simple, high-performance, and production-ready library for all the advanced features of TFHE.
+
+## Key cryptographic concepts
+
+The TFHE-rs library implements Zama’s variant of Fully Homomorphic Encryption over the Torus (TFHE). TFHE is based on Learning With Errors (LWE), a well-studied cryptographic primitive believed to be secure even against quantum computers.
+
+In cryptography, a raw value is called a message (also sometimes called a cleartext), while an encoded message is called a plaintext and an encrypted plaintext is called a ciphertext.
+
+The idea of homomorphic encryption is that you can compute on ciphertexts while not knowing messages encrypted within them. A scheme is said to be _fully homomorphic_, meaning any program can be evaluated with it, if at least two of the following operations are supported ($$x$$ is a plaintext and $$E[x]$$ is the corresponding ciphertext):
+
+* homomorphic univariate function evaluation: $$f(E[x]) = E[f(x)]$$
+* homomorphic addition: $$E[x] + E[y] = E[x + y]$$
+* homomorphic multiplication: $$E[x] * E[y] = E[x * y]$$
+
+Zama's variant of TFHE is fully homomorphic and deals with fixed-precision numbers as messages. It implements all needed homomorphic operations, such as addition and function evaluation via **Programmable Bootstrapping**. You can read more about Zama's TFHE variant in the [preliminary whitepaper](https://whitepaper.zama.ai/).
+
+Using FHE in a Rust program with TFHE-rs consists in:
+
+* generating a client key and a server key using secure parameters:
+  * a client key encrypts/decrypts data and must be kept secret
+  * a server key is used to perform operations on encrypted data and could be public (also called an evaluation key)
+* encrypting plaintexts using the client key to produce ciphertexts
+* operating homomorphically on ciphertexts with the server key
+* decrypting the resulting ciphertexts into plaintexts using the client key
+
+If you would like to know more about the problems that FHE solves, we suggest you review our [6 minute introduction to homomorphic encryption](https://6min.zama.ai/).
--- a/tfhe/docs/getting_started/benchmarks.md
+++ b/tfhe/docs/getting_started/benchmarks.md
@@ -3,30 +3,44 @@
 Due to their nature, homomorphic operations are naturally slower than their cleartext equivalents. Some timings are exposed for basic operations. For completeness, benchmarks for other libraries are also given.

 {% hint style="info" %}
-All benchmarks were launched on an AWS m6i.metal with the following specifications: Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz and 512GB of RAM.
+All benchmarks were launched on an AWS hpc7a.96xlarge instance with the following specifications: AMD EPYC 9R14 CPU @ 2.60GHz and 740GB of RAM.
 {% endhint %}

 ## Integer

 This measures the execution time for some operation sets of tfhe-rs::integer (the unsigned version). Note that the timings for `FheInt` (i.e., the signed integers) are similar.

+The table below reports the timing when the inputs of the benchmarked operation are encrypted.
+
 | Operation \ Size                                       | `FheUint8` | `FheUint16` | `FheUint32` | `FheUint64` | `FheUint128` | `FheUint256` |
-|--------------------------------------------------------|------------|-------------|-------------|-------------|--------------|--------------|
-| Negation (`-`)                                         | 70.9 ms    | 99.3 ms     | 129 ms      | 180 ms      | 239 ms       | 333 ms       |
-| Add / Sub (`+`,`-`)                                    | 70.5 ms    | 100 ms      | 132 ms      | 186 ms      | 249 ms       | 334 ms       |
-| Mul (`x`)                                              | 144 ms     | 216 ms      | 333 ms      | 832 ms      | 2.50 s       | 8.85 s       |
-| Equal / Not Equal (`eq`, `ne`)                         | 36.1 ms    | 36.5 ms     | 57.4 ms     | 64.2 ms     | 67.3 ms      | 78.1 ms      |
-| Comparisons  (`ge`, `gt`, `le`, `lt`)                  | 52.6 ms    | 73.1 ms     | 98.8 ms     | 124 ms      | 165 ms       | 201 ms       |
-| Max / Min   (`max`,`min`)                              | 76.2 ms    | 102 ms      | 135 ms      | 171 ms      | 212 ms       | 301 ms       |
-| Bitwise operations (`&`, `\|`, `^`)                    | 19.4 ms    | 20.3 ms     | 21.0 ms     | 27.2 ms     | 31.6 ms      | 40.2 ms      |
-| Div / Rem  (`/`, `%`)                                  | 729 ms     | 1.93 s      | 4.81 s      | 12.2 s      | 30.7 s       | 89.6 s       |
-| Left / Right Shifts (`<<`, `>>`)                       | 99.4 ms    | 129 ms      | 180 ms      | 243 ms      | 372 ms       | 762 ms       |
-| Left / Right Rotations (`left_rotate`, `right_rotate`) | 103 ms     | 128 ms      | 182 ms      | 241 ms      | 374 ms       | 763 ms       |
+| ------------------------------------------------------ | ---------- | ----------- | ----------- | ----------- | ------------ | ------------ |
+| Negation (`-`)                                         | 55.4 ms    | 79.7 ms     | 105 ms      | 133 ms      | 163 ms       | 199 ms       |
+| Add / Sub (`+`,`-`)                                    | 58.9 ms    | 86.0 ms     | 106 ms      | 124 ms      | 151 ms       | 193 ms       |
+| Mul (`x`)                                              | 122 ms     | 164 ms      | 227 ms      | 410 ms      | 1,04 s       | 3,41 s       |
+| Equal / Not Equal (`eq`, `ne`)                         | 32.0 ms    | 32.0 ms     | 50.4 ms     | 50.9 ms     | 53.1 ms      | 54.6 ms      |
+| Comparisons (`ge`, `gt`, `le`, `lt`)                   | 43.7 ms    | 65.2 ms     | 84.3 ms     | 107 ms      | 132 ms       | 159 ms       |
+| Max / Min (`max`,`min`)                                | 68.4 ms    | 86.8 ms     | 106 ms      | 132 ms      | 160 ms       | 200 ms       |
+| Bitwise operations (`&`, `\|`, `^`)                    | 17.1 ms    | 17.3 ms     | 17.8 ms     | 18.8 ms     | 20.2 ms      | 22.2 ms      |
+| Div / Rem (`/`, `%`)                                   | 631 ms     | 1.59 s      | 3.77 s      | 8,64 s      | 20,3 s       | 53,4 s       |
+| Left / Right Shifts (`<<`, `>>`)                       | 82.8 ms    | 99.2 ms     | 121 ms      | 149 ms      | 194 ms       | 401 ms       |
+| Left / Right Rotations (`left_rotate`, `right_rotate`) | 82.1 ms    | 99.4 ms     | 120 ms      | 149 ms      | 194 ms       | 402 ms       |

+The table below reports the timing when the left input of the benchmarked operation is encrypted and the other is a clear scalar of the same size.

-All timings are related to parallelized Radix-based integer operations, where each block is encrypted using the default parameters (i.e., PARAM\_MESSAGE\_2\_CARRY\_2\_KS\_PBS, more information about parameters can be found [here](../fine_grained_api/shortint/parameters.md)).
-To ensure predictable timings, the operation flavor is the `default` one: the carry is propagated if needed. The operation costs may be reduced by using `unchecked`, `checked`, or `smart`.
+| Operation \ Size                                       | `FheUint8` | `FheUint16` | `FheUint32` | `FheUint64` | `FheUint128` | `FheUint256` |
+| ------------------------------------------------------ | ---------- | ----------- | ----------- | ----------- | ------------ | ------------ |
+| Add / Sub (`+`,`-`)                                    | 68.3 ms    | 82.4 ms     | 102 ms      | 122 ms      | 151 ms       | 191 ms       |
+| Mul (`x`)                                              | 93.7 ms    | 139 ms      | 178 ms      | 242 ms      | 516 ms       | 1.02 s       |
+| Equal / Not Equal (`eq`, `ne`)                         | 30.2 ms    | 30.8 ms     | 32.7 ms     | 50.4 ms     | 51.2 ms      | 54.8 ms      |
+| Comparisons (`ge`, `gt`, `le`, `lt`)                   | 47.3 ms    | 69.9 ms     | 96.3 ms     | 102 ms      | 138 ms       | 141 ms       |
+| Max / Min (`max`,`min`)                                | 75.4 ms    | 99.7 ms     | 120 ms      | 126 ms      | 150 ms       | 186 ms       |
+| Bitwise operations (`&`, `\|`, `^`)                    | 17.1 ms    | 17.4 ms     | 18.2 ms     | 19.2 ms     | 19.7 ms      | 22.6 ms      |
+| Div (`/`)                                              | 160 ms     | 212 ms      | 272 ms      | 402 ms      | 796 ms       | 2.27 s       |
+| Rem (`%`)                                              | 315 ms     | 428 ms      | 556 ms      | 767 ms      | 1.27 s       | 2.86 s       |
+| Left / Right Shifts (`<<`, `>>`)                       | 16.8 ms    | 16.8 ms     | 17.3 ms     | 18.0 ms     | 18.9 ms      | 22.6 ms      |
+| Left / Right Rotations (`left_rotate`, `right_rotate`) | 16.8 ms    | 16.9 ms     | 17.3 ms     | 18.3 ms     | 19.0 ms      | 22.8 ms      |

+All timings are related to parallelized Radix-based integer operations, where each block is encrypted using the default parameters (i.e., PARAM\_MESSAGE\_2\_CARRY\_2\_KS\_PBS, more information about parameters can be found [here](../references/fine-grained-apis/shortint/parameters.md)). To ensure predictable timings, the operation flavor is the `default` one: the carry is propagated if needed. The operation costs may be reduced by using `unchecked`, `checked`, or `smart`.

 ## Shortint

@@ -35,12 +49,11 @@ This measures the execution time for some operations using various parameter set
 This uses the Concrete FFT + AVX-512 configuration.

 | Parameter set                      | PARAM\_MESSAGE\_1\_CARRY\_1 | PARAM\_MESSAGE\_2\_CARRY\_2 | PARAM\_MESSAGE\_3\_CARRY\_3 | PARAM\_MESSAGE\_4\_CARRY\_4 |
-|------------------------------------|-----------------------------|-----------------------------|-----------------------------|-----------------------------|
-| unchecked\_add                     | 348 ns                      | 413 ns                      | 2.95 µs                     | 12.1 µs                     |
-| add                                | 7.59 ms                     | 17.0 ms                     | 121 ms                      | 835 ms                      |
-| mul\_lsb                           | 8.13 ms                     | 16.8 ms                     | 121 ms                      | 827 ms                      |
-| keyswitch\_programmable\_bootstrap | 7.28 ms                     | 16.6  ms                    | 121 ms                      | 811 ms                      |
-
+| ---------------------------------- | --------------------------- | --------------------------- | --------------------------- | --------------------------- |
+| unchecked\_add                     | 341 ns                      | 555 ns                      | 2.47 µs                     | 9.77 µs                     |
+| add                                | 5.96 ms                     | 12.6 ms                     | 102 ms                      | 508 ms                      |
+| mul\_lsb                           | 5.99 ms                     | 12.3 ms                     | 101 ms                      | 500 ms                      |
+| keyswitch\_programmable\_bootstrap | 6.40 ms                     | 12.9 ms                     | 104 ms                      | 489 ms                      |

 ## Boolean

@@ -49,26 +62,25 @@ This measures the execution time of a single binary Boolean gate.
 ### tfhe-rs::boolean.

 | Parameter set                                        | Concrete FFT + AVX-512 |
-|------------------------------------------------------|------------------------|
-| DEFAULT\_PARAMETERS\_KS\_PBS                         | 9.19 ms                |
-| PARAMETERS\_ERROR\_PROB\_2\_POW\_MINUS\_165\_KS\_PBS | 14.1 ms                |
-| TFHE\_LIB\_PARAMETERS                                | 10.0 ms                |
-
+| ---------------------------------------------------- | ---------------------- |
+| DEFAULT\_PARAMETERS\_KS\_PBS                         | 8.49 ms                |
+| PARAMETERS\_ERROR\_PROB\_2\_POW\_MINUS\_165\_KS\_PBS | 13.7 ms                |
+| TFHE\_LIB\_PARAMETERS                                | 9.90 ms                |

 ### tfhe-lib.

-Using the same m6i.metal machine as the one for tfhe-rs, the timings are:
+Using the same hpc7a.96xlarge machine as the one for tfhe-rs, the timings are:

 | Parameter set                                    | spqlios-fma |
-|--------------------------------------------------|-------------|
-| default\_128bit\_gate\_bootstrapping\_parameters | 15.4 ms     |
+| ------------------------------------------------ | ----------- |
+| default\_128bit\_gate\_bootstrapping\_parameters | 13.5 ms     |

-### OpenFHE (v1.1.1).
+### OpenFHE (v1.1.2).

-Following the official instructions from OpenFHE, `clang14` and the following command are used to setup the project:
-`cmake -DNATIVE_SIZE=32 -DWITH_NATIVEOPT=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DWITH_OPENMP=OFF ..`
+Following the official instructions from OpenFHE, `clang14` and the following command are used to setup the project: `cmake -DNATIVE_SIZE=32 -DWITH_NATIVEOPT=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DWITH_OPENMP=OFF ..`

 To use the HEXL library, the configuration used is as follows:
+
 ```bash
 export CXX=clang++
 export CC=clang
@@ -80,13 +92,12 @@ hexl -> y
 scripts/build-openfhe-development-hexl.sh
 ```

-Using the same m6i.metal machine as the one for tfhe-rs, the timings are:
-
-| Parameter set                    | GINX    | GINX w/ Intel HEXL |
-|----------------------------------|---------|--------------------|
-| FHEW\_BINGATE/STD128\_OR         | 40.2 ms | 31.0 ms            |
-| FHEW\_BINGATE/STD128\_LMKCDEY_OR | 38.6 ms | 28.4 ms            |
+Using the same hpc7a.96xlarge machine as the one for tfhe-rs, the timings are:

+| Parameter set                     | GINX    | GINX w/ Intel HEXL |
+| --------------------------------- | ------- | ------------------ |
+| FHEW\_BINGATE/STD128\_OR          | 25.5 ms | 21,6 ms            |
+| FHEW\_BINGATE/STD128\_LMKCDEY\_OR | 25.4 ms | 19.9 ms            |

 ## How to reproduce TFHE-rs benchmarks

--- a/tfhe/docs/getting_started/installation.md
+++ b/tfhe/docs/getting_started/installation.md
@@ -8,12 +8,12 @@ To use `TFHE-rs` in your project, you first need to add it as a dependency in yo

 If you are using an `x86` machine:
 ```toml
-tfhe = { version = "0.5.0", features = [ "boolean", "shortint", "integer", "x86_64-unix" ] }
+tfhe = { version = "0.5.2", features = [ "boolean", "shortint", "integer", "x86_64-unix" ] }
 ```

 If you are using an `ARM` machine:
 ```toml
-tfhe = { version = "0.5.0", features = [ "boolean", "shortint", "integer", "aarch64-unix" ] }
+tfhe = { version = "0.5.2", features = [ "boolean", "shortint", "integer", "aarch64-unix" ] }
 ```

 {% hint style="info" %}
--- a/tfhe/docs/getting_started/operations.md
+++ b/tfhe/docs/getting_started/operations.md
@@ -1,9 +1,11 @@
-# Homomorphic Types and Operations
+# Types & Operations

 ## Types
+
 `TFHE-rs` includes two main types to represent encrypted data:
- `FheUint`: this is the homomorphic equivalent of Rust unsigned integers `u8, u16, ...`
- `FheInt`: this is the homomorphic equivalent of Rust (signed) integers `i8, i16, ...`
+
+* `FheUint`: this is the homomorphic equivalent of Rust unsigned integers `u8, u16, ...`
+* `FheInt`: this is the homomorphic equivalent of Rust (signed) integers `i8, i16, ...`

 In the same manner as many programming languages, the number of bits used to represent the data must be chosen when declaring a variable. For instance:

@@ -19,35 +21,35 @@ In the same manner as many programming languages, the number of bits used to rep
 ```

 ## Operation list
-The table below contains an overview of the available operations in `TFHE-rs`. The notation `Enc` (for Encypted) either refers to `FheInt` or `FheUint`, for any size between 1 and 256-bits.
+
+The table below contains an overview of the available operations in `TFHE-rs`. The notation `Enc` (for Encrypted) either refers to `FheInt` or `FheUint`, for any size between 1 and 256-bits.

 More details, and further examples, are given in the following sections.

-| name                  | symbol         | `Enc`/`Enc`        | `Enc`/ `Int`             |
-|-----------------------|----------------|--------------------|--------------------------|
-| Neg                   | `-`            | :heavy_check_mark: | :heavy_check_mark:       |
-| Add                   | `+`            | :heavy_check_mark: | :heavy_check_mark:       |
-| Sub                   | `-`            | :heavy_check_mark: | :heavy_check_mark:       |
-| Mul                   | `*`            | :heavy_check_mark: | :heavy_check_mark:       |
-| Div                   | `/`            | :heavy_check_mark: | :heavy_check_mark:       |
-| Rem                   | `%`            | :heavy_check_mark: | :heavy_check_mark:       |
-| Not                   | `!`            | :heavy_check_mark: | :heavy_check_mark:       |
-| BitAnd                | `&`            | :heavy_check_mark: | :heavy_check_mark:       |
-| BitOr                 | `\|`           | :heavy_check_mark: | :heavy_check_mark:       |
-| BitXor                | `^`            | :heavy_check_mark: | :heavy_check_mark:       |
-| Shr                   | `>>`           | :heavy_check_mark: | :heavy_check_mark:       |
-| Shl                   | `<<`           | :heavy_check_mark: | :heavy_check_mark:       |
-| Min                   | `min`          | :heavy_check_mark: | :heavy_check_mark:       |
-| Max                   | `max`          | :heavy_check_mark: | :heavy_check_mark:       |
-| Greater than          | `gt`           | :heavy_check_mark: | :heavy_check_mark:       |
-| Greater or equal than | `ge`           | :heavy_check_mark: | :heavy_check_mark:       |
-| Lower than            | `lt`           | :heavy_check_mark: | :heavy_check_mark:       |
-| Lower or equal than   | `le`           | :heavy_check_mark: | :heavy_check_mark:       |
-| Equal                 | `eq`           | :heavy_check_mark: | :heavy_check_mark:       |
-| Cast (into dest type) | `cast_into`    | :heavy_check_mark: | :heavy_multiplication_x: |
-| Cast (from src type)  | `cast_from`    | :heavy_check_mark: | :heavy_multiplication_x: |
-| Ternary operator      | `if_then_else` | :heavy_check_mark: | :heavy_multiplication_x: |
-
+| name                  | symbol         | `Enc`/`Enc`          | `Enc`/ `Int`               |
+| --------------------- | -------------- | -------------------- | -------------------------- |
+| Neg                   | `-`            | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Add                   | `+`            | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Sub                   | `-`            | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Mul                   | `*`            | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Div                   | `/`            | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Rem                   | `%`            | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Not                   | `!`            | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| BitAnd                | `&`            | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| BitOr                 | `\|`           | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| BitXor                | `^`            | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Shr                   | `>>`           | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Shl                   | `<<`           | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Min                   | `min`          | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Max                   | `max`          | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Greater than          | `gt`           | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Greater or equal than | `ge`           | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Lower than            | `lt`           | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Lower or equal than   | `le`           | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Equal                 | `eq`           | :heavy\_check\_mark: | :heavy\_check\_mark:       |
+| Cast (into dest type) | `cast_into`    | :heavy\_check\_mark: | :heavy\_multiplication\_x: |
+| Cast (from src type)  | `cast_from`    | :heavy\_check\_mark: | :heavy\_multiplication\_x: |
+| Ternary operator      | `if_then_else` | :heavy\_check\_mark: | :heavy\_multiplication\_x: |

 ## Integer

@@ -59,17 +61,16 @@ Homomorphic integer types support arithmetic operations.

 The list of supported operations is:

-| name                                                     | symbol | type   |
-|----------------------------------------------------------|--------|--------|
-| [Neg](https://doc.rust-lang.org/std/ops/trait.Neg.html)  | `-`    | Unary  |
-| [Add](https://doc.rust-lang.org/std/ops/trait.Add.html)  | `+`    | Binary |
-| [Sub](https://doc.rust-lang.org/std/ops/trait.Sub.html)  | `-`    | Binary |
-| [Mul](https://doc.rust-lang.org/std/ops/trait.Mul.html)  | `*`    | Binary |
-| [Div](https://doc.rust-lang.org/std/ops/trait.Div.html)* | `/`    | Binary |
-| [Rem](https://doc.rust-lang.org/std/ops/trait.Rem.html)* | `%`    | Binary |
+| name                                                      | symbol | type   |
+| --------------------------------------------------------- | ------ | ------ |
+| [Neg](https://doc.rust-lang.org/std/ops/trait.Neg.html)   | `-`    | Unary  |
+| [Add](https://doc.rust-lang.org/std/ops/trait.Add.html)   | `+`    | Binary |
+| [Sub](https://doc.rust-lang.org/std/ops/trait.Sub.html)   | `-`    | Binary |
+| [Mul](https://doc.rust-lang.org/std/ops/trait.Mul.html)   | `*`    | Binary |
+| [Div](https://doc.rust-lang.org/std/ops/trait.Div.html)\* | `/`    | Binary |
+| [Rem](https://doc.rust-lang.org/std/ops/trait.Rem.html)\* | `%`    | Binary |

-For division by 0, the convention is to return `modulus - 1`. For instance, for `FheUint8`, the modulus is $$2^8=256$$, so a division by 0 will return an encryption of 255.
-For the remainder operator, the convention is to return the first input without any modification.  For instance, if `ct1 = FheUint8(63)` and `ct2 = FheUint8(0)` then `ct1 % ct2` will return `FheUint8(63)`.
+For division by 0, the convention is to return `modulus - 1`. For instance, for `FheUint8`, the modulus is $$2^8=256$$, so a division by 0 will return an encryption of 255. For the remainder operator, the convention is to return the first input without any modification. For instance, if `ct1 = FheUint8(63)` and `ct2 = FheUint8(0)` then `ct1 % ct2` will return `FheUint8(63)`.

 A simple example of how to use these operations:

@@ -116,16 +117,16 @@ Homomorphic integer types support some bitwise operations.

 The list of supported operations is:

-| name                                                                                 | symbol         | type   |
-|--------------------------------------------------------------------------------------|----------------|--------|
-| [Not](https://doc.rust-lang.org/std/ops/trait.Not.html)                              | `!`            | Unary  |
-| [BitAnd](https://doc.rust-lang.org/std/ops/trait.BitAnd.html)                        | `&`            | Binary |
-| [BitOr](https://doc.rust-lang.org/std/ops/trait.BitOr.html)                          | `\|`           | Binary |
-| [BitXor](https://doc.rust-lang.org/std/ops/trait.BitXor.html)                        | `^`            | Binary |
-| [Shr](https://doc.rust-lang.org/std/ops/trait.Shr.html)                              | `>>`           | Binary |
-| [Shl](https://doc.rust-lang.org/std/ops/trait.Shl.html)                              | `<<`           | Binary |
-| [Rotate Right](https://doc.rust-lang.org/std/primitive.u32.html#method.rotate_right) | `rotate_right` | Binary |
-| [Rotate Left](https://doc.rust-lang.org/std/primitive.u32.html#method.rotate_left)   | `rotate_left`  | Binary |
+| name                                                                                  | symbol         | type   |
+| ------------------------------------------------------------------------------------- | -------------- | ------ |
+| [Not](https://doc.rust-lang.org/std/ops/trait.Not.html)                               | `!`            | Unary  |
+| [BitAnd](https://doc.rust-lang.org/std/ops/trait.BitAnd.html)                         | `&`            | Binary |
+| [BitOr](https://doc.rust-lang.org/std/ops/trait.BitOr.html)                           | `\|`           | Binary |
+| [BitXor](https://doc.rust-lang.org/std/ops/trait.BitXor.html)                         | `^`            | Binary |
+| [Shr](https://doc.rust-lang.org/std/ops/trait.Shr.html)                               | `>>`           | Binary |
+| [Shl](https://doc.rust-lang.org/std/ops/trait.Shl.html)                               | `<<`           | Binary |
+| [Rotate Right](https://doc.rust-lang.org/std/primitive.u32.html#method.rotate\_right) | `rotate_right` | Binary |
+| [Rotate Left](https://doc.rust-lang.org/std/primitive.u32.html#method.rotate\_left)   | `rotate_left`  | Binary |

 A simple example of how to use these operations:

@@ -173,13 +174,13 @@ You will need to use different methods instead of using symbols for the comparis
 The list of supported operations is:

 | name                                                                        | symbol | type   |
-|-----------------------------------------------------------------------------|--------|--------|
-| [Equal           ](https://doc.rust-lang.org/std/cmp/trait.PartialEq.html)  | `eq`   | Binary |
-| [Not Equal       ](https://doc.rust-lang.org/std/cmp/trait.PartialEq.html)  | `ne`   | Binary |
-| [Greater Than    ](https://doc.rust-lang.org/std/cmp/trait.PartialOrd.html) | `gt`   | Binary |
+| --------------------------------------------------------------------------- | ------ | ------ |
+| [Equal](https://doc.rust-lang.org/std/cmp/trait.PartialEq.html)             | `eq`   | Binary |
+| [Not Equal](https://doc.rust-lang.org/std/cmp/trait.PartialEq.html)         | `ne`   | Binary |
+| [Greater Than](https://doc.rust-lang.org/std/cmp/trait.PartialOrd.html)     | `gt`   | Binary |
 | [Greater or Equal](https://doc.rust-lang.org/std/cmp/trait.PartialOrd.html) | `ge`   | Binary |
-| [Lower           ](https://doc.rust-lang.org/std/cmp/trait.PartialOrd.html) | `lt`   | Binary |
-| [Lower or Equal  ](https://doc.rust-lang.org/std/cmp/trait.PartialOrd.html) | `le`   | Binary |
+| [Lower](https://doc.rust-lang.org/std/cmp/trait.PartialOrd.html)            | `lt`   | Binary |
+| [Lower or Equal](https://doc.rust-lang.org/std/cmp/trait.PartialOrd.html)   | `le`   | Binary |

 A simple example of how to use these operations:

@@ -260,10 +261,11 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
 ```

 ### Ternary conditional operator.
+
 The ternary conditional operator allows computing conditional instructions of the form `if cond { choice_if } else { choice_else }`.

 | name             | symbol         | type    |
-|------------------|----------------|---------|
+| ---------------- | -------------- | ------- |
 | Ternary operator | `if_then_else` | Ternary |

 The syntax is `encrypted_condition.if_then_else(encrypted_choice_if, encrypted_choice_else)`. The `encrypted_condition` should be an encryption of 0 or 1 in order to be valid.
@@ -309,10 +311,9 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
 }
 ```

-
 ### Casting.
-Casting between integer types is possible via the `cast_from` associated function
-or the `cast_into` method.
+
+Casting between integer types is possible via the `cast_from` associated function or the `cast_into` method.

 ```rust
 use tfhe::prelude::*;
@@ -370,7 +371,6 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
 }
 ```

-
 ## Boolean Operations

 Native homomorphic Booleans support common Boolean operations.
--- a/tfhe/docs/getting_started/quick_start.md
+++ b/tfhe/docs/getting_started/quick_start.md
@@ -1,15 +1,13 @@
-# Tutorial
-
-## Quick Start
+# Quick start

 The basic steps for using the high-level API of TFHE-rs are:

-1. Importing the TFHE-rs prelude;
-2. Client-side: Configuring and creating keys;
-3. Client-side: Encrypting data;
-4. Server-side: Setting the server key;
-5. Server-side: Computing over encrypted data;
-6. Client-side: Decrypting data.
+1. [Importing the TFHE-rs prelude;](quick\_start.md#imports)
+2. Client-side: [Configuring and creating keys;](../fundamentals/configure-and-create-keys.md)
+3. Client-side: [Encrypting data;](../fundamentals/encrypt-data.md)
+4. Server-side: [Setting the server key;](../fundamentals/set-the-server-key.md)
+5. Server-side: [Computing over encrypted data;](../fundamentals/compute-and-decrypt.md)
+6. Client-side: [Decrypting data.](../fundamentals/compute-and-decrypt.md)

 Here is a full example (combining the client and server parts):

@@ -43,94 +41,19 @@ fn main() {
 ```

 The default configuration for x86 Unix machines:
+
 ```toml
-tfhe = { version = "0.5.0", features = ["integer", "x86_64-unix"]}
+tfhe = { version = "0.5.2", features = ["integer", "x86_64-unix"]}
 ```

-Configuration options for different platforms can be found [here](../getting_started/installation.md). Other rust and homomorphic types features can be found [here](../how_to/rust_configuration.md).
+Configuration options for different platforms can be found [here](installation.md). Other rust and homomorphic types features can be found [here](../guides/rust\_configuration.md).

-### Imports.
+### Imports

 `tfhe` uses `traits` to have a consistent API for creating FHE types and enable users to write generic functions. To be able to use associated functions and methods of a trait, the trait has to be in scope.

 To make it easier, the `prelude` 'pattern' is used. All of the important `tfhe` traits are in a `prelude` module that you can **glob import**. With this, there is no need to remember or know the traits that you want to import.

 ```rust
-use tfhe::prelude::*;
-```
-
-### 1. Configuring and creating keys.
-
-The first step is the creation of the configuration. The configuration is used to declare which type you will (or will not) use, as well as enabling you to use custom crypto-parameters for these types. Custom parameters should only be used for more advanced usage and/or testing.
-
-A configuration can be created by using the ConfigBuilder type.
-
-In this example, 8-bit unsigned integers with default parameters are used. The `integers` 
-feature must also be enabled, as per the table on [this page](../how_to/rust_configuration.md#choosing-your-features).
-
-The config is generated by first creating a builder with all types deactivated. Then, the integer types with default parameters are activated, since we are going to use FheUint8 values.
-
-```rust
-use tfhe::{ConfigBuilder, generate_keys};
-
-fn main() {
-    let config = ConfigBuilder::default().build();
-
-
-    let (client_key, server_key) = generate_keys(config);
-}
-```
-
-The `generate_keys` command returns a client key and a server key.
-
-The `client_key` is meant to stay private and not leave the client, whereas the `server_key` can be made public and sent to a server for it to enable FHE computations.
-
-### 2. Setting the server key.
-
-The next step is to call `set_server_key`
-
-This function will **move** the server key to an internal state of the crate and manage the details to give a simpler interface.
-
-```rust
-use tfhe::{ConfigBuilder, generate_keys, set_server_key};
-
-fn main() {
-    let config = ConfigBuilder::default().build();
-
-    let (client_key, server_key) = generate_keys(config);
-
-    set_server_key(server_key);
-}
-```
-
-### 3. Encrypting data.
-
-Encrypting data is achieved via the `encrypt` associated function of the FheEncrypt trait.
-
-Types exposed by this crate implement at least one of FheEncrypt or FheTryEncrypt to allow encryption.
-
-```Rust
-let clear_a = 27u8;
-let clear_b = 128u8;
-
-let a = FheUint8::encrypt(clear_a, &client_key);
-let b = FheUint8::encrypt(clear_b, &client_key);
-```
-
-### 4. Computation and decryption.
-
-Computations should be as easy as normal Rust to write, thanks to the usage of operator overloading.
-
-```Rust
-let result = a + b;
-```
-
-The decryption is achieved by using the `decrypt` method, which comes from the FheDecrypt trait.
-
-```Rust
-let decrypted_result: u8 = result.decrypt(&client_key);
-
-let clear_result = clear_a + clear_b;
-
-assert_eq!(decrypted_result, clear_result);
+use tfhe::prelude::*; 
 ```
--- a/tfhe/docs/guides/c_api.md
+++ b/tfhe/docs/guides/c_api.md
--- a/tfhe/docs/guides/count-pbs.md
+++ b/tfhe/docs/guides/count-pbs.md
@@ -0,0 +1,2 @@
+# How to count PBS
+
--- a/tfhe/docs/guides/debug.md
+++ b/tfhe/docs/guides/debug.md
@@ -0,0 +1,66 @@
+# Debug
+
+Since tfhe-rs 0.5, [trivial ciphertexts](broken-reference) have another application. They can be used to allow debugging via a debugger or print statements as well as speeding-up execution time so that you won't have to spend minutes waiting for execution to progress.
+
+This can greatly improve the pace at which one develops FHE applications.
+
+{% hint style="warning" %}
+Keep in mind that trivial ciphertexts are not secure at all, thus an application released/deployed in production must never receive trivial ciphertext from a client.
+{% endhint %}
+
+## Example
+
+To use this feature, simply call your circuits/functions with trivially encrypted values (made using `encrypt_trivial`) instead of real encryptions (made using `encrypt`)
+
+```rust
+use tfhe::prelude::*;
+use tfhe::{set_server_key, generate_keys, ConfigBuilder, FheUint128};
+
+
+fn mul_all(a: &FheUint128, b: &FheUint128, c: &FheUint128) -> FheUint128 {
+    // Use the debug format ('{:?}'), if you don't want to unwrap()
+    // and panic if the value is not a trivial.
+    println!(
+        "a: {:?}, b: {:?}, c: {:?}", 
+        a.try_decrypt_trivial::<u128>(),
+        b.try_decrypt_trivial::<u128>(),
+        c.try_decrypt_trivial::<u128>(),
+    );
+    let tmp = a * b;
+    
+    println!("a * b = {:?}", tmp.try_decrypt_trivial::<u128>());
+
+    tmp * c
+}
+
+
+fn main() {
+    let (cks, sks) = generate_keys(ConfigBuilder::default().build());
+    
+    set_server_key(sks);
+    
+    let a = FheUint128::encrypt_trivial(1234u128);
+    let b = FheUint128::encrypt_trivial(4567u128);
+    let c = FheUint128::encrypt_trivial(89101112u128);
+    
+    // since all inputs are trivially encrypted, this is going to be
+    // much faster
+    let result = mul_all(&a, &b, &c);
+}
+```
+
+This example is going to print.
+
+```
+a: Ok(1234), b: Ok(4567), c: Ok(89101112)
+a * b = Ok(5635678)
+```
+
+If any input to `mul_all` is not a trivial ciphertexts, the computations would be done 100% in FHE, and the program would output:
+
+```
+a: Err(NotTrivialCiphertextError), b: Err(NotTrivialCiphertextError), c: Err(NotTrivialCiphertextError)
+a * b = Err(NotTrivialCiphertextError)
+```
+
+Using trivial encryptions as input, the example runs in **980 ms** on a standard 12 cores laptop, using real encryptions it would run in **7.5 seconds** on a 128-core machine.
--- a/tfhe/docs/guides/js_on_wasm_api.md
+++ b/tfhe/docs/guides/js_on_wasm_api.md
--- a/tfhe/docs/guides/migrate_data.md
+++ b/tfhe/docs/guides/migrate_data.md
@@ -1,3 +1,3 @@
-# Migrating Data to TFHE-rs 0.5.0 (This Release)
+# Migrating Data to TFHE-rs 0.5.2 (This Release)

-Forward compatibility code to migrate data from TFHE-rs 0.4 to TFHE-rs 0.5 has been added in a minor release of TFHE-rs 0.4, the documentation about the process can be found [here](https://docs.zama.ai/tfhe-rs/0.4-1/how-to/migrate_data).
+Forward compatibility code to migrate data from TFHE-rs 0.4 to TFHE-rs 0.5 has been added in a minor release of TFHE-rs 0.4, the documentation about the process can be found [here](https://docs.zama.ai/tfhe-rs/v/0.4-1/how-to/migrate_data).
--- a/tfhe/docs/guides/overflow_operations.md
+++ b/tfhe/docs/guides/overflow_operations.md
--- a/tfhe/docs/guides/parallelized_pbs.md
+++ b/tfhe/docs/guides/parallelized_pbs.md
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Yuxi Zhao	bddc35459d	GITBOOK-5: Update TOC	2024-03-05 14:32:28 +00:00
Yuxi Zhao	27c421b359	GITBOOK-4: V2 design details	2024-02-28 15:27:05 +00:00
Yuxi Zhao	2adeff44f3	GITBOOK-3: correct a typo	2024-02-28 14:54:38 +00:00
Yuxi Zhao	d0042aed54	GITBOOK-2: No subject	2024-02-28 14:23:50 +00:00
Yuxi Zhao	5eabdeab55	GITBOOK-1: Remove extra sentences	2024-02-28 14:11:06 +00:00
yuxizama	0152c212af	Update SUMMARY.md	2024-02-28 15:07:14 +01:00
yuxizama	9a2c4a3784	Rename what-is-tfhe-rs to what-is-tfhe-rs.md	2024-02-28 15:06:45 +01:00
yuxizama	c14aad5656	V2 resorting 6	2024-02-28 14:57:24 +01:00
yuxizama	702e0ef306	V2 resorting 5	2024-02-28 14:54:21 +01:00
yuxizama	515d2e009f	V2 resorting 4	2024-02-28 14:30:49 +01:00
yuxizama	711b5151dc	V2 resorting 3	2024-02-28 14:28:58 +01:00
yuxizama	ceaee2f910	V2 resorting 2	2024-02-28 14:28:18 +01:00
yuxizama	41015db7a1	V2 resorting 1	2024-02-28 14:27:56 +01:00
Yuxi Zhao	485b2a7693	GITBOOK-15: V2 change images and adjust wording	2024-02-28 13:13:02 +00:00
Yuxi Zhao	7d903d5f7a	GITBOOK-13: update v2	2024-02-27 16:01:36 +00:00
Yuxi Zhao	19ac6eb123	GITBOOK-1: New structure	2024-02-27 16:27:16 +01:00
tmontaigu	5b653864b7	chore(tfhe): bump version to 0.5.2	2024-02-23 10:21:47 +01:00
Arthur Meyre	a1d189b415	chore(ci): update macOS runner for cargo builds	2024-02-23 10:21:47 +01:00
sarah el kazdadi	c59434f183	chore(ci): update toolchain, fix clippy warnings	2024-02-23 10:21:47 +01:00
David Testé	83239e6afa	chore(bench): implement integer casting benchmarks	2024-02-23 10:21:47 +01:00
sarah el kazdadi	ef8cb0273f	fix(tfhe): update pulp and bytemuck to fix nightly breakage	2024-02-23 10:21:47 +01:00
tmontaigu	9b353bac2d	fix(integer): correct degree in small comparisons	2024-02-23 10:21:47 +01:00
tmontaigu	46d65f1f87	fix(capi): add missing function on FheBool - safe ser/de - classical ser/de - comparisons - scalar binary fn/comparisons - compact & compressed fhe bool encryption	2024-02-23 10:21:47 +01:00
tmontaigu	a63a2cb725	chore(hlapi): add tests for fhe_bool	2024-02-23 10:21:47 +01:00
tmontaigu	c45af05ec6	fix(integer): make encrypt_bool specify the degree encrypt_one_block does not leak information on the message. BooleanBlocks are meant for when we want to be explicit that the value is a boolean and are ok for this to be public. Thus it needs to correctly set the degree to 1 for other operations to properly take advantage of that	2024-02-23 10:21:47 +01:00
tmontaigu	584eaeb4ed	fix(shortint): fix bitwise opts degree We used `after_bitand/or/xor` on the ct_left after the lut had changed its degree. So the `after_bit` function computed the resulting using a wrong degree for the left ct.	2024-02-23 10:21:47 +01:00
tmontaigu	8d94ed2512	fix(hlapi): bind missing cuda bitnot	2024-02-23 10:21:47 +01:00
tmontaigu	b8d9dbe85b	refactor(hlapi): split long files of hlapi This splits the long base.rs files into multiple ones, to make it easier to navigate. There is no code changes appart from moving stuff.	2024-02-23 10:21:47 +01:00
tmontaigu	ad25340c33	feat(capi): add Cuda support - This adds GPU support in the C API - Also make ctest (cmake test launcher) print test output when it fails	2024-02-23 10:21:47 +01:00
Arthur Meyre	ad1ae0c8c2	chore(ci): update scripts and Makefile for future forward compatibility	2024-01-31 18:22:15 +01:00
Arthur Meyre	ee40906b8b	chore(ci): convert some make targets to be semver trick compatible	2024-01-31 18:22:15 +01:00
Arthur Meyre	bf6b4cc541	chore(tfhe): bump version to 0.5.1	2024-01-30 10:51:39 +01:00
Arthur Meyre	24404567a4	chore(tfhe): bump tfhe-cuda-backend version to 0.1.3	2024-01-30 10:51:39 +01:00
tmontaigu	052dd4a60e	feat(integer): fuse two PBS in comparisons In comparisons, we were reducing a vec of orderings (inferior, equal, superior) into one final ordering, and then we would do one final PBS to transform that into a boolean value (0 or 1) depending what was wanted (<=, <, >, >=). This fuse the last PBS (ordering -> boolean value) with the last round of reduction, when there are only two blocks left to be reduced. This allows to gain one PBS. Meaning for ciphertext/cipheretxt comparisons we get back the performance lost introduced by the fix in f4c220c1. And comparisons between a clear and ciphertext get an improvement.	2024-01-30 10:51:39 +01:00
tmontaigu	f8d829d076	fix(integer): add noise cleaning pbs in comparisons In comparisons we were packing blocks to then do a subtraction between them. However this goes above the noise limit that would guarentee the advertised error propability. To fix that we add a pbs to clean the noise. This pbs only needs to be added in the ciphertext/ciphertext comparisons. Making them slower by 1 PBS.	2024-01-30 10:51:39 +01:00
dependabot[bot]	d9761ca17e	chore(deps): bump codecov/codecov-action from 3.1.4 to 3.1.5 Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 3.1.4 to 3.1.5. - [Release notes](https://github.com/codecov/codecov-action/releases) - [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md) - [Commits](`eaaf4bedf3...4fe8c5f003`) --- updated-dependencies: - dependency-name: codecov/codecov-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2024-01-30 10:51:39 +01:00
dependabot[bot]	8d2e15347b	chore(deps): bump tj-actions/changed-files from 42.0.0 to 42.0.2 Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 42.0.0 to 42.0.2. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](`ae82ed4ae0...90a06d6ba9`) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2024-01-30 10:51:39 +01:00
dependabot[bot]	a368257bc7	chore(deps): bump actions/upload-artifact from 4.1.0 to 4.3.0 Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.1.0 to 4.3.0. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](https://github.com/actions/upload-artifact/compare/v4.1.0...26f96dfa697d77e81fd5907df203aa23a56210a8) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2024-01-30 10:51:39 +01:00
David Testé	76d23d0c91	chore(bench): add ciphertexts sum to integer benchmarks	2024-01-30 10:51:39 +01:00
David Testé	ddc5002232	chore(bench): add pbs benchmarks on gpu	2024-01-30 10:51:39 +01:00
tmontaigu	c08c479616	docs(hlapi): document trivial encryption to debug	2024-01-30 10:51:39 +01:00
tmontaigu	f26afc16de	docs(hlapi): document how to use rayon	2024-01-30 10:51:39 +01:00
yuxizama	13f533f6fb	chore(docs): update readme links and badges	2024-01-30 10:51:39 +01:00
yuxizama	d9541e472b	chore(docs): update README.md Change support banner	2024-01-30 10:51:39 +01:00
Agnes Leroy	3453e45258	fix(gpu): make all async functions unsafe, fix cuda_drop binding, add missing sync	2024-01-30 10:51:39 +01:00
David Testé	55de96f046	chore(ci): add gpu tests from user documentation	2024-01-30 10:51:39 +01:00
Agnes Leroy	9747c06f6e	chore(gpu): fix formatting command	2024-01-30 10:51:39 +01:00
Agnes Leroy	00f72d2c13	chore(gpu): fix compilation when no nvidia gpu is available	2024-01-30 10:51:39 +01:00
tmontaigu	01f5cb9056	fix(integer): is_scalar_out_of_bounds handles bigger ct Fix a bug where in is_scalar_out_of_bounds, if the scalar was negative and the ciphertext a signed one with more blocks than the decomposed scalar, we would do an out of bound access (i.e a panic). This fixes that, this will fix doing signed_overflowing_mul on 256 bits where the bug first appeared	2024-01-30 10:51:39 +01:00
David Testé	d66e313fa4	chore(ci): fix inputs for gpu full benchmark workflow	2024-01-30 10:51:39 +01:00
Arthur Meyre	c9d530e642	fix(core): ignore value in the body when doing LWE encryption	2024-01-30 10:51:39 +01:00
Agnes Leroy	6c2096fe52	chore(gpu): rename "test vector" -> "luts" and "tvi" -> "lut_indexes"	2024-01-30 10:51:39 +01:00
Agnes Leroy	1e94134dda	chore(gpu): move around code in integer.h for better readability	2024-01-30 10:51:39 +01:00
tmontaigu	c76a60111c	fix(integer): fix cast in scalar_shift/rotate In scalar_shift/rotate, we get the number of bits to shift/rotate as a generic type, the can be casted to u64. We compute the total number of bits the ciphertext has, cast that number to the same type as the scalar, and do "shift % num_bits". However, if the number of bits computed exceeds the max value the scalar type can hold, we could end up doing a remainder with 0. e.g 256bits ciphertext and scalar type u8 => 256u64 casted to u8 results in 0. Fix that by casting the scalar value to u64.	2024-01-30 10:51:39 +01:00
tmontaigu	18ff400df2	chore(hlapi): remove leftover file This file was not correctly removed during the refactor	2024-01-30 10:51:39 +01:00
David Testé	3d31d09be5	chore(ci): change rust-toolchain action Github thrid-party Action actions-rs/toolchain is not maintained anymore. We switch to dtolnay/rust-toolchain.	2024-01-30 10:51:39 +01:00
David Testé	76322606f2	chore(ci): set rustbacktrace var to full to ease debug on failure	2024-01-30 10:51:39 +01:00
dependabot[bot]	bf58a9f0c6	chore(deps): bump actions/upload-artifact from 3.1.2 to 4.2.0 Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 3.1.2 to 4.2.0. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](https://github.com/actions/upload-artifact/compare/v3.1.2...694cdabd8bdb0f10b2cea11669e1bf5453eed0a6) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2024-01-30 10:51:39 +01:00
dependabot[bot]	64461c82b4	chore(deps): bump tj-actions/changed-files from 41.1.1 to 42.0.0 Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 41.1.1 to 42.0.0. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](`62f4729b5d...ae82ed4ae0`) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2024-01-30 10:51:39 +01:00
dependabot[bot]	339c84fbd9	chore(deps): bump actions/checkout from 3.5.3 to 4.1.1 Bumps [actions/checkout](https://github.com/actions/checkout) from 3.5.3 to 4.1.1. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v3.5.3...b4ffde65f46336ab88eb53be808477a3936bae11) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2024-01-30 10:51:39 +01:00
Arthur Meyre	bc682a5ffb	docs(bench): add scalar benchmarks for integer	2024-01-29 16:42:32 +01:00
Arthur Meyre	2920daf2d9	chore(docs): fix link to 0.4 semver doc	2024-01-23 10:50:25 +01:00