chore(gpu): change multi-gpu tests to run on rtx so it's cheaper

chore(gpu): set test threads to 1 when BIG_INSTANCE is false to get a better view of failures in the ci
chore(gpu): add comment, remove unnecessary sync
2026-01-11 07:38:08 -05:00 · 2024-09-12 14:03:42 +02:00 · 2024-09-12 13:24:07 +02:00 · 2024-09-12 13:24:07 +02:00 · 2024-09-12 10:49:41 +02:00 · 2024-09-12 10:49:41 +02:00
283 changed files with 14896 additions and 5478 deletions
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -56,7 +56,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
        with:
          since_last_remote_commit: true
          files_yaml: |
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -19,21 +19,53 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [labeled]
  push:
    branches:
      - main
-  schedule:
-    # Nightly tests @ 3AM after each work day
-    - cron: "0 3 * * MON-FRI"

 jobs:
+  should-run:
+    if:
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
+      github.event_name == 'workflow_dispatch'
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    outputs:
+      integer_test: ${{ github.event_name == 'workflow_dispatch' ||
+        steps.changed-files.outputs.integer_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+          persist-credentials: "false"
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
+        with:
+          since_last_remote_commit: true
+          files_yaml: |
+            integer:
+              - tfhe/Cargo.toml
+              - concrete-csprng/**
+              - tfhe-zk-pok/**
+              - tfhe/src/core_crypto/**
+              - tfhe/src/shortint/**
+              - tfhe/src/integer/**
+
  setup-instance:
    name: Setup instance (unsigned-integer-tests)
-    if: (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
-        (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
-        (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
-        github.event_name == 'workflow_dispatch'
+    needs: should-run
+    if:
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
+      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
@@ -60,7 +92,7 @@ jobs:
      - name: Checkout tfhe-rs
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
-          persist-credentials: 'false'
+          persist-credentials: "false"

      - name: Set up home
        run: |
@@ -103,7 +135,7 @@ jobs:
  teardown-instance:
    name: Teardown instance (unsigned-integer-tests)
    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, unsigned-integer-tests ]
+    needs: [setup-instance, unsigned-integer-tests]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -19,21 +19,53 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [labeled]
  push:
    branches:
      - main
-  schedule:
-    # Nightly tests @ 3AM after each work day
-    - cron: "0 3 * * MON-FRI"

 jobs:
+  should-run:
+    if:
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
+      github.event_name == 'workflow_dispatch'
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    outputs:
+      integer_test: ${{ github.event_name == 'workflow_dispatch' ||
+        steps.changed-files.outputs.integer_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+          persist-credentials: "false"
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
+        with:
+          since_last_remote_commit: true
+          files_yaml: |
+            integer:
+              - tfhe/Cargo.toml
+              - concrete-csprng/**
+              - tfhe-zk-pok/**
+              - tfhe/src/core_crypto/**
+              - tfhe/src/shortint/**
+              - tfhe/src/integer/**
+
  setup-instance:
-    name: Setup instance (signed-integer-tests)
-    if: (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
-        (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
-        (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
-        github.event_name == 'workflow_dispatch'
+    name: Setup instance (unsigned-integer-tests)
+    needs: should-run
+    if:
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
+      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
@@ -60,7 +92,7 @@ jobs:
      - name: Checkout tfhe-rs
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
-          persist-credentials: 'false'
+          persist-credentials: "false"

      - name: Set up home
        run: |
@@ -107,7 +139,7 @@ jobs:
  teardown-instance:
    name: Teardown instance (signed-integer-tests)
    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, signed-integer-tests ]
+    needs: [setup-instance, signed-integer-tests]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -63,7 +63,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
        with:
          since_last_remote_commit: true
          files_yaml: |
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -71,6 +71,8 @@ jobs:
          make test_nodejs_wasm_api_in_docker

      - name: Run parallel wasm tests
+        # test timeouts are at 60 but if we want a log we need to give time to the step to log stuff
+        timeout-minutes: 65
        run: |
          make test_web_js_api_parallel_ci

--- a/.github/workflows/benchmark_boolean.yml
+++ b/.github/workflows/benchmark_boolean.yml
@@ -98,7 +98,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_boolean
          path: ${{ env.RESULTS_FILENAME }}
@@ -113,16 +113,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
        if: ${{ failure() }}
--- a/.github/workflows/benchmark_core_crypto.yml
+++ b/.github/workflows/benchmark_core_crypto.yml
@@ -86,7 +86,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
@@ -101,16 +101,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on downloaded artifact"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
        if: ${{ failure() }}
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -82,7 +82,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_integer_multi_bit_gpu_default
          path: ${{ env.RESULTS_FILENAME }}
@@ -90,16 +90,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
        if: ${{ always() }}
@@ -164,7 +156,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/benchmark_gpu_core_crypto.yml
+++ b/.github/workflows/benchmark_gpu_core_crypto.yml
@@ -128,7 +128,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
@@ -143,16 +143,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on downloaded artifact"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

  slack-notify:
    name: Slack Notification
--- a/.github/workflows/benchmark_gpu_integer.yml
+++ b/.github/workflows/benchmark_gpu_integer.yml
@@ -124,7 +124,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -144,7 +144,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}
@@ -159,16 +159,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

  slack-notify:
    name: Slack Notification
--- a/.github/workflows/benchmark_gpu_integer_full.yml
+++ b/.github/workflows/benchmark_gpu_integer_full.yml
@@ -144,7 +144,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -152,16 +152,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

  slack-notify:
    name: Slack Notification
--- a/.github/workflows/benchmark_gpu_integer_multi_bit.yml
+++ b/.github/workflows/benchmark_gpu_integer_multi_bit.yml
@@ -147,7 +147,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -167,7 +167,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}
@@ -182,17 +182,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
-
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

  slack-notify:
    name: Slack Notification
--- a/.github/workflows/benchmark_gpu_integer_multi_bit_multi_gpu.yml
+++ b/.github/workflows/benchmark_gpu_integer_multi_bit_multi_gpu.yml
@@ -164,7 +164,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}
@@ -172,16 +172,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

  slack-notify:
    name: Slack Notification
--- a/.github/workflows/benchmark_gpu_integer_multi_gpu_full.yml
+++ b/.github/workflows/benchmark_gpu_integer_multi_gpu_full.yml
@@ -39,7 +39,7 @@ jobs:
          profile: multi-h100

  cuda-integer-full-multi-gpu-benchmarks:
-    name: Execute multi GPU integer benchmarks for all operations flavor
+    name: Execute multi GPU integer benchmarks
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    timeout-minutes: 1440 # 24 hours
@@ -48,8 +48,8 @@ jobs:
      fail-fast: false
      max-parallel: 1
      matrix:
-        command: [integer, integer_multi_bit]
-        op_flavor: [default, unchecked]
+        command: [integer_multi_bit]
+        op_flavor: [default]
        # explicit include-based build matrix, of known valid options
        include:
          - os: ubuntu-22.04
@@ -144,7 +144,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -152,16 +152,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

  slack-notify:
    name: Slack Notification
--- a/.github/workflows/integer_cpu_benchmark.yml
+++ b/.github/workflows/integer_cpu_benchmark.yml
@@ -139,7 +139,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -147,16 +147,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
        if: ${{ failure() }}
--- a/.github/workflows/shortint_cpu_benchmark.yml
+++ b/.github/workflows/shortint_cpu_benchmark.yml
@@ -141,7 +141,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -149,16 +149,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
        if: ${{ failure() }}
--- a/.github/workflows/signed_integer_cpu_benchmark.yml
+++ b/.github/workflows/signed_integer_cpu_benchmark.yml
@@ -139,7 +139,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -147,16 +147,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
        if: ${{ failure() }}
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -39,7 +39,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -103,6 +103,8 @@ jobs:
          toolchain: nightly

      - name: Run benchmarks
+        # test timeouts are at 60 but if we want a log we need to give time to the step to log stuff
+        timeout-minutes: 65
        run: |
          make install_node
          make bench_web_js_api_parallel_ci
@@ -130,7 +132,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_wasm
          path: ${{ env.RESULTS_FILENAME }}
@@ -145,16 +147,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
        if: ${{ failure() }}
--- a/.github/workflows/benchmark_zk_pke.yml
+++ b/.github/workflows/benchmark_zk_pke.yml
@@ -36,7 +36,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -79,7 +79,7 @@ jobs:
    if: needs.setup-instance.result != 'skipped'
    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
+      group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
@@ -138,7 +138,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_integer_zk
          path: ${{ env.RESULTS_FILENAME }}
@@ -153,16 +153,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
        if: ${{ !success() && !cancelled() }}
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -19,14 +19,21 @@ jobs:

    strategy:
      matrix:
-        os: [large_ubuntu_16, macos-latest-large, large_windows_16_latest]
+        # GitHub macos-latest are now M1 macs, so use ours, we limit what runs so it will be fast
+        # even with a few PRs
+        os: [large_ubuntu_16, macos-latest, windows-latest]
      fail-fast: false

    steps:
      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332

+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        with:
+          toolchain: stable
+
      - name: Install and run newline linter checks
-        if: matrix.os == 'ubuntu-latest'
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          wget https://github.com/fernandrone/linelint/releases/download/0.0.6/linelint-linux-amd64
          echo "16b70fb7b471d6f95cbdc0b4e5dc2b0ac9e84ba9ecdc488f7bdf13df823aca4b linelint-linux-amd64" > checksum
@@ -36,27 +43,33 @@ jobs:
          make check_newline

      - name: Run pcc checks
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make pcc

      - name: Build concrete-csprng
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_concrete_csprng

      - name: Build Release core
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_core AVX512_SUPPORT=ON
          make build_core_experimental AVX512_SUPPORT=ON

      - name: Build Release boolean
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_boolean

      - name: Build Release shortint
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_shortint

      - name: Build Release integer
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_integer

@@ -65,10 +78,12 @@ jobs:
          make build_tfhe_full

      - name: Build Release c_api
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_c_api

      - name: Build coverage tests
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_tfhe_coverage

--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -57,7 +57,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
        with:
          files_yaml: |
            tfhe:
--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -34,7 +34,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -146,7 +146,8 @@ jobs:

      - name: Run core crypto and internal CUDA backend tests
        run: |
-          BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu
+          BIG_TESTS_INSTANCE=FALSE make test_core_crypto_gpu
+          BIG_TESTS_INSTANCE=FALSE make test_integer_compression_gpu
          BIG_TESTS_INSTANCE=TRUE make test_cuda_backend

      - name: Run user docs tests
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -33,7 +33,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -145,6 +145,7 @@ jobs:
      - name: Run core crypto and internal CUDA backend tests
        run: |
          make test_core_crypto_gpu
+          make test_integer_compression_gpu
          make test_cuda_backend

      - name: Run user docs tests
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -34,7 +34,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -144,6 +144,10 @@ jobs:
        if: ${{ !cancelled() }}
        run: nvidia-smi

+      - name: Run multi-bit CUDA integer compression tests
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
+
      # No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
      - name: Run multi-bit CUDA integer tests
        run: |
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -34,7 +34,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
        with:
          since_last_remote_commit: true
          files_yaml: |
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -42,7 +42,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
        with:
          since_last_remote_commit: true
          files_yaml: |
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -34,7 +34,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
        with:
          since_last_remote_commit: true
          files_yaml: |
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -41,7 +41,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@e9772d140489982e0e3704fea5ee93d536f1e275
        with:
          since_last_remote_commit: true
          files_yaml: |
--- a/.github/workflows/make_release.yml
+++ b/.github/workflows/make_release.yml
@@ -42,7 +42,7 @@ jobs:
      - name: Prepare package
        run: |
          cargo package -p tfhe
-      - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
+      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
        with:
          name: crate
          path: target/package/*.crate
--- a/.github/workflows/make_release_concrete_csprng.yml
+++ b/.github/workflows/make_release_concrete_csprng.yml
@@ -1,4 +1,3 @@
-# Publish new release of tfhe-rs on various platform.
 name: Publish concrete-csprng release

 on:
@@ -37,6 +36,6 @@ jobs:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "concrete-csprng release failed: (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "concrete-csprng release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/make_release_concrete_tfhe_versionable.yml
+++ b/.github/workflows/make_release_concrete_tfhe_versionable.yml
@@ -0,0 +1,36 @@
+name: Publish tfhe-versionable release
+
+on:
+  workflow_dispatch:
+
+env:
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+jobs:
+  publish_release:
+    name: Publish tfhe-versionable Release
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+
+      - name: Publish crate.io package
+        env:
+          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+        run: |
+          cargo publish -p tfhe-versionable-derive --token ${{ env.CRATES_TOKEN }}
+          cargo publish -p tfhe-versionable --token ${{ env.CRATES_TOKEN }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "tfhe-versionable release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/18
+++ b/18
@@ -18,7 +18,7 @@ FAST_TESTS?=FALSE
 FAST_BENCH?=FALSE
 NIGHTLY_TESTS?=FALSE
 BENCH_OP_FLAVOR?=DEFAULT
-NODE_VERSION=22.4
+NODE_VERSION=22.6
 FORWARD_COMPAT?=OFF
 BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
 BACKWARD_COMPAT_DATA_BRANCH?=v0.1
@@ -284,6 +284,9 @@ clippy_c_api: install_rs_check_toolchain

 .PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
 clippy_js_wasm_api: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok \
+		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
@@ -481,6 +484,13 @@ test_integer_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::

+.PHONY: test_integer_compression_gpu
+test_integer_compression_gpu: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compressed_ciphertext_list::tests::test_gpu_ciphertext_compression
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compress
+
 .PHONY: test_integer_gpu_ci # Run the tests for integer ci on gpu backend
 test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
@@ -883,6 +893,12 @@ bench_integer_gpu: install_rs_check_toolchain
 	--bench integer-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

+.PHONY: bench_integer_compression_gpu
+bench_integer_compression_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench	glwe_packing_compression-integer-bench \
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) --
+
 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
--- a/README.md
+++ b/README.md
@@ -159,7 +159,7 @@ To run this code, use the following command:
 > Note that when running code that uses `TFHE-rs`, it is highly recommended
 to run in release mode with cargo's `--release` flag to have the best performances possible.

-*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/getting-started/quick_start)*
+*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick_start)*

 <p align="right">
  <a href="#about" > ↑ Back to top </a> 
--- a/apps/trivium/src/kreyvium/kreyvium.rs
+++ b/apps/trivium/src/kreyvium/kreyvium.rs
@@ -148,10 +148,9 @@ where

    /// Computes one turn of the stream, updating registers and outputting the new bit.
    pub fn next_bool(&mut self) -> T {
-        match &self.fhe_key {
-            Some(sk) => set_server_key(sk.clone()),
-            None => (),
-        };
+        if let Some(sk) = &self.fhe_key {
+            set_server_key(sk.clone());
+        }

        let [o, a, b, c] = self.get_output_and_values(0);

@@ -226,18 +225,12 @@ where
    /// Computes 64 turns of the stream, outputting the 64 bits all at once in a
    /// Vec (first value is oldest, last is newest)
    pub fn next_64(&mut self) -> Vec<T> {
-        match &self.fhe_key {
-            Some(sk) => {
-                rayon::broadcast(|_| set_server_key(sk.clone()));
-            }
-            None => (),
+        if let Some(sk) = &self.fhe_key {
+            rayon::broadcast(|_| set_server_key(sk.clone()));
        }
        let mut values = self.get_64_output_and_values();
-        match &self.fhe_key {
-            Some(_) => {
-                rayon::broadcast(|_| unset_server_key());
-            }
-            None => (),
+        if self.fhe_key.is_some() {
+            rayon::broadcast(|_| unset_server_key());
        }

        let mut ret = Vec::<T>::with_capacity(64);
--- a/apps/trivium/src/kreyvium/kreyvium_byte.rs
+++ b/apps/trivium/src/kreyvium/kreyvium_byte.rs
@@ -237,18 +237,12 @@ where
    /// Computes 64 turns of the stream, outputting the 64 bits (in 8 bytes) all at once in a
    /// Vec (first value is oldest, last is newest)
    pub fn next_64(&mut self) -> Vec<T> {
-        match &self.fhe_key {
-            Some(sk) => {
-                rayon::broadcast(|_| set_server_key(sk.clone()));
-            }
-            None => (),
+        if let Some(sk) = &self.fhe_key {
+            rayon::broadcast(|_| set_server_key(sk.clone()));
        }
        let values = self.get_64_output_and_values();
-        match &self.fhe_key {
-            Some(_) => {
-                rayon::broadcast(|_| unset_server_key());
-            }
-            None => (),
+        if self.fhe_key.is_some() {
+            rayon::broadcast(|_| unset_server_key());
        }

        let mut bytes = Vec::<T>::with_capacity(8);
--- a/apps/trivium/src/lib.rs
+++ b/apps/trivium/src/lib.rs
@@ -1,3 +1,5 @@
+#![allow(clippy::too_long_first_doc_paragraph)]
+
 mod static_deque;

 mod kreyvium;
--- a/apps/trivium/src/trivium/trivium_bool.rs
+++ b/apps/trivium/src/trivium/trivium_bool.rs
@@ -120,10 +120,9 @@ where

    /// Computes one turn of the stream, updating registers and outputting the new bit.
    pub fn next_bool(&mut self) -> T {
-        match &self.fhe_key {
-            Some(sk) => set_server_key(sk.clone()),
-            None => (),
-        };
+        if let Some(sk) = &self.fhe_key {
+            set_server_key(sk.clone());
+        }

        let [o, a, b, c] = self.get_output_and_values(0);

@@ -196,18 +195,12 @@ where
    /// Computes 64 turns of the stream, outputting the 64 bits all at once in a
    /// Vec (first value is oldest, last is newest)
    pub fn next_64(&mut self) -> Vec<T> {
-        match &self.fhe_key {
-            Some(sk) => {
-                rayon::broadcast(|_| set_server_key(sk.clone()));
-            }
-            None => (),
+        if let Some(sk) = &self.fhe_key {
+            rayon::broadcast(|_| set_server_key(sk.clone()));
        }
        let mut values = self.get_64_output_and_values();
-        match &self.fhe_key {
-            Some(_) => {
-                rayon::broadcast(|_| unset_server_key());
-            }
-            None => (),
+        if self.fhe_key.is_some() {
+            rayon::broadcast(|_| unset_server_key());
        }

        let mut ret = Vec::<T>::with_capacity(64);
--- a/apps/trivium/src/trivium/trivium_byte.rs
+++ b/apps/trivium/src/trivium/trivium_byte.rs
@@ -187,18 +187,12 @@ where
    /// Computes 64 turns of the stream, outputting the 64 bits (in 8 bytes) all at once in a
    /// Vec (first value is oldest, last is newest)
    pub fn next_64(&mut self) -> Vec<T> {
-        match &self.fhe_key {
-            Some(sk) => {
-                rayon::broadcast(|_| set_server_key(sk.clone()));
-            }
-            None => (),
+        if let Some(sk) = &self.fhe_key {
+            rayon::broadcast(|_| set_server_key(sk.clone()));
        }
        let values = self.get_64_output_and_values();
-        match &self.fhe_key {
-            Some(_) => {
-                rayon::broadcast(|_| unset_server_key());
-            }
-            None => (),
+        if self.fhe_key.is_some() {
+            rayon::broadcast(|_| unset_server_key());
        }

        let mut bytes = Vec::<T>::with_capacity(8);
--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -67,9 +67,21 @@ endif()

 add_compile_definitions(CUDA_ARCH=${CUDA_ARCH})

+# Check if the DEBUG flag is defined
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+  # Debug mode
+  message("Compiling in Debug mode")
+  add_definitions(-DDEBUG)
+  set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O0 -G -g")
+else()
+  # Release mode
+  message("Compiling in Release mode")
+  set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O3")
+endif()
+
 # in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
 set(CMAKE_CUDA_FLAGS
-    "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 \
+    "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\
  -std=c++17 --no-exceptions  --expt-relaxed-constexpr -rdc=true \
  --use_fast_math -Xcompiler -fPIC")

--- a/backends/tfhe-cuda-backend/cuda/include/compression.h
+++ b/backends/tfhe-cuda-backend/cuda/include/compression.h
@@ -0,0 +1,159 @@
+#ifndef CUDA_INTEGER_COMPRESSION_H
+#define CUDA_INTEGER_COMPRESSION_H
+
+#include "integer.h"
+
+extern "C" {
+void scratch_cuda_integer_compress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
+    bool allocate_gpu_memory);
+
+void scratch_cuda_integer_decompress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
+    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
+    uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
+    bool allocate_gpu_memory);
+
+void cuda_integer_compress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
+    int8_t *mem_ptr);
+
+void cuda_integer_decompress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *glwe_in, void *indexes_array,
+    uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr);
+
+void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
+                                                       uint32_t *gpu_indexes,
+                                                       uint32_t gpu_count,
+                                                       int8_t **mem_ptr_void);
+
+void cleanup_cuda_integer_decompress_radix_ciphertext_64(void **streams,
+                                                         uint32_t *gpu_indexes,
+                                                         uint32_t gpu_count,
+                                                         int8_t **mem_ptr_void);
+}
+
+template <typename Torus> struct int_compression {
+  int_radix_params compression_params;
+  uint32_t storage_log_modulus;
+  uint32_t lwe_per_glwe;
+
+  uint32_t body_count;
+
+  // Compression
+  int8_t *fp_ks_buffer;
+  Torus *tmp_lwe;
+  Torus *tmp_glwe_array_out;
+
+  int_compression(cudaStream_t *streams, uint32_t *gpu_indexes,
+                  uint32_t gpu_count, int_radix_params compression_params,
+                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
+                  uint32_t storage_log_modulus, bool allocate_gpu_memory) {
+    this->compression_params = compression_params;
+    this->lwe_per_glwe = lwe_per_glwe;
+    this->storage_log_modulus = storage_log_modulus;
+    this->body_count = num_radix_blocks;
+
+    if (allocate_gpu_memory) {
+      Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
+                                    compression_params.polynomial_size;
+
+      tmp_lwe = (Torus *)cuda_malloc_async(
+          num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
+              sizeof(Torus),
+          streams[0], gpu_indexes[0]);
+      tmp_glwe_array_out = (Torus *)cuda_malloc_async(
+          glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]);
+
+      scratch_packing_keyswitch_lwe_list_to_glwe_64(
+          streams[0], gpu_indexes[0], &fp_ks_buffer,
+          compression_params.glwe_dimension, compression_params.polynomial_size,
+          num_radix_blocks, true);
+    }
+  }
+  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
+               uint32_t gpu_count) {
+    cuda_drop_async(tmp_lwe, streams[0], gpu_indexes[0]);
+    cuda_drop_async(tmp_glwe_array_out, streams[0], gpu_indexes[0]);
+    cleanup_packing_keyswitch_lwe_list_to_glwe(streams[0], gpu_indexes[0],
+                                               &fp_ks_buffer);
+  }
+};
+
+template <typename Torus> struct int_decompression {
+  int_radix_params encryption_params;
+  int_radix_params compression_params;
+
+  uint32_t storage_log_modulus;
+
+  uint32_t num_lwes;
+  uint32_t body_count;
+
+  Torus *tmp_extracted_glwe;
+  Torus *tmp_extracted_lwe;
+
+  int_radix_lut<Torus> *carry_extract_lut;
+
+  int_decompression(cudaStream_t *streams, uint32_t *gpu_indexes,
+                    uint32_t gpu_count, int_radix_params encryption_params,
+                    int_radix_params compression_params,
+                    uint32_t num_radix_blocks, uint32_t body_count,
+                    uint32_t storage_log_modulus, bool allocate_gpu_memory) {
+    this->encryption_params = encryption_params;
+    this->compression_params = compression_params;
+    this->storage_log_modulus = storage_log_modulus;
+    this->num_lwes = num_radix_blocks;
+    this->body_count = body_count;
+
+    if (allocate_gpu_memory) {
+      Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
+                                    compression_params.polynomial_size;
+
+      carry_extract_lut = new int_radix_lut<Torus>(
+          streams, gpu_indexes, gpu_count, encryption_params, 1,
+          num_radix_blocks, allocate_gpu_memory);
+
+      tmp_extracted_glwe = (Torus *)cuda_malloc_async(
+          glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]);
+      tmp_extracted_lwe = (Torus *)cuda_malloc_async(
+          num_radix_blocks *
+              (compression_params.glwe_dimension *
+                   compression_params.polynomial_size +
+               1) *
+              sizeof(Torus),
+          streams[0], gpu_indexes[0]);
+      // Decompression
+      // Carry extract LUT
+      auto carry_extract_f = [encryption_params](Torus x) -> Torus {
+        return x / encryption_params.message_modulus;
+      };
+
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0],
+          carry_extract_lut->get_lut(gpu_indexes[0], 0),
+          encryption_params.glwe_dimension, encryption_params.polynomial_size,
+          encryption_params.message_modulus, encryption_params.carry_modulus,
+          carry_extract_f);
+
+      carry_extract_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+    }
+  }
+  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
+               uint32_t gpu_count) {
+    cuda_drop_async(tmp_extracted_glwe, streams[0], gpu_indexes[0]);
+    cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]);
+
+    carry_extract_lut->release(streams, gpu_indexes, gpu_count);
+    delete (carry_extract_lut);
+  }
+};
+#endif
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -39,10 +39,6 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);

 void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);

-bool cuda_check_support_cooperative_groups();
-
-bool cuda_check_support_thread_block_clusters();
-
 void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index);

@@ -62,9 +58,13 @@ void cuda_synchronize_device(uint32_t gpu_index);
 void cuda_drop(void *ptr, uint32_t gpu_index);

 void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
+}

 int cuda_get_max_shared_memory(uint32_t gpu_index);
-}
+
+bool cuda_check_support_cooperative_groups();
+
+bool cuda_check_support_thread_block_clusters();

 template <typename Torus>
 void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -8,7 +8,7 @@ extern std::mutex m;
 extern bool p2p_enabled;

 extern "C" {
-int cuda_setup_multi_gpu();
+int32_t cuda_setup_multi_gpu();
 }

 // Define a variant type that can be either a vector or a single pointer
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -1,6 +1,7 @@
 #ifndef CUDA_INTEGER_H
 #define CUDA_INTEGER_H

+#include "keyswitch.h"
 #include "pbs/programmable_bootstrap.cuh"
 #include "programmable_bootstrap.h"
 #include "programmable_bootstrap_multibit.h"
@@ -15,7 +16,6 @@ enum SHIFT_OR_ROTATE_TYPE {
  LEFT_ROTATE = 2,
  RIGHT_ROTATE = 3
 };
-enum LUT_TYPE { OPERATOR = 0, MAXVALUE = 1, ISNONZERO = 2, BLOCKSLEN = 3 };
 enum BITOP_TYPE {
  BITAND = 0,
  BITOR = 1,
@@ -112,10 +112,11 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
 void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes,
                               uint32_t gpu_count, int8_t **mem_ptr_void);

-void cuda_negate_integer_radix_ciphertext_64_inplace(
-    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
-    uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
-    uint32_t message_modulus, uint32_t carry_modulus);
+void cuda_negate_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus);

 void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
@@ -385,8 +386,8 @@ void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(

 void cuda_integer_compute_prefix_sum_hillis_steele_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
-    void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
-    void **bsks, uint32_t num_blocks, uint32_t shift);
+    void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
+    void **ksks, void **bsks, uint32_t num_blocks, uint32_t shift);

 void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -475,7 +476,8 @@ struct int_radix_params {
        message_modulus(message_modulus), carry_modulus(carry_modulus){};

  void print() {
-    printf("pbs_type: %u, glwe_dimension: %u, polynomial_size: %u, "
+    printf("pbs_type: %u, glwe_dimension: %u, "
+           "polynomial_size: %u, "
           "big_lwe_dimension: %u, "
           "small_lwe_dimension: %u, ks_level: %u, ks_base_log: %u, pbs_level: "
           "%u, pbs_base_log: "
@@ -812,7 +814,6 @@ template <typename Torus> struct int_radix_lut {
    }
  }
 };
-
 template <typename Torus> struct int_bit_extract_luts_buffer {
  int_radix_params params;
  int_radix_lut<Torus> *lut;
@@ -1356,6 +1357,7 @@ template <typename Torus> struct int_overflowing_sub_memory {

 template <typename Torus> struct int_sum_ciphertexts_vec_memory {
  Torus *new_blocks;
+  Torus *new_blocks_copy;
  Torus *old_blocks;
  Torus *small_lwe_vector;
  int_radix_params params;
@@ -1383,6 +1385,9 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
    new_blocks = (Torus *)cuda_malloc_async(
        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
        streams[0], gpu_indexes[0]);
+    new_blocks_copy = (Torus *)cuda_malloc_async(
+        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
+        streams[0], gpu_indexes[0]);
    old_blocks = (Torus *)cuda_malloc_async(
        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
        streams[0], gpu_indexes[0]);
@@ -1414,6 +1419,9 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
    this->new_blocks = new_blocks;
    this->old_blocks = old_blocks;
    this->small_lwe_vector = small_lwe_vector;
+    new_blocks_copy = (Torus *)cuda_malloc_async(
+        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
+        streams[0], gpu_indexes[0]);

    d_smart_copy_in = (int32_t *)cuda_malloc_async(
        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
@@ -1432,8 +1440,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
      cuda_drop_async(small_lwe_vector, streams[0], gpu_indexes[0]);
    }

+    cuda_drop_async(new_blocks_copy, streams[0], gpu_indexes[0]);
    scp_mem->release(streams, gpu_indexes, gpu_count);
-
    delete scp_mem;
  }
 };
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
@@ -16,6 +16,21 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
+
+void scratch_packing_keyswitch_lwe_list_to_glwe_64(
+    void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
+    bool allocate_gpu_memory);
+
+void cuda_packing_keyswitch_lwe_list_to_glwe_64(
+    void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
+    void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension,
+    uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_lwes);
+
+void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
+                                                uint32_t gpu_index,
+                                                int8_t **fp_ks_buffer);
 }

 #endif // CNCRT_KS_H_
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
@@ -81,14 +81,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(

 void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
                                         int8_t **pbs_buffer);
-
-uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count);
-
-uint64_t get_buffer_size_programmable_bootstrap_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count);
 }

 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
@@ -1,17 +1,3 @@
-set(SOURCES
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h)
 file(GLOB_RECURSE SOURCES "*.cu")
 add_library(tfhe_cuda_backend STATIC ${SOURCES})
 set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
@@ -38,8 +38,8 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
  auto lwe_out = lwe_array_out + input_id * lwe_output_size;

  // We assume each GLWE will store the first polynomial_size inputs
-  uint32_t nth_per_glwe = params::degree;
-  auto glwe_in = glwe_array_in + (input_id / nth_per_glwe) * glwe_input_size;
+  uint32_t lwe_per_glwe = params::degree;
+  auto glwe_in = glwe_array_in + (input_id / lwe_per_glwe) * glwe_input_size;

  auto nth = nth_array[input_id];

@@ -50,11 +50,11 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
 template <typename Torus, class params>
 __host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
                                  Torus *lwe_array_out, Torus *glwe_array_in,
-                                  uint32_t *nth_array, uint32_t num_glwes,
+                                  uint32_t *nth_array, uint32_t num_nths,
                                  uint32_t glwe_dimension) {
  cudaSetDevice(gpu_index);

-  dim3 grid(num_glwes);
+  dim3 grid(num_nths);
  dim3 thds(params::degree / params::opt);
  sample_extract<Torus, params><<<grid, thds, 0, stream>>>(
      lwe_array_out, glwe_array_in, nth_array, glwe_dimension);
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -10,7 +10,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  cuda_keyswitch_lwe_ciphertext_vector(
+  host_keyswitch_lwe_ciphertext_vector<uint32_t>(
      static_cast<cudaStream_t>(stream), gpu_index,
      static_cast<uint32_t *>(lwe_array_out),
      static_cast<uint32_t *>(lwe_output_indexes),
@@ -40,7 +40,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  cuda_keyswitch_lwe_ciphertext_vector(
+  host_keyswitch_lwe_ciphertext_vector<uint64_t>(
      static_cast<cudaStream_t>(stream), gpu_index,
      static_cast<uint64_t *>(lwe_array_out),
      static_cast<uint64_t *>(lwe_output_indexes),
@@ -48,3 +48,35 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
      static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
 }
+
+void scratch_packing_keyswitch_lwe_list_to_glwe_64(
+    void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
+    bool allocate_gpu_memory) {
+  scratch_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index, fp_ks_buffer,
+      glwe_dimension, polynomial_size, num_lwes, allocate_gpu_memory);
+}
+/* Perform functional packing keyswitch on a batch of 64 bits input LWE
+ * ciphertexts.
+ */
+void cuda_packing_keyswitch_lwe_list_to_glwe_64(
+    void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
+    void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension,
+    uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_lwes) {
+
+  host_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(glwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in),
+      static_cast<uint64_t *>(fp_ksk_array), fp_ks_buffer, input_lwe_dimension,
+      output_glwe_dimension, output_polynomial_size, base_log, level_count,
+      num_lwes);
+}
+
+void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
+                                                uint32_t gpu_index,
+                                                int8_t **fp_ks_buffer) {
+  cuda_drop_async(*fp_ks_buffer, static_cast<cudaStream_t>(stream), gpu_index);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -7,6 +7,7 @@
 #include "polynomial/functions.cuh"
 #include "polynomial/polynomial_math.cuh"
 #include "torus.cuh"
+#include "utils/helper.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <thread>
 #include <vector>
@@ -98,7 +99,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
 }

 template <typename Torus>
-__host__ void cuda_keyswitch_lwe_ciphertext_vector(
+__host__ void host_keyswitch_lwe_ciphertext_vector(
    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
    Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
@@ -146,7 +147,7 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
        GET_VARIANT_ELEMENT(lwe_input_indexes, i);

    // Compute Keyswitch
-    cuda_keyswitch_lwe_ciphertext_vector<Torus>(
+    host_keyswitch_lwe_ciphertext_vector<Torus>(
        streams[i], gpu_indexes[i], current_lwe_array_out,
        current_lwe_output_indexes, current_lwe_array_in,
        current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out,
@@ -154,4 +155,154 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
  }
 }

+template <typename Torus>
+__host__ void scratch_packing_keyswitch_lwe_list_to_glwe(
+    cudaStream_t stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
+    bool allocate_gpu_memory) {
+  cudaSetDevice(gpu_index);
+
+  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
+
+  if (allocate_gpu_memory)
+    *fp_ks_buffer = (int8_t *)cuda_malloc_async(
+        2 * num_lwes * glwe_accumulator_size * sizeof(Torus), stream,
+        gpu_index);
+}
+
+// public functional packing keyswitch for a single LWE ciphertext
+//
+// Assumes there are (glwe_dimension+1) * polynomial_size threads split through
+// different thread blocks at the x-axis to work on that input.
+template <typename Torus>
+__device__ void packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
+    Torus *glwe_out, Torus *lwe_in, Torus *fp_ksk, uint32_t lwe_dimension_in,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count) {
+
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  size_t glwe_size = (glwe_dimension + 1);
+
+  if (tid < glwe_size * polynomial_size) {
+    const int local_index = threadIdx.x;
+    // the output_glwe is split in polynomials and each x-block takes one of
+    // them
+    size_t poly_id = blockIdx.x;
+    size_t coef_per_block = blockDim.x;
+
+    // number of coefficients inside fp-ksk block for each lwe_input coefficient
+    size_t ksk_block_size = glwe_size * polynomial_size * level_count;
+
+    // initialize accumulator to 0
+    glwe_out[tid] = SEL(0, lwe_in[lwe_dimension_in],
+                        tid == glwe_dimension * polynomial_size);
+
+    // Iterate through all lwe elements
+    for (int i = 0; i < lwe_dimension_in; i++) {
+      // Round and prepare decomposition
+      Torus a_i = round_to_closest_multiple(lwe_in[i], base_log, level_count);
+
+      Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
+      Torus mod_b_mask = (1ll << base_log) - 1ll;
+
+      // block of key for current lwe coefficient (cur_input_lwe[i])
+      auto ksk_block = &fp_ksk[i * ksk_block_size];
+      for (int j = 0; j < level_count; j++) {
+        auto ksk_glwe = &ksk_block[j * glwe_size * polynomial_size];
+        // Iterate through each level and multiply by the ksk piece
+        auto ksk_glwe_chunk = &ksk_glwe[poly_id * coef_per_block];
+        Torus decomposed = decompose_one<Torus>(state, mod_b_mask, base_log);
+        glwe_out[tid] -= decomposed * ksk_glwe_chunk[local_index];
+      }
+    }
+  }
+}
+
+// public functional packing keyswitch for a batch of LWE ciphertexts
+//
+// Selects the input each thread is working on using the y-block index.
+//
+// Assumes there are (glwe_dimension+1) * polynomial_size threads split through
+// different thread blocks at the x-axis to work on that input.
+template <typename Torus>
+__global__ void
+packing_keyswitch_lwe_list_to_glwe(Torus *glwe_array_out, Torus *lwe_array_in,
+                                   Torus *fp_ksk, uint32_t lwe_dimension_in,
+                                   uint32_t glwe_dimension,
+                                   uint32_t polynomial_size, uint32_t base_log,
+                                   uint32_t level_count, Torus *d_mem) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
+  const int lwe_size = (lwe_dimension_in + 1);
+
+  const int input_id = blockIdx.y;
+  const int degree = input_id;
+
+  // Select an input
+  auto lwe_in = lwe_array_in + input_id * lwe_size;
+  auto ks_glwe_out = d_mem + input_id * glwe_accumulator_size;
+  auto glwe_out = glwe_array_out + input_id * glwe_accumulator_size;
+  // KS LWE to GLWE
+  packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
+      ks_glwe_out, lwe_in, fp_ksk, lwe_dimension_in, glwe_dimension,
+      polynomial_size, base_log, level_count);
+
+  // P * x ^degree
+  auto in_poly = ks_glwe_out + (tid / polynomial_size) * polynomial_size;
+  auto out_result = glwe_out + (tid / polynomial_size) * polynomial_size;
+  polynomial_accumulate_monic_monomial_mul(out_result, in_poly, degree,
+                                           tid % polynomial_size,
+                                           polynomial_size, 1, true);
+}
+
+/// To-do: Rewrite this kernel for efficiency
+template <typename Torus>
+__global__ void accumulate_glwes(Torus *glwe_out, Torus *glwe_array_in,
+                                 uint32_t glwe_dimension,
+                                 uint32_t polynomial_size, uint32_t num_lwes) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < (glwe_dimension + 1) * polynomial_size) {
+    glwe_out[tid] = glwe_array_in[tid];
+
+    // Accumulate
+    for (int i = 1; i < num_lwes; i++) {
+      auto glwe_in = glwe_array_in + i * (glwe_dimension + 1) * polynomial_size;
+      glwe_out[tid] += glwe_in[tid];
+    }
+  }
+}
+
+template <typename Torus>
+__host__ void host_packing_keyswitch_lwe_list_to_glwe(
+    cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out,
+    Torus *lwe_array_in, Torus *fp_ksk_array, int8_t *fp_ks_buffer,
+    uint32_t lwe_dimension_in, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_lwes) {
+  cudaSetDevice(gpu_index);
+  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
+
+  int num_blocks = 0, num_threads = 0;
+  getNumBlocksAndThreads(glwe_accumulator_size, 128, num_blocks, num_threads);
+
+  dim3 grid(num_blocks, num_lwes);
+  dim3 threads(num_threads);
+
+  auto d_mem = (Torus *)fp_ks_buffer;
+  auto d_tmp_glwe_array_out = d_mem + num_lwes * glwe_accumulator_size;
+
+  // individually keyswitch each lwe
+  packing_keyswitch_lwe_list_to_glwe<<<grid, threads, 0, stream>>>(
+      d_tmp_glwe_array_out, lwe_array_in, fp_ksk_array, lwe_dimension_in,
+      glwe_dimension, polynomial_size, base_log, level_count, d_mem);
+  check_cuda_error(cudaGetLastError());
+
+  // accumulate to a single glwe
+  accumulate_glwes<<<num_blocks, threads, 0, stream>>>(
+      glwe_out, d_tmp_glwe_array_out, glwe_dimension, polynomial_size,
+      num_lwes);
+  check_cuda_error(cudaGetLastError());
+}
+
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -2,6 +2,7 @@
 #define CNCRT_TORUS_CUH

 #include "types/int128.cuh"
+#include "utils/kernel_dimensions.cuh"
 #include <limits>

 template <typename T>
@@ -29,20 +30,18 @@ __device__ inline void typecast_double_to_torus<uint64_t>(double x,
 template <typename T>
 __device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
                                              uint32_t level_count) {
-  T shift = sizeof(T) * 8 - level_count * base_log;
-  T mask = 1ll << (shift - 1);
-  T b = (x & mask) >> (shift - 1);
+  const T non_rep_bit_count = sizeof(T) * 8 - level_count * base_log;
+  const T shift = non_rep_bit_count - 1;
  T res = x >> shift;
-  res += b;
-  res <<= shift;
-  return res;
+  res += 1;
+  res &= (T)(-2);
+  return res << shift;
 }

 template <typename T>
 __device__ __forceinline__ void modulus_switch(T input, T &output,
                                               uint32_t log_modulus) {
  constexpr uint32_t BITS = sizeof(T) * 8;
-
  output = input + (((T)1) << (BITS - log_modulus - 1));
  output >>= (BITS - log_modulus);
 }
@@ -54,4 +53,27 @@ __device__ __forceinline__ T modulus_switch(T input, uint32_t log_modulus) {
  return output;
 }

+template <typename Torus>
+__global__ void modulus_switch_inplace(Torus *array, int size,
+                                       uint32_t log_modulus) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < size) {
+    array[tid] = modulus_switch(array[tid], log_modulus);
+  }
+}
+
+template <typename Torus>
+__host__ void host_modulus_switch_inplace(cudaStream_t stream,
+                                          uint32_t gpu_index, Torus *array,
+                                          int size, uint32_t log_modulus) {
+  cudaSetDevice(gpu_index);
+
+  int num_threads = 0, num_blocks = 0;
+  getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
+
+  modulus_switch_inplace<<<num_blocks, num_threads, 0, stream>>>(array, size,
+                                                                 log_modulus);
+  check_cuda_error(cudaGetLastError());
+}
+
 #endif // CNCRT_TORUS_H
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -177,8 +177,8 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
    int num_blocks = (n + block_size - 1) / block_size;

    // Launch the kernel
-    cuda_set_value_kernel<<<num_blocks, block_size, 0, stream>>>(d_array, value,
-                                                                 n);
+    cuda_set_value_kernel<Torus>
+        <<<num_blocks, block_size, 0, stream>>>(d_array, value, n);
    check_cuda_error(cudaGetLastError());
  }
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
@@ -37,12 +37,12 @@ void host_resolve_signed_overflow(
      streams[0], gpu_indexes[0], x, last_block_output_carry, d_clears,
      mem->params.big_lwe_dimension, 1);

-  host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
-                last_block_inner_propagation, x, mem->params.big_lwe_dimension,
-                1);
-  host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
-                last_block_inner_propagation, last_block_input_carry,
-                mem->params.big_lwe_dimension, 1);
+  host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
+                       last_block_inner_propagation, x,
+                       mem->params.big_lwe_dimension, 1);
+  host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
+                       last_block_inner_propagation, last_block_input_carry,
+                       mem->params.big_lwe_dimension, 1);

  host_apply_univariate_lut_kb<Torus>(streams, gpu_indexes, gpu_count, result,
                                      last_block_inner_propagation,
@@ -94,14 +94,14 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(

  // phase 1
  if (op == SIGNED_OPERATION::ADDITION) {
-    host_addition(streams[0], gpu_indexes[0], result, lhs, rhs,
-                  big_lwe_dimension, num_blocks);
+    host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, rhs,
+                         big_lwe_dimension, num_blocks);
  } else {
-    host_integer_radix_negation(
+    host_integer_radix_negation<Torus>(
        streams, gpu_indexes, gpu_count, neg_rhs, rhs, big_lwe_dimension,
        num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
-    host_addition(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
-                  big_lwe_dimension, num_blocks);
+    host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
+                         big_lwe_dimension, num_blocks);
  }

  // phase 2
@@ -109,10 +109,10 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
  }

-  host_propagate_single_carry(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
-                              result, output_carry, input_carries,
-                              mem_ptr->scp_mem, bsks, ksks, num_blocks);
-  host_generate_last_block_inner_propagation(
+  host_propagate_single_carry<Torus>(
+      mem_ptr->sub_streams_1, gpu_indexes, gpu_count, result, output_carry,
+      input_carries, mem_ptr->scp_mem, bsks, ksks, num_blocks);
+  host_generate_last_block_inner_propagation<Torus>(
      mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
      last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size],
      &rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem, bsks,
@@ -126,7 +126,7 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
  // phase 3
  auto input_carry = &input_carries[(num_blocks - 1) * big_lwe_size];

-  host_resolve_signed_overflow(
+  host_resolve_signed_overflow<Torus>(
      streams, gpu_indexes, gpu_count, overflowed, last_block_inner_propagation,
      input_carry, output_carry, mem_ptr->resolve_overflow_mem, bsks, ksks);

--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -17,7 +17,7 @@ void scratch_cuda_integer_radix_cmux_kb_64(
  std::function<uint64_t(uint64_t)> predicate_lut_f =
      [](uint64_t x) -> uint64_t { return x == 1; };

-  scratch_cuda_integer_radix_cmux_kb(
+  scratch_cuda_integer_radix_cmux_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
      lwe_ciphertext_count, params, allocate_gpu_memory);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -27,10 +27,11 @@ __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
    auto lwe_array_out_block = tmp_lwe_array_input + i * big_lwe_size;
    auto lwe_array_input_block = lwe_array_input + i * big_lwe_size;

-    device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
-        lwe_array_out_block, predicate->lwe_indexes_in, lwe_array_input_block,
-        lwe_condition, predicate->lwe_indexes_in, params.big_lwe_dimension,
-        params.message_modulus, 1);
+    device_pack_bivariate_blocks<Torus>
+        <<<num_blocks, num_threads, 0, streams[0]>>>(
+            lwe_array_out_block, predicate->lwe_indexes_in,
+            lwe_array_input_block, lwe_condition, predicate->lwe_indexes_in,
+            params.big_lwe_dimension, params.message_modulus, 1);
    check_cuda_error(cudaGetLastError());
  }

@@ -57,13 +58,15 @@ __host__ void host_integer_radix_cmux_kb(
  }

  auto mem_true = mem_ptr->zero_if_true_buffer;
-  zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
-              lwe_array_true, lwe_condition, mem_true,
-              mem_ptr->inverted_predicate_lut, bsks, ksks, num_radix_blocks);
+  zero_out_if<Torus>(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
+                     lwe_array_true, lwe_condition, mem_true,
+                     mem_ptr->inverted_predicate_lut, bsks, ksks,
+                     num_radix_blocks);
  auto mem_false = mem_ptr->zero_if_false_buffer;
-  zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
-              lwe_array_false, lwe_condition, mem_false, mem_ptr->predicate_lut,
-              bsks, ksks, num_radix_blocks);
+  zero_out_if<Torus>(false_streams, gpu_indexes, gpu_count,
+                     mem_ptr->tmp_false_ct, lwe_array_false, lwe_condition,
+                     mem_false, mem_ptr->predicate_lut, bsks, ksks,
+                     num_radix_blocks);
  for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
    cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
  }
@@ -75,9 +78,9 @@ __host__ void host_integer_radix_cmux_kb(
  // will be 0 If the condition was false, true_ct will be 0 and false_ct will
  // have kept its value
  auto added_cts = mem_ptr->tmp_true_ct;
-  host_addition(streams[0], gpu_indexes[0], added_cts, mem_ptr->tmp_true_ct,
-                mem_ptr->tmp_false_ct, params.big_lwe_dimension,
-                num_radix_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], added_cts,
+                       mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct,
+                       params.big_lwe_dimension, num_radix_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -43,7 +43,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
  int num_entries = (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
  // Add all blocks and store in sum
-  device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream>>>(
+  device_accumulate_all_blocks<Torus><<<num_blocks, num_threads, 0, stream>>>(
      output, input, lwe_dimension, num_radix_blocks);
  check_cuda_error(cudaGetLastError());
 }
@@ -62,7 +62,6 @@ __host__ void are_all_comparisons_block_true(
    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
    uint32_t num_radix_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto glwe_dimension = params.glwe_dimension;
@@ -96,8 +95,9 @@ __host__ void are_all_comparisons_block_true(
    auto is_equal_to_num_blocks_map =
        &are_all_block_true_buffer->is_equal_to_lut_map;
    for (int i = 0; i < num_chunks; i++) {
-      accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
-                            input_blocks, big_lwe_dimension, chunk_length);
+      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
+                                   input_blocks, big_lwe_dimension,
+                                   chunk_length);

      accumulator += (big_lwe_dimension + 1);
      remaining_blocks -= (chunk_length - 1);
@@ -165,7 +165,6 @@ __host__ void is_at_least_one_comparisons_block_true(
    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
    uint32_t num_radix_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto message_modulus = params.message_modulus;
@@ -192,8 +191,9 @@ __host__ void is_at_least_one_comparisons_block_true(
    auto input_blocks = mem_ptr->tmp_lwe_array_out;
    auto accumulator = buffer->tmp_block_accumulated;
    for (int i = 0; i < num_chunks; i++) {
-      accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
-                            input_blocks, big_lwe_dimension, chunk_length);
+      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
+                                   input_blocks, big_lwe_dimension,
+                                   chunk_length);

      accumulator += (big_lwe_dimension + 1);
      remaining_blocks -= (chunk_length - 1);
@@ -280,8 +280,8 @@ __host__ void host_compare_with_zero_equality(
      uint32_t chunk_size =
          std::min(remainder_blocks, num_elements_to_fill_carry);

-      accumulate_all_blocks(streams[0], gpu_indexes[0], sum_i, chunk,
-                            big_lwe_dimension, chunk_size);
+      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], sum_i, chunk,
+                                   big_lwe_dimension, chunk_size);

      num_sum_blocks++;
      remainder_blocks -= (chunk_size - 1);
@@ -295,8 +295,9 @@ __host__ void host_compare_with_zero_equality(
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, sum, sum, bsks, ksks, num_sum_blocks,
      zero_comparison);
-  are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
-                                 sum, mem_ptr, bsks, ksks, num_sum_blocks);
+  are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
+                                        lwe_array_out, sum, mem_ptr, bsks, ksks,
+                                        num_sum_blocks);
 }

 template <typename Torus>
@@ -310,7 +311,7 @@ __host__ void host_integer_radix_equality_check_kb(

  // Applies the LUT for the comparison operation
  auto comparisons = mem_ptr->tmp_block_comparisons;
-  integer_radix_apply_bivariate_lookup_table_kb(
+  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, comparisons, lwe_array_1, lwe_array_2,
      bsks, ksks, num_radix_blocks, eq_buffer->operator_lut,
      eq_buffer->operator_lut->params.message_modulus);
@@ -319,9 +320,9 @@ __host__ void host_integer_radix_equality_check_kb(
  //
  // It returns a block encrypting 1 if all input blocks are 1
  // otherwise the block encrypts 0
-  are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
-                                 comparisons, mem_ptr, bsks, ksks,
-                                 num_radix_blocks);
+  are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
+                                        lwe_array_out, comparisons, mem_ptr,
+                                        bsks, ksks, num_radix_blocks);
 }

 template <typename Torus>
@@ -352,19 +353,20 @@ compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,

  // Subtract
  // Here we need the true lwe sub, not the one that comes from shortint.
-  host_subtraction(streams[0], gpu_indexes[0], lwe_array_out, lwe_array_left,
-                   lwe_array_right, big_lwe_dimension, num_radix_blocks);
+  host_subtraction<Torus>(streams[0], gpu_indexes[0], lwe_array_out,
+                          lwe_array_left, lwe_array_right, big_lwe_dimension,
+                          num_radix_blocks);

  // Apply LUT to compare to 0
  auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
-  integer_radix_apply_univariate_lookup_table_kb(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsks, ksks,
      num_radix_blocks, is_non_zero_lut);

  // Add one
  // Here Lhs can have the following values: (-1) % (message modulus * carry
  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
-  host_integer_radix_add_scalar_one_inplace(
+  host_integer_radix_add_scalar_one_inplace<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
      num_radix_blocks, message_modulus, carry_modulus);
 }
@@ -406,8 +408,8 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,

  auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
  while (partial_block_count > 2) {
-    pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
-                partial_block_count, 4);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
+                       partial_block_count, 4);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
@@ -433,8 +435,8 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
  std::function<Torus(Torus)> f;

  if (partial_block_count == 2) {
-    pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
-                partial_block_count, 4);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
+                       partial_block_count, 4);

    f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
      int msb = (x >> 2) & 3;
@@ -454,9 +456,9 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
  last_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

  // Last leaf
-  integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
-                                                 gpu_count, lwe_array_out, y,
-                                                 bsks, ksks, 1, last_lut);
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out, y, bsks, ksks, 1,
+      last_lut);
 }

 template <typename Torus>
@@ -488,19 +490,21 @@ __host__ void host_integer_radix_difference_check_kb(
    if (mem_ptr->is_signed) {
      packed_num_radix_blocks -= 2;
    }
-    pack_blocks(streams[0], gpu_indexes[0], packed_left, lwe_array_left,
-                big_lwe_dimension, packed_num_radix_blocks, message_modulus);
-    pack_blocks(streams[0], gpu_indexes[0], packed_right, lwe_array_right,
-                big_lwe_dimension, packed_num_radix_blocks, message_modulus);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], packed_left, lwe_array_left,
+                       big_lwe_dimension, packed_num_radix_blocks,
+                       message_modulus);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], packed_right,
+                       lwe_array_right, big_lwe_dimension,
+                       packed_num_radix_blocks, message_modulus);
    // From this point we have half number of blocks
    packed_num_radix_blocks /= 2;

    // Clean noise
    auto identity_lut = mem_ptr->identity_lut;
-    integer_radix_apply_univariate_lookup_table_kb(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, packed_left, packed_left, bsks, ksks,
        packed_num_radix_blocks, identity_lut);
-    integer_radix_apply_univariate_lookup_table_kb(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, packed_right, packed_right, bsks, ksks,
        packed_num_radix_blocks, identity_lut);

@@ -517,16 +521,17 @@ __host__ void host_integer_radix_difference_check_kb(
  if (!mem_ptr->is_signed) {
    // Compare packed blocks, or simply the total number of radix blocks in the
    // inputs
-    compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
-                            rhs, mem_ptr, bsks, ksks, packed_num_radix_blocks);
+    compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count, comparisons,
+                                   lhs, rhs, mem_ptr, bsks, ksks,
+                                   packed_num_radix_blocks);
    num_comparisons = packed_num_radix_blocks;
  } else {
    // Packing is possible
    if (carry_modulus >= message_modulus) {
      // Compare (num_radix_blocks - 2) / 2 packed blocks
-      compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
-                              rhs, mem_ptr, bsks, ksks,
-                              packed_num_radix_blocks);
+      compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count,
+                                     comparisons, lhs, rhs, mem_ptr, bsks, ksks,
+                                     packed_num_radix_blocks);

      // Compare the last block before the sign block separately
      auto identity_lut = mem_ptr->identity_lut;
@@ -535,21 +540,21 @@ __host__ void host_integer_radix_difference_check_kb(
      Torus *last_right_block_before_sign_block =
          diff_buffer->tmp_packed_right +
          packed_num_radix_blocks * big_lwe_size;
-      integer_radix_apply_univariate_lookup_table_kb(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, last_left_block_before_sign_block,
          lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1,
          identity_lut);
-      integer_radix_apply_univariate_lookup_table_kb(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, last_right_block_before_sign_block,
          lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks,
          1, identity_lut);
-      compare_radix_blocks_kb(
+      compare_radix_blocks_kb<Torus>(
          streams, gpu_indexes, gpu_count,
          comparisons + packed_num_radix_blocks * big_lwe_size,
          last_left_block_before_sign_block, last_right_block_before_sign_block,
          mem_ptr, bsks, ksks, 1);
      // Compare the sign block separately
-      integer_radix_apply_bivariate_lookup_table_kb(
+      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count,
          comparisons + (packed_num_radix_blocks + 1) * big_lwe_size,
          lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
@@ -558,11 +563,11 @@ __host__ void host_integer_radix_difference_check_kb(
      num_comparisons = packed_num_radix_blocks + 2;

    } else {
-      compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
-                              lwe_array_left, lwe_array_right, mem_ptr, bsks,
-                              ksks, num_radix_blocks - 1);
+      compare_radix_blocks_kb<Torus>(
+          streams, gpu_indexes, gpu_count, comparisons, lwe_array_left,
+          lwe_array_right, mem_ptr, bsks, ksks, num_radix_blocks - 1);
      // Compare the sign block separately
-      integer_radix_apply_bivariate_lookup_table_kb(
+      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count,
          comparisons + (num_radix_blocks - 1) * big_lwe_size,
          lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
@@ -575,9 +580,9 @@ __host__ void host_integer_radix_difference_check_kb(
  // Reduces a vec containing radix blocks that encrypts a sign
  // (inferior, equal, superior) to one single radix block containing the
  // final sign
-  tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
-                      comparisons, mem_ptr->diff_buffer->tree_buffer,
-                      reduction_lut_f, bsks, ksks, num_comparisons);
+  tree_sign_reduction<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                             comparisons, mem_ptr->diff_buffer->tree_buffer,
+                             reduction_lut_f, bsks, ksks, num_comparisons);
 }

 template <typename Torus>
@@ -601,16 +606,16 @@ host_integer_radix_maxmin_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
                             Torus **ksks, uint32_t total_num_radix_blocks) {

  // Compute the sign
-  host_integer_radix_difference_check_kb(
+  host_integer_radix_difference_check_kb<Torus>(
      streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
      lwe_array_left, lwe_array_right, mem_ptr, mem_ptr->identity_lut_f, bsks,
      ksks, total_num_radix_blocks);

  // Selector
-  host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
-                             mem_ptr->tmp_lwe_array_out, lwe_array_left,
-                             lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
-                             total_num_radix_blocks);
+  host_integer_radix_cmux_kb<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out,
+      mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
+      mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -0,0 +1,88 @@
+#include "compression.cuh"
+
+void scratch_cuda_integer_compress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
+    bool allocate_gpu_memory) {
+
+  int_radix_params compression_params(
+      pbs_type, compression_glwe_dimension, compression_polynomial_size,
+      (compression_glwe_dimension + 1) * compression_polynomial_size,
+      lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus,
+      carry_modulus);
+
+  scratch_cuda_compress_integer_radix_ciphertext<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_compression<uint64_t> **)mem_ptr, num_lwes, compression_params,
+      lwe_per_glwe, storage_log_modulus, allocate_gpu_memory);
+}
+void scratch_cuda_integer_decompress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
+    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
+    uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
+    bool allocate_gpu_memory) {
+
+  // Decompression doesn't keyswitch, so big and small dimensions are the same
+  int_radix_params encryption_params(
+      pbs_type, encryption_glwe_dimension, encryption_polynomial_size,
+      lwe_dimension, lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0,
+      message_modulus, carry_modulus);
+
+  int_radix_params compression_params(
+      pbs_type, compression_glwe_dimension, compression_polynomial_size,
+      lwe_dimension, compression_glwe_dimension * compression_polynomial_size,
+      0, 0, pbs_level, pbs_base_log, 0, message_modulus, carry_modulus);
+
+  scratch_cuda_integer_decompress_radix_ciphertext<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_decompression<uint64_t> **)mem_ptr, num_lwes, body_count,
+      encryption_params, compression_params, storage_log_modulus,
+      allocate_gpu_memory);
+}
+void cuda_integer_compress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
+    int8_t *mem_ptr) {
+
+  host_integer_compress<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(glwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in), (uint64_t **)(fp_ksk), num_nths,
+      (int_compression<uint64_t> *)mem_ptr);
+}
+void cuda_integer_decompress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *glwe_in, void *indexes_array,
+    uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr) {
+
+  host_integer_decompress<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array_out), static_cast<uint64_t *>(glwe_in),
+      static_cast<uint32_t *>(indexes_array), indexes_array_size, bsks,
+      (int_decompression<uint64_t> *)mem_ptr);
+}
+
+void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
+                                                       uint32_t *gpu_indexes,
+                                                       uint32_t gpu_count,
+                                                       int8_t **mem_ptr_void) {
+
+  int_compression<uint64_t> *mem_ptr =
+      (int_compression<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
+
+void cleanup_cuda_integer_decompress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void) {
+
+  int_decompression<uint64_t> *mem_ptr =
+      (int_decompression<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -0,0 +1,277 @@
+#ifndef CUDA_INTEGER_COMPRESSION_CUH
+#define CUDA_INTEGER_COMPRESSION_CUH
+
+#include "ciphertext.h"
+#include "compression.h"
+#include "crypto/keyswitch.cuh"
+#include "device.h"
+#include "integer/integer.cuh"
+#include "linearalgebra/multiplication.cuh"
+#include "polynomial/functions.cuh"
+#include "utils/kernel_dimensions.cuh"
+
+template <typename Torus>
+__global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
+                     uint32_t in_len, uint32_t len) {
+  auto nbits = sizeof(Torus) * 8;
+
+  auto i = threadIdx.x + blockIdx.x * blockDim.x;
+  if (i < len) {
+    auto k = nbits * i / log_modulus;
+    auto j = k;
+
+    auto start_shift = i * nbits - j * log_modulus;
+
+    auto value = array_in[j] >> start_shift;
+    j++;
+
+    while (j * log_modulus < ((i + 1) * nbits) && j < in_len) {
+      auto shift = j * log_modulus - i * nbits;
+      value |= array_in[j] << shift;
+      j++;
+    }
+
+    array_out[i] = value;
+  }
+}
+
+template <typename Torus>
+__host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
+                        Torus *array_out, Torus *array_in, uint32_t num_inputs,
+                        uint32_t body_count, int_compression<Torus> *mem_ptr) {
+  cudaSetDevice(gpu_index);
+  auto params = mem_ptr->compression_params;
+
+  auto log_modulus = mem_ptr->storage_log_modulus;
+  auto in_len = params.glwe_dimension * params.polynomial_size + body_count;
+  auto number_bits_to_pack = in_len * log_modulus;
+  auto nbits = sizeof(Torus) * 8;
+  // number_bits_to_pack.div_ceil(Scalar::BITS)
+  auto len = (number_bits_to_pack + nbits - 1) / nbits;
+
+  int num_blocks = 0, num_threads = 0;
+  getNumBlocksAndThreads(len, 128, num_blocks, num_threads);
+
+  dim3 grid(num_blocks);
+  dim3 threads(num_threads);
+  pack<Torus><<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus,
+                                            in_len, len);
+}
+
+template <typename Torus>
+__host__ void host_integer_compress(cudaStream_t *streams,
+                                    uint32_t *gpu_indexes, uint32_t gpu_count,
+                                    Torus *glwe_array_out, Torus *lwe_array_in,
+                                    Torus **fp_ksk, uint32_t num_lwes,
+                                    int_compression<Torus> *mem_ptr) {
+
+  auto compression_params = mem_ptr->compression_params;
+  auto input_lwe_dimension = compression_params.small_lwe_dimension;
+
+  // Shift
+  auto lwe_shifted = mem_ptr->tmp_lwe;
+  host_cleartext_multiplication<Torus>(
+      streams[0], gpu_indexes[0], lwe_shifted, lwe_array_in,
+      (uint64_t)compression_params.message_modulus, input_lwe_dimension,
+      num_lwes);
+
+  uint32_t lwe_in_size = input_lwe_dimension + 1;
+  uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
+                           compression_params.polynomial_size;
+  uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe + 1;
+  auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe);
+
+  // Keyswitch LWEs to GLWE
+  auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
+  auto fp_ks_buffer = mem_ptr->fp_ks_buffer;
+  for (int i = 0; i < num_glwes; i++) {
+    auto lwe_subset = lwe_shifted + i * lwe_in_size;
+    auto glwe_out = tmp_glwe_array_out + i * glwe_out_size;
+
+    host_packing_keyswitch_lwe_list_to_glwe<Torus>(
+        streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
+        fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
+        compression_params.polynomial_size, compression_params.ks_base_log,
+        compression_params.ks_level, body_count);
+  }
+
+  // Modulus switch
+  host_modulus_switch_inplace<Torus>(
+      streams[0], gpu_indexes[0], tmp_glwe_array_out,
+      num_glwes * (compression_params.glwe_dimension *
+                       compression_params.polynomial_size +
+                   body_count),
+      mem_ptr->storage_log_modulus);
+  check_cuda_error(cudaGetLastError());
+
+  host_pack<Torus>(streams[0], gpu_indexes[0], glwe_array_out,
+                   tmp_glwe_array_out, num_glwes, body_count, mem_ptr);
+}
+
+template <typename Torus>
+__global__ void extract(Torus *glwe_array_out, Torus *array_in, uint32_t index,
+                        uint32_t log_modulus, uint32_t initial_out_len) {
+  auto nbits = sizeof(Torus) * 8;
+
+  auto i = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (i < initial_out_len) {
+    // Unpack
+    Torus mask = ((Torus)1 << log_modulus) - 1;
+    auto start = i * log_modulus;
+    auto end = (i + 1) * log_modulus;
+
+    auto start_block = start / nbits;
+    auto start_remainder = start % nbits;
+
+    auto end_block_inclusive = (end - 1) / nbits;
+
+    Torus unpacked_i;
+    if (start_block == end_block_inclusive) {
+      auto single_part = array_in[start_block] >> start_remainder;
+      unpacked_i = single_part & mask;
+    } else {
+      auto first_part = array_in[start_block] >> start_remainder;
+      auto second_part = array_in[start_block + 1] << (nbits - start_remainder);
+
+      unpacked_i = (first_part | second_part) & mask;
+    }
+
+    // Extract
+    glwe_array_out[i] = unpacked_i << (nbits - log_modulus);
+  }
+}
+
+template <typename Torus>
+__host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
+                           Torus *glwe_array_out, Torus *array_in,
+                           uint32_t glwe_index,
+                           int_decompression<Torus> *mem_ptr) {
+  cudaSetDevice(gpu_index);
+
+  auto params = mem_ptr->compression_params;
+
+  auto log_modulus = mem_ptr->storage_log_modulus;
+
+  uint32_t body_count = mem_ptr->body_count;
+
+  auto initial_out_len =
+      params.glwe_dimension * params.polynomial_size + body_count;
+
+  // We assure the tail of the glwe is zeroed
+  auto zeroed_slice = glwe_array_out + initial_out_len;
+  cuda_memset_async(zeroed_slice, 0,
+                    (params.polynomial_size - body_count) * sizeof(Torus),
+                    stream, gpu_index);
+  int num_blocks = 0, num_threads = 0;
+  getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
+  dim3 grid(num_blocks);
+  dim3 threads(num_threads);
+  extract<Torus><<<grid, threads, 0, stream>>>(
+      glwe_array_out, array_in, glwe_index, log_modulus, initial_out_len);
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename Torus>
+__host__ void
+host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
+                        uint32_t gpu_count, Torus *lwe_array_out,
+                        Torus *packed_glwe_in, uint32_t *indexes_array,
+                        uint32_t indexes_array_size, void **bsks,
+                        int_decompression<Torus> *mem_ptr) {
+
+  auto extracted_glwe = mem_ptr->tmp_extracted_glwe;
+  auto compression_params = mem_ptr->compression_params;
+  host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
+                      packed_glwe_in, 0, mem_ptr);
+
+  auto num_lwes = mem_ptr->num_lwes;
+
+  // Sample extract
+  auto extracted_lwe = mem_ptr->tmp_extracted_lwe;
+  cuda_glwe_sample_extract_64(streams[0], gpu_indexes[0], extracted_lwe,
+                              extracted_glwe, indexes_array, indexes_array_size,
+                              compression_params.glwe_dimension,
+                              compression_params.polynomial_size);
+
+  /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+  /// dimension to a big LWE dimension
+  auto encryption_params = mem_ptr->encryption_params;
+  auto lut = mem_ptr->carry_extract_lut;
+  auto active_gpu_count = get_active_gpu_count(num_lwes, gpu_count);
+  if (active_gpu_count == 1) {
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_out,
+        lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
+        lut->lwe_indexes_in, bsks, lut->buffer,
+        encryption_params.glwe_dimension,
+        compression_params.small_lwe_dimension,
+        encryption_params.polynomial_size, encryption_params.pbs_base_log,
+        encryption_params.pbs_level, encryption_params.grouping_factor,
+        num_lwes, encryption_params.pbs_type);
+  } else {
+    /// For multi GPU execution we create vectors of pointers for inputs and
+    /// outputs
+    std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
+    std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
+    std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
+
+    /// Make sure all data that should be on GPU 0 is indeed there
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+    /// With multiple GPUs we push to the vectors on each GPU then when we
+    /// gather data to GPU 0 we can copy back to the original indexing
+    multi_gpu_scatter_lwe_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe,
+        lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_lwes,
+        compression_params.small_lwe_dimension + 1);
+
+    /// Apply PBS
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
+        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
+        lwe_array_in_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
+        encryption_params.glwe_dimension,
+        compression_params.small_lwe_dimension,
+        encryption_params.polynomial_size, encryption_params.pbs_base_log,
+        encryption_params.pbs_level, encryption_params.grouping_factor,
+        num_lwes, encryption_params.pbs_type);
+
+    /// Copy data back to GPU 0 and release vecs
+    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
+                                      lwe_array_out, lwe_after_pbs_vec,
+                                      lut->h_lwe_indexes_out,
+                                      lut->using_trivial_lwe_indexes, num_lwes,
+                                      encryption_params.big_lwe_dimension + 1);
+
+    /// Synchronize all GPUs
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    }
+  }
+}
+
+template <typename Torus>
+__host__ void scratch_cuda_compress_integer_radix_ciphertext(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_compression<Torus> **mem_ptr, uint32_t num_lwes,
+    int_radix_params compression_params, uint32_t lwe_per_glwe,
+    uint32_t storage_log_modulus, bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_compression<Torus>(
+      streams, gpu_indexes, gpu_count, compression_params, num_lwes,
+      lwe_per_glwe, storage_log_modulus, allocate_gpu_memory);
+}
+
+template <typename Torus>
+__host__ void scratch_cuda_integer_decompress_radix_ciphertext(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_decompression<Torus> **mem_ptr, uint32_t num_lwes, uint32_t body_count,
+    int_radix_params encryption_params, int_radix_params compression_params,
+    uint32_t storage_log_modulus, bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_decompression<Torus>(
+      streams, gpu_indexes, gpu_count, encryption_params, compression_params,
+      num_lwes, body_count, storage_log_modulus, allocate_gpu_memory);
+}
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -282,7 +282,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
          // Shift the mask so that we will only keep bits we should
          uint32_t shifted_mask = full_message_mask >> shift_amount;

-          integer_radix_apply_univariate_lookup_table_kb(
+          integer_radix_apply_univariate_lookup_table_kb<Torus>(
              streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
              interesting_divisor.last_block(), bsks, ksks, 1,
              mem_ptr->masking_luts_1[shifted_mask]);
@@ -310,7 +310,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
          // the estimated degree of the output is < msg_modulus
          shifted_mask = shifted_mask & full_message_mask;

-          integer_radix_apply_univariate_lookup_table_kb(
+          integer_radix_apply_univariate_lookup_table_kb<Torus>(
              streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
              divisor_ms_blocks.first_block(), bsks, ksks, 1,
              mem_ptr->masking_luts_2[shifted_mask]);
@@ -334,7 +334,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
          interesting_remainder1.insert(0, numerator_block_1.first_block(),
                                        streams[0], gpu_indexes[0]);

-          host_integer_radix_logical_scalar_shift_kb_inplace(
+          host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
              streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
              mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len);

@@ -342,7 +342,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
                               interesting_remainder1.len - 1, streams[0],
                               gpu_indexes[0]);

-          host_radix_blocks_rotate_left(
+          host_radix_blocks_rotate_left<Torus>(
              streams, gpu_indexes, gpu_count, interesting_remainder1.data,
              tmp_radix.data, 1, interesting_remainder1.len, big_lwe_size);

@@ -363,7 +363,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,

    auto left_shift_interesting_remainder2 =
        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
-          host_integer_radix_logical_scalar_shift_kb_inplace(
+          host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
              streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
              mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len);
        }; // left_shift_interesting_remainder2
@@ -396,10 +396,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
    // but in that position, interesting_remainder2 always has a 0
    auto &merged_interesting_remainder = interesting_remainder1;

-    host_addition(streams[0], gpu_indexes[0], merged_interesting_remainder.data,
-                  merged_interesting_remainder.data,
-                  interesting_remainder2.data, radix_params.big_lwe_dimension,
-                  merged_interesting_remainder.len);
+    host_addition<Torus>(
+        streams[0], gpu_indexes[0], merged_interesting_remainder.data,
+        merged_interesting_remainder.data, interesting_remainder2.data,
+        radix_params.big_lwe_dimension, merged_interesting_remainder.len);

    // after create_clean_version_of_merged_remainder
    // `merged_interesting_remainder` will be reused as
@@ -439,7 +439,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
        // We could call unchecked_scalar_ne
        // But we are in the special case where scalar == 0
        // So we can skip some stuff
-        host_compare_with_zero_equality(
+        host_compare_with_zero_equality<Torus>(
            streams, gpu_indexes, gpu_count, tmp_1.data, trivial_blocks.data,
            mem_ptr->comparison_buffer, bsks, ksks, trivial_blocks.len,
            mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
@@ -447,7 +447,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
        tmp_1.len =
            ceil_div(trivial_blocks.len, message_modulus * carry_modulus - 1);

-        is_at_least_one_comparisons_block_true(
+        is_at_least_one_comparisons_block_true<Torus>(
            streams, gpu_indexes, gpu_count,
            at_least_one_upper_block_is_non_zero.data, tmp_1.data,
            mem_ptr->comparison_buffer, bsks, ksks, tmp_1.len);
@@ -460,7 +460,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
    //  `cleaned_merged_interesting_remainder` - radix ciphertext
    auto create_clean_version_of_merged_remainder =
        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
-          integer_radix_apply_univariate_lookup_table_kb(
+          integer_radix_apply_univariate_lookup_table_kb<Torus>(
              streams, gpu_indexes, gpu_count,
              cleaned_merged_interesting_remainder.data,
              cleaned_merged_interesting_remainder.data, bsks, ksks,
@@ -486,10 +486,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
    }

-    host_addition(streams[0], gpu_indexes[0], overflow_sum.data,
-                  subtraction_overflowed.data,
-                  at_least_one_upper_block_is_non_zero.data,
-                  radix_params.big_lwe_dimension, 1);
+    host_addition<Torus>(streams[0], gpu_indexes[0], overflow_sum.data,
+                         subtraction_overflowed.data,
+                         at_least_one_upper_block_is_non_zero.data,
+                         radix_params.big_lwe_dimension, 1);

    int factor = (i) ? 3 : 2;
    int factor_lut_id = factor - 2;
@@ -528,10 +528,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
          mem_ptr->merge_overflow_flags_luts[pos_in_block]
              ->params.message_modulus);

-      host_addition(streams[0], gpu_indexes[0],
-                    &quotient[block_of_bit * big_lwe_size],
-                    &quotient[block_of_bit * big_lwe_size],
-                    did_not_overflow.data, radix_params.big_lwe_dimension, 1);
+      host_addition<Torus>(
+          streams[0], gpu_indexes[0], &quotient[block_of_bit * big_lwe_size],
+          &quotient[block_of_bit * big_lwe_size], did_not_overflow.data,
+          radix_params.big_lwe_dimension, 1);
    };

    for (uint j = 0; j < gpu_count; j++) {
@@ -564,17 +564,17 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,

  // Clean the quotient and remainder
  // as even though they have no carries, they are not at nominal noise level
-  host_addition(streams[0], gpu_indexes[0], remainder, remainder1.data,
-                remainder2.data, radix_params.big_lwe_dimension,
-                remainder1.len);
+  host_addition<Torus>(streams[0], gpu_indexes[0], remainder, remainder1.data,
+                       remainder2.data, radix_params.big_lwe_dimension,
+                       remainder1.len);

  for (uint j = 0; j < gpu_count; j++) {
    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
  }
-  integer_radix_apply_univariate_lookup_table_kb(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
      bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
-  integer_radix_apply_univariate_lookup_table_kb(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
      ksks, num_blocks, mem_ptr->message_extract_lut_2);
  for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -53,7 +53,7 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
                          message_modulus, carry_modulus);

-  scratch_cuda_propagate_single_carry_kb_inplace(
+  scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
      allocate_gpu_memory);
@@ -195,15 +195,15 @@ void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(

 void cuda_integer_compute_prefix_sum_hillis_steele_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
-    void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
-    void **bsks, uint32_t num_blocks, uint32_t shift) {
+    void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
+    void **ksks, void **bsks, uint32_t num_blocks, uint32_t shift) {

  int_radix_params params = ((int_radix_lut<uint64_t> *)mem_ptr)->params;

  host_compute_prefix_sum_hillis_steele<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(output_radix_lwe),
-      static_cast<uint64_t *>(input_radix_lwe), params,
+      static_cast<uint64_t *>(generates_or_propagates), params,
      (int_radix_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
      num_blocks);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -78,7 +78,7 @@ host_radix_blocks_rotate_right(cudaStream_t *streams, uint32_t *gpu_indexes,
          "pointers should be different");
  }
  cudaSetDevice(gpu_indexes[0]);
-  radix_blocks_rotate_right<<<blocks_count, 1024, 0, streams[0]>>>(
+  radix_blocks_rotate_right<Torus><<<blocks_count, 1024, 0, streams[0]>>>(
      dst, src, value, blocks_count, lwe_size);
 }

@@ -95,7 +95,7 @@ host_radix_blocks_rotate_left(cudaStream_t *streams, uint32_t *gpu_indexes,
          "pointers should be different");
  }
  cudaSetDevice(gpu_indexes[0]);
-  radix_blocks_rotate_left<<<blocks_count, 1024, 0, streams[0]>>>(
+  radix_blocks_rotate_left<Torus><<<blocks_count, 1024, 0, streams[0]>>>(
      dst, src, value, blocks_count, lwe_size);
 }

@@ -124,8 +124,8 @@ host_radix_blocks_reverse_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
                                  uint32_t lwe_size) {
  cudaSetDevice(gpu_indexes[0]);
  int num_blocks = blocks_count / 2, num_threads = 1024;
-  radix_blocks_reverse_lwe_inplace<<<num_blocks, num_threads, 0, streams[0]>>>(
-      src, blocks_count, lwe_size);
+  radix_blocks_reverse_lwe_inplace<Torus>
+      <<<num_blocks, num_threads, 0, streams[0]>>>(src, blocks_count, lwe_size);
 }

 // polynomial_size threads
@@ -164,9 +164,10 @@ __host__ void pack_bivariate_blocks(cudaStream_t *streams,
  int num_blocks = 0, num_threads = 0;
  int num_entries = num_radix_blocks * (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
-      lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_array_2, lwe_indexes_in,
-      lwe_dimension, shift, num_radix_blocks);
+  device_pack_bivariate_blocks<Torus>
+      <<<num_blocks, num_threads, 0, streams[0]>>>(
+          lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_array_2,
+          lwe_indexes_in, lwe_dimension, shift, num_radix_blocks);
  check_cuda_error(cudaGetLastError());
 }

@@ -273,10 +274,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(

  // Left message is shifted
  auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
-  pack_bivariate_blocks(streams, gpu_indexes, gpu_count, lwe_array_pbs_in,
-                        lut->lwe_trivial_indexes, lwe_array_1, lwe_array_2,
-                        lut->lwe_indexes_in, big_lwe_dimension, shift,
-                        num_radix_blocks);
+  pack_bivariate_blocks<Torus>(streams, gpu_indexes, gpu_count,
+                               lwe_array_pbs_in, lut->lwe_trivial_indexes,
+                               lwe_array_1, lwe_array_2, lut->lwe_indexes_in,
+                               big_lwe_dimension, shift, num_radix_blocks);
  check_cuda_error(cudaGetLastError());

  /// For multi GPU execution we create vectors of pointers for inputs and
@@ -380,7 +381,7 @@ void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
    body[i] = -body[i];
  }

-  rotate_left(body, half_box_size, polynomial_size);
+  rotate_left<Torus>(body, half_box_size, polynomial_size);
 }

 template <typename Torus>
@@ -590,13 +591,13 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
      ksks, num_blocks, luts_array);

  // compute prefix sum with hillis&steele
-  host_compute_prefix_sum_hillis_steele(
+  host_compute_prefix_sum_hillis_steele<Torus>(
      streams, gpu_indexes, gpu_count, step_output, generates_or_propagates,
      params, luts_carry_propagation_sum, bsks, ksks, num_blocks);

-  host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
-                                 generates_or_propagates, 1, num_blocks,
-                                 big_lwe_size);
+  host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                        step_output, generates_or_propagates, 1,
+                                        num_blocks, big_lwe_size);
  if (carry_out != nullptr) {
    cuda_memcpy_async_gpu_to_gpu(carry_out, step_output, big_lwe_size_bytes,
                                 streams[0], gpu_indexes[0]);
@@ -610,8 +611,9 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
                                 gpu_indexes[0]);
  }

-  host_addition(streams[0], gpu_indexes[0], lwe_array, lwe_array, step_output,
-                glwe_dimension * polynomial_size, num_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
+                       step_output, glwe_dimension * polynomial_size,
+                       num_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
@@ -664,14 +666,15 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,
      overflowed, &generates_or_propagates[big_lwe_size * (num_blocks - 1)],
      big_lwe_size_bytes, streams[0], gpu_indexes[0]);

-  host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
-                                 generates_or_propagates, 1, num_blocks,
-                                 big_lwe_size);
+  host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                        step_output, generates_or_propagates, 1,
+                                        num_blocks, big_lwe_size);
  cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0],
                    gpu_indexes[0]);

-  host_subtraction(streams[0], gpu_indexes[0], lwe_array, lwe_array,
-                   step_output, glwe_dimension * polynomial_size, num_blocks);
+  host_subtraction<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
+                          step_output, glwe_dimension * polynomial_size,
+                          num_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
@@ -727,10 +730,10 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,

    if (i < num_blocks - 1) {
      auto next_input_block = &input_blocks[(i + 1) * big_lwe_size];
-      host_addition(streams[0], gpu_indexes[0], next_input_block,
-                    next_input_block,
-                    &mem_ptr->tmp_big_lwe_vector[big_lwe_size],
-                    params.big_lwe_dimension, 1);
+      host_addition<Torus>(streams[0], gpu_indexes[0], next_input_block,
+                           next_input_block,
+                           &mem_ptr->tmp_big_lwe_vector[big_lwe_size],
+                           params.big_lwe_dimension, 1);
    }
  }
 }
@@ -765,7 +768,7 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
    }

    if (num_radix_blocks % 2 == 1) {
-      // We couldn't pack the last block, so we just copy it
+      // We couldn't host_pack the last block, so we just copy it
      Torus *lsb_block =
          lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
      Torus *last_block =
@@ -794,7 +797,7 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
  int num_blocks = 0, num_threads = 0;
  int num_entries = (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 1024, num_blocks, num_threads);
-  device_pack_blocks<<<num_blocks, num_threads, 0, stream>>>(
+  device_pack_blocks<Torus><<<num_blocks, num_threads, 0, stream>>>(
      lwe_array_out, lwe_array_in, lwe_dimension, num_radix_blocks, factor);
 }

@@ -840,7 +843,7 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_create_trivial_radix<<<grid, thds, 0, stream>>>(
+  device_create_trivial_radix<Torus><<<grid, thds, 0, stream>>>(
      lwe_array_out, scalar_array, num_scalar_blocks, lwe_dimension, delta);
  check_cuda_error(cudaGetLastError());
 }
@@ -857,7 +860,7 @@ __host__ void extract_n_bits(cudaStream_t *streams, uint32_t *gpu_indexes,
                             uint32_t num_radix_blocks, uint32_t bits_per_block,
                             int_bit_extract_luts_buffer<Torus> *bit_extract) {

-  integer_radix_apply_univariate_lookup_table_kb(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks, ksks,
      num_radix_blocks * bits_per_block, bit_extract->lut);
 }
@@ -870,7 +873,6 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
             std::function<Torus(Torus)> sign_handler_f, void **bsks,
             Torus **ksks, uint32_t num_sign_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto diff_buffer = mem_ptr->diff_buffer;

  auto params = mem_ptr->params;
@@ -904,9 +906,9 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    while (num_sign_blocks > 2) {
-      pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a,
-                  big_lwe_dimension, num_sign_blocks, 4);
-      integer_radix_apply_univariate_lookup_table_kb(
+      pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
+                         big_lwe_dimension, num_sign_blocks, 4);
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, signs_a, signs_b, bsks, ksks,
          num_sign_blocks / 2, lut);

@@ -937,11 +939,11 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
        final_lut_f);
    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

-    pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a, big_lwe_dimension,
-                2, 4);
-    integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
-                                                   gpu_count, signs_array_out,
-                                                   signs_b, bsks, ksks, 1, lut);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
+                       big_lwe_dimension, 2, 4);
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        streams, gpu_indexes, gpu_count, signs_array_out, signs_b, bsks, ksks,
+        1, lut);

  } else {

@@ -957,9 +959,9 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
        final_lut_f);
    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

-    integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
-                                                   gpu_count, signs_array_out,
-                                                   signs_a, bsks, ksks, 1, lut);
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        streams, gpu_indexes, gpu_count, signs_array_out, signs_a, bsks, ksks,
+        1, lut);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -241,7 +241,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        nullptr);
    break;
  case 1024:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -249,7 +250,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        nullptr);
    break;
  case 2048:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -257,7 +259,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        nullptr);
    break;
  case 4096:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -265,7 +268,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        nullptr);
    break;
  case 8192:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -273,7 +277,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        nullptr);
    break;
  case 16384:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -281,7 +286,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        nullptr);
    break;
  default:
    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -186,9 +186,10 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    Torus *radix_lwe_out, Torus *terms, int *terms_degree, void **bsks,
    uint64_t **ksks, int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
    uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec,
-    int_radix_lut<Torus> *reused_lut = nullptr) {
+    int_radix_lut<Torus> *reused_lut) {

  auto new_blocks = mem_ptr->new_blocks;
+  auto new_blocks_copy = mem_ptr->new_blocks_copy;
  auto old_blocks = mem_ptr->old_blocks;
  auto small_lwe_vector = mem_ptr->small_lwe_vector;

@@ -205,12 +206,27 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
  auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
  auto small_lwe_size = small_lwe_dimension + 1;

+  if (num_radix_in_vec == 0)
+    return;
+  if (num_radix_in_vec == 1) {
+    cuda_memcpy_async_gpu_to_gpu(radix_lwe_out, terms,
+                                 num_blocks_in_radix * big_lwe_size *
+                                     sizeof(Torus),
+                                 streams[0], gpu_indexes[0]);
+    return;
+  }
  if (old_blocks != terms) {
    cuda_memcpy_async_gpu_to_gpu(old_blocks, terms,
                                 num_blocks_in_radix * num_radix_in_vec *
                                     big_lwe_size * sizeof(Torus),
                                 streams[0], gpu_indexes[0]);
  }
+  if (num_radix_in_vec == 2) {
+    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
+                         &old_blocks[num_blocks * big_lwe_size],
+                         big_lwe_dimension, num_blocks);
+    return;
+  }

  size_t r = num_radix_in_vec;
  size_t total_modulus = message_modulus * carry_modulus;
@@ -271,7 +287,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    if (!ch_amount)
      ch_amount++;
    dim3 add_grid(ch_amount, num_blocks, 1);
-    size_t sm_size = big_lwe_size * sizeof(Torus);

    cudaSetDevice(gpu_indexes[0]);
    tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
@@ -288,7 +303,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
        terms_degree, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
        h_smart_copy_out, ch_amount, r, num_blocks, chunk_size, message_max,
        total_count, message_count, carry_count, sm_copy_count);
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
    auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
    luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
@@ -303,8 +317,11 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    // inside d_smart_copy_in there are only -1 values
    // it's fine to call smart_copy with same pointer
    // as source and destination
-    smart_copy<<<sm_copy_count, 1024, 0, streams[0]>>>(
-        new_blocks, new_blocks, d_smart_copy_out, d_smart_copy_in,
+    cuda_memcpy_async_gpu_to_gpu(new_blocks_copy, new_blocks,
+                                 r * num_blocks * big_lwe_size * sizeof(Torus),
+                                 streams[0], gpu_indexes[0]);
+    smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
+        new_blocks, new_blocks_copy, d_smart_copy_out, d_smart_copy_in,
        big_lwe_size);
    check_cuda_error(cudaGetLastError());

@@ -422,9 +439,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
  luts_message_carry->release(streams, gpu_indexes, gpu_count);
  delete (luts_message_carry);

-  host_addition(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
-                &old_blocks[num_blocks * big_lwe_size], big_lwe_dimension,
-                num_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
+                       &old_blocks[num_blocks * big_lwe_size],
+                       big_lwe_dimension, num_blocks);
 }

 template <typename Torus, class params>
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
@@ -1,14 +1,16 @@
 #include "integer/negation.cuh"

-void cuda_negate_integer_radix_ciphertext_64_inplace(
-    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
-    uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
-    uint32_t message_modulus, uint32_t carry_modulus) {
+void cuda_negate_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus) {

-  host_integer_radix_negation(
+  host_integer_radix_negation<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(lwe_array),
-      lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus);
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in), lwe_dimension,
+      lwe_ciphertext_count, message_modulus, carry_modulus);
 }

 void scratch_cuda_integer_radix_overflowing_sub_kb_64(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -25,14 +25,13 @@ template <typename Torus>
 __global__ void
 device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
                              uint64_t lwe_dimension, uint64_t message_modulus,
-                              uint64_t carry_modulus, uint64_t delta) {
+                              uint64_t delta) {
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  if (tid < lwe_dimension + 1) {
    bool is_body = (tid == lwe_dimension);

    // z = ceil( degree / 2^p ) * 2^p
    uint64_t z = (2 * message_modulus - 1) / message_modulus;
-    __syncthreads();
    z *= message_modulus;

    // (0,Delta*z) - ct
@@ -47,12 +46,9 @@ device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,

      uint64_t encoded_zb = zb * delta;

-      __syncthreads();
-
      // (0,Delta*z) - ct
      output[tid] =
          (is_body ? z * delta - (input[tid] + encoded_zb) : -input[tid]);
-      __syncthreads();
    }
  }
 }
@@ -75,16 +71,15 @@ host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes,
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);
-  uint64_t shared_mem = input_lwe_ciphertext_count * sizeof(uint32_t);

  // Value of the shift we multiply our messages by
  // If message_modulus and carry_modulus are always powers of 2 we can simplify
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_integer_radix_negation<<<grid, thds, shared_mem, streams[0]>>>(
+  device_integer_radix_negation<<<grid, thds, 0, streams[0]>>>(
      output, input, input_lwe_ciphertext_count, lwe_dimension, message_modulus,
-      carry_modulus, delta);
+      delta);
  check_cuda_error(cudaGetLastError());
 }

@@ -107,7 +102,7 @@ __host__ void host_integer_overflowing_sub_kb(

  auto radix_params = mem_ptr->params;

-  host_unchecked_sub_with_correcting_term(
+  host_unchecked_sub_with_correcting_term<Torus>(
      streams[0], gpu_indexes[0], radix_lwe_out, radix_lwe_left,
      radix_lwe_right, radix_params.big_lwe_dimension, num_blocks,
      radix_params.message_modulus, radix_params.carry_modulus,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
@@ -5,7 +5,7 @@ void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
    void *scalar_input, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus) {

-  host_integer_radix_scalar_addition_inplace(
+  host_integer_radix_scalar_addition_inplace<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(scalar_input),
      lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
@@ -45,9 +45,10 @@ __host__ void host_integer_radix_scalar_addition_inplace(
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_integer_radix_scalar_addition_inplace<<<grid, thds, 0, streams[0]>>>(
-      lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
-      delta);
+  device_integer_radix_scalar_addition_inplace<Torus>
+      <<<grid, thds, 0, streams[0]>>>(lwe_array, scalar_input,
+                                      input_lwe_ciphertext_count, lwe_dimension,
+                                      delta);
  check_cuda_error(cudaGetLastError());
 }

@@ -83,8 +84,9 @@ __host__ void host_integer_radix_add_scalar_one_inplace(
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_integer_radix_add_scalar_one_inplace<<<grid, thds, 0, streams[0]>>>(
-      lwe_array, input_lwe_ciphertext_count, lwe_dimension, delta);
+  device_integer_radix_add_scalar_one_inplace<Torus>
+      <<<grid, thds, 0, streams[0]>>>(lwe_array, input_lwe_ciphertext_count,
+                                      lwe_dimension, delta);
  check_cuda_error(cudaGetLastError());
 }

@@ -122,10 +124,10 @@ __host__ void host_integer_radix_scalar_subtraction_inplace(
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_integer_radix_scalar_subtraction_inplace<<<grid, thds, 0,
-                                                    streams[0]>>>(
-      lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
-      delta);
+  device_integer_radix_scalar_subtraction_inplace<Torus>
+      <<<grid, thds, 0, streams[0]>>>(lwe_array, scalar_input,
+                                      input_lwe_ciphertext_count, lwe_dimension,
+                                      delta);
  check_cuda_error(cudaGetLastError());
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -3,6 +3,58 @@

 #include "integer/comparison.cuh"

+template <typename Torus>
+__host__ void scalar_compare_radix_blocks_kb(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
+    uint32_t num_radix_blocks) {
+
+  if (num_radix_blocks == 0)
+    return;
+  auto params = mem_ptr->params;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+  auto message_modulus = params.message_modulus;
+  auto carry_modulus = params.carry_modulus;
+
+  // When rhs > lhs, the subtraction will overflow, and the bit of padding will
+  // be set to 1
+  // meaning that the output of the pbs will be the negative (modulo message
+  // space)
+  //
+  // Example:
+  // lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
+  // lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
+  // Since there was an overflow the bit of padding is 1 and not 0.
+  // When applying the LUT for an input value of 14 we would expect 1,
+  // but since the bit of padding is 1, we will get -1 modulus our message
+  // space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
+
+  auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
+  cuda_memcpy_async_gpu_to_gpu(subtracted_blocks, lwe_array_in,
+                               num_radix_blocks * (big_lwe_dimension + 1) *
+                                   sizeof(Torus),
+                               streams[0], gpu_indexes[0]);
+  // Subtract
+  // Here we need the true lwe sub, not the one that comes from shortint.
+  host_integer_radix_scalar_subtraction_inplace<Torus>(
+      streams, gpu_indexes, gpu_count, subtracted_blocks, scalar_blocks,
+      big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);
+
+  // Apply LUT to compare to 0
+  auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks,
+      ksks, num_radix_blocks, sign_lut);
+
+  // Add one
+  // Here Lhs can have the following values: (-1) % (message modulus * carry
+  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
+  host_integer_radix_add_scalar_one_inplace<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
+      num_radix_blocks, message_modulus, carry_modulus);
+}
+
 template <typename Torus>
 __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -45,10 +97,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
  if (total_num_scalar_blocks == 0) {
    // We only have to compare blocks with zero
    // means scalar is zero
-    host_compare_with_zero_equality(streams, gpu_indexes, gpu_count,
-                                    mem_ptr->tmp_lwe_array_out, lwe_array_in,
-                                    mem_ptr, bsks, ksks, total_num_radix_blocks,
-                                    mem_ptr->is_zero_lut);
+    host_compare_with_zero_equality<Torus>(
+        streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
+        lwe_array_in, mem_ptr, bsks, ksks, total_num_radix_blocks,
+        mem_ptr->is_zero_lut);

    auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
      x = (x == 1 ? IS_EQUAL : IS_SUPERIOR);
@@ -91,10 +143,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    Torus *lhs = diff_buffer->tmp_packed_left;
    Torus *rhs = diff_buffer->tmp_packed_right;

-    pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
-                big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
-    pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
-                total_num_scalar_blocks, message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                       big_lwe_dimension, num_lsb_radix_blocks,
+                       message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                       total_num_scalar_blocks, message_modulus);

    // From this point we have half number of blocks
    num_lsb_radix_blocks /= 2;
@@ -106,22 +159,22 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    // - 2 if lhs > rhs

    auto comparisons = mem_ptr->tmp_block_comparisons;
-    scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
-                                   comparisons, lhs, rhs, mem_ptr, bsks, ksks,
-                                   num_lsb_radix_blocks);
+    scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
+                                          comparisons, lhs, rhs, mem_ptr, bsks,
+                                          ksks, num_lsb_radix_blocks);

    // Reduces a vec containing radix blocks that encrypts a sign
    // (inferior, equal, superior) to one single radix block containing the
    // final sign
-    tree_sign_reduction(lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
-                        comparisons, mem_ptr->diff_buffer->tree_buffer,
-                        mem_ptr->identity_lut_f, bsks, ksks,
-                        num_lsb_radix_blocks);
+    tree_sign_reduction<Torus>(
+        lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
+        mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, ksks,
+        num_lsb_radix_blocks);
    //////////////
    // msb
-    host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
-                                    lwe_array_msb_out, msb, mem_ptr, bsks, ksks,
-                                    num_msb_radix_blocks, mem_ptr->is_zero_lut);
+    host_compare_with_zero_equality<Torus>(
+        msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb, mem_ptr,
+        bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
      cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
      cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
@@ -145,7 +198,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
        scalar_bivariate_last_leaf_lut_f);
    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

-    integer_radix_apply_bivariate_lookup_table_kb(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
        lwe_array_msb_out, bsks, ksks, 1, lut, lut->params.message_modulus);

@@ -159,10 +212,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    Torus *lhs = diff_buffer->tmp_packed_left;
    Torus *rhs = diff_buffer->tmp_packed_right;

-    pack_blocks(streams[0], gpu_indexes[0], lhs, lwe_array_in,
-                big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
-    pack_blocks(streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
-                num_scalar_blocks, message_modulus);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                       big_lwe_dimension, num_lsb_radix_blocks,
+                       message_modulus);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                       num_scalar_blocks, message_modulus);

    // From this point we have half number of blocks
    num_lsb_radix_blocks /= 2;
@@ -173,16 +227,17 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    // - 1 if lhs == rhs
    // - 2 if lhs > rhs
    auto comparisons = mem_ptr->tmp_lwe_array_out;
-    scalar_compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
-                                   lhs, rhs, mem_ptr, bsks, ksks,
-                                   num_lsb_radix_blocks);
+    scalar_compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count,
+                                          comparisons, lhs, rhs, mem_ptr, bsks,
+                                          ksks, num_lsb_radix_blocks);

    // Reduces a vec containing radix blocks that encrypts a sign
    // (inferior, equal, superior) to one single radix block containing the
    // final sign
-    tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
-                        comparisons, mem_ptr->diff_buffer->tree_buffer,
-                        sign_handler_f, bsks, ksks, num_lsb_radix_blocks);
+    tree_sign_reduction<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                               comparisons, mem_ptr->diff_buffer->tree_buffer,
+                               sign_handler_f, bsks, ksks,
+                               num_lsb_radix_blocks);
  }
 }

@@ -229,7 +284,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    // We only have to compare blocks with zero
    // means scalar is zero
    Torus *are_all_msb_zeros = mem_ptr->tmp_lwe_array_out;
-    host_compare_with_zero_equality(
+    host_compare_with_zero_equality<Torus>(
        streams, gpu_indexes, gpu_count, are_all_msb_zeros, lwe_array_in,
        mem_ptr, bsks, ksks, total_num_radix_blocks, mem_ptr->is_zero_lut);
    Torus *sign_block =
@@ -277,7 +332,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
        scalar_bivariate_last_leaf_lut_f);
    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

-    integer_radix_apply_bivariate_lookup_table_kb(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
        sign_block, bsks, ksks, 1, lut, lut->params.message_modulus);

@@ -304,10 +359,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    Torus *lhs = diff_buffer->tmp_packed_left;
    Torus *rhs = diff_buffer->tmp_packed_right;

-    pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
-                big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
-    pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
-                total_num_scalar_blocks, message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                       big_lwe_dimension, num_lsb_radix_blocks,
+                       message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                       total_num_scalar_blocks, message_modulus);

    // From this point we have half number of blocks
    num_lsb_radix_blocks /= 2;
@@ -319,24 +375,24 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    // - 2 if lhs > rhs

    auto comparisons = mem_ptr->tmp_block_comparisons;
-    scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
-                                   comparisons, lhs, rhs, mem_ptr, bsks, ksks,
-                                   num_lsb_radix_blocks);
+    scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
+                                          comparisons, lhs, rhs, mem_ptr, bsks,
+                                          ksks, num_lsb_radix_blocks);

    // Reduces a vec containing radix blocks that encrypts a sign
    // (inferior, equal, superior) to one single radix block containing the
    // final sign
-    tree_sign_reduction(lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
-                        comparisons, mem_ptr->diff_buffer->tree_buffer,
-                        mem_ptr->identity_lut_f, bsks, ksks,
-                        num_lsb_radix_blocks);
+    tree_sign_reduction<Torus>(
+        lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
+        mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, ksks,
+        num_lsb_radix_blocks);
    //////////////
    // msb
    // We remove the last block (which is the sign)
    Torus *are_all_msb_zeros = lwe_array_msb_out;
-    host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
-                                    are_all_msb_zeros, msb, mem_ptr, bsks, ksks,
-                                    num_msb_radix_blocks, mem_ptr->is_zero_lut);
+    host_compare_with_zero_equality<Torus>(
+        msb_streams, gpu_indexes, gpu_count, are_all_msb_zeros, msb, mem_ptr,
+        bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);

    auto sign_bit_pos = (int)log2(message_modulus) - 1;

@@ -371,7 +427,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
-    integer_radix_apply_bivariate_lookup_table_kb(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block,
        are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut,
        signed_msb_lut->params.message_modulus);
@@ -382,8 +438,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(

    //////////////
    // Reduce the two blocks into one final
-    reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out,
-                 lwe_array_lsb_out, mem_ptr, sign_handler_f, bsks, ksks, 2);
+    reduce_signs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                        lwe_array_lsb_out, mem_ptr, sign_handler_f, bsks, ksks,
+                        2);

  } else {
    // We only have to do the regular comparison
@@ -403,10 +460,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    Torus *lhs = diff_buffer->tmp_packed_left;
    Torus *rhs = diff_buffer->tmp_packed_right;

-    pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
-                big_lwe_dimension, num_lsb_radix_blocks - 1, message_modulus);
-    pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
-                num_lsb_radix_blocks - 1, message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                       big_lwe_dimension, num_lsb_radix_blocks - 1,
+                       message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                       num_lsb_radix_blocks - 1, message_modulus);

    // From this point we have half number of blocks
    num_lsb_radix_blocks /= 2;
@@ -415,19 +473,19 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    // - 0 if lhs < rhs
    // - 1 if lhs == rhs
    // - 2 if lhs > rhs
-    scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
-                                   lwe_array_ct_out, lhs, rhs, mem_ptr, bsks,
-                                   ksks, num_lsb_radix_blocks);
+    scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
+                                          lwe_array_ct_out, lhs, rhs, mem_ptr,
+                                          bsks, ksks, num_lsb_radix_blocks);
    Torus *encrypted_sign_block =
        lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;
    Torus *scalar_sign_block = scalar_blocks + (total_num_scalar_blocks - 1);

    auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
-    create_trivial_radix(msb_streams[0], gpu_indexes[0], trivial_sign_block,
-                         scalar_sign_block, big_lwe_dimension, 1, 1,
-                         message_modulus, carry_modulus);
+    create_trivial_radix<Torus>(
+        msb_streams[0], gpu_indexes[0], trivial_sign_block, scalar_sign_block,
+        big_lwe_dimension, 1, 1, message_modulus, carry_modulus);

-    integer_radix_apply_bivariate_lookup_table_kb(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        msb_streams, gpu_indexes, gpu_count, lwe_array_sign_out,
        encrypted_sign_block, trivial_sign_block, bsks, ksks, 1,
        mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
@@ -439,9 +497,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    // Reduces a vec containing radix blocks that encrypts a sign
    // (inferior, equal, superior) to one single radix block containing the
    // final sign
-    reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out,
-                 lwe_array_ct_out, mem_ptr, sign_handler_f, bsks, ksks,
-                 num_lsb_radix_blocks + 1);
+    reduce_signs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                        lwe_array_ct_out, mem_ptr, sign_handler_f, bsks, ksks,
+                        num_lsb_radix_blocks + 1);
  }
 }

@@ -452,14 +510,13 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
    uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  // Calculates the difference sign between the ciphertext and the scalar
  // - 0 if lhs < rhs
  // - 1 if lhs == rhs
  // - 2 if lhs > rhs
  auto sign = mem_ptr->tmp_lwe_array_out;
-  integer_radix_signed_scalar_difference_check_kb(
+  integer_radix_signed_scalar_difference_check_kb<Torus>(
      streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
      mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks,
      total_num_scalar_blocks);
@@ -469,17 +526,17 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
  auto lwe_array_left = lwe_array_in;
  auto lwe_array_right = mem_ptr->tmp_block_comparisons;

-  create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right,
-                       scalar_blocks, params.big_lwe_dimension,
-                       total_num_radix_blocks, total_num_scalar_blocks,
-                       params.message_modulus, params.carry_modulus);
+  create_trivial_radix<Torus>(streams[0], gpu_indexes[0], lwe_array_right,
+                              scalar_blocks, params.big_lwe_dimension,
+                              total_num_radix_blocks, total_num_scalar_blocks,
+                              params.message_modulus, params.carry_modulus);

  // Selector
  // CMUX for Max or Min
-  host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
-                             sign, lwe_array_left, lwe_array_right,
-                             mem_ptr->cmux_buffer, bsks, ksks,
-                             total_num_radix_blocks);
+  host_integer_radix_cmux_kb<Torus>(streams, gpu_indexes, gpu_count,
+                                    lwe_array_out, sign, lwe_array_left,
+                                    lwe_array_right, mem_ptr->cmux_buffer, bsks,
+                                    ksks, total_num_radix_blocks);
 }

 template <typename Torus>
@@ -492,12 +549,12 @@ __host__ void host_integer_radix_scalar_difference_check_kb(

  if (mem_ptr->is_signed) {
    // is signed and scalar is positive
-    integer_radix_signed_scalar_difference_check_kb(
+    integer_radix_signed_scalar_difference_check_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
        scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
        total_num_radix_blocks, total_num_scalar_blocks);
  } else {
-    integer_radix_unsigned_scalar_difference_check_kb(
+    integer_radix_unsigned_scalar_difference_check_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
        scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
        total_num_radix_blocks, total_num_scalar_blocks);
@@ -513,70 +570,16 @@ __host__ void host_integer_radix_signed_scalar_maxmin_kb(

  if (mem_ptr->is_signed) {
    // is signed and scalar is positive
-    integer_radix_signed_scalar_maxmin_kb(
+    integer_radix_signed_scalar_maxmin_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
        scalar_blocks, mem_ptr, bsks, ksks, total_num_radix_blocks,
        total_num_scalar_blocks);
  } else {
-    integer_radix_unsigned_scalar_maxmin_kb(
-        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
-        scalar_blocks, mem_ptr, bsks, ksks, total_num_radix_blocks,
-        total_num_scalar_blocks);
+    PANIC("Cuda error: only signed scalar maxmin can be called in signed "
+          "scalar comparison")
  }
 }

-template <typename Torus>
-__host__ void scalar_compare_radix_blocks_kb(
-    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
-    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
-    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
-    uint32_t num_radix_blocks) {
-
-  if (num_radix_blocks == 0)
-    return;
-  auto params = mem_ptr->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
-
-  // When rhs > lhs, the subtraction will overflow, and the bit of padding will
-  // be set to 1
-  // meaning that the output of the pbs will be the negative (modulo message
-  // space)
-  //
-  // Example:
-  // lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
-  // lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
-  // Since there was an overflow the bit of padding is 1 and not 0.
-  // When applying the LUT for an input value of 14 we would expect 1,
-  // but since the bit of padding is 1, we will get -1 modulus our message
-  // space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
-
-  auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
-  cuda_memcpy_async_gpu_to_gpu(subtracted_blocks, lwe_array_in,
-                               num_radix_blocks * (big_lwe_dimension + 1) *
-                                   sizeof(Torus),
-                               streams[0], gpu_indexes[0]);
-  // Subtract
-  // Here we need the true lwe sub, not the one that comes from shortint.
-  host_integer_radix_scalar_subtraction_inplace(
-      streams, gpu_indexes, gpu_count, subtracted_blocks, scalar_blocks,
-      big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);
-
-  // Apply LUT to compare to 0
-  auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
-  integer_radix_apply_univariate_lookup_table_kb(
-      streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks,
-      ksks, num_radix_blocks, sign_lut);
-
-  // Add one
-  // Here Lhs can have the following values: (-1) % (message modulus * carry
-  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
-  host_integer_radix_add_scalar_one_inplace(
-      streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
-      num_radix_blocks, message_modulus, carry_modulus);
-}
-
 template <typename Torus>
 __host__ void host_integer_radix_scalar_maxmin_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -591,7 +594,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
  // - 1 if lhs == rhs
  // - 2 if lhs > rhs
  auto sign = mem_ptr->tmp_lwe_array_out;
-  host_integer_radix_scalar_difference_check_kb(
+  host_integer_radix_scalar_difference_check_kb<Torus>(
      streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
      mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks,
      total_num_scalar_blocks);
@@ -601,17 +604,17 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
  auto lwe_array_left = lwe_array_in;
  auto lwe_array_right = mem_ptr->tmp_block_comparisons;

-  create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right,
-                       scalar_blocks, params.big_lwe_dimension,
-                       total_num_radix_blocks, total_num_scalar_blocks,
-                       params.message_modulus, params.carry_modulus);
+  create_trivial_radix<Torus>(streams[0], gpu_indexes[0], lwe_array_right,
+                              scalar_blocks, params.big_lwe_dimension,
+                              total_num_radix_blocks, total_num_scalar_blocks,
+                              params.message_modulus, params.carry_modulus);

  // Selector
  // CMUX for Max or Min
-  host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
-                             mem_ptr->tmp_lwe_array_out, lwe_array_left,
-                             lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
-                             total_num_radix_blocks);
+  host_integer_radix_cmux_kb<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out,
+      mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
+      mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
 }

 template <typename Torus>
@@ -659,10 +662,11 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
    auto packed_scalar =
        packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks;

-    pack_blocks(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb,
-                big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
-    pack_blocks(lsb_streams[0], gpu_indexes[0], packed_scalar, scalar_blocks, 0,
-                num_scalar_blocks, message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb,
+                       big_lwe_dimension, num_lsb_radix_blocks,
+                       message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], packed_scalar,
+                       scalar_blocks, 0, num_scalar_blocks, message_modulus);

    cuda_memcpy_async_gpu_to_gpu(
        scalar_comparison_luts->get_lut_indexes(gpu_indexes[0], 0),
@@ -670,7 +674,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
        gpu_indexes[0]);
    scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);

-    integer_radix_apply_univariate_lookup_table_kb(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, packed_blocks,
        bsks, ksks, num_halved_lsb_radix_blocks, scalar_comparison_luts);
  }
@@ -689,9 +693,9 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
      PANIC("Cuda error: integer operation not supported")
    }

-    host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
-                                    lwe_array_msb_out, msb, mem_ptr, bsks, ksks,
-                                    num_msb_radix_blocks, msb_lut);
+    host_compare_with_zero_equality<Torus>(
+        msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb, mem_ptr,
+        bsks, ksks, num_msb_radix_blocks, msb_lut);
  }

  for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
@@ -701,13 +705,13 @@ __host__ void host_integer_radix_scalar_equality_check_kb(

  switch (mem_ptr->op) {
  case COMPARISON_TYPE::EQ:
-    are_all_comparisons_block_true(
+    are_all_comparisons_block_true<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
        mem_ptr, bsks, ksks,
        num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
    break;
  case COMPARISON_TYPE::NE:
-    is_at_least_one_comparisons_block_true(
+    is_at_least_one_comparisons_block_true<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
        mem_ptr, bsks, ksks,
        num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -65,7 +65,7 @@ __host__ void host_integer_scalar_mul_radix(
      cuda_memcpy_async_gpu_to_gpu(ptr, lwe_array,
                                   lwe_size_bytes * num_radix_blocks,
                                   streams[0], gpu_indexes[0]);
-      host_integer_radix_logical_scalar_shift_kb_inplace(
+      host_integer_radix_logical_scalar_shift_kb_inplace<T>(
          streams, gpu_indexes, gpu_count, ptr, shift_amount,
          mem->logical_scalar_shift_buffer, bsks, ksks, num_radix_blocks);
    } else {
@@ -82,9 +82,9 @@ __host__ void host_integer_scalar_mul_radix(
          preshifted_buffer + (i % msg_bits) * num_radix_blocks * lwe_size;
      T *block_shift_buffer =
          all_shifted_buffer + j * num_radix_blocks * lwe_size;
-      host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                     block_shift_buffer, preshifted_radix_ct,
-                                     i / msg_bits, num_radix_blocks, lwe_size);
+      host_radix_blocks_rotate_right<T>(
+          streams, gpu_indexes, gpu_count, block_shift_buffer,
+          preshifted_radix_ct, i / msg_bits, num_radix_blocks, lwe_size);
      // create trivial assign for value = 0
      cuda_memset_async(block_shift_buffer, 0, (i / msg_bits) * lwe_size_bytes,
                        streams[0], gpu_indexes[0]);
@@ -108,7 +108,7 @@ __host__ void host_integer_scalar_mul_radix(
    host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
        streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer,
        terms_degree, bsks, ksks, mem->sum_ciphertexts_vec_mem,
-        num_radix_blocks, j);
+        num_radix_blocks, j, nullptr);

    auto scp_mem_ptr = mem->sum_ciphertexts_vec_mem->scp_mem;
    host_propagate_single_carry<T>(streams, gpu_indexes, gpu_count, lwe_array,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
@@ -56,9 +56,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
  // one block is responsible to process single lwe ciphertext
  if (mem->shift_type == LEFT_SHIFT) {
    // rotate right as the blocks are from LSB to MSB
-    host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                   rotated_buffer, lwe_array, rotations,
-                                   num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                          rotated_buffer, lwe_array, rotations,
+                                          num_blocks, big_lwe_size);

    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
                                 num_blocks * big_lwe_size_bytes, streams[0],
@@ -70,9 +70,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(

    auto receiver_blocks = lwe_array;
    auto giver_blocks = rotated_buffer;
-    host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                   giver_blocks, lwe_array, 1, num_blocks,
-                                   big_lwe_size);
+    host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                          giver_blocks, lwe_array, 1,
+                                          num_blocks, big_lwe_size);

    auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

@@ -83,9 +83,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(

  } else {
    // rotate left as the blocks are from LSB to MSB
-    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
-                                  rotated_buffer, lwe_array, rotations,
-                                  num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         rotated_buffer, lwe_array, rotations,
+                                         num_blocks, big_lwe_size);

    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
                                 num_blocks * big_lwe_size_bytes, streams[0],
@@ -97,8 +97,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(

    auto receiver_blocks = lwe_array;
    auto giver_blocks = rotated_buffer;
-    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, giver_blocks,
-                                  lwe_array, 1, num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         giver_blocks, lwe_array, 1, num_blocks,
+                                         big_lwe_size);

    auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -53,9 +53,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(

  if (mem->shift_type == LEFT_SHIFT) {
    // rotate right as the blocks are from LSB to MSB
-    host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                   rotated_buffer, lwe_array, rotations,
-                                   num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                          rotated_buffer, lwe_array, rotations,
+                                          num_blocks, big_lwe_size);

    // create trivial assign for value = 0
    cuda_memset_async(rotated_buffer, 0, rotations * big_lwe_size_bytes,
@@ -83,9 +83,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(

  } else {
    // right shift
-    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
-                                  rotated_buffer, lwe_array, rotations,
-                                  num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         rotated_buffer, lwe_array, rotations,
+                                         num_blocks, big_lwe_size);

    // rotate left as the blocks are from LSB to MSB
    // create trivial assign for value = 0
@@ -156,9 +156,9 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
  Torus *last_block_copy = &padding_block[big_lwe_size];

  if (mem->shift_type == RIGHT_SHIFT) {
-    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
-                                  rotated_buffer, lwe_array, rotations,
-                                  num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         rotated_buffer, lwe_array, rotations,
+                                         num_blocks, big_lwe_size);
    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
                                 num_blocks * big_lwe_size_bytes, streams[0],
                                 gpu_indexes[0]);
@@ -213,7 +213,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
      }
      auto lut_univariate_padding_block =
          mem->lut_buffers_univariate[num_bits_in_block - 1];
-      integer_radix_apply_univariate_lookup_table_kb(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          mem->local_streams_1, gpu_indexes, gpu_count, padding_block,
          last_block_copy, bsks, ksks, 1, lut_univariate_padding_block);
      // Replace blocks 'pulled' from the left with the correct padding
@@ -227,7 +227,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
      if (shift_within_block != 0) {
        auto lut_univariate_shift_last_block =
            mem->lut_buffers_univariate[shift_within_block - 1];
-        integer_radix_apply_univariate_lookup_table_kb(
+        integer_radix_apply_univariate_lookup_table_kb<Torus>(
            mem->local_streams_2, gpu_indexes, gpu_count, last_block,
            last_block_copy, bsks, ksks, 1, lut_univariate_shift_last_block);
      }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
@@ -88,9 +88,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
    switch (mem->shift_type) {
    case LEFT_SHIFT:
      // rotate right as the blocks are from LSB to MSB
-      host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                     rotated_input, input_bits_b, rotations,
-                                     total_nb_bits, big_lwe_size);
+      host_radix_blocks_rotate_right<Torus>(
+          streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
+          rotations, total_nb_bits, big_lwe_size);

      if (mem->is_signed && mem->shift_type == RIGHT_SHIFT)
        for (int i = 0; i < rotations; i++)
@@ -103,9 +103,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
      break;
    case RIGHT_SHIFT:
      // rotate left as the blocks are from LSB to MSB
-      host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
-                                    rotated_input, input_bits_b, rotations,
-                                    total_nb_bits, big_lwe_size);
+      host_radix_blocks_rotate_left<Torus>(
+          streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
+          rotations, total_nb_bits, big_lwe_size);

      if (mem->is_signed)
        for (int i = 0; i < rotations; i++)
@@ -119,38 +119,39 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
      break;
    case LEFT_ROTATE:
      // rotate right as the blocks are from LSB to MSB
-      host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                     rotated_input, input_bits_b, rotations,
-                                     total_nb_bits, big_lwe_size);
+      host_radix_blocks_rotate_right<Torus>(
+          streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
+          rotations, total_nb_bits, big_lwe_size);
      break;
    case RIGHT_ROTATE:
      // rotate left as the blocks are from LSB to MSB
-      host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
-                                    rotated_input, input_bits_b, rotations,
-                                    total_nb_bits, big_lwe_size);
+      host_radix_blocks_rotate_left<Torus>(
+          streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
+          rotations, total_nb_bits, big_lwe_size);
      break;
    default:
      PANIC("Unknown operation")
    }

-    // pack bits into one block so that we have
+    // host_pack bits into one block so that we have
    // control_bit|b|a
    cuda_memset_async(mux_inputs, 0, total_nb_bits * big_lwe_size_bytes,
                      streams[0], gpu_indexes[0]); // Do we need this?
-    pack_bivariate_blocks(streams, gpu_indexes, gpu_count, mux_inputs,
-                          mux_lut->lwe_indexes_out, rotated_input, input_bits_a,
-                          mux_lut->lwe_indexes_in, big_lwe_dimension, 2,
-                          total_nb_bits);
+    pack_bivariate_blocks<Torus>(streams, gpu_indexes, gpu_count, mux_inputs,
+                                 mux_lut->lwe_indexes_out, rotated_input,
+                                 input_bits_a, mux_lut->lwe_indexes_in,
+                                 big_lwe_dimension, 2, total_nb_bits);

    // The shift bit is already properly aligned/positioned
    for (int i = 0; i < total_nb_bits; i++)
-      host_addition(streams[0], gpu_indexes[0], mux_inputs + i * big_lwe_size,
-                    mux_inputs + i * big_lwe_size, shift_bit,
-                    mem->params.big_lwe_dimension, 1);
+      host_addition<Torus>(streams[0], gpu_indexes[0],
+                           mux_inputs + i * big_lwe_size,
+                           mux_inputs + i * big_lwe_size, shift_bit,
+                           mem->params.big_lwe_dimension, 1);

    // we have
    // control_bit|b|a
-    integer_radix_apply_univariate_lookup_table_kb(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, input_bits_a, mux_inputs, bsks, ksks,
        total_nb_bits, mux_lut);
  }
@@ -179,8 +180,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
    auto bit_to_add = input_bits_a + i * big_lwe_size;

    for (int j = 0; j < num_radix_blocks; j++) {
-      host_addition(streams[0], gpu_indexes[0], block, block, bit_to_add,
-                    big_lwe_dimension, 1);
+      host_addition<Torus>(streams[0], gpu_indexes[0], block, block, bit_to_add,
+                           big_lwe_dimension, 1);

      block += big_lwe_size;
      bit_to_add += bits_per_block * big_lwe_size;
@@ -188,7 +189,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(

    // To give back a clean ciphertext
    auto cleaning_lut = mem->cleaning_lut;
-    integer_radix_apply_univariate_lookup_table_kb(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_last_out, lwe_last_out, bsks, ksks,
        num_radix_blocks, cleaning_lut);
  }
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
@@ -11,11 +11,11 @@ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
                                       uint32_t input_lwe_dimension,
                                       uint32_t input_lwe_ciphertext_count) {

-  host_addition(static_cast<cudaStream_t>(stream), gpu_index,
-                static_cast<uint32_t *>(lwe_array_out),
-                static_cast<uint32_t *>(lwe_array_in_1),
-                static_cast<uint32_t *>(lwe_array_in_2), input_lwe_dimension,
-                input_lwe_ciphertext_count);
+  host_addition<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
+                          static_cast<uint32_t *>(lwe_array_out),
+                          static_cast<uint32_t *>(lwe_array_in_1),
+                          static_cast<uint32_t *>(lwe_array_in_2),
+                          input_lwe_dimension, input_lwe_ciphertext_count);
 }

 /*
@@ -51,11 +51,11 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                                       uint32_t input_lwe_dimension,
                                       uint32_t input_lwe_ciphertext_count) {

-  host_addition(static_cast<cudaStream_t>(stream), gpu_index,
-                static_cast<uint64_t *>(lwe_array_out),
-                static_cast<uint64_t *>(lwe_array_in_1),
-                static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
-                input_lwe_ciphertext_count);
+  host_addition<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
+                          static_cast<uint64_t *>(lwe_array_out),
+                          static_cast<uint64_t *>(lwe_array_in_1),
+                          static_cast<uint64_t *>(lwe_array_in_2),
+                          input_lwe_dimension, input_lwe_ciphertext_count);
 }
 /*
 * Perform the addition of a u32 input LWE ciphertext vector with a u32
@@ -66,11 +66,12 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
    void *plaintext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count) {

-  host_addition_plaintext(static_cast<cudaStream_t>(stream), gpu_index,
-                          static_cast<uint32_t *>(lwe_array_out),
-                          static_cast<uint32_t *>(lwe_array_in),
-                          static_cast<uint32_t *>(plaintext_array_in),
-                          input_lwe_dimension, input_lwe_ciphertext_count);
+  host_addition_plaintext<uint32_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint32_t *>(lwe_array_out),
+      static_cast<uint32_t *>(lwe_array_in),
+      static_cast<uint32_t *>(plaintext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
 }
 /*
 * Perform the addition of a u64 input LWE ciphertext vector with a u64 input
@@ -105,9 +106,10 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
    void *plaintext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count) {

-  host_addition_plaintext(static_cast<cudaStream_t>(stream), gpu_index,
-                          static_cast<uint64_t *>(lwe_array_out),
-                          static_cast<uint64_t *>(lwe_array_in),
-                          static_cast<uint64_t *>(plaintext_array_in),
-                          input_lwe_dimension, input_lwe_ciphertext_count);
+  host_addition_plaintext<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in),
+      static_cast<uint64_t *>(plaintext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -43,7 +43,7 @@ host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
  cuda_memcpy_async_gpu_to_gpu(output, lwe_input,
                               (lwe_dimension + 1) * lwe_ciphertext_count,
                               stream, gpu_index);
-  plaintext_addition<<<grid, thds, 0, stream>>>(
+  plaintext_addition<T><<<grid, thds, 0, stream>>>(
      output, lwe_input, plaintext_input, lwe_dimension, num_entries);
  check_cuda_error(cudaGetLastError());
 }
@@ -78,7 +78,7 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  addition<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
+  addition<T><<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
  check_cuda_error(cudaGetLastError());
 }

@@ -112,7 +112,8 @@ __host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  subtraction<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
+  subtraction<T>
+      <<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
  check_cuda_error(cudaGetLastError());
 }

@@ -150,7 +151,7 @@ __host__ void host_subtraction_plaintext(cudaStream_t stream,
                                   (input_lwe_dimension + 1) * sizeof(T),
                               stream, gpu_index);

-  radix_body_subtraction_inplace<<<grid, thds, 0, stream>>>(
+  radix_body_subtraction_inplace<T><<<grid, thds, 0, stream>>>(
      output, plaintext_input, input_lwe_dimension, num_entries);
  check_cuda_error(cudaGetLastError());
 }
@@ -176,7 +177,6 @@ __global__ void unchecked_sub_with_correcting_term(
  }
 }
 template <typename T>
-
 __host__ void host_unchecked_sub_with_correcting_term(
    cudaStream_t stream, uint32_t gpu_index, T *output, T *input_1, T *input_2,
    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count,
@@ -193,7 +193,7 @@ __host__ void host_unchecked_sub_with_correcting_term(
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  unchecked_sub_with_correcting_term<<<grid, thds, 0, stream>>>(
+  unchecked_sub_with_correcting_term<T><<<grid, thds, 0, stream>>>(
      output, input_1, input_2, num_entries, lwe_size, message_modulus,
      carry_modulus, degree);
  check_cuda_error(cudaGetLastError());
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
@@ -9,12 +9,12 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
    void *cleartext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count) {

-  host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
-                                static_cast<uint32_t *>(lwe_array_out),
-                                static_cast<uint32_t *>(lwe_array_in),
-                                static_cast<uint32_t *>(cleartext_array_in),
-                                input_lwe_dimension,
-                                input_lwe_ciphertext_count);
+  host_cleartext_vec_multiplication<uint32_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint32_t *>(lwe_array_out),
+      static_cast<uint32_t *>(lwe_array_in),
+      static_cast<uint32_t *>(cleartext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
 }
 /*
 * Perform the multiplication of a u64 input LWE ciphertext vector with a u64
@@ -49,10 +49,10 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
    void *cleartext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count) {

-  host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
-                                static_cast<uint64_t *>(lwe_array_out),
-                                static_cast<uint64_t *>(lwe_array_in),
-                                static_cast<uint64_t *>(cleartext_array_in),
-                                input_lwe_dimension,
-                                input_lwe_ciphertext_count);
+  host_cleartext_vec_multiplication<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in),
+      static_cast<uint64_t *>(cleartext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
@@ -14,9 +14,10 @@
 #include <vector>

 template <typename T>
-__global__ void
-cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
-                         uint32_t input_lwe_dimension, uint32_t num_entries) {
+__global__ void cleartext_vec_multiplication(T *output, T *lwe_input,
+                                             T *cleartext_input,
+                                             uint32_t input_lwe_dimension,
+                                             uint32_t num_entries) {

  int tid = threadIdx.x;
  int index = blockIdx.x * blockDim.x + tid;
@@ -27,10 +28,46 @@ cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
  }
 }

+template <typename T>
+__host__ void
+host_cleartext_vec_multiplication(cudaStream_t stream, uint32_t gpu_index,
+                                  T *output, T *lwe_input, T *cleartext_input,
+                                  uint32_t input_lwe_dimension,
+                                  uint32_t input_lwe_ciphertext_count) {
+
+  cudaSetDevice(gpu_index);
+  // lwe_size includes the presence of the body
+  // whereas lwe_dimension is the number of elements in the mask
+  int lwe_size = input_lwe_dimension + 1;
+  // Create a 1-dimensional grid of threads
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = input_lwe_ciphertext_count * lwe_size;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  cleartext_vec_multiplication<T><<<grid, thds, 0, stream>>>(
+      output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename T>
+__global__ void
+cleartext_multiplication(T *output, T *lwe_input, T cleartext_input,
+                         uint32_t input_lwe_dimension, uint32_t num_entries) {
+
+  int tid = threadIdx.x;
+  int index = blockIdx.x * blockDim.x + tid;
+  if (index < num_entries) {
+    // Here we take advantage of the wrapping behaviour of uint
+    output[index] = lwe_input[index] * cleartext_input;
+  }
+}
+
 template <typename T>
 __host__ void
 host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
-                              T *output, T *lwe_input, T *cleartext_input,
+                              T *output, T *lwe_input, T cleartext_input,
                              uint32_t input_lwe_dimension,
                              uint32_t input_lwe_ciphertext_count) {

@@ -45,7 +82,7 @@ host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  cleartext_multiplication<<<grid, thds, 0, stream>>>(
+  cleartext_multiplication<T><<<grid, thds, 0, stream>>>(
      output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
  check_cuda_error(cudaGetLastError());
 }
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu
@@ -10,10 +10,10 @@ void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
                                          uint32_t input_lwe_dimension,
                                          uint32_t input_lwe_ciphertext_count) {

-  host_negation(static_cast<cudaStream_t>(stream), gpu_index,
-                static_cast<uint32_t *>(lwe_array_out),
-                static_cast<uint32_t *>(lwe_array_in), input_lwe_dimension,
-                input_lwe_ciphertext_count);
+  host_negation<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
+                          static_cast<uint32_t *>(lwe_array_out),
+                          static_cast<uint32_t *>(lwe_array_in),
+                          input_lwe_dimension, input_lwe_ciphertext_count);
 }

 /*
@@ -44,8 +44,8 @@ void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                                          uint32_t input_lwe_dimension,
                                          uint32_t input_lwe_ciphertext_count) {

-  host_negation(static_cast<cudaStream_t>(stream), gpu_index,
-                static_cast<uint64_t *>(lwe_array_out),
-                static_cast<uint64_t *>(lwe_array_in), input_lwe_dimension,
-                input_lwe_ciphertext_count);
+  host_negation<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
+                          static_cast<uint64_t *>(lwe_array_out),
+                          static_cast<uint64_t *>(lwe_array_in),
+                          input_lwe_dimension, input_lwe_ciphertext_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
@@ -37,7 +37,7 @@ __host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output,
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  negation<<<grid, thds, 0, stream>>>(output, input, num_entries);
+  negation<T><<<grid, thds, 0, stream>>>(output, input, num_entries);
  check_cuda_error(cudaGetLastError());
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
@@ -1,15 +1,5 @@
 #include "programmable_bootstrap_amortized.cuh"

-/*
- * Returns the buffer size for 64 bits executions
- */
-uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count) {
-  return get_buffer_size_programmable_bootstrap_amortized<uint64_t>(
-      glwe_dimension, polynomial_size, input_lwe_ciphertext_count);
-}
-
 /*
 * This scratch function allocates the necessary amount of data on the GPU for
 * the amortized PBS on 32 bits inputs, into `buffer`. It also
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -256,7 +256,7 @@ __host__ void execute_cg_external_product_loop(
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t lwe_chunk_size, int lwe_offset) {
+    uint32_t lwe_chunk_size, uint32_t lwe_offset) {

  uint64_t full_dm =
      get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
@@ -275,6 +275,8 @@ __host__ void execute_cg_external_product_loop(

  uint32_t chunk_size =
      std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
+  if (chunk_size == 0)
+    return;

  auto d_mem = buffer->d_mem_acc_cg;
  auto keybundle_fft = buffer->keybundle_fft;
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -182,25 +182,6 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
 }
 #endif

-/*
- * Returns the buffer size for 64 bits executions
- */
-uint64_t get_buffer_size_programmable_bootstrap_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count) {
-
-  if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count))
-    return get_buffer_size_programmable_bootstrap_cg<uint64_t>(
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count);
-  else
-    return get_buffer_size_programmable_bootstrap_cg<uint64_t>(
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count);
-}
-
 template <typename Torus>
 void scratch_cuda_programmable_bootstrap_cg(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -18,9 +18,9 @@
 #include <vector>

 template <typename Torus, class params>
-__device__ Torus calculates_monomial_degree(const Torus *lwe_array_group,
-                                            uint32_t ggsw_idx,
-                                            uint32_t grouping_factor) {
+__device__ uint32_t calculates_monomial_degree(const Torus *lwe_array_group,
+                                               uint32_t ggsw_idx,
+                                               uint32_t grouping_factor) {
  Torus x = 0;
  for (int i = 0; i < grouping_factor; i++) {
    uint32_t mask_position = grouping_factor - (i + 1);
@@ -31,6 +31,13 @@ __device__ Torus calculates_monomial_degree(const Torus *lwe_array_group,
  return modulus_switch(x, params::log2_degree + 1);
 }

+__device__ __forceinline__ int
+get_start_ith_ggsw_offset(uint32_t polynomial_size, int glwe_dimension,
+                          uint32_t level_count) {
+  return polynomial_size * (glwe_dimension + 1) * (glwe_dimension + 1) *
+         level_count;
+}
+
 template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void device_multi_bit_programmable_bootstrap_keybundle(
    const Torus *__restrict__ lwe_array_in,
@@ -60,8 +67,6 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
  uint32_t input_idx = blockIdx.x / lwe_chunk_size;

  if (lwe_iteration < (lwe_dimension / grouping_factor)) {
-    //
-    Torus *accumulator = (Torus *)selected_memory;

    const Torus *block_lwe_array_in =
        &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
@@ -81,56 +86,52 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
    const Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
        bootstrapping_key, 0, rev_lwe_iteration, glwe_id, level_id,
        grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
-    const Torus *bsk_poly = bsk_slice + poly_id * params::degree;
+    const Torus *bsk_poly_ini = bsk_slice + poly_id * params::degree;

-    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
-        bsk_poly, accumulator);
+    Torus reg_acc[params::opt];
+
+    copy_polynomial_in_regs<Torus, params::opt, params::degree / params::opt>(
+        bsk_poly_ini, reg_acc);
+
+    int offset =
+        get_start_ith_ggsw_offset(polynomial_size, glwe_dimension, level_count);
+
+    // Precalculate the monomial degrees and store them in shared memory
+    uint32_t *monomial_degrees = (uint32_t *)selected_memory;
+    if (threadIdx.x < (1 << grouping_factor)) {
+      const Torus *lwe_array_group =
+          block_lwe_array_in + rev_lwe_iteration * grouping_factor;
+      monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
+          lwe_array_group, threadIdx.x, grouping_factor);
+    }
+    synchronize_threads_in_block();

    // Accumulate the other terms
    for (int g = 1; g < (1 << grouping_factor); g++) {

-      const Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
-          bootstrapping_key, g, rev_lwe_iteration, glwe_id, level_id,
-          grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
-      const Torus *bsk_poly = bsk_slice + poly_id * params::degree;
+      uint32_t monomial_degree = monomial_degrees[g];

-      // Calculates the monomial degree
-      const Torus *lwe_array_group =
-          block_lwe_array_in + rev_lwe_iteration * grouping_factor;
-      uint32_t monomial_degree = calculates_monomial_degree<Torus, params>(
-          lwe_array_group, g, grouping_factor);
-
-      synchronize_threads_in_block();
+      const Torus *bsk_poly = bsk_poly_ini + g * offset;
      // Multiply by the bsk element
-      polynomial_product_accumulate_by_monomial<Torus, params>(
-          accumulator, bsk_poly, monomial_degree, false);
+      polynomial_product_accumulate_by_monomial_nosync<Torus, params>(
+          reg_acc, bsk_poly, monomial_degree);
    }
+    synchronize_threads_in_block(); // needed because we are going to reuse the
+                                    // shared memory for the fft

-    synchronize_threads_in_block();
-
-    // Move accumulator to local memory
-    double2 temp[params::opt / 2];
-    int tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt / 2; i++) {
-      temp[i].x = __ll2double_rn((int64_t)accumulator[tid]);
-      temp[i].y =
-          __ll2double_rn((int64_t)accumulator[tid + params::degree / 2]);
-      temp[i].x /= (double)std::numeric_limits<Torus>::max();
-      temp[i].y /= (double)std::numeric_limits<Torus>::max();
-      tid += params::degree / params::opt;
-    }
-
-    synchronize_threads_in_block();
    // Move from local memory back to shared memory but as complex
-    tid = threadIdx.x;
+    int tid = threadIdx.x;
    double2 *fft = (double2 *)selected_memory;
 #pragma unroll
    for (int i = 0; i < params::opt / 2; i++) {
-      fft[tid] = temp[i];
+      fft[tid] =
+          make_double2(__ll2double_rn((int64_t)reg_acc[i]) /
+                           (double)std::numeric_limits<Torus>::max(),
+                       __ll2double_rn((int64_t)reg_acc[i + params::opt / 2]) /
+                           (double)std::numeric_limits<Torus>::max());
      tid += params::degree / params::opt;
    }
-    synchronize_threads_in_block();
+
    NSMFFT_direct<HalfDegree<params>>(fft);

    // lwe iteration
@@ -464,7 +465,7 @@ __host__ void execute_compute_keybundle(
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t lwe_chunk_size, int lwe_offset) {
+    uint32_t lwe_chunk_size, uint32_t lwe_offset) {

  uint32_t chunk_size =
      std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
@@ -505,14 +506,12 @@ __host__ void execute_compute_keybundle(
 }

 template <typename Torus, class params>
-__host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
-                               Torus *lut_vector, Torus *lut_vector_indexes,
-                               Torus *lwe_array_in, Torus *lwe_input_indexes,
-                               pbs_buffer<Torus, MULTI_BIT> *buffer,
-                               uint32_t num_samples, uint32_t lwe_dimension,
-                               uint32_t glwe_dimension,
-                               uint32_t polynomial_size, uint32_t base_log,
-                               uint32_t level_count, int j, int lwe_offset) {
+__host__ void execute_step_one(
+    cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
+    Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
+    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t j, uint32_t lwe_offset) {

  uint64_t full_sm_accumulate_step_one =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
@@ -561,14 +560,12 @@ __host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
 }

 template <typename Torus, class params>
-__host__ void execute_step_two(cudaStream_t stream, uint32_t gpu_index,
-                               Torus *lwe_array_out, Torus *lwe_output_indexes,
-                               pbs_buffer<Torus, MULTI_BIT> *buffer,
-                               uint32_t num_samples, uint32_t lwe_dimension,
-                               uint32_t glwe_dimension,
-                               uint32_t polynomial_size,
-                               int32_t grouping_factor, uint32_t level_count,
-                               int j, int lwe_offset, uint32_t lwe_chunk_size) {
+__host__ void execute_step_two(
+    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
+    uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, int32_t grouping_factor, uint32_t level_count,
+    uint32_t j, uint32_t lwe_offset, uint32_t lwe_chunk_size) {

  uint64_t full_sm_accumulate_step_two =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
@@ -626,7 +623,7 @@ __host__ void host_multi_bit_programmable_bootstrap(
    // Accumulate
    uint32_t chunk_size = std::min(
        lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
-    for (int j = 0; j < chunk_size; j++) {
+    for (uint32_t j = 0; j < chunk_size; j++) {
      execute_step_one<Torus, params>(
          stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
          lwe_input_indexes, buffer, num_samples, lwe_dimension, glwe_dimension,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -267,7 +267,7 @@ __host__ void execute_tbc_external_product_loop(
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t lwe_chunk_size, int lwe_offset) {
+    uint32_t lwe_chunk_size, uint32_t lwe_offset) {

  auto supports_dsm =
      supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
@@ -294,6 +294,8 @@ __host__ void execute_tbc_external_product_loop(

  uint32_t chunk_size =
      std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
+  if (chunk_size == 0)
+    return;

  auto d_mem = buffer->d_mem_acc_tbc;
  auto keybundle_fft = buffer->keybundle_fft;
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
@@ -31,6 +31,13 @@ __device__ void copy_polynomial(const T *__restrict__ source, T *dst) {
    tid = tid + block_size;
  }
 }
+template <typename T, int elems_per_thread, int block_size>
+__device__ void copy_polynomial_in_regs(const T *__restrict__ source, T *dst) {
+#pragma unroll
+  for (int i = 0; i < elems_per_thread; i++) {
+    dst[i] = source[threadIdx.x + i * block_size];
+  }
+}

 /*
 * Receives num_poly  concatenated polynomials of type T. For each:
@@ -215,6 +222,8 @@ __device__ void sample_extract_mask(Torus *lwe_array_out, Torus *glwe,
    Torus result[params::opt];
 #pragma unroll
    for (int i = 0; i < params::opt; i++) {
+      // params::degree - tid - 1 can't be negative, tid goes from 0 to
+      // params::degree - 1
      auto x = glwe_slice[params::degree - tid - 1];
      result[i] = SEL(-x, x, tid >= params::degree - nth);
      tid = tid + params::degree / params::opt;
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
@@ -55,21 +55,22 @@ __device__ void polynomial_product_accumulate_in_fourier_domain(
  }
 }

-// If init_accumulator is set, assumes that result was not initialized and does
-// that with the outcome of first * second
-template <typename T, class params>
-__device__ void
-polynomial_product_accumulate_by_monomial(T *result, const T *__restrict__ poly,
-                                          uint64_t monomial_degree,
-                                          bool init_accumulator = false) {
-  // monomial_degree \in [0, 2 * params::degree)
-  int full_cycles_count = monomial_degree / params::degree;
-  int remainder_degrees = monomial_degree % params::degree;
+// This method expects to work with polynomial_size / compression_params::opt
+// threads in the x-block If init_accumulator is set, assumes that result was
+// not initialized and does that with the outcome of first * second
+template <typename T>
+__device__ void polynomial_accumulate_monic_monomial_mul(
+    T *result, const T *__restrict__ poly, uint64_t monomial_degree,
+    uint32_t tid, uint32_t polynomial_size, int coeff_per_thread,
+    bool init_accumulator = false) {
+  // monomial_degree \in [0, 2 * compression_params::degree)
+  int full_cycles_count = monomial_degree / polynomial_size;
+  int remainder_degrees = monomial_degree % polynomial_size;

-  int pos = threadIdx.x;
-  for (int i = 0; i < params::opt; i++) {
+  int pos = tid;
+  for (int i = 0; i < coeff_per_thread; i++) {
    T element = poly[pos];
-    int new_pos = (pos + monomial_degree) % params::degree;
+    int new_pos = (pos + monomial_degree) % polynomial_size;

    T x = SEL(element, -element, full_cycles_count % 2); // monomial coefficient
    x = SEL(-x, x, new_pos >= remainder_degrees);
@@ -78,7 +79,32 @@ polynomial_product_accumulate_by_monomial(T *result, const T *__restrict__ poly,
      result[new_pos] = x;
    else
      result[new_pos] += x;
-    pos += params::degree / params::opt;
+    pos += polynomial_size / coeff_per_thread;
+  }
+}
+
+template <typename T, class params>
+__device__ void polynomial_product_accumulate_by_monomial_nosync(
+    T *result, const T *__restrict__ poly, uint32_t monomial_degree) {
+  // monomial_degree \in [0, 2 * params::degree)
+  int full_cycles_count = monomial_degree / params::degree;
+  int remainder_degrees = monomial_degree % params::degree;
+
+// Every thread has a fixed position to track instead of "chasing" the
+// position
+#pragma unroll
+  for (int i = 0; i < params::opt; i++) {
+    int pos =
+        (threadIdx.x + i * (params::degree / params::opt) - monomial_degree) &
+        (params::degree - 1);
+
+    T element = poly[pos];
+    T x = SEL(element, -element, full_cycles_count % 2);
+    x = SEL(-x, x,
+            threadIdx.x + i * (params::degree / params::opt) >=
+                remainder_degrees);
+
+    result[i] += x;
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
@@ -6,7 +6,7 @@
 std::mutex m;
 bool p2p_enabled = false;

-int cuda_setup_multi_gpu() {
+int32_t cuda_setup_multi_gpu() {
  int num_gpus = cuda_get_number_of_gpus();
  if (num_gpus == 0)
    PANIC("GPU error: the number of GPUs should be > 0.")
@@ -32,7 +32,7 @@ int cuda_setup_multi_gpu() {
    }
    m.unlock();
  }
-  return num_used_gpus;
+  return (int32_t)(num_used_gpus);
 }

 int get_active_gpu_count(int num_inputs, int gpu_count) {
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
@@ -176,22 +176,22 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
  }

  scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
-      stream, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer, lwe_dimension,
-      glwe_dimension, polynomial_size, pbs_level, grouping_factor,
-      input_lwe_ciphertext_count, true);
+      stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer,
+      lwe_dimension, glwe_dimension, polynomial_size, pbs_level,
+      grouping_factor, input_lwe_ciphertext_count, true);

  for (auto _ : st) {
    // Execute PBS
    cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
-        stream, d_lwe_ct_out_array, d_lwe_output_indexes, d_lut_pbs_identity,
-        d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk,
-        (pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
-        pbs_level, input_lwe_ciphertext_count);
-    cuda_synchronize_stream(stream);
+        stream, gpu_index, d_lwe_ct_out_array, d_lwe_output_indexes,
+        d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
+        d_lwe_input_indexes, d_bsk, (pbs_buffer<uint64_t, MULTI_BIT> *)buffer,
+        lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
+        pbs_base_log, pbs_level, input_lwe_ciphertext_count);
+    cuda_synchronize_stream(stream, gpu_index);
  }

-  cleanup_cuda_multi_bit_programmable_bootstrap(stream, &buffer);
+  cleanup_cuda_multi_bit_programmable_bootstrap(stream, gpu_index, &buffer);
 }
 #endif

@@ -255,23 +255,24 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
  }

  scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
-      stream, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer, glwe_dimension,
-      polynomial_size, pbs_level, input_lwe_ciphertext_count, true);
+      stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer,
+      glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
+      true);

  for (auto _ : st) {
    // Execute PBS
    cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
-        stream, (uint64_t *)d_lwe_ct_out_array,
+        stream, gpu_index, (uint64_t *)d_lwe_ct_out_array,
        (uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity,
        (uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array,
        (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
        (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
        glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
        input_lwe_ciphertext_count);
-    cuda_synchronize_stream(stream);
+    cuda_synchronize_stream(stream, gpu_index);
  }

-  cleanup_cuda_programmable_bootstrap(stream, &buffer);
+  cleanup_cuda_programmable_bootstrap(stream, gpu_index, &buffer);
 }
 #endif

--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
--- a/ci/parse_integer_benches_to_csv.py
+++ b/ci/parse_integer_benches_to_csv.py
@@ -20,7 +20,7 @@ def main(args):
            bench_function_id = bench_data["function_id"]

            split = bench_function_id.split("::")
-            if split.len() == 5:  # Signed integers
+            if len(split) == 5:  # Signed integers
                (_, _, function_name, parameter_set, bits) = split
            else:  # Unsigned integers
                (_, function_name, parameter_set, bits) = split
@@ -53,7 +53,8 @@ def main(args):
                    estimate_upper_bound_ms,
                )
            )
-        except:
+        except Exception as e:
+            print(e)
            pass

    if len(data) == 0:
--- a/ci/slab.toml
+++ b/ci/slab.toml
@@ -58,7 +58,7 @@ flavor_name = "n3-A100x8-NVLink"
 [backend.hyperstack.multi-gpu-test]
 environment_name = "canada"
 image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2"
-flavor_name = "n3-A100x4"
+flavor_name = "n3-RTX-A6000x4"

 [command.signed_integer_full_bench]
 workflow = "signed_integer_full_benchmark.yml"
--- a/scripts/integer-tests.sh
+++ b/scripts/integer-tests.sh
@@ -133,8 +133,8 @@ if [[ "${backend}" == "gpu" ]]; then
        test_threads=8
        doctest_threads=8
    else
-        test_threads=3
-        doctest_threads=3
+        test_threads=1
+        doctest_threads=1
    fi
 fi

--- a/tfhe-zk-pok/Cargo.toml
+++ b/tfhe-zk-pok/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-zk-pok"
-version = "0.3.0-alpha.0"
+version = "0.3.0-alpha.1"
 edition = "2021"
 keywords = ["zero", "knowledge", "proof", "vector-commitments"]
 homepage = "https://zama.ai/"
@@ -15,13 +15,17 @@ description = "tfhe-zk-pok: An implementation of zero-knowledge proofs of encryp
 ark-bls12-381 = { package = "tfhe-ark-bls12-381", version = "0.4.0" }
 ark-ec = { package = "tfhe-ark-ec", version = "0.4.2", features = ["parallel"] }
 ark-ff = { package = "tfhe-ark-ff", version = "0.4.3", features = ["parallel"] }
-ark-poly = { package = "tfhe-ark-poly", version = "0.4.2", features = ["parallel"] }
+ark-poly = { package = "tfhe-ark-poly", version = "0.4.2", features = [
+    "parallel",
+] }
 ark-serialize = { version = "0.4.2" }
 rand = "0.8.5"
 rayon = "1.8.0"
 sha3 = "0.10.8"
 serde = { version = "~1.0", features = ["derive"] }
 zeroize = "1.7.0"
+num-bigint = "0.4.5"

 [dev-dependencies]
 serde_json = "~1.0"
+itertools = "0.11.0"
--- a/tfhe-zk-pok/src/curve_api.rs
+++ b/tfhe-zk-pok/src/curve_api.rs
@@ -210,9 +210,14 @@ impl CurveGroupOps<bls12_381::Zp> for bls12_381::G1 {
    }

    fn mul_scalar(self, scalar: bls12_381::Zp) -> Self {
-        self.mul_scalar(scalar)
+        if scalar.inner == MontFp!("2") {
+            self.double()
+        } else {
+            self.mul_scalar(scalar)
+        }
    }

+    #[track_caller]
    fn multi_mul_scalar(bases: &[Self::Affine], scalars: &[bls12_381::Zp]) -> Self {
        Self::Affine::multi_mul_scalar(bases, scalars)
    }
@@ -245,9 +250,14 @@ impl CurveGroupOps<bls12_381::Zp> for bls12_381::G2 {
    }

    fn mul_scalar(self, scalar: bls12_381::Zp) -> Self {
-        self.mul_scalar(scalar)
+        if scalar.inner == MontFp!("2") {
+            self.double()
+        } else {
+            self.mul_scalar(scalar)
+        }
    }

+    #[track_caller]
    fn multi_mul_scalar(bases: &[Self::Affine], scalars: &[bls12_381::Zp]) -> Self {
        Self::Affine::multi_mul_scalar(bases, scalars)
    }
@@ -273,6 +283,9 @@ impl PairingGroupOps<bls12_381::Zp, bls12_381::G1, bls12_381::G2> for bls12_381:
    }

    fn pairing(x: bls12_381::G1, y: bls12_381::G2) -> Self {
+        if x == bls12_381::G1::ZERO || y == bls12_381::G2::ZERO {
+            return Self::pairing(bls12_381::G1::ZERO, bls12_381::G2::GENERATOR);
+        }
        Self::pairing(x, y)
    }
 }
@@ -329,12 +342,21 @@ impl CurveGroupOps<bls12_446::Zp> for bls12_446::G1 {
    }

    fn mul_scalar(self, scalar: bls12_446::Zp) -> Self {
-        self.mul_scalar(scalar)
+        if scalar.inner == MontFp!("2") {
+            self.double()
+        } else {
+            self.mul_scalar(scalar)
+        }
    }

+    #[track_caller]
    fn multi_mul_scalar(bases: &[Self::Affine], scalars: &[bls12_446::Zp]) -> Self {
-        msm::msm_wnaf_g1_446(bases, scalars)
-        // Self::Affine::multi_mul_scalar(bases, scalars)
+        // overhead seems to not be worth it outside of wasm
+        if cfg!(target_family = "wasm") {
+            msm::msm_wnaf_g1_446(bases, scalars)
+        } else {
+            Self::Affine::multi_mul_scalar(bases, scalars)
+        }
    }

    fn to_bytes(self) -> impl AsRef<[u8]> {
@@ -365,9 +387,14 @@ impl CurveGroupOps<bls12_446::Zp> for bls12_446::G2 {
    }

    fn mul_scalar(self, scalar: bls12_446::Zp) -> Self {
-        self.mul_scalar(scalar)
+        if scalar.inner == MontFp!("2") {
+            self.double()
+        } else {
+            self.mul_scalar(scalar)
+        }
    }

+    #[track_caller]
    fn multi_mul_scalar(bases: &[Self::Affine], scalars: &[bls12_446::Zp]) -> Self {
        Self::Affine::multi_mul_scalar(bases, scalars)
    }
@@ -393,13 +420,16 @@ impl PairingGroupOps<bls12_446::Zp, bls12_446::G1, bls12_446::G2> for bls12_446:
    }

    fn pairing(x: bls12_446::G1, y: bls12_446::G2) -> Self {
+        if x == bls12_446::G1::ZERO || y == bls12_446::G2::ZERO {
+            return Self::pairing(bls12_446::G1::ZERO, bls12_446::G2::GENERATOR);
+        }
        Self::pairing(x, y)
    }
 }

-#[derive(Copy, Clone, serde::Serialize, serde::Deserialize)]
+#[derive(Debug, Copy, Clone, serde::Serialize, serde::Deserialize)]
 pub struct Bls12_381;
-#[derive(Copy, Clone, serde::Serialize, serde::Deserialize)]
+#[derive(Debug, Copy, Clone, serde::Serialize, serde::Deserialize)]
 pub struct Bls12_446;

 impl Curve for Bls12_381 {
--- a/tfhe-zk-pok/src/curve_api/bls12_446.rs
+++ b/tfhe-zk-pok/src/curve_api/bls12_446.rs
@@ -55,6 +55,7 @@ mod g1 {
    }

    impl G1Affine {
+        #[track_caller]
        pub fn multi_mul_scalar(bases: &[Self], scalars: &[Zp]) -> G1 {
            // SAFETY: interpreting a `repr(transparent)` pointer as its contents.
            G1 {
@@ -124,6 +125,7 @@ mod g1 {
            }
        }

+        #[track_caller]
        pub fn multi_mul_scalar(bases: &[Self], scalars: &[Zp]) -> Self {
            use rayon::prelude::*;
            let bases = bases
@@ -230,6 +232,7 @@ mod g2 {
    }

    impl G2Affine {
+        #[track_caller]
        pub fn multi_mul_scalar(bases: &[Self], scalars: &[Zp]) -> G2 {
            // SAFETY: interpreting a `repr(transparent)` pointer as its contents.
            G2 {
@@ -247,10 +250,10 @@ mod g2 {
        // functions. we cache it since it requires a Zp division
        // https://hackmd.io/@tazAymRSQCGXTUKkbh1BAg/Sk27liTW9#Math-Formula-for-Point-Addition
        pub(crate) fn compute_m(self, other: G2Affine) -> Option<crate::curve_446::Fq2> {
-            let zero = crate::curve_446::Fq2::ZERO;
-
            // in the context of elliptic curves, the point at infinity is the zero element of the
            // group
+            let zero = crate::curve_446::Fq2::ZERO;
+
            if self.inner.infinity || other.inner.infinity {
                return None;
            }
--- a/tfhe-zk-pok/src/curve_api/msm.rs
+++ b/tfhe-zk-pok/src/curve_api/msm.rs
@@ -1,6 +1,6 @@
 use ark_ec::short_weierstrass::Affine;
 use ark_ec::AffineRepr;
-use ark_ff::{AdditiveGroup, BigInt, BigInteger, Field, Fp, PrimeField};
+use ark_ff::{AdditiveGroup, BigInteger, Field, Fp, PrimeField};
 use rayon::prelude::*;

 fn make_digits(a: &impl BigInteger, w: usize, num_bits: usize) -> impl Iterator<Item = i64> + '_ {
@@ -46,6 +46,7 @@ fn make_digits(a: &impl BigInteger, w: usize, num_bits: usize) -> impl Iterator<
 }

 // Compute msm using windowed non-adjacent form
+#[track_caller]
 pub fn msm_wnaf_g1_446(
    bases: &[super::bls12_446::G1Affine],
    scalars: &[super::bls12_446::Zp],
@@ -236,207 +237,3 @@ pub fn msm_wnaf_g1_446(
                total
            })
 }
-
-// Compute msm using windowed non-adjacent form
-pub fn msm_wnaf_g1_446_extended(
-    bases: &[super::bls12_446::G1Affine],
-    scalars: &[super::bls12_446::Zp],
-) -> super::bls12_446::G1 {
-    use super::bls12_446::*;
-    type BaseField = Fp<ark_ff::MontBackend<crate::curve_446::FqConfig, 7>, 7>;
-
-    // let num_bits = 75usize;
-    // let mask = BigInt([!0, (1 << 11) - 1, 0, 0, 0]);
-    // let scalars = &*scalars
-    //     .par_iter()
-    //     .map(|x| x.inner.into_bigint())
-    //     .flat_map_iter(|x| (0..4).map(move |i| (x >> (75 * i)) & mask))
-    //     .collect::<Vec<_>>();
-
-    let num_bits = 150usize;
-    let mask = BigInt([!0, !0, (1 << 22) - 1, 0, 0]);
-    let scalars = &*scalars
-        .par_iter()
-        .map(|x| x.inner.into_bigint())
-        .flat_map_iter(|x| (0..2).map(move |i| (x >> (150 * i)) & mask))
-        .collect::<Vec<_>>();
-
-    assert_eq!(bases.len(), scalars.len());
-
-    let size = bases.len();
-
-    let c = if size < 32 {
-        3
-    } else {
-        // natural log approx
-        (size.ilog2() as usize * 69 / 100) + 2
-    };
-    let c = c - 3;
-
-    let digits_count = (num_bits + c - 1) / c;
-    let scalar_digits = scalars
-        .into_par_iter()
-        .flat_map_iter(|s| make_digits(s, c, num_bits))
-        .collect::<Vec<_>>();
-
-    let zero = G1Affine {
-        inner: Affine::zero(),
-    };
-
-    let window_sums: Vec<_> = (0..digits_count)
-        .into_par_iter()
-        .map(|i| {
-            let n = 1 << c;
-            let mut indices = vec![vec![]; n];
-            let mut d = vec![BaseField::ZERO; n + 1];
-            let mut e = vec![BaseField::ZERO; n + 1];
-
-            for (idx, digits) in scalar_digits.chunks(digits_count).enumerate() {
-                use core::cmp::Ordering;
-                // digits is the digits thing of the first scalar?
-                let scalar = digits[i];
-                match 0.cmp(&scalar) {
-                    Ordering::Less => indices[(scalar - 1) as usize].push(idx),
-                    Ordering::Greater => indices[(-scalar - 1) as usize].push(!idx),
-                    Ordering::Equal => (),
-                }
-            }
-
-            let mut buckets = vec![zero; 1 << c];
-
-            loop {
-                d[0] = BaseField::ONE;
-                for (k, (bucket, idx)) in core::iter::zip(&mut buckets, &mut indices).enumerate() {
-                    if let Some(idx) = idx.last().copied() {
-                        let value = if idx >> (usize::BITS - 1) == 1 {
-                            let mut val = bases[!idx];
-                            val.inner.y = -val.inner.y;
-                            val
-                        } else {
-                            bases[idx]
-                        };
-
-                        if !bucket.inner.infinity {
-                            let a = value.inner.x - bucket.inner.x;
-                            if a != BaseField::ZERO {
-                                d[k + 1] = d[k] * a;
-                            } else if value.inner.y == bucket.inner.y {
-                                d[k + 1] = d[k] * value.inner.y.double();
-                            } else {
-                                d[k + 1] = d[k];
-                            }
-                            continue;
-                        }
-                    }
-                    d[k + 1] = d[k];
-                }
-                e[n] = d[n].inverse().unwrap();
-
-                for (k, (bucket, idx)) in core::iter::zip(&mut buckets, &mut indices)
-                    .enumerate()
-                    .rev()
-                {
-                    if let Some(idx) = idx.last().copied() {
-                        let value = if idx >> (usize::BITS - 1) == 1 {
-                            let mut val = bases[!idx];
-                            val.inner.y = -val.inner.y;
-                            val
-                        } else {
-                            bases[idx]
-                        };
-
-                        if !bucket.inner.infinity {
-                            let a = value.inner.x - bucket.inner.x;
-                            if a != BaseField::ZERO {
-                                e[k] = e[k + 1] * a;
-                            } else if value.inner.y == bucket.inner.y {
-                                e[k] = e[k + 1] * value.inner.y.double();
-                            } else {
-                                e[k] = e[k + 1];
-                            }
-                            continue;
-                        }
-                    }
-                    e[k] = e[k + 1];
-                }
-
-                let d = &d[..n];
-                let e = &e[1..];
-
-                let mut empty = true;
-                for ((&d, &e), (bucket, idx)) in core::iter::zip(
-                    core::iter::zip(d, e),
-                    core::iter::zip(&mut buckets, &mut indices),
-                ) {
-                    empty &= idx.len() <= 1;
-                    if let Some(idx) = idx.pop() {
-                        let value = if idx >> (usize::BITS - 1) == 1 {
-                            let mut val = bases[!idx];
-                            val.inner.y = -val.inner.y;
-                            val
-                        } else {
-                            bases[idx]
-                        };
-
-                        if !bucket.inner.infinity {
-                            let x1 = bucket.inner.x;
-                            let x2 = value.inner.x;
-                            let y1 = bucket.inner.y;
-                            let y2 = value.inner.y;
-
-                            let eq_x = x1 == x2;
-
-                            if eq_x && y1 != y2 {
-                                bucket.inner.infinity = true;
-                            } else {
-                                let r = d * e;
-                                let m = if eq_x {
-                                    let x1 = x1.square();
-                                    x1 + x1.double()
-                                } else {
-                                    y2 - y1
-                                };
-                                let m = m * r;
-
-                                let x3 = m.square() - x1 - x2;
-                                let y3 = m * (x1 - x3) - y1;
-                                bucket.inner.x = x3;
-                                bucket.inner.y = y3;
-                            }
-                        } else {
-                            *bucket = value;
-                        }
-                    }
-                }
-
-                if empty {
-                    break;
-                }
-            }
-
-            let mut running_sum = G1::ZERO;
-            let mut res = G1::ZERO;
-            buckets.into_iter().rev().for_each(|b| {
-                running_sum.inner += b.inner;
-                res += running_sum;
-            });
-            res
-        })
-        .collect();
-
-    // We store the sum for the lowest window.
-    let lowest = *window_sums.first().unwrap();
-
-    // We're traversing windows from high to low.
-    lowest
-        + window_sums[1..]
-            .iter()
-            .rev()
-            .fold(G1::ZERO, |mut total, &sum_i| {
-                total += sum_i;
-                for _ in 0..c {
-                    total = total.double();
-                }
-                total
-            })
-}
--- a/tfhe-zk-pok/src/four_squares.rs
+++ b/tfhe-zk-pok/src/four_squares.rs
@@ -0,0 +1,308 @@
+use ark_ff::biginteger::arithmetic::widening_mul;
+use rand::prelude::*;
+
+pub fn sqr<T: Copy + core::ops::Mul>(x: T) -> T::Output {
+    x * x
+}
+
+// copied from the standard library
+// since isqrt is unstable at the moment
+pub fn isqrt(this: u128) -> u128 {
+    if this < 2 {
+        return this;
+    }
+
+    // The algorithm is based on the one presented in
+    // <https://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Binary_numeral_system_(base_2)>
+    // which cites as source the following C code:
+    // <https://web.archive.org/web/20120306040058/http://medialab.freaknet.org/martin/src/sqrt/sqrt.c>.
+
+    let mut op = this;
+    let mut res = 0;
+    let mut one = 1 << (this.ilog2() & !1);
+
+    while one != 0 {
+        if op >= res + one {
+            op -= res + one;
+            res = (res >> 1) + one;
+        } else {
+            res >>= 1;
+        }
+        one >>= 2;
+    }
+
+    res
+}
+
+fn half_gcd(p: u128, s: u128) -> u128 {
+    let sq_p = isqrt(p as _);
+    let mut a = p;
+    let mut b = s;
+    while b > sq_p {
+        let r = a % b;
+        a = b;
+        b = r;
+    }
+    b
+}
+
+fn modular_inv_2_64(p: u64) -> u64 {
+    assert_eq!(p % 2, 1);
+
+    let mut old_r = p as u128;
+    let mut r = 1u128 << 64;
+
+    let mut old_s = 1u64;
+    let mut s = 0u64;
+
+    while r != 0 {
+        let q = old_r / r;
+        (old_r, r) = (r, old_r - q * r);
+
+        let q = q as u64;
+        (old_s, s) = (s, old_s.wrapping_sub(q.wrapping_mul(s)));
+    }
+
+    assert_eq!(u64::wrapping_mul(old_s, p), 1);
+    old_s
+}
+
+#[derive(Copy, Clone, Debug)]
+struct Montgomery {
+    p: u128,
+    r2: u128,
+    p_prime: u64,
+}
+
+impl Montgomery {
+    fn new(p: u128) -> Self {
+        assert_ne!(p, 0);
+        assert_eq!(p % 2, 1);
+
+        // r = 2^128
+        // we want to compute r^2 mod p
+        let r = p.wrapping_neg() % p;
+
+        let r = num_bigint::BigUint::from(r);
+        let r2 = &r * &r;
+        let r2 = r2 % p;
+        let r2_digits = &*r2.to_u64_digits();
+
+        let r2 = match *r2_digits {
+            [] => 0u128,
+            [a] => a as u128,
+            [a, b] => a as u128 | ((b as u128) << 64),
+            _ => unreachable!("value modulo 128 bit integer should have at most two u64 digits"),
+        };
+
+        let p_prime = modular_inv_2_64(p as u64).wrapping_neg();
+
+        Self { p, r2, p_prime }
+    }
+
+    fn redc(self, lo: u128, hi: u128) -> u128 {
+        let p0 = self.p as u64;
+        let p1 = (self.p >> 64) as u64;
+
+        let t0 = lo as u64;
+        let mut t1 = (lo >> 64) as u64;
+        let mut t2 = hi as u64;
+        let mut t3 = (hi >> 64) as u64;
+        let mut t4 = 0u64;
+
+        {
+            let m = u64::wrapping_mul(t0, self.p_prime);
+            let mut c = 0u64;
+
+            let x = c as u128 + t0 as u128 + widening_mul(m, p0);
+            // t0 = x as u64;
+            c = (x >> 64) as u64;
+
+            let x = c as u128 + t1 as u128 + widening_mul(m, p1);
+            t1 = x as u64;
+            c = (x >> 64) as u64;
+
+            let x = c as u128 + t2 as u128;
+            t2 = x as u64;
+            c = (x >> 64) as u64;
+
+            let x = c as u128 + t3 as u128;
+            t3 = x as u64;
+            c = (x >> 64) as u64;
+
+            t4 += c;
+        }
+
+        {
+            let m = u64::wrapping_mul(t1, self.p_prime);
+            let mut c = 0u64;
+
+            let x = c as u128 + t1 as u128 + widening_mul(m, p0);
+            // t1 = x as u64;
+            c = (x >> 64) as u64;
+
+            let x = c as u128 + t2 as u128 + widening_mul(m, p1);
+            t2 = x as u64;
+            c = (x >> 64) as u64;
+
+            let x = c as u128 + t3 as u128;
+            t3 = x as u64;
+            c = (x >> 64) as u64;
+
+            t4 += c;
+        }
+
+        let mut s0 = t2;
+        let mut s1 = t3;
+        let s2 = t4;
+
+        if !(s2 == 0 && (s1, s0) < (p1, p0)) {
+            let borrow;
+            (s0, borrow) = u64::overflowing_sub(s0, p0);
+            s1 = s1.wrapping_sub(p1).wrapping_sub(borrow as u64);
+        }
+
+        s0 as u128 | ((s1 as u128) << 64)
+    }
+
+    fn mont_from_natural(self, x: u128) -> u128 {
+        self.mul(x, self.r2)
+    }
+
+    fn natural_from_mont(self, x: u128) -> u128 {
+        self.redc(x, 0)
+    }
+
+    fn mul(self, x: u128, y: u128) -> u128 {
+        let x0 = x as u64;
+        let x1 = (x >> 64) as u64;
+        let y0 = y as u64;
+        let y1 = (y >> 64) as u64;
+
+        let lolo = widening_mul(x0, y0);
+        let lohi = widening_mul(x0, y1);
+        let hilo = widening_mul(x1, y0);
+        let hihi = widening_mul(x1, y1);
+
+        let lo = lolo;
+        let (lo, o0) = u128::overflowing_add(lo, lohi << 64);
+        let (lo, o1) = u128::overflowing_add(lo, hilo << 64);
+
+        let hi = hihi + (lohi >> 64) + (hilo >> 64) + (o0 as u128 + o1 as u128);
+
+        self.redc(lo, hi)
+    }
+
+    fn exp(self, x: u128, n: u128) -> u128 {
+        if n == 0 {
+            return 1;
+        }
+        let mut y = self.mont_from_natural(1);
+        let mut x = x;
+        let mut n = n;
+        while n > 1 {
+            if n % 2 == 1 {
+                y = self.mul(x, y);
+            }
+            x = self.mul(x, x);
+            n /= 2;
+        }
+        self.mul(x, y)
+    }
+}
+
+pub fn four_squares(v: u128) -> [u64; 4] {
+    let rng = &mut StdRng::seed_from_u64(0);
+
+    let f = v % 4;
+    if f == 2 {
+        let b = isqrt(v as _) as u64;
+
+        'main_loop: loop {
+            let x = 2 + rng.gen::<u64>() % (b - 2);
+            let y = 2 + rng.gen::<u64>() % (b - 2);
+
+            let (sum, o) = u128::overflowing_add(sqr(x as u128), sqr(y as u128));
+            if o || sum > v {
+                continue 'main_loop;
+            }
+
+            let p = v - sum;
+
+            if p == 0 || p == 1 {
+                return [0, p as u64, x, y];
+            }
+
+            if p % 4 != 1 {
+                continue 'main_loop;
+            }
+
+            let mut d = p - 1;
+            let mut s = 0u32;
+            while d % 2 == 0 {
+                d /= 2;
+                s += 1;
+            }
+            let d = d;
+            let s = s;
+
+            let mont = Montgomery::new(p);
+            let a = 2 + (rng.gen::<u128>() % (p - 3));
+
+            let mut sqrt = 0;
+            {
+                let a = mont.mont_from_natural(a);
+                let one = mont.mont_from_natural(1);
+                let neg_one = p - one;
+
+                let mut x = mont.exp(a, d);
+                let mut y = 0;
+
+                for _ in 0..s {
+                    y = mont.mul(x, x);
+                    if y == one && x != one && x != neg_one {
+                        continue 'main_loop;
+                    }
+                    if y == neg_one {
+                        sqrt = x;
+                    }
+                    x = y;
+                }
+                if y != one {
+                    continue 'main_loop;
+                }
+            }
+            if sqrt == 0 {
+                continue 'main_loop;
+            }
+
+            let i = mont.natural_from_mont(sqrt);
+            let i = if i <= p / 2 { p - i } else { i };
+            let z = half_gcd(p, i) as u64;
+            let w = isqrt(p - sqr(z as u128)) as u64;
+
+            if p != sqr(z as u128) + sqr(w as u128) {
+                continue 'main_loop;
+            }
+
+            return [x, y, z, w];
+        }
+    } else if f == 0 {
+        four_squares(v / 4).map(|x| x + x)
+    } else {
+        let mut r = four_squares(2 * v);
+        r.sort_by_key(|&x| {
+            if x % 2 == 0 {
+                -1 - ((x / 2) as i64)
+            } else {
+                (x / 2) as i64
+            }
+        });
+        [
+            (r[0] + r[1]) / 2,
+            (r[0] - r[1]) / 2,
+            (r[3] + r[2]) / 2,
+            (r[3] - r[2]) / 2,
+        ]
+    }
+}
--- a/tfhe-zk-pok/src/lib.rs
+++ b/tfhe-zk-pok/src/lib.rs
@@ -3,3 +3,5 @@ pub use ark_serialize::{CanonicalDeserialize, CanonicalSerialize, Compress, Vali
 pub mod curve_446;
 pub mod curve_api;
 pub mod proofs;
+
+mod four_squares;
--- a/tfhe-zk-pok/src/proofs/mod.rs
+++ b/tfhe-zk-pok/src/proofs/mod.rs
@@ -141,5 +141,6 @@ pub const HASH_METADATA_LEN_BYTES: usize = 256;
 pub mod binary;
 pub mod index;
 pub mod pke;
+pub mod pke_v2;
 pub mod range;
 pub mod rlwe;
--- a/tfhe-zk-pok/src/proofs/pke.rs
+++ b/tfhe-zk-pok/src/proofs/pke.rs
@@ -101,7 +101,7 @@ impl<G: Curve> PublicCommit<G> {
            b,
            c1,
            c2,
-            __marker: Default::default(),
+            __marker: PhantomData,
        }
    }
 }
@@ -194,6 +194,7 @@ pub fn commit<G: Curve>(
 pub fn prove<G: Curve>(
    public: (&PublicParams<G>, &PublicCommit<G>),
    private_commit: &PrivateCommit<G>,
+    metadata: &[u8],
    load: ComputeLoad,
    rng: &mut dyn RngCore,
 ) -> Proof<G> {
@@ -347,7 +348,10 @@ pub fn prove<G: Curve>(
    .collect::<Box<_>>();

    let mut y = vec![G::Zp::ZERO; n];
-    G::Zp::hash(&mut y, &[hash, x_bytes, c_hat.to_bytes().as_ref()]);
+    G::Zp::hash(
+        &mut y,
+        &[hash, metadata, x_bytes, c_hat.to_bytes().as_ref()],
+    );
    let y = OneBased(y);

    let scalars = (n + 1 - big_d..n + 1)
@@ -360,6 +364,7 @@ pub fn prove<G: Curve>(
        &mut theta,
        &[
            hash_lmap,
+            metadata,
            x_bytes,
            c_hat.to_bytes().as_ref(),
            c_y.to_bytes().as_ref(),
@@ -379,6 +384,7 @@ pub fn prove<G: Curve>(
        &mut t,
        &[
            hash_t,
+            metadata,
            &(1..n + 1)
                .flat_map(|i| y[i].to_bytes().as_ref().to_vec())
                .collect::<Box<_>>(),
@@ -394,6 +400,7 @@ pub fn prove<G: Curve>(
        &mut delta,
        &[
            hash_agg,
+            metadata,
            x_bytes,
            c_hat.to_bytes().as_ref(),
            c_y.to_bytes().as_ref(),
@@ -472,6 +479,7 @@ pub fn prove<G: Curve>(
            core::array::from_mut(&mut z),
            &[
                hash_z,
+                metadata,
                x_bytes,
                c_hat.to_bytes().as_ref(),
                c_y.to_bytes().as_ref(),
@@ -512,6 +520,7 @@ pub fn prove<G: Curve>(
            core::array::from_mut(&mut w),
            &[
                hash_w,
+                metadata,
                x_bytes,
                c_hat.to_bytes().as_ref(),
                c_y.to_bytes().as_ref(),
@@ -698,6 +707,7 @@ fn compute_a_theta<G: Curve>(
 pub fn verify<G: Curve>(
    proof: &Proof<G>,
    public: (&PublicParams<G>, &PublicCommit<G>),
+    metadata: &[u8],
 ) -> Result<(), ()> {
    let &Proof {
        c_hat,
@@ -760,7 +770,10 @@ pub fn verify<G: Curve>(
    .collect::<Box<_>>();

    let mut y = vec![G::Zp::ZERO; n];
-    G::Zp::hash(&mut y, &[hash, x_bytes, c_hat.to_bytes().as_ref()]);
+    G::Zp::hash(
+        &mut y,
+        &[hash, metadata, x_bytes, c_hat.to_bytes().as_ref()],
+    );
    let y = OneBased(y);

    let mut theta = vec![G::Zp::ZERO; d + k + 1];
@@ -768,6 +781,7 @@ pub fn verify<G: Curve>(
        &mut theta,
        &[
            hash_lmap,
+            metadata,
            x_bytes,
            c_hat.to_bytes().as_ref(),
            c_y.to_bytes().as_ref(),
@@ -792,6 +806,7 @@ pub fn verify<G: Curve>(
        &mut t,
        &[
            hash_t,
+            metadata,
            &(1..n + 1)
                .flat_map(|i| y[i].to_bytes().as_ref().to_vec())
                .collect::<Box<_>>(),
@@ -807,6 +822,7 @@ pub fn verify<G: Curve>(
        &mut delta,
        &[
            hash_agg,
+            metadata,
            x_bytes,
            c_hat.to_bytes().as_ref(),
            c_y.to_bytes().as_ref(),
@@ -821,6 +837,7 @@ pub fn verify<G: Curve>(
            core::array::from_mut(&mut z),
            &[
                hash_z,
+                metadata,
                x_bytes,
                c_hat.to_bytes().as_ref(),
                c_y.to_bytes().as_ref(),
@@ -873,6 +890,7 @@ pub fn verify<G: Curve>(
            core::array::from_mut(&mut w),
            &[
                hash_w,
+                metadata,
                x_bytes,
                c_hat.to_bytes().as_ref(),
                c_y.to_bytes().as_ref(),
@@ -1053,6 +1071,15 @@ mod tests {
                .wrapping_add((delta * m[i] as u64) as i64);
        }

+        // One of our usecases uses 320 bits of additional metadata
+        const METADATA_LEN: usize = (320 / u8::BITS) as usize;
+
+        let mut metadata = [0u8; METADATA_LEN];
+        metadata.fill_with(|| rng.gen::<u8>());
+
+        let mut fake_metadata = [255u8; METADATA_LEN];
+        fake_metadata.fill_with(|| rng.gen::<u8>());
+
        let mut m_roundtrip = vec![0i64; k];
        for i in 0..k {
            let mut dot = 0i128;
@@ -1093,60 +1120,77 @@ mod tests {
        let public_param_that_was_not_compressed =
            serialize_then_deserialize(&original_public_param, Compress::Yes).unwrap();

-        for public_param in [
-            original_public_param,
-            public_param_that_was_compressed,
-            public_param_that_was_not_compressed,
-        ] {
-            for use_fake_e1 in [false, true] {
-                for use_fake_e2 in [false, true] {
-                    for use_fake_m in [false, true] {
-                        for use_fake_r in [false, true] {
-                            let (public_commit, private_commit) = commit(
-                                a.clone(),
-                                b.clone(),
-                                c1.clone(),
-                                c2.clone(),
-                                if use_fake_r {
-                                    fake_r.clone()
-                                } else {
-                                    r.clone()
-                                },
-                                if use_fake_e1 {
-                                    fake_e1.clone()
-                                } else {
-                                    e1.clone()
-                                },
-                                if use_fake_m {
-                                    fake_m.clone()
-                                } else {
-                                    m.clone()
-                                },
-                                if use_fake_e2 {
-                                    fake_e2.clone()
-                                } else {
-                                    e2.clone()
-                                },
-                                &public_param,
-                                rng,
-                            );
+        for (
+            public_param,
+            use_fake_e1,
+            use_fake_e2,
+            use_fake_m,
+            use_fake_r,
+            use_fake_metadata_verify,
+        ) in itertools::iproduct!(
+            [
+                original_public_param,
+                public_param_that_was_compressed,
+                public_param_that_was_not_compressed,
+            ],
+            [false, true],
+            [false, true],
+            [false, true],
+            [false, true],
+            [false, true]
+        ) {
+            let (public_commit, private_commit) = commit(
+                a.clone(),
+                b.clone(),
+                c1.clone(),
+                c2.clone(),
+                if use_fake_r {
+                    fake_r.clone()
+                } else {
+                    r.clone()
+                },
+                if use_fake_e1 {
+                    fake_e1.clone()
+                } else {
+                    e1.clone()
+                },
+                if use_fake_m {
+                    fake_m.clone()
+                } else {
+                    m.clone()
+                },
+                if use_fake_e2 {
+                    fake_e2.clone()
+                } else {
+                    e2.clone()
+                },
+                &public_param,
+                rng,
+            );

-                            for load in [ComputeLoad::Proof, ComputeLoad::Verify] {
-                                let proof = prove(
-                                    (&public_param, &public_commit),
-                                    &private_commit,
-                                    load,
-                                    rng,
-                                );
+            for load in [ComputeLoad::Proof, ComputeLoad::Verify] {
+                let proof = prove(
+                    (&public_param, &public_commit),
+                    &private_commit,
+                    &metadata,
+                    load,
+                    rng,
+                );

-                                assert_eq!(
-                                    verify(&proof, (&public_param, &public_commit)).is_err(),
-                                    use_fake_e1 || use_fake_e2 || use_fake_r || use_fake_m
-                                );
-                            }
-                        }
-                    }
-                }
+                let verify_metadata = if use_fake_metadata_verify {
+                    &fake_metadata
+                } else {
+                    &metadata
+                };
+
+                assert_eq!(
+                    verify(&proof, (&public_param, &public_commit), verify_metadata).is_err(),
+                    use_fake_e1
+                        || use_fake_e2
+                        || use_fake_r
+                        || use_fake_m
+                        || use_fake_metadata_verify
+                );
            }
        }
    }
--- a/tfhe-zk-pok/src/proofs/pke_v2.rs
+++ b/tfhe-zk-pok/src/proofs/pke_v2.rs
--- a/tfhe/Cargo.toml
+++ b/tfhe/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe"
-version = "0.8.0-alpha.2"
+version = "0.8.0-alpha.5"
 edition = "2021"
 readme = "../README.md"
 keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
@@ -62,12 +62,12 @@ lazy_static = { version = "1.4.0", optional = true }
 serde = { version = "1.0", features = ["derive"] }
 rayon = { version = "1.5.0" }
 bincode = "1.3.3"
-concrete-fft = { version = "0.4.1", features = ["serde", "fft128"] }
-concrete-ntt = { version = "0.1.2" }
-pulp = "0.18.8"
+concrete-fft = { version = "0.5.1", features = ["serde", "fft128"] }
+concrete-ntt = { version = "0.2.0" }
+pulp = "0.18.22"
 tfhe-cuda-backend = { version = "0.4.0-alpha.0", path = "../backends/tfhe-cuda-backend", optional = true }
 aligned-vec = { version = "0.5", features = ["serde"] }
-dyn-stack = { version = "0.9" }
+dyn-stack = { version = "0.10" }
 paste = "1.0.7"
 fs2 = { version = "0.4.3", optional = true }
 # Used for OPRF in shortint
@@ -75,8 +75,8 @@ sha3 = { version = "0.10", optional = true }
 # While we wait for repeat_n in rust standard library
 itertools = "0.11.0"
 rand_core = { version = "0.6.4", features = ["std"] }
-tfhe-zk-pok = { version = "0.3.0-alpha.0", path = "../tfhe-zk-pok", optional = true }
-tfhe-versionable = { version = "0.2.0", path = "../utils/tfhe-versionable" }
+tfhe-zk-pok = { version = "0.3.0-alpha.1", path = "../tfhe-zk-pok", optional = true }
+tfhe-versionable = { version = "0.2.1", path = "../utils/tfhe-versionable" }

 # wasm deps
 wasm-bindgen = { version = "0.2.86", features = [
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Agnes Leroy	a01814e991	chore(gpu): change multi-gpu tests to run on rtx so it's cheaper	2024-09-12 14:03:42 +02:00
Agnes Leroy	b5d50cec5a	chore(gpu): set test threads to 1 when BIG_INSTANCE is false to get a better view of failures in the ci	2024-09-12 13:24:07 +02:00
Agnes Leroy	c619eb479e	chore(gpu): add comment, remove unnecessary sync	2024-09-12 13:24:07 +02:00
Agnes Leroy	4e8bdc4380	chore(gpu): add scalar div and signed scalar div to hl api	2024-09-12 10:49:41 +02:00
Agnes Leroy	1deaaf5249	feat(gpu): signed scalar div	2024-09-12 10:49:41 +02:00
Agnes Leroy	abd2fe1f4e	chore(gpu): return if chunk_size is 0	2024-09-12 10:49:41 +02:00
Agnes Leroy	47d671b043	fix(gpu): return early in sum_ct if num radix is 2, pass different pointers to smart copy	2024-09-12 10:40:13 +02:00
Agnes Leroy	f700016776	chore(gpu): fix partial sum ct with 0 or 1 inputs in the vec Also refactor the interface for Hillis & Steele prefix sum	2024-09-12 09:22:42 +02:00
Agnes Leroy	6fabe6bab0	chore(gpu): fix templates and refactor radix negation	2024-09-12 09:21:54 +02:00
Arthur Meyre	91171c738d	chore: bump version of tfhe to 0.8.0-alpha.5	2024-09-11 18:06:25 +02:00
Arthur Meyre	7bf0dc157d	chore: bump tfhe-zk-pok version to 0.3.0-alpha.1	2024-09-11 18:06:25 +02:00
Arthur Meyre	0612ef5be5	feat(integer): plug metadata into lower level ZK APIs	2024-09-11 18:06:25 +02:00
Arthur Meyre	aee4c1ed18	feat(shortint): plug metadata API in the lower level ZK APIs	2024-09-11 18:06:25 +02:00
Arthur Meyre	e2a3ef151a	feat(core): plug metadata into ZK APIs	2024-09-11 18:06:25 +02:00
Arthur Meyre	6f77bea5e0	feat(zk): add metadata management to v1 - proof function takes an additional u8 slice which is hashed in the proof the verification cannot happen without the same metadata being provided again	2024-09-11 18:06:25 +02:00
Arthur Meyre	e4f72dab30	chore(ci): make a check for wasm bindings with and without zk-pok	2024-09-11 18:06:25 +02:00
Arthur Meyre	7ed3fded4a	chore(ci): the detect handles option from jest is freezing the runner - trying to find the cause is making the problem worse, reverting	2024-09-11 17:25:40 +02:00
David Testé	488c942a3a	refactor(shortint): move parameters set to their own directory This is done to ease automatic parameters updates.	2024-09-11 13:54:23 +02:00
Mayeul@Zama	c0d98394fa	refactor(integer): add compression key types	2024-09-11 13:53:04 +02:00
Mayeul@Zama	93ff6992e2	refactor(all): refactor oprf integer and hl APIs	2024-09-11 10:49:39 +02:00
Pedro Alves	2a4026c761	fix(gpu): fix some edge-cases (and booleans) on compression	2024-09-10 23:11:20 +02:00
Pedro Alves	39c424b14d	chore(gpu): add debug/release modes	2024-09-09 14:02:10 +02:00
Guillermo Oyarzun	46a7a3b43b	refactor(gpu): avoid synchronizations in the keybundle	2024-09-09 14:01:15 +02:00
Mayeul@Zama	38b5759e88	chore(all): fix new lints	2024-09-09 11:57:45 +02:00
Mayeul@Zama	d6f8e59394	chore(all): update toolchain	2024-09-09 11:57:45 +02:00
dependabot[bot]	a95db07003	chore(deps): bump tj-actions/changed-files from 45.0.0 to 45.0.1 Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 45.0.0 to 45.0.1. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](`40853de9f8...e9772d1404`) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2024-09-09 11:05:21 +02:00
David Testé	6544e6f6a3	chore(ci): use python script to send benchmark results Using this script simplify writing of corresponding workflow step. Moreover, now when an upload fails it translate into a workflow failure.	2024-09-09 11:04:06 +02:00
Agnes Leroy	1d549dfd8a	chore(gpu): pass over all cuda bind	2024-09-06 17:47:59 +02:00
Arthur Meyre	019548daa5	chore(ci): add a flag to jest to indicate what might be stuck when running	2024-09-06 17:41:22 +02:00
Arthur Meyre	26b666955a	chore(ci): timeout wasm bench and test at the GitHub runner level - avoids a stuck runner for 6 hours - actions timeouts are slightly larger than the test runner timeout to have a chance to get a log out	2024-09-06 17:41:22 +02:00
Arthur Meyre	ce9da12e65	feat(zk): implement faster pke proof - original work by Sarah El kazdadi co-authored-by: sarah el kazdadi <sarah.elkazdadi@zama.ai>	2024-09-06 14:25:57 +02:00
Arthur Meyre	32b45ac4bc	chore(js): increase timeout for ZK test as it can be surpassed - this seemed to cause the test runner to hang forever - also add a timeout in the GitHub workflow, to avoid having the test runner wait forever (or in this case 6 hours because of default timeout)	2024-09-06 14:19:07 +02:00
Arthur Meyre	26055b236e	feat(tfhe): allow unpacking packed compact ciphertext lists in js/wasm	2024-09-06 14:19:07 +02:00
Agnes Leroy	ce9e355c15	chore(gpu): reduce the amount of weekly multi-gpu bench	2024-09-06 11:55:34 +02:00
tmontaigu	85cc638c62	chore(gpu): fix bad merge	2024-09-06 10:21:00 +02:00
Agnes Leroy	d454b5386b	chore(gpu): remove device synchronization in drop for CudaVec	2024-09-05 14:13:06 +02:00
tmontaigu	426f3bd192	feat(hlapi): add tag system Tag The `Tag` allows to store bytes alongside of entities (keys, and ciphertext) the main purpose of this system is to `tag` / identify ciphertext with their keys. * When encrypted, a ciphertext gets the tag of the key used to encrypt it. * Ciphertexts resulting from operations (add, sub, etc.) get the tag from the ServerKey used * PublicKey gets its tag from the ClientKey that was used to create it * ServerKey gets its tag from the ClientKey that was used to create it User can change the tag of any entities at any point. BREAKING CHANGE: Many of the into_raw_parts and from_raw_parts changed to accommodate the addition of the `tag``	2024-09-05 10:32:35 +02:00
tmontaigu	4c707e79d8	feat(hlapi): bind cuda's trailing/leading_ones/zeros, ilog2	2024-09-04 19:38:14 +02:00
Arthur Meyre	e1afb8126d	chore: bump version to 0.8.0-alpha.4	2024-09-04 17:30:43 +02:00
Agnes Leroy	0d1ef0af7e	chore(gpu): add ilog2 bench	2024-09-04 17:03:20 +02:00
Arthur Meyre	15e3474cda	feat(pbs): slightly improve f64 pbs perf co-authored-by: sarah el kazdadi <sarah.elkazdadi@zama.ai>	2024-09-03 19:31:14 +02:00
Arthur Meyre	10be6f9423	chore(ci): update node project packages	2024-09-03 17:14:36 +02:00
David Testé	c521c2ca2e	chore(ci): avoid running integer tests on push to internal repo	2024-09-03 15:29:15 +02:00
David Testé	39c46056f6	chore(ci): rename benchmark workflows to ease file navigation	2024-09-03 10:34:14 +02:00
Pedro Alves	aa2b27460c	fix(gpu): update the internal benchmark tool for the TBC pbs	2024-09-02 13:16:18 +02:00
dependabot[bot]	c258d53625	chore(deps): bump actions/upload-artifact from 4.3.6 to 4.4.0 Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.3.6 to 4.4.0. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](`834a144ee9...50769540e7`) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2024-09-02 09:34:34 +02:00
tmontaigu	8ddee20a57	feat(tfhe): add get_kind_of to CompactCiphertextList This adds the ability to query the length and types contained in a CompactCiphertextList and ProvenCompactCiphertextList without having to expand or verify the proof(s)	2024-08-30 21:01:46 +02:00
tmontaigu	1d786b7202	feat(wasm): bind CompactListExpander::get_kind_of And other things to allow wasm users to explore what kind of data is stored in the compact list.	2024-08-30 21:01:46 +02:00
tmontaigu	7267d60e01	feat(integer): implement unsigned_overflowing_scalar_sub	2024-08-29 19:09:48 +02:00
Arthur Meyre	0148a6ffc8	chore(tfhe): update dependencies with breaking changes - concrete-fft to 0.5 and concrete-ntt 0.2.0 due to rust AVX512 breaking change (fix for bad args in function) - dyn-stack to 0.10 due to concrete-fft update	2024-08-29 17:36:19 +02:00
tmontaigu	63571a07ae	feat(integer): add is_even/is_odd functions These ones are pretty simple and so are also directly done for GPU	2024-08-29 14:24:40 +02:00
Arthur Meyre	6e2908ad4e	chore(bench): fix CRS size for integer ZK bench	2024-08-29 09:41:35 +02:00
sarah el kazdadi	d3d06c905f	feat(tfhe): replace asm with rust intrinsics	2024-08-29 09:41:20 +02:00
Arthur Meyre	051f33f166	chore(hl): remove second server key generation - bad merge led to two server key generations in the HL API, fix that	2024-08-28 15:25:35 +02:00
Mayeul@Zama	11a8f97a1c	chore(all): use destructuring in conformance	2024-08-26 17:28:05 +02:00
tmontaigu	35a9c323a7	chore(integer): make remaining non-parallel test use defined test cases This makes the remaining non-parallel ops implementation use the same test cases that are used for parallel implementations. There are still some test that do not share the test case but its either because they do not have a parallel impl (not interesting to have) or when its tests about encryption/decryption Closes https://github.com/zama-ai/tfhe-rs-internal/issues/265	2024-08-26 10:13:11 +02:00
dependabot[bot]	641f47b775	chore(deps): bump tj-actions/changed-files from 44.5.7 to 45.0.0 Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 44.5.7 to 45.0.0. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](`c65cd88342...40853de9f8`) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2024-08-26 09:52:06 +02:00
tmontaigu	456d0ced1b	chore(integer): addition test based on trivial inputs This adds `overflowing_add` and `add` tests that are on trivial inputs. As these are faster to run they can be more extensive than on true encryptions This also binds the advanced_add_assign functions tests to include overflow computation On a standard laptop with 1 test thread it takes ~7 minutes to run these trivial tests	2024-08-23 16:28:40 +02:00
tmontaigu	358bcc9a22	feat(integer): implement sub_assign_with borrow To get the same kind of speed ups for unsigned_overflow as we got in previous commits that changed the carry propagation algorithm	2024-08-21 09:56:40 +02:00
Pedro Alves	27a4564d83	fix(gpu): fix compression benchmarking	2024-08-20 17:46:20 -03:00
Arthur Meyre	296e419f6c	chore(ci): update tfhe-lints to more recent toolchain	2024-08-20 13:02:12 +02:00
Arthur Meyre	e1a25a10ac	chore(docs): fix README link to getting started	2024-08-19 15:35:52 +02:00
Arthur Meyre	d9349b3357	chore(ci): update nightly toolchain	2024-08-19 15:35:52 +02:00
Arthur Meyre	68e4ac4896	chore(ci): fix lints for new nightly toolchain	2024-08-19 15:35:52 +02:00
tmontaigu	3f318a2046	feat(wasm): add missing push_u{512,1024,2048} This adds the missing push functions for some big uint type that the fhEVM needs	2024-08-19 10:12:53 +02:00
tmontaigu	d1380794ed	chore(tfhe): bump version to 0.8.0-alpha.3	2024-08-19 10:12:53 +02:00
Pedro Alves	fe5641ef6d	feat(gpu): implement CUDA-based Radix Integer compression and public functional packing keyswitch	2024-08-16 15:44:34 -03:00
Arthur Meyre	3397aa81d2	chore(ci): update node to 22.6	2024-08-14 13:42:14 +02:00
Arthur Meyre	8f10f8f8db	chore(ci): reduce bench loops for WASM compressed server key - excessive loops seemed to trigger a crash likely due to some memory exhaustion/fragmentation	2024-08-14 13:42:14 +02:00
Arthur Meyre	92be95c6b8	chore(ci): fix parsing for integer benchmarks	2024-08-14 13:42:14 +02:00
Arthur Meyre	990c4d0380	chore(ci): do not run all steps on slow runners	2024-08-14 13:42:14 +02:00
Arthur Meyre	1d5abfd5ea	chore(ci): do not run tests nightly, on push only if relevant files changed	2024-08-14 13:42:14 +02:00
Arthur Meyre	dfd1beeb47	chore(ci): avoid concurrency lock for PKE ZK benchmarks - sharing a concurrency group on merge to main means two sequential merges will lock the second one while it waits for the first to complete	2024-08-14 13:42:14 +02:00
Arthur Meyre	43a007a2fa	chore(ci): make sure the newline linter runs	2024-08-14 13:42:14 +02:00
Arthur Meyre	54faf64ecd	chore(tfhe): bump tfhe-versionable version to 0.2.1	2024-08-14 13:17:21 +02:00
Arthur Meyre	8fe7f9c3cb	chore(ci): add workflow to publish tfhe-versionable	2024-08-14 13:17:21 +02:00
Arthur Meyre	9ed65db03d	chore(ci): csprng release workflow misc fixes	2024-08-14 13:17:21 +02:00
tmontaigu	9413d3e722	feat(integer): improve {overflowing_}scalar_add/sub	2024-08-14 12:30:53 +02:00