chore(gpu): add valgrind and fix leaks

2026-01-09 22:57:59 -05:00 · 2025-07-31 10:18:43 +02:00
parent 677da3855e
commit c06b513182
27 changed files with 340 additions and 49 deletions
--- a/.github/workflows/gpu_code_validation_tests.yml
+++ b/.github/workflows/gpu_code_validation_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend on an AWS instance
-name: Cuda - Code Validation
+name: Cuda - CPU Memory Checks
 env:
  CARGO_TERM_COLOR: always
@@ -22,8 +22,9 @@ env:
 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  pull_request:
+  schedule:
-    types: [ labeled ]
+    # every 3 months
    - cron: "0 0 1 */3 *"
 permissions:
  contents: read
@@ -57,7 +58,7 @@ jobs:
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
  cuda-tests-linux:
-    name: CUDA Code Validation tests
+    name: CUDA Memory Checks tests
    needs: [ setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
@@ -65,6 +66,7 @@ jobs:
      group: ${{ github.workflow_ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    timeout-minutes: 5760
    strategy:
      fail-fast: false
      # explicit include-based build matrix, of known valid options
@@ -89,7 +91,9 @@ jobs:
      - name: Find tools
        run: |
          sudo apt update && sudo apt install -y valgrind 
          find /usr -executable -name "compute-sanitizer"
          which valgrind
      - name: Install latest stable
        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
@@ -98,7 +102,7 @@ jobs:
      - name: Run memory sanitizer
        run: |
-          make test_high_level_api_gpu_debug
+          make test_high_level_api_gpu_valgrind
  slack-notify:
    name: Slack Notification
@@ -120,7 +124,7 @@ jobs:
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "GPU code validation tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
+          SLACK_MESSAGE: "GPU Memory Checks tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
  teardown-instance:
    name: Teardown instance (cuda-tests)
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -127,9 +127,11 @@ jobs:
        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable
      - name: Enable nvidia multi-process service
        run: |
          nvidia-cuda-mps-control -d
      - name: Run core crypto and internal CUDA backend tests
        run: |
          make test_core_crypto_gpu
--- a/.github/workflows/gpu_memory_sanitizer.yml
+++ b/.github/workflows/gpu_memory_sanitizer.yml
@@ -0,0 +1,149 @@
 # Compile and test tfhe-cuda-backend on an AWS instance
 name: Cuda - GPU Memory Checks
 env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
  RUST_MIN_STACK: "8388608"
  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  SLACKIFY_MARKDOWN: true
  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
  PULL_REQUEST_MD_LINK: ""
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
  # Secrets will be available only to zama-ai organization members
  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
  EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"
 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  pull_request:
    types: [ labeled ]
  workflow_dispatch:
 permissions:
  contents: read
 jobs:
  setup-instance:
    name: Setup instance (cuda-tests)
    runs-on: ubuntu-latest
    if: github.event_name != 'pull_request' ||
      (github.event.action == 'labeled' && github.event.label.name == 'approved')
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
          backend: hyperstack
          profile: gpu-test
      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
        id: start-github-instance
        if: env.SECRETS_AVAILABLE == 'false'
        run: |
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
  cuda-tests-linux:
    name: CUDA Memory Checks tests
    needs: [ setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
      group: ${{ github.workflow_ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    timeout-minutes: 240
    strategy:
      fail-fast: false
      # explicit include-based build matrix, of known valid options
      matrix:
        include:
          - os: ubuntu-22.04
            cuda: "12.8"
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
      - name: Setup Hyperstack dependencies
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
          gcc-version: ${{ matrix.gcc }}
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
      - name: Find tools
        run: |
          find /usr -executable -name "compute-sanitizer"
      - name: Install latest stable
        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable
      - name: Run memory sanitizer
        run: |
          make test_high_level_api_gpu_sanitizer
  slack-notify:
    name: Slack Notification
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
          SLACK_MESSAGE: "GPU Memory Checks tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
  teardown-instance:
    name: Teardown instance (cuda-tests)
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
          label: ${{ needs.setup-instance.outputs.runner-name }}
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/15
+++ b/15
@@ -702,12 +702,19 @@ test_integer_gpu_debug: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile release_lto_off \
 		--features=integer,gpu-debug -p tfhe -- integer::gpu::server_key::
-.PHONY: test_high_level_api_gpu_debug # Run the tests of the integer module with Debug flags for CUDA
+.PHONY: test_high_level_api_gpu_valgrind # Run the tests of the integer module with Debug flags for CUDA
-test_high_level_api_gpu_debug: install_rs_build_toolchain install_cargo_nextest
+test_high_level_api_gpu_valgrind: install_rs_build_toolchain install_cargo_nextest
-	export RUSTFLAGS="$(RUSTFLAGS)" && \
+	export RUSTFLAGS="-C target-cpu=x86-64" && \
 	export CARGO_RS_BUILD_TOOLCHAIN="$(CARGO_RS_BUILD_TOOLCHAIN)" && \
 	export TFHE_SPEC="tfhe" && \
-	export CARGO_PROFILE="$(CARGO_PROFILE)" &&	scripts/check_memory_errors.sh
+	export CARGO_PROFILE="$(CARGO_PROFILE)" &&	scripts/check_memory_errors.sh --cpu
 .PHONY: test_high_level_api_gpu_sanitizer # Run the tests of the integer module with Debug flags for CUDA
 test_high_level_api_gpu_sanitizer: install_rs_build_toolchain install_cargo_nextest
 	export RUSTFLAGS="-C target-cpu=x86-64" && \
 	export CARGO_RS_BUILD_TOOLCHAIN="$(CARGO_RS_BUILD_TOOLCHAIN)" && \
 	export TFHE_SPEC="tfhe" && \
 	export CARGO_PROFILE="$(CARGO_PROFILE)" &&	scripts/check_memory_errors.sh --gpu
 .PHONY: test_integer_hl_test_gpu_check_warnings
 test_integer_hl_test_gpu_check_warnings: install_rs_build_toolchain
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -12,6 +12,8 @@
 #include <functional>
 #include <queue>
 #include <stdio.h>
 class NoiseLevel {
 public:
  // Constants equivalent to the Rust code
@@ -761,22 +763,20 @@ template <typename Torus> struct int_radix_lut {
    if (!mem_reuse) {
      release_radix_ciphertext_async(streams[0], gpu_indexes[0],
                                     tmp_lwe_before_ks, gpu_memory_allocated);
-      if (gpu_memory_allocated) {
+      for (int i = 0; i < buffer.size(); i++) {
-        for (int i = 0; i < buffer.size(); i++) {
+        switch (params.pbs_type) {
-          switch (params.pbs_type) {
+        case MULTI_BIT:
-          case MULTI_BIT:
+          cleanup_cuda_multi_bit_programmable_bootstrap(
-            cleanup_cuda_multi_bit_programmable_bootstrap(
+              streams[i], gpu_indexes[i], &buffer[i]);
-                streams[i], gpu_indexes[i], &buffer[i]);
+          break;
-            break;
+        case CLASSICAL:
-          case CLASSICAL:
+          cleanup_cuda_programmable_bootstrap(streams[i], gpu_indexes[i],
-            cleanup_cuda_programmable_bootstrap(streams[i], gpu_indexes[i],
+                                              &buffer[i]);
-                                                &buffer[i]);
+          break;
-            break;
+        default:
-          default:
+          PANIC("Cuda error (PBS): unknown PBS type. ")
            PANIC("Cuda error (PBS): unknown PBS type. ")
          }
          cuda_synchronize_stream(streams[i], gpu_indexes[i]);
        }
        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
      }
      delete tmp_lwe_before_ks;
      buffer.clear();
@@ -4910,12 +4910,22 @@ template <typename Torus> struct int_scalar_mul_buffer {
    sc_prop_mem->release(streams, gpu_indexes, gpu_count);
    delete sc_prop_mem;
    delete all_shifted_buffer;
-    if (!anticipated_buffers_drop) {
+    release_buffers(streams, gpu_indexes, gpu_count);
  }
  void release_buffers(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                       uint32_t gpu_count) {
    if (preshifted_buffer) {
      release_radix_ciphertext_async(streams[0], gpu_indexes[0],
                                     preshifted_buffer, gpu_memory_allocated);
      delete preshifted_buffer;
      preshifted_buffer = nullptr;
    }
    if (logical_scalar_shift_buffer) {
      logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
      delete logical_scalar_shift_buffer;
-      delete preshifted_buffer;
+      logical_scalar_shift_buffer = nullptr;
    }
  }
 };
@@ -5197,6 +5207,7 @@ template <typename Torus> struct int_scalar_mul_high_buffer {
    scalar_mul_mem->release(streams, gpu_indexes, gpu_count);
    delete scalar_mul_mem;
    scalar_mul_mem = nullptr;
    release_radix_ciphertext_async(streams[0], gpu_indexes[0], tmp,
                                   allocate_gpu_memory);
@@ -5718,23 +5729,29 @@ template <typename Torus> struct int_signed_scalar_div_rem_buffer {
    release_radix_ciphertext_async(streams[0], gpu_indexes[0], numerator_ct,
                                   allocate_gpu_memory);
    delete numerator_ct;
    numerator_ct = nullptr;
    signed_div_mem->release(streams, gpu_indexes, gpu_count);
    delete signed_div_mem;
    signed_div_mem = nullptr;
    scp_mem->release(streams, gpu_indexes, gpu_count);
    delete scp_mem;
    scp_mem = nullptr;
    if (logical_scalar_shift_mem != nullptr) {
      logical_scalar_shift_mem->release(streams, gpu_indexes, gpu_count);
      delete logical_scalar_shift_mem;
      logical_scalar_shift_mem = nullptr;
    }
    if (scalar_mul_mem != nullptr) {
      scalar_mul_mem->release(streams, gpu_indexes, gpu_count);
      delete scalar_mul_mem;
      scalar_mul_mem = nullptr;
    }
    sub_and_propagate_mem->release(streams, gpu_indexes, gpu_count);
    delete sub_and_propagate_mem;
    sub_and_propagate_mem = nullptr;
  }
 };
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
@@ -40,4 +40,6 @@ void cleanup_cuda_integer_abs_inplace(void *const *streams,
  int_abs_buffer<uint64_t> *mem_ptr =
      (int_abs_buffer<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
@@ -41,6 +41,8 @@ void cleanup_cuda_integer_bitop(void *const *streams,
  int_bitop_buffer<uint64_t> *mem_ptr =
      (int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }
 void update_degrees_after_bitand(uint64_t *output_degrees,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
@@ -61,4 +61,6 @@ void cleanup_cuda_extend_radix_with_sign_msb_64(void *const *streams,
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  POP_RANGE()
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -50,5 +50,7 @@ void cleanup_cuda_integer_radix_cmux(void *const *streams,
  int_cmux_buffer<uint64_t> *mem_ptr =
      (int_cmux_buffer<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
  POP_RANGE()
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
@@ -99,6 +99,8 @@ void cleanup_cuda_integer_comparison(void *const *streams,
  int_comparison_buffer<uint64_t> *mem_ptr =
      (int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
  POP_RANGE()
 }
@@ -146,6 +148,8 @@ void cleanup_cuda_integer_are_all_comparisons_block_true(
  int_comparison_buffer<uint64_t> *mem_ptr =
      (int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }
 uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
@@ -192,4 +196,6 @@ void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
  int_comparison_buffer<uint64_t> *mem_ptr =
      (int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -75,6 +75,8 @@ void cleanup_cuda_integer_compress_radix_ciphertext_64(
  int_compression<uint64_t> *mem_ptr =
      (int_compression<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }
 void cleanup_cuda_integer_decompress_radix_ciphertext_64(
@@ -84,4 +86,6 @@ void cleanup_cuda_integer_decompress_radix_ciphertext_64(
  int_decompression<uint64_t> *mem_ptr =
      (int_decompression<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
@@ -46,5 +46,7 @@ void cleanup_cuda_integer_div_rem(void *const *streams,
      (int_div_rem_memory<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
  POP_RANGE()
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -42,6 +42,8 @@ void cleanup_cuda_full_propagation(void *const *streams,
      (int_fullprop_buffer<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }
 uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
@@ -155,6 +157,8 @@ void cleanup_cuda_propagate_single_carry(void *const *streams,
  int_sc_prop_memory<uint64_t> *mem_ptr =
      (int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
  POP_RANGE()
 }
@@ -166,6 +170,8 @@ void cleanup_cuda_add_and_propagate_single_carry(void *const *streams,
  int_sc_prop_memory<uint64_t> *mem_ptr =
      (int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
  POP_RANGE()
 }
 void cleanup_cuda_integer_overflowing_sub(void *const *streams,
@@ -176,6 +182,8 @@ void cleanup_cuda_integer_overflowing_sub(void *const *streams,
  int_borrow_prop_memory<uint64_t> *mem_ptr =
      (int_borrow_prop_memory<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
  POP_RANGE()
 }
@@ -245,6 +253,8 @@ void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
  PUSH_RANGE("cleanup univar lut")
  int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
  POP_RANGE()
 }
@@ -307,6 +317,8 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams,
  PUSH_RANGE("cleanup bivar lut")
  int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
  POP_RANGE()
 }
@@ -351,6 +363,8 @@ void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
    int8_t **mem_ptr_void) {
  int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }
 void cuda_integer_reverse_blocks_64_inplace(void *const *streams,
@@ -442,5 +456,7 @@ void cleanup_cuda_apply_noise_squashing_kb(void *const *streams,
  int_noise_squashing_lut<uint64_t> *mem_ptr =
      (int_noise_squashing_lut<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
  POP_RANGE()
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -198,6 +198,8 @@ void cleanup_cuda_integer_mult(void *const *streams,
      (int_mul_memory<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
  POP_RANGE()
 }
@@ -248,4 +250,6 @@ void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
      (int_sum_ciphertexts_vec_memory<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
@@ -46,4 +46,6 @@ void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,
      (int_scalar_mul_buffer<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -103,12 +103,7 @@ __host__ void host_integer_scalar_mul_radix(
  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
  if (mem->anticipated_buffers_drop) {
-    release_radix_ciphertext_async(streams[0], gpu_indexes[0],
+    mem->release_buffers(streams, gpu_indexes, gpu_count);
                                   preshifted_buffer,
                                   mem->gpu_memory_allocated);
    delete preshifted_buffer;
    mem->logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
    delete (mem->logical_scalar_shift_buffer);
  }
  if (j == 0) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
@@ -41,4 +41,6 @@ void cleanup_cuda_integer_radix_scalar_rotate(void *const *streams,
      (int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu
@@ -83,6 +83,8 @@ void cleanup_cuda_integer_radix_logical_scalar_shift(
      (int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }
 void cleanup_cuda_integer_radix_arithmetic_scalar_shift(
@@ -93,4 +95,6 @@ void cleanup_cuda_integer_radix_arithmetic_scalar_shift(
      (int_arithmetic_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cu
@@ -40,4 +40,6 @@ void cleanup_cuda_integer_radix_shift_and_rotate(void *const *streams,
      (int_shift_and_rotate_buffer<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cu
@@ -45,4 +45,6 @@ void cleanup_cuda_sub_and_propagate_single_carry(void *const *streams,
  mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
  POP_RANGE()
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
@@ -372,4 +372,5 @@ void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
  // Free memory
  cuda_drop_async(*pbs_buffer, static_cast<cudaStream_t>(stream), gpu_index);
  *pbs_buffer = nullptr;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -5,6 +5,8 @@
 #endif
 #include "ciphertext.h"
 #include <stdio.h>
 template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
                                                   uint32_t polynomial_size,
@@ -731,6 +733,8 @@ void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
                                         int8_t **buffer) {
  auto x = (pbs_buffer<uint64_t, CLASSICAL> *)(*buffer);
  x->release(static_cast<cudaStream_t>(stream), gpu_index);
  delete x;
  *buffer = nullptr;
 }
 template bool has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
@@ -258,4 +258,6 @@ void cleanup_cuda_programmable_bootstrap_128(void *stream, uint32_t gpu_index,
                                             int8_t **buffer) {
  auto x = (pbs_buffer_128<__uint128_t, PBS_TYPE::CLASSICAL> *)(*buffer);
  x->release(static_cast<cudaStream_t>(stream), gpu_index);
  delete x;
  *buffer = nullptr;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -440,6 +440,8 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
                                                   int8_t **buffer) {
  auto x = (pbs_buffer<uint64_t, MULTI_BIT> *)(*buffer);
  x->release(static_cast<cudaStream_t>(stream), gpu_index);
  delete x;
  *buffer = nullptr;
 }
 /**
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
@@ -294,6 +294,8 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
  const auto x =
      reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(*buffer);
  x->release(static_cast<cudaStream_t>(stream), gpu_index);
  delete x;
  *buffer = nullptr;
 }
 /**
--- a/backends/tfhe-cuda-backend/cuda/src/zk/zk.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/zk/zk.cu
@@ -64,4 +64,6 @@ void cleanup_expand_without_verification_64(void *const *streams,
  zk_expand_mem<uint64_t> *mem_ptr =
      reinterpret_cast<zk_expand_mem<uint64_t> *>(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }
--- a/scripts/check_memory_errors.sh
+++ b/scripts/check_memory_errors.sh
@@ -1,27 +1,79 @@
 #!/usr/bin/env bash
 RUN_VALGRIND=0
 RUN_COMPUTE_SANITIZER=0
 while [ -n "$1" ]
 do
   case "$1" in
        "--cpu" )
            RUN_VALGRIND=1
            ;;
        "--gpu" )
            RUN_COMPUTE_SANITIZER=1
            ;;
        *)
            echo "Unknown param : $1"
            exit 1
            ;;
   esac
   shift
 done
 if [[ "${RUN_VALGRIND}" == "0" && "${RUN_COMPUTE_SANITIZER}" == "0" ]]; then
  echo "Usage: check_memory_errors.sh [--gpu] [--cpu]"
  exit 1
 fi
 # Build the tests but don't run them
 RUSTFLAGS="$RUSTFLAGS" cargo "${CARGO_RS_BUILD_TOOLCHAIN}" test --no-run --profile "${CARGO_PROFILE}" \
-  --features=integer,internal-keycache,gpu,zk-pok -p "${TFHE_SPEC}"
+  --features=integer,internal-keycache,gpu-debug,zk-pok -p "${TFHE_SPEC}"
-# Find the test executable
+# Find the test executable -> last one to have been modified
-EXECUTABLE=$(find target/release/deps/ -type f -executable -name "tfhe-*")
+EXECUTABLE=target/release/deps/$(find target/release/deps/ -type f -executable -name "tfhe-*" -printf "%T@ %f\n" |sort -nr|sed 's/^.* //; q;')
 # List the tests into a temporary file
 RUSTFLAGS="$RUSTFLAGS" cargo "${CARGO_RS_BUILD_TOOLCHAIN}" nextest list --cargo-profile "${CARGO_PROFILE}" \
-          --features=integer,internal-keycache,gpu,zk-pok -p "${TFHE_SPEC}" &> /tmp/test_list.txt
+          --features=integer,internal-keycache,gpu-debug,zk-pok -p "${TFHE_SPEC}" &> /tmp/test_list.txt
 # Filter the tests to get only the HL ones
-TESTS_HL=$(sed -e $'s/\x1b\[[0-9;]*m//g' <  /tmp/test_list.txt | grep 'high_level_api::.*gpu.*')
+TESTS_HL=$(sed -e $'s/\x1b\[[0-9;]*m//g' <  /tmp/test_list.txt | grep 'high_level_api::.*gpu.*' )
-# Run compute sanitizer on each test individually
+if [[ "${RUN_VALGRIND}" == "1" ]]; then
-# shellcheck disable=SC2181
+  # shellcheck disable=SC2181
-RESULT=0 && \
+  RESULT=0 && \
-while read -r t; do \
+  while read -r t; do \
-  echo compute-sanitizer --target-processes=all "$(pwd)"/"${EXECUTABLE}" -- "${t}" && \
+    echo valgrind --leak-check=full  --show-leak-kinds=definite "$(pwd)"/"${EXECUTABLE}" -- "${t}" && \
-  compute-sanitizer --leak-check=full --error-exitcode=1 --target-processes=all "$(pwd)"/"${EXECUTABLE}" -- "${t}" && \
+    valgrind --leak-check=full  --show-leak-kinds=definite "$(pwd)"/"${EXECUTABLE}" -- "${t}" && \
-  if [[ $? != "0" ]]; then \
+    if [[ $? != "0" ]]; then \
-      RESULT=1; \
+        RESULT=1; \
-  fi; \
+    fi; \
-done <<< "${TESTS_HL}"
+  done <<< "${TESTS_HL}"
-exit $RESULT
+  if [ $RESULT -ne 0 ]; then \
    exit $RESULT; \
  fi;
 fi
 TESTS_HL=$(sed -e $'s/\x1b\[[0-9;]*m//g' <  /tmp/test_list.txt | grep 'high_level_api::.*gpu.*' )
 if [[ "${RUN_COMPUTE_SANITIZER}" == "1" ]]; then
  # Run compute sanitizer on each test individually
  # shellcheck disable=SC2181
  RESULT=0 && \
  while read -r t; do \
    echo compute-sanitizer --tool memcheck --target-processes=all "$(pwd)"/"${EXECUTABLE}" -- "${t}" && \
    compute-sanitizer --tool memcheck --leak-check=full --error-exitcode=1 --target-processes=all "$(pwd)"/"${EXECUTABLE}" -- "${t}" && \
    if [[ $? != "0" ]]; then \
        RESULT=1; \
    fi; \
  done <<< "${TESTS_HL}"
  if [ $RESULT -ne 0 ]; then \
    exit $RESULT; \
  fi;
 fi
 exit 0