fix(gpu): define CUDA_ARCH in tfhe-cuda-common CMakeLists

device.cu uses CUDA_ARCH in preprocessor conditionals (e.g., cuda_check_support_thread_block_clusters relies on #if CUDA_ARCH >= 900). Without this define, CUDA_ARCH defaults to 0, silently disabling thread block cluster support on Hopper GPUs.
chore(gpu): bump tfhe-cuda-backend to 0.14.1 and zk-cuda-backend to 0.1.1
2026-04-28 03:01:21 -04:00 · 2026-04-23 20:35:27 -03:00 · 2026-04-23 16:42:55 -03:00 · 2026-04-23 16:35:30 -03:00
75 changed files with 450 additions and 412 deletions
--- a/.github/workflows/aws_data_tests.yml
+++ b/.github/workflows/aws_data_tests.yml
@@ -54,7 +54,7 @@ jobs:

      - name: Retrieve data from cache
        id: retrieve-data-cache
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
@@ -89,7 +89,7 @@ jobs:
      - name: Store data in cache
        if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
        continue-on-error: true
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -69,7 +69,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            dependencies:
@@ -200,7 +200,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            ~/.nvm
@@ -213,7 +213,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -56,7 +56,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            integer:
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -57,7 +57,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            integer:
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -78,7 +78,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            dependencies:
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -45,7 +45,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            wasm:
@@ -92,7 +92,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            ~/.nvm
@@ -105,7 +105,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/backward_compat_pr_change_report.yml
+++ b/.github/workflows/backward_compat_pr_change_report.yml
@@ -34,7 +34,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            backward:
--- a/.github/workflows/benchmark_gpu_coprocessor.yml
+++ b/.github/workflows/benchmark_gpu_coprocessor.yml
@@ -204,7 +204,7 @@ jobs:
        uses: foundry-rs/foundry-toolchain@8789b3e21e6c11b2697f5eb56eddae542f746c10

      - name: Cache cargo
-        uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
        with:
          path: |
            ~/.cargo/registry
@@ -232,7 +232,7 @@ jobs:
        working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker

      - name: Use Node.js
-        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
+        uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # v6.3.0
        with:
          node-version: 20.x

--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -46,7 +46,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            wasm_bench:
--- a/.github/workflows/benchmark_wasm_client_common.yml
+++ b/.github/workflows/benchmark_wasm_client_common.yml
@@ -124,7 +124,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            ~/.nvm
@@ -137,7 +137,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/cargo_build_common.yml
+++ b/.github/workflows/cargo_build_common.yml
@@ -138,7 +138,7 @@ jobs:
      - name: Node cache restoration
        if: inputs.run-pcc-cpu-batch == 'pcc_batch_2'
        id: node-cache
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: |
            ~/.nvm
@@ -151,7 +151,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        if: inputs.run-pcc-cpu-batch == 'pcc_batch_2' && steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/cargo_test_fft.yml
+++ b/.github/workflows/cargo_test_fft.yml
@@ -40,7 +40,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            fft:
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -42,7 +42,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            ntt:
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -43,7 +43,7 @@ jobs:
          echo "version=$(make zizmor_version)" >> "${GITHUB_OUTPUT}"

      - name: Check workflows security
-        uses: zizmorcore/zizmor-action@b1d7e1fb5de872772f31590499237e7cce841e8e # v0.5.3
+        uses: zizmorcore/zizmor-action@71321a20a9ded102f6e9ce5718a2fcec2c4f70d8 # v0.5.2
        with:
          advanced-security: 'false' # Print results directly in logs
          persona: pedantic
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -44,7 +44,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            tfhe:
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -46,7 +46,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            csprng:
--- a/.github/workflows/gpu_core_h100_tests.yml
+++ b/.github/workflows/gpu_core_h100_tests.yml
@@ -49,7 +49,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            gpu:
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -47,7 +47,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            gpu:
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -48,7 +48,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            gpu:
--- a/.github/workflows/gpu_hlapi_h100_tests.yml
+++ b/.github/workflows/gpu_hlapi_h100_tests.yml
@@ -49,7 +49,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            gpu:
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -45,7 +45,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            gpu:
--- a/.github/workflows/gpu_memory_sanitizer.yml
+++ b/.github/workflows/gpu_memory_sanitizer.yml
@@ -48,7 +48,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            gpu:
--- a/.github/workflows/gpu_memory_sanitizer_h100.yml
+++ b/.github/workflows/gpu_memory_sanitizer_h100.yml
@@ -48,7 +48,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            gpu:
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -48,7 +48,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            gpu:
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -49,7 +49,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            gpu:
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -49,7 +49,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            gpu:
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -48,7 +48,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            gpu:
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -49,7 +49,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            gpu:
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -49,7 +49,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            gpu:
--- a/.github/workflows/gpu_zk_tests.yml
+++ b/.github/workflows/gpu_zk_tests.yml
@@ -47,7 +47,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            gpu:
--- a/.github/workflows/hpu_hlapi_tests.yml
+++ b/.github/workflows/hpu_hlapi_tests.yml
@@ -41,7 +41,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
        with:
          files_yaml: |
            hpu:
--- a/.github/workflows/make_release_tfhe.yml
+++ b/.github/workflows/make_release_tfhe.yml
@@ -89,7 +89,7 @@ jobs:
          make build_web_js_api_parallel

      - name: Authenticate on NPM
-        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
+        uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f # v6.3.0
        with:
          node-version: '24'
          registry-url: 'https://registry.npmjs.org'
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -53,7 +53,7 @@ jobs:

      - name: Restore Sagemath image from cache
        id: docker-cache
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
+        uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: /tmp/sagemath_image
          key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}
@@ -76,7 +76,7 @@ jobs:
      - name: Store Sagemath image in cache
        if: steps.docker-cache.outputs.cache-hit != 'true'
        continue-on-error: true
-        uses: actions/cache/save@27d5ce7f107fe9357f9df03efb73ab90386fccae #v5.0.5
+        uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
        with:
          path: /tmp/sagemath_image
          key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,6 +2,7 @@
 resolver = "3"
 members = [
    "apps/test-vectors",
+    "backends/tfhe-cuda-common",
    "backends/tfhe-cuda-backend",
    "backends/tfhe-hpu-backend",
    "backends/zk-cuda-backend",
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.14.0"
+version = "0.14.1"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
@@ -11,6 +11,9 @@ repository = "https://github.com/zama-ai/tfhe-rs"
 readme = "README.md"
 keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]

+[dependencies]
+tfhe-cuda-common = { version = "0.1.0", path = "../tfhe-cuda-common" }
+
 [build-dependencies]
 cmake.workspace = true
 pkg-config.workspace = true
@@ -18,6 +21,6 @@ bindgen.workspace = true

 [features]
 experimental-multi-arch = []
-profile = []
-debug = []
+profile = ["tfhe-cuda-common/profile"]
+debug = ["tfhe-cuda-common/debug"]
 debug-fake-multi-gpu = []
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -1,15 +1,5 @@
 use std::path::PathBuf;

-fn get_linux_distribution_name() -> Option<String> {
-    let content = std::fs::read_to_string("/etc/os-release").ok()?;
-    for line in content.lines() {
-        if let Some(value) = line.strip_prefix("NAME=") {
-            return Some(value.trim_matches('"').to_string());
-        }
-    }
-    None
-}
-
 fn main() {
    if let Ok(val) = std::env::var("DOCS_RS") {
        if val.parse::<u32>() == Ok(1) {
@@ -36,14 +26,9 @@ fn main() {
    println!("cargo::rerun-if-changed=cuda/CMakeLists.txt");
    println!("cargo::rerun-if-changed=src");

+    // Platform/distro check is performed by tfhe-cuda-common's build.rs, which
+    // Cargo builds first as a dependency.
    if std::env::consts::OS == "linux" {
-        if get_linux_distribution_name().as_deref() != Some("Ubuntu") {
-            println!(
-                "cargo:warning=This Linux distribution is not officially supported. \
-                Only Ubuntu is supported by tfhe-cuda-backend at this time. Build may fail\n"
-            );
-        }
-
        let mut cmake_config = cmake::Config::new("cuda");

        // Conditionally pass the "MULTI_ARCH" variable to CMake if the feature is enabled
@@ -67,6 +52,10 @@ fn main() {
            cmake_config.define("FAKE_MULTI_GPU", "ON");
        }

+        if let Ok(common_include) = std::env::var("DEP_TFHE_CUDA_COMMON_INCLUDE") {
+            cmake_config.define("TFHE_CUDA_COMMON_INCLUDE_DIR", &common_include);
+        }
+
        // Build the CMake project
        let dest = cmake_config.build();
        println!("cargo:rustc-link-search=native={}", dest.display());
--- a/backends/tfhe-cuda-backend/cuda/.semgrep/release-ordering.yaml
+++ b/backends/tfhe-cuda-backend/cuda/.semgrep/release-ordering.yaml
@@ -75,9 +75,7 @@ rules:
        - "*.cuh"
        - "*.cpp"
        - "*.h"
-      exclude:
-        - backends/tfhe-cuda-backend/cuda/check_cuda.cu # contains cuda checking functions
-        - backends/tfhe-cuda-backend/cuda/include/device.h # contains the cuda_check_error macro (and others)
+      exclude: []
    patterns:
      - pattern: $FUNC(...)
      - metavariable-regex:
--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -18,9 +18,9 @@ endif()
 if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS ${MINIMUM_SUPPORTED_CUDA_VERSION})
  message(FATAL_ERROR "CUDA ${MINIMUM_SUPPORTED_CUDA_VERSION} or greater is required for compilation.")
 endif()
-# Get CUDA compute capability
+# Get CUDA compute capability (check_cuda.cu lives in tfhe-cuda-common)
 set(OUTPUTFILE ${CMAKE_CURRENT_SOURCE_DIR}/cuda_script) # No suffix required
-set(CUDAFILE ${CMAKE_CURRENT_SOURCE_DIR}/check_cuda.cu)
+set(CUDAFILE ${CMAKE_CURRENT_SOURCE_DIR}/../../tfhe-cuda-common/cuda/check_cuda.cu)
 execute_process(COMMAND nvcc -lcuda ${CUDAFILE} -o ${OUTPUTFILE})
 execute_process(
  COMMAND ${OUTPUTFILE}
@@ -116,10 +116,21 @@ set(CMAKE_CUDA_FLAGS

 set(INCLUDE_DIR include)

+set(TFHE_CUDA_COMMON_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../tfhe-cuda-common/cuda")
+if(NOT TFHE_CUDA_COMMON_INCLUDE_DIR)
+  set(TFHE_CUDA_COMMON_INCLUDE_DIR "${TFHE_CUDA_COMMON_DIR}/include")
+endif()
+
+if(NOT TARGET tfhe_cuda_common)
+  add_subdirectory("${TFHE_CUDA_COMMON_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/tfhe_cuda_common")
+endif()
+
 add_subdirectory(src)
 enable_testing()
 add_subdirectory(tests_and_benchmarks)
 target_include_directories(tfhe_cuda_backend PRIVATE ${INCLUDE_DIR})
+target_include_directories(tfhe_cuda_backend PRIVATE ${TFHE_CUDA_COMMON_INCLUDE_DIR})
+target_link_libraries(tfhe_cuda_backend PUBLIC tfhe_cuda_common)

 # This is required for rust cargo build
 install(TARGETS tfhe_cuda_backend DESTINATION .)
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -5,6 +5,7 @@
 #include "crypto/keyswitch.cuh"
 #include "device.h"
 #include "helper_multi_gpu.h"
+#include "helper_profile.cuh"
 #include "integer/comparison.h"
 #include "integer/integer_utilities.h"
 #include "integer/scalar_addition.cuh"
@@ -14,7 +15,6 @@
 #include "polynomial/functions.cuh"
 #include "utils/helper.cuh"
 #include "utils/helper_multi_gpu.cuh"
-#include "utils/helper_profile.cuh"
 #include <algorithm>
 #include <functional>

--- a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
@@ -3,10 +3,10 @@

 #include "checked_arithmetic.h"
 #include "device.h"
+#include "helper_profile.cuh"
 #include "integer/integer.h"
 #include "integer/radix_ciphertext.h"
 #include "utils/helper.cuh"
-#include "utils/helper_profile.cuh"

 inline CudaLweCiphertextListFFI
 to_lwe_ciphertext_list(CudaRadixCiphertextFFI *radix) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/rerand.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/rerand.cuh
@@ -1,12 +1,12 @@
 #pragma once

 #include "device.h"
+#include "helper_profile.cuh"
 #include "integer/integer.h"
 #include "integer/radix_ciphertext.h"
 #include "integer/rerand.h"
 #include "integer/rerand_utilities.h"
 #include "utils/helper.cuh"
-#include "utils/helper_profile.cuh"
 #include "zk/zk_utilities.h"

 template <typename Torus, class params>
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper.cuh
@@ -1,7 +1,7 @@
 #ifndef HELPER_CUH
 #define HELPER_CUH

-#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
+#include "device.h"

 inline int nextPow2(int x) {
  --x;
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/CMakeLists.txt
@@ -55,6 +55,7 @@ endif()
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include)
 include_directories(${CONCRETE_CUDA_SOURCE_DIR}/include)
 include_directories(${CONCRETE_CUDA_SOURCE_DIR}/src)
+include_directories(${TFHE_CUDA_COMMON_INCLUDE_DIR})
 include_directories(${TFHE_RS_BINARY_DIR})
 include_directories(${TFHE_RS_BINARY_DIR}/deps)
 include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/CMakeLists.txt
@@ -46,6 +46,7 @@ endif()

 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include)
 include_directories(${CONCRETE_CUDA_SOURCE_DIR}/include)
+include_directories(${TFHE_CUDA_COMMON_INCLUDE_DIR})
 include_directories(${TFHE_RS_BINARY_DIR})
 include_directories(${TFHE_RS_BINARY_DIR}/deps)
 include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -1,6 +1,9 @@
 use std::ffi::c_void;

-#[link(name = "tfhe_cuda_backend", kind = "static")]
+// Declarations forwarded from tfhe-cuda-common. cargo-semver-checks cannot trace
+// `pub use` re-exports of extern "C" functions, so the signatures must be repeated
+// here to keep the public API stable across patch versions.
+#[link(name = "tfhe_cuda_common", kind = "static")]
 extern "C" {

    pub fn cuda_create_stream(gpu_index: u32) -> *mut c_void;
--- a/backends/tfhe-cuda-common/Cargo.toml
+++ b/backends/tfhe-cuda-common/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "tfhe-cuda-common"
+version = "0.1.0"
+edition = "2021"
+authors = ["Zama team"]
+license = "BSD-3-Clause-Clear"
+description = "Shared CUDA device utilities for tfhe-rs backends."
+homepage = "https://www.zama.org/"
+documentation = "https://docs.zama.org/tfhe-rs"
+repository = "https://github.com/zama-ai/tfhe-rs"
+keywords = ["cuda", "gpu", "fhe", "cryptography"]
+links = "tfhe_cuda_common"
+rust-version.workspace = true
+
+[build-dependencies]
+cmake.workspace = true
+pkg-config.workspace = true
+
+[features]
+profile = []
+debug = []
--- a/backends/tfhe-cuda-common/build.rs
+++ b/backends/tfhe-cuda-common/build.rs
@@ -0,0 +1,79 @@
+fn get_linux_distribution_name() -> Option<String> {
+    let content = std::fs::read_to_string("/etc/os-release").ok()?;
+    for line in content.lines() {
+        if let Some(value) = line.strip_prefix("NAME=") {
+            return Some(value.trim_matches('"').to_string());
+        }
+    }
+    None
+}
+
+fn main() {
+    if let Ok(val) = std::env::var("DOCS_RS") {
+        if val.parse::<u32>() == Ok(1) {
+            return;
+        }
+    }
+
+    if std::env::var("_CBINDGEN_IS_RUNNING").is_ok() {
+        return;
+    }
+
+    println!("cargo::rerun-if-changed=cuda/include");
+    println!("cargo::rerun-if-changed=cuda/src");
+    println!("cargo::rerun-if-changed=cuda/CMakeLists.txt");
+    println!("cargo::rerun-if-changed=src");
+
+    if std::env::consts::OS == "linux" {
+        let manifest_dir = std::env::var("CARGO_MANIFEST_DIR")
+            .expect("CARGO_MANIFEST_DIR must be set by cargo during build");
+
+        if get_linux_distribution_name().as_deref() != Some("Ubuntu") {
+            println!(
+                "cargo:warning=This Linux distribution is not officially supported. \
+                Only Ubuntu is supported by tfhe-cuda-common at this time. Build may fail\n"
+            );
+        }
+
+        let mut cmake_config = cmake::Config::new("cuda");
+
+        if cfg!(feature = "profile") {
+            cmake_config.define("USE_NVTOOLS", "ON");
+        } else {
+            cmake_config.define("USE_NVTOOLS", "OFF");
+        }
+
+        if cfg!(feature = "debug") {
+            cmake_config.define("CMAKE_BUILD_TYPE", "Debug");
+        }
+
+        let dest = cmake_config.build();
+
+        println!(
+            "cargo:rustc-link-search=native={}",
+            dest.join("lib").display()
+        );
+        println!("cargo:rustc-link-lib=static=tfhe_cuda_common");
+
+        if pkg_config::Config::new()
+            .atleast_version("10")
+            .probe("cuda")
+            .is_err()
+        {
+            println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
+        }
+
+        println!("cargo:rustc-link-lib=cudart");
+        println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
+        println!("cargo:rustc-link-lib=stdc++");
+
+        // Expose the include path so dependent crates can access headers via
+        // DEP_TFHE_CUDA_COMMON_INCLUDE
+        let include_dir = std::path::PathBuf::from(&manifest_dir).join("cuda/include");
+        println!("cargo:include={}", include_dir.display());
+    } else {
+        panic!(
+            "Error: platform not supported, tfhe-cuda-common not built (only Linux is supported)"
+        );
+    }
+}
--- a/backends/tfhe-cuda-common/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-common/cuda/CMakeLists.txt
@@ -0,0 +1,93 @@
+cmake_minimum_required(VERSION 3.18)
+project(
+  CommonCudaBackend
+  VERSION 1.0.0
+  LANGUAGES CXX)
+
+set(MINIMUM_SUPPORTED_CUDA_VERSION 10.0)
+include(CheckLanguage)
+check_language(CUDA)
+if(CMAKE_CUDA_COMPILER)
+  enable_language(CUDA)
+else()
+  message(FATAL_ERROR "CUDA compiler not found.")
+endif()
+
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS ${MINIMUM_SUPPORTED_CUDA_VERSION})
+  message(FATAL_ERROR "CUDA ${MINIMUM_SUPPORTED_CUDA_VERSION} or greater is required for compilation.")
+endif()
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+
+# Auto-detect CUDA compute capability
+set(OUTPUTFILE ${CMAKE_CURRENT_SOURCE_DIR}/cuda_script)
+set(CUDAFILE ${CMAKE_CURRENT_SOURCE_DIR}/check_cuda.cu)
+execute_process(COMMAND nvcc -lcuda ${CUDAFILE} -o ${OUTPUTFILE})
+execute_process(
+  COMMAND ${OUTPUTFILE}
+  RESULT_VARIABLE CUDA_RETURN_CODE
+  OUTPUT_VARIABLE ARCH)
+file(REMOVE ${OUTPUTFILE})
+
+if(${CUDA_RETURN_CODE} EQUAL 0)
+  set(CUDA_SUCCESS "TRUE")
+else()
+  set(CUDA_SUCCESS "FALSE")
+endif()
+
+if(${CUDA_SUCCESS})
+  set(CMAKE_CUDA_ARCHITECTURES native)
+  string(REPLACE "-arch=sm_" "" CUDA_ARCH "${ARCH}")
+  set(CUDA_ARCH "${CUDA_ARCH}0")
+else()
+  set(CMAKE_CUDA_ARCHITECTURES 70)
+  set(CUDA_ARCH "700")
+endif()
+
+add_compile_definitions(CUDA_ARCH=${CUDA_ARCH})
+
+set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE
+      Release
+      CACHE STRING "Build type" FORCE)
+endif()
+
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
+set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG")
+set(CMAKE_CXX_FLAGS_DEBUG "-g -O0")
+set(CMAKE_CUDA_FLAGS_DEBUG "-g -O0 -G")
+
+if(${USE_NVTOOLS})
+  add_definitions(-DUSE_NVTOOLS)
+endif()
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall -Xcompiler -Wextra --use_fast_math --expt-relaxed-constexpr")
+
+set(COMMON_SOURCES src/device.cu src/helper_profile.cu)
+
+add_library(tfhe_cuda_common STATIC ${COMMON_SOURCES})
+
+set_target_properties(
+  tfhe_cuda_common
+  PROPERTIES CUDA_SEPARABLE_COMPILATION ON
+             POSITION_INDEPENDENT_CODE ON
+             CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+
+if(CMAKE_BUILD_TYPE STREQUAL "Release")
+  set_target_properties(tfhe_cuda_common PROPERTIES CUDA_OPTIMIZE_DEPENDENCIES ON)
+elseif(CMAKE_BUILD_TYPE STREQUAL "Debug")
+  target_compile_options(tfhe_cuda_common PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-g -O0 -G>)
+endif()
+
+target_include_directories(tfhe_cuda_common PUBLIC include)
+target_link_libraries(tfhe_cuda_common PUBLIC cudart)
+
+install(
+  TARGETS tfhe_cuda_common
+  ARCHIVE DESTINATION lib
+  LIBRARY DESTINATION lib)
--- a/backends/tfhe-cuda-backend/cuda/check_cuda.cu
+++ b/backends/tfhe-cuda-backend/cuda/check_cuda.cu
--- a/backends/tfhe-cuda-backend/cuda/include/checked_arithmetic.h
+++ b/backends/tfhe-cuda-backend/cuda/include/checked_arithmetic.h
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -6,6 +6,8 @@
 #include <cstdlib>
 #include <cuda_runtime.h>

+#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
+
 extern "C" {

 #define check_cuda_error(ans)                                                  \
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cuh
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -1,5 +1,4 @@
 #include "device.h"
-#include "utils/helper.cuh"
 #include <cstdint>
 #include <cuda_runtime.h>
 #include <mutex>
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_profile.cu
--- a/backends/tfhe-cuda-common/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-common/src/cuda_bind.rs
@@ -0,0 +1,62 @@
+use std::ffi::c_void;
+
+#[link(name = "tfhe_cuda_common", kind = "static")]
+extern "C" {
+
+    pub fn cuda_create_stream(gpu_index: u32) -> *mut c_void;
+
+    pub fn cuda_destroy_stream(stream: *mut c_void, gpu_index: u32);
+
+    pub fn cuda_synchronize_stream(stream: *mut c_void, gpu_index: u32);
+
+    pub fn cuda_is_available() -> u32;
+
+    pub fn cuda_malloc(size: u64, gpu_index: u32) -> *mut c_void;
+
+    pub fn cuda_malloc_async(size: u64, stream: *mut c_void, gpu_index: u32) -> *mut c_void;
+    pub fn cuda_check_valid_malloc(size: u64, gpu_index: u32) -> bool;
+    pub fn cuda_device_total_memory(gpu_index: u32) -> u64;
+
+    pub fn cuda_memcpy_async_to_gpu(
+        dest: *mut c_void,
+        src: *const c_void,
+        size: u64,
+        stream: *mut c_void,
+        gpu_index: u32,
+    );
+
+    pub fn cuda_memcpy_gpu_to_gpu(dest: *mut c_void, src: *const c_void, size: u64, gpu_index: u32);
+
+    pub fn cuda_memcpy_async_gpu_to_gpu(
+        dest: *mut c_void,
+        src: *const c_void,
+        size: u64,
+        stream: *mut c_void,
+        gpu_index: u32,
+    );
+
+    pub fn cuda_memcpy_async_to_cpu(
+        dest: *mut c_void,
+        src: *const c_void,
+        size: u64,
+        stream: *mut c_void,
+        gpu_index: u32,
+    );
+
+    pub fn cuda_memset_async(
+        dest: *mut c_void,
+        val: u64,
+        size: u64,
+        stream: *mut c_void,
+        gpu_index: u32,
+    );
+
+    pub fn cuda_get_number_of_gpus() -> i32;
+
+    pub fn cuda_get_number_of_sms() -> i32;
+
+    pub fn cuda_synchronize_device(gpu_index: u32);
+
+    pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32);
+
+} // extern "C"
--- a/backends/tfhe-cuda-common/src/lib.rs
+++ b/backends/tfhe-cuda-common/src/lib.rs
@@ -0,0 +1 @@
+pub mod cuda_bind;
--- a/backends/zk-cuda-backend/Cargo.toml
+++ b/backends/zk-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "zk-cuda-backend"
-version = "0.1.0"
+version = "0.1.1"
 edition = "2021"
 rust-version.workspace = true
 authors = ["Zama team"]
@@ -24,7 +24,7 @@ bindgen.workspace = true
 [dependencies]
 ark-ec.workspace = true
 ark-ff.workspace = true
-tfhe-cuda-backend = { version = "0.14.0", path = "../tfhe-cuda-backend" }
+tfhe-cuda-common = { version = "0.1.0", path = "../tfhe-cuda-common" }

 [features]
 default = []
--- a/backends/zk-cuda-backend/build.rs
+++ b/backends/zk-cuda-backend/build.rs
@@ -1,27 +1,12 @@
 use std::path::PathBuf;

-fn get_linux_distribution_name() -> Option<String> {
-    let content = std::fs::read_to_string("/etc/os-release").ok()?;
-    for line in content.lines() {
-        if let Some(value) = line.strip_prefix("NAME=") {
-            return Some(value.trim_matches('"').to_string());
-        }
-    }
-    None
-}
-
 fn main() {
-    // Handle docs.rs builds (no CUDA available)
    if let Ok(val) = std::env::var("DOCS_RS") {
        if val.parse::<u32>() == Ok(1) {
            return;
        }
    }

-    // Workaround for cbindgen running during builds: cbindgen can trigger a second
-    // compilation pass that may forward incorrect arguments to cmake, crashing builds
-    // on make < 4.4. Since zk-cuda-backend has no macro expansions for cbindgen to
-    // inspect, skipping this compilation also speeds up C API builds.
    if std::env::var("_CBINDGEN_IS_RUNNING").is_ok() {
        return;
    }
@@ -32,24 +17,18 @@ fn main() {
    println!("cargo::rerun-if-changed=cuda/CMakeLists.txt");
    println!("cargo::rerun-if-changed=src");

+    // Platform/distro check is performed by tfhe-cuda-common's build.rs, which
+    // Cargo builds first as a dependency.
    if std::env::consts::OS == "linux" {
-        // GNU linker flags for handling duplicate symbols between tfhe-cuda-backend
-        // and zk-cuda-backend (e.g., shared device utilities)
-        println!("cargo:rustc-link-arg=-Wl,--allow-multiple-definition");
-        println!("cargo:rustc-link-arg=-Wl,--no-as-needed");
-
        let manifest_dir = std::env::var("CARGO_MANIFEST_DIR")
            .expect("CARGO_MANIFEST_DIR must be set by cargo during build");

-        if get_linux_distribution_name().as_deref() != Some("Ubuntu") {
-            println!(
-                "cargo:warning=This Linux distribution is not officially supported. \
-                Only Ubuntu is supported by zk-cuda-backend at this time. Build may fail\n"
-            );
+        let mut cmake_config = cmake::Config::new("cuda");
+
+        if let Ok(common_include) = std::env::var("DEP_TFHE_CUDA_COMMON_INCLUDE") {
+            cmake_config.define("TFHE_CUDA_COMMON_INCLUDE_DIR", &common_include);
        }

-        // Build CUDA library using cmake crate
-        let mut cmake_config = cmake::Config::new("cuda");
        let dest = cmake_config.build();

        // cmake crate installs to dest/lib subdirectory
--- a/backends/zk-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/zk-cuda-backend/cuda/CMakeLists.txt
@@ -25,9 +25,9 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)

-# Auto-detect CUDA compute capability using the same mechanism as tfhe-cuda-backend
+# Auto-detect CUDA compute capability (check_cuda.cu lives in tfhe-cuda-common)
 set(OUTPUTFILE ${CMAKE_CURRENT_SOURCE_DIR}/cuda_script)
-set(CUDAFILE ${CMAKE_CURRENT_SOURCE_DIR}/check_cuda.cu)
+set(CUDAFILE ${CMAKE_CURRENT_SOURCE_DIR}/../../tfhe-cuda-common/cuda/check_cuda.cu)
 execute_process(COMMAND nvcc -lcuda ${CUDAFILE} -o ${OUTPUTFILE})
 execute_process(
  COMMAND ${OUTPUTFILE}
@@ -71,7 +71,7 @@ set(CMAKE_CUDA_FLAGS_DEBUG "-g -O0 -G")
 # Additional CUDA flags (aligned with tfhe-cuda-backend)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall -Xcompiler -Wextra --use_fast_math --expt-relaxed-constexpr")

-# Core source files (without device utilities) Device utilities come from tfhe-cuda-backend.
+# Core source files
 set(FP_CORE_SOURCES src/primitives/fp.cu src/primitives/fp2.cu src/curve.cu src/msm/pippenger/msm_pippenger.cu
                    src/msm/msm.cu)

@@ -81,12 +81,10 @@ set(FP_MSM_HEADERS src/msm/common.cuh)
 # C wrapper source for FFI bindings (CUDA for CUDA headers)
 set(C_WRAPPER_SOURCES ../src/c_wrapper.cu)

-# Headers (device.h comes from tfhe-cuda-backend)
 set(FP_HEADERS include/fp.h include/fp_kernels.h include/fp2.h include/fp2_kernels.h include/curve.h)

 # =============================================================================
-# zk_cuda_backend: For Rust/Cargo builds (WITHOUT device.cu) Device utilities are provided by tfhe-cuda-backend when
-# linking with tfhe.
+# zk_cuda_backend: Static library for Rust/Cargo builds
 # =============================================================================
 add_library(zk_cuda_backend STATIC ${FP_CORE_SOURCES} ${FP_HEADERS} ${C_WRAPPER_SOURCES})

@@ -106,8 +104,11 @@ endif()

 target_link_libraries(zk_cuda_backend PUBLIC cudart)

-# Include both local headers and tfhe-cuda-backend headers (for device.h)
 target_include_directories(zk_cuda_backend PUBLIC include ../src/include)
+if(NOT TFHE_CUDA_COMMON_INCLUDE_DIR)
+  set(TFHE_CUDA_COMMON_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../tfhe-cuda-common/cuda/include)
+endif()
+target_include_directories(zk_cuda_backend PUBLIC ${TFHE_CUDA_COMMON_INCLUDE_DIR})

 # =============================================================================
 # Tests and Benchmarks (optional, controlled by ZK_CUDA_BACKEND_BUILD_TESTS/BENCHMARKS)
--- a/backends/zk-cuda-backend/cuda/check_cuda.cu
+++ b/backends/zk-cuda-backend/cuda/check_cuda.cu
@@ -1,22 +0,0 @@
-#include <stdio.h>
-
-int main(int argc, char **argv) {
-  cudaDeviceProp dP;
-  float min_cc = 3.0;
-
-  int rc = cudaGetDeviceProperties(&dP, 0);
-  if (rc != cudaSuccess) {
-    cudaError_t error = cudaGetLastError();
-    printf("CUDA error: %s", cudaGetErrorString(error));
-    return rc; /* Failure */
-  }
-  if ((dP.major + (dP.minor / 10)) < min_cc) {
-    printf("Min Compute Capability of %2.1f required:  %d.%d found\n Not "
-           "Building CUDA Code",
-           min_cc, dP.major, dP.minor);
-    return 1; /* Failure */
-  } else {
-    printf("-arch=sm_%d%d", dP.major, dP.minor);
-    return 0; /* Success */
-  }
-}
--- a/backends/zk-cuda-backend/cuda/include/checked_arithmetic.h
+++ b/backends/zk-cuda-backend/cuda/include/checked_arithmetic.h
@@ -1,35 +0,0 @@
-#pragma once
-
-#include <cstddef>
-#include <cstdio>
-
-#include "device.h"
-
-// Variadic checked multiplication of size_t values.
-// Folds left-to-right using __builtin_mul_overflow, returning true on overflow.
-// On overflow the value written to *out is unspecified.
-template <typename... Args>
-inline bool checked_mul(size_t *out, size_t first, Args... rest) {
-  size_t result = first;
-  for (size_t value : {static_cast<size_t>(rest)...}) {
-    if (__builtin_mul_overflow(result, value, &result))
-      return true;
-  }
-  *out = result;
-  return false;
-}
-
-// Variadic safe multiplication: computes the product and panics on overflow.
-template <typename... Args> inline size_t safe_mul(size_t first, Args... rest) {
-  size_t result;
-  bool overflow = checked_mul(&result, first, rest...);
-  PANIC_IF_FALSE(!overflow, "multiplication overflow wraps size_t");
-  return result;
-}
-
-// Variadic safe multiplication with an appended sizeof(T) factor.
-// Computes (args... * sizeof(T)) with overflow checking.
-template <typename T, typename... Args>
-inline size_t safe_mul_sizeof(Args... args) {
-  return safe_mul(args..., sizeof(T));
-}
--- a/backends/zk-cuda-backend/cuda/include/device.h
+++ b/backends/zk-cuda-backend/cuda/include/device.h
@@ -1,145 +0,0 @@
-#ifndef DEVICE_H
-#define DEVICE_H
-
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cuda_runtime.h>
-
-extern "C" {
-
-#define check_cuda_error(ans)                                                  \
-  { cuda_error((ans), __FILE__, __LINE__); }
-inline void cuda_error(cudaError_t code, const char *file, int line) {
-  if (code != cudaSuccess) {
-    std::fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code),
-                 file, line);
-    std::abort();
-  }
-}
-
-// The PANIC macro should be used to validate user-inputs to GPU functions
-// it will execute in all targets, including production settings
-// e.g., cudaMemCopy to the device should check that the destination pointer is
-// a device pointer
-#define PANIC(format, ...)                                                     \
-  {                                                                            \
-    std::fprintf(stderr, "%s::%d::%s: panic.\n" format "\n", __FILE__,         \
-                 __LINE__, __func__, ##__VA_ARGS__);                           \
-    std::abort();                                                              \
-  }
-
-// This is a generic assertion checking macro with user defined printf-style
-// message
-#define PANIC_IF_FALSE(cond, format, ...)                                      \
-  do {                                                                         \
-    if (!(cond)) {                                                             \
-      PANIC(format "\n\n %s\n", ##__VA_ARGS__, #cond);                         \
-    }                                                                          \
-  } while (0)
-
-#ifndef GPU_ASSERTS_DISABLE
-// The GPU assert should be used to validate assumptions in algorithms,
-// for example, checking that two user-provided quantities have a certain
-// relationship or that the size of the buffer  provided to a function is
-// sufficient when it is filled with some algorithm that depends on
-// user-provided inputs e.g., OPRF corrections buffer should not have a size
-// higher than the number of blocks in the datatype that is generated
-#define GPU_ASSERT(cond, format, ...)                                          \
-  PANIC_IF_FALSE(cond, format, ##__VA_ARGS__)
-#else
-#define GPU_ASSERT(cond)                                                       \
-  do {                                                                         \
-  } while (0)
-#endif
-
-uint32_t cuda_get_device();
-void cuda_set_device(uint32_t gpu_index);
-
-cudaEvent_t cuda_create_event(uint32_t gpu_index);
-
-void cuda_event_record(cudaEvent_t event, cudaStream_t stream,
-                       uint32_t gpu_index);
-void cuda_stream_wait_event(cudaStream_t stream, cudaEvent_t event,
-                            uint32_t gpu_index);
-
-void cuda_event_destroy(cudaEvent_t event, uint32_t gpu_index);
-
-cudaStream_t cuda_create_stream(uint32_t gpu_index);
-
-void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index);
-
-void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index);
-
-uint32_t cuda_is_available();
-
-void *cuda_malloc(uint64_t size, uint32_t gpu_index);
-
-void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
-                                           uint32_t gpu_index,
-                                           uint64_t &size_tracker,
-                                           bool allocate_gpu_memory);
-
-void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
-
-bool cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
-uint64_t cuda_device_total_memory(uint32_t gpu_index);
-
-void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
-                                                 uint64_t size,
-                                                 cudaStream_t stream,
-                                                 uint32_t gpu_index,
-                                                 bool gpu_memory_allocated);
-
-void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
-                              cudaStream_t stream, uint32_t gpu_index);
-
-void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
-    void *dest, void const *src, uint64_t size, cudaStream_t stream,
-    uint32_t gpu_index, bool gpu_memory_allocated);
-
-void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
-                                  cudaStream_t stream, uint32_t gpu_index);
-
-void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
-                            uint32_t gpu_index);
-
-void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
-                              cudaStream_t stream, uint32_t gpu_index);
-
-void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
-                                          uint64_t size, cudaStream_t stream,
-                                          uint32_t gpu_index,
-                                          bool gpu_memory_allocated);
-
-void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
-                       cudaStream_t stream, uint32_t gpu_index);
-
-int cuda_get_number_of_gpus();
-
-int cuda_get_number_of_sms();
-
-void cuda_synchronize_device(uint32_t gpu_index);
-
-void cuda_drop(void *ptr, uint32_t gpu_index);
-
-void cuda_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
-                                        uint32_t gpu_index,
-                                        bool gpu_memory_allocated);
-
-void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
-}
-
-uint32_t cuda_get_max_shared_memory(uint32_t gpu_index);
-
-uint32_t cuda_get_max_shared_memory_per_block(uint32_t gpu_index);
-
-bool cuda_check_support_cooperative_groups();
-
-bool cuda_check_support_thread_block_clusters();
-
-template <typename Torus>
-void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
-                          Torus *d_array, Torus value, Torus n);
-
-#endif
--- a/backends/zk-cuda-backend/cuda/include/fp.h
+++ b/backends/zk-cuda-backend/cuda/include/fp.h
@@ -3,8 +3,7 @@
 #include <cstdint>
 #include <cuda_runtime.h>

-// Ceiling division: computes (M + N - 1) / N
-#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
+#include "device.h"

 // CUDA architecture constant
 #define CUDA_WARP_SIZE 32 // NVIDIA warp size (threads per warp)
--- a/backends/zk-cuda-backend/cuda/include/helper_profile.cuh
+++ b/backends/zk-cuda-backend/cuda/include/helper_profile.cuh
@@ -1,16 +0,0 @@
-#ifndef HELPER_PROFILE
-#define HELPER_PROFILE
-
-#ifdef USE_NVTOOLS
-#include <nvtx3/nvToolsExt.h>
-#endif
-
-void cuda_nvtx_label_with_color(const char *name);
-void cuda_nvtx_pop();
-
-#define PUSH_RANGE(name)                                                       \
-  { cuda_nvtx_label_with_color(name); }
-#define POP_RANGE()                                                            \
-  { cuda_nvtx_pop(); }
-
-#endif
--- a/backends/zk-cuda-backend/cuda/src/helper_profile.cu
+++ b/backends/zk-cuda-backend/cuda/src/helper_profile.cu
@@ -1,43 +0,0 @@
-#include "helper_profile.cuh"
-#include <stdint.h>
-
-uint32_t adler32(const unsigned char *data) {
-  const uint32_t MOD_ADLER = 65521;
-  uint32_t a = 1, b = 0;
-  size_t index;
-  for (index = 0; data[index] != 0; ++index) {
-    a = (a + data[index] * 2) % MOD_ADLER;
-    b = (b + a) % MOD_ADLER;
-  }
-  return (b << 16) | a;
-}
-
-void cuda_nvtx_label_with_color(const char *name) {
-#ifdef USE_NVTOOLS
-  int color_id = adler32((const unsigned char *)name);
-  int r, g, b;
-  r = color_id & 0x000000ff;
-  g = (color_id & 0x000ff000) >> 12;
-  b = (color_id & 0x0ff00000) >> 20;
-  if (r < 64 & g < 64 & b < 64) {
-    r = r * 3;
-    g = g * 3 + 64;
-    b = b * 4;
-  }
-
-  color_id = 0xff000000 | (r << 16) | (g << 8) | (b);
-  nvtxEventAttributes_t eventAttrib = {0};
-  eventAttrib.version = NVTX_VERSION;
-  eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
-  eventAttrib.colorType = NVTX_COLOR_ARGB;
-  eventAttrib.color = color_id;
-  eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
-  eventAttrib.message.ascii = name;
-  nvtxRangePushEx(&eventAttrib);
-#endif
-}
-void cuda_nvtx_pop() {
-#ifdef USE_NVTOOLS
-  nvtxRangePop();
-#endif
-}
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/benchmarks/CMakeLists.txt
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/benchmarks/CMakeLists.txt
@@ -15,29 +15,27 @@ set(BENCHMARK_ENABLE_GTEST_TESTS
    CACHE BOOL "" FORCE)
 FetchContent_MakeAvailable(googlebenchmark)

-# Path to tfhe-cuda-backend for device utilities
-set(TFHE_CUDA_BACKEND_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../tfhe-cuda-backend/cuda)
+set(TFHE_CUDA_COMMON_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../tfhe-cuda-common/cuda)

 # Path to main source (needed for CUDA device linking)
 set(ZK_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../src)
 set(ZK_PRIMITIVES_DIR ${ZK_SRC_DIR}/primitives)

-# Build device library from tfhe-cuda-backend
-add_library(tfhe_device_bench STATIC ${TFHE_CUDA_BACKEND_DIR}/src/device.cu
-                                     ${TFHE_CUDA_BACKEND_DIR}/src/utils/helper_profile.cu)
+add_library(common_device_bench STATIC ${TFHE_CUDA_COMMON_DIR}/src/device.cu
+                                       ${TFHE_CUDA_COMMON_DIR}/src/helper_profile.cu)
 set_target_properties(
-  tfhe_device_bench
+  common_device_bench
  PROPERTIES CUDA_SEPARABLE_COMPILATION ON
             POSITION_INDEPENDENT_CODE ON
             CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-target_include_directories(tfhe_device_bench PUBLIC ${TFHE_CUDA_BACKEND_DIR}/include)
-target_link_libraries(tfhe_device_bench PUBLIC cudart)
+target_include_directories(common_device_bench PUBLIC ${TFHE_CUDA_COMMON_DIR}/include)
+target_link_libraries(common_device_bench PUBLIC cudart)

 # Benchmark executable for Fp Include fp.cu, fp2.cu and curve.cu directly to enable proper CUDA device linking with
 # benchmark kernels (curve.cu depends on both fp and fp2)
 add_executable(benchmark_fp benchmark_fp.cu ../tests/primitives/fp_helpers.cu ${ZK_PRIMITIVES_DIR}/fp.cu
                            ${ZK_PRIMITIVES_DIR}/fp2.cu ${ZK_SRC_DIR}/curve.cu)
-target_link_libraries(benchmark_fp tfhe_device_bench benchmark::benchmark benchmark::benchmark_main)
+target_link_libraries(benchmark_fp common_device_bench benchmark::benchmark benchmark::benchmark_main)
 target_include_directories(benchmark_fp PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include)
 set_target_properties(
  benchmark_fp
@@ -49,7 +47,7 @@ set_target_properties(
 # benchmark kernels
 add_executable(benchmark_fp2 benchmark_fp2.cu ../tests/primitives/fp2_helpers.cu ${ZK_PRIMITIVES_DIR}/fp.cu
                             ${ZK_PRIMITIVES_DIR}/fp2.cu ${ZK_SRC_DIR}/curve.cu)
-target_link_libraries(benchmark_fp2 tfhe_device_bench benchmark::benchmark benchmark::benchmark_main)
+target_link_libraries(benchmark_fp2 common_device_bench benchmark::benchmark benchmark::benchmark_main)
 target_include_directories(benchmark_fp2 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include)
 set_target_properties(
  benchmark_fp2
@@ -59,7 +57,7 @@ set_target_properties(

 # Benchmark executable for MSM
 add_executable(benchmark_msm benchmark_msm.cu)
-target_link_libraries(benchmark_msm zk_cuda_backend tfhe_device_bench benchmark::benchmark benchmark::benchmark_main)
+target_link_libraries(benchmark_msm zk_cuda_backend common_device_bench benchmark::benchmark benchmark::benchmark_main)
 set_target_properties(
  benchmark_msm
  PROPERTIES CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/CMakeLists.txt
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/CMakeLists.txt
@@ -12,29 +12,26 @@ set(gtest_force_shared_crt
    CACHE BOOL "" FORCE)
 FetchContent_MakeAvailable(googletest)

-# Path to tfhe-cuda-backend for device utilities
-set(TFHE_CUDA_BACKEND_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../tfhe-cuda-backend/cuda)
+set(TFHE_CUDA_COMMON_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../tfhe-cuda-common/cuda)

 # Path to main source (needed for CUDA device linking)
 set(ZK_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../src)
 set(ZK_PRIMITIVES_DIR ${ZK_SRC_DIR}/primitives)

-# Build device library from tfhe-cuda-backend
-add_library(tfhe_device STATIC ${TFHE_CUDA_BACKEND_DIR}/src/device.cu
-                               ${TFHE_CUDA_BACKEND_DIR}/src/utils/helper_profile.cu)
+add_library(common_device STATIC ${TFHE_CUDA_COMMON_DIR}/src/device.cu ${TFHE_CUDA_COMMON_DIR}/src/helper_profile.cu)
 set_target_properties(
-  tfhe_device
+  common_device
  PROPERTIES CUDA_SEPARABLE_COMPILATION ON
             POSITION_INDEPENDENT_CODE ON
             CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-target_include_directories(tfhe_device PUBLIC ${TFHE_CUDA_BACKEND_DIR}/include)
-target_link_libraries(tfhe_device PUBLIC cudart)
+target_include_directories(common_device PUBLIC ${TFHE_CUDA_COMMON_DIR}/include)
+target_link_libraries(common_device PUBLIC cudart)

 # Test executable for Fp Include fp.cu, fp2.cu and curve.cu directly to enable proper CUDA device linking with test
 # kernels (curve.cu depends on both fp and fp2)
 add_executable(test_fp primitives/test_fp.cu primitives/test_fp_gpu_helpers.cu primitives/fp_helpers.cu
                       ${ZK_PRIMITIVES_DIR}/fp.cu ${ZK_PRIMITIVES_DIR}/fp2.cu ${ZK_SRC_DIR}/curve.cu)
-target_link_libraries(test_fp tfhe_device GTest::gtest_main)
+target_link_libraries(test_fp common_device GTest::gtest_main)
 target_include_directories(test_fp PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include)
 set_target_properties(
  test_fp
@@ -46,7 +43,7 @@ set_target_properties(
 # kernels
 add_executable(test_fp2 primitives/test_fp2.cu primitives/test_fp2_gpu_helpers.cu primitives/fp2_helpers.cu
                        ${ZK_PRIMITIVES_DIR}/fp.cu ${ZK_PRIMITIVES_DIR}/fp2.cu ${ZK_SRC_DIR}/curve.cu)
-target_link_libraries(test_fp2 tfhe_device GTest::gtest_main)
+target_link_libraries(test_fp2 common_device GTest::gtest_main)
 target_include_directories(test_fp2 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include)
 set_target_properties(
  test_fp2
@@ -56,7 +53,7 @@ set_target_properties(

 # Test executable for MSM
 add_executable(test_msm test_msm.cu)
-target_link_libraries(test_msm zk_cuda_backend tfhe_device GTest::gtest_main)
+target_link_libraries(test_msm zk_cuda_backend common_device GTest::gtest_main)
 set_target_properties(
  test_msm
  PROPERTIES CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}
@@ -65,7 +62,7 @@ set_target_properties(

 # Test executable for point operations
 add_executable(test_point_ops test_point_ops.cu)
-target_link_libraries(test_point_ops zk_cuda_backend tfhe_device GTest::gtest_main)
+target_link_libraries(test_point_ops zk_cuda_backend common_device GTest::gtest_main)
 set_target_properties(
  test_point_ops
  PROPERTIES CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}
--- a/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/basic/CMakeLists.txt
+++ b/backends/zk-cuda-backend/cuda/tests_and_benchmarks/tests/basic/CMakeLists.txt
@@ -5,7 +5,7 @@ set(ZK_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../include)

 # basic_fp_ops: host-side Fp field arithmetic
 add_executable(basic_fp_ops basic_fp_ops.cu)
-target_link_libraries(basic_fp_ops zk_cuda_backend tfhe_device)
+target_link_libraries(basic_fp_ops zk_cuda_backend common_device)
 target_include_directories(basic_fp_ops PRIVATE ${ZK_INCLUDE_DIR})
 set_target_properties(
  basic_fp_ops
@@ -15,7 +15,7 @@ set_target_properties(

 # basic_curve_ops: host-side G1 elliptic curve operations
 add_executable(basic_curve_ops basic_curve_ops.cu)
-target_link_libraries(basic_curve_ops zk_cuda_backend tfhe_device)
+target_link_libraries(basic_curve_ops zk_cuda_backend common_device)
 target_include_directories(basic_curve_ops PRIVATE ${ZK_INCLUDE_DIR})
 set_target_properties(
  basic_curve_ops
@@ -25,7 +25,7 @@ set_target_properties(

 # basic_msm: GPU-accelerated multi-scalar multiplication
 add_executable(basic_msm basic_msm.cu)
-target_link_libraries(basic_msm zk_cuda_backend tfhe_device)
+target_link_libraries(basic_msm zk_cuda_backend common_device)
 target_include_directories(basic_msm PRIVATE ${ZK_INCLUDE_DIR})
 set_target_properties(
  basic_msm
--- a/backends/zk-cuda-backend/src/lib.rs
+++ b/backends/zk-cuda-backend/src/lib.rs
@@ -44,9 +44,7 @@
 //! let g1_affine_again = g1_proj.to_affine();
 //! ```

-// Force linking of tfhe-cuda-backend which provides device utilities (cuda_malloc, cuda_set_device,
-// etc.)
-extern crate tfhe_cuda_backend;
+extern crate tfhe_cuda_common;

 // Auto-generated bindgen bindings (matching tfhe-cuda-backend pattern)
 #[allow(warnings)]
--- a/backends/zk-cuda-backend/src/types/mod.rs
+++ b/backends/zk-cuda-backend/src/types/mod.rs
@@ -193,9 +193,9 @@ mod tests {
        let gen = G1Affine::new(Fp::new(G1_GENERATOR_X), Fp::new(G1_GENERATOR_Y), false);
        let one = Scalar::from_u64(1);

-        let stream = unsafe { tfhe_cuda_backend::cuda_bind::cuda_create_stream(0) };
+        let stream = unsafe { tfhe_cuda_common::cuda_bind::cuda_create_stream(0) };
        let result = G1Projective::msm(&[gen], &[one], stream, 0, false).unwrap();
-        unsafe { tfhe_cuda_backend::cuda_bind::cuda_destroy_stream(stream, 0) };
+        unsafe { tfhe_cuda_common::cuda_bind::cuda_destroy_stream(stream, 0) };

        // from_montgomery_normalized() normalizes (divides by Z in Montgomery form)
        // then converts X, Y to normal form and sets Z = 1 (normal). The resulting
@@ -218,9 +218,9 @@ mod tests {
        let gen = G2Affine::new(x, y, false);
        let one = Scalar::from_u64(1);

-        let stream = unsafe { tfhe_cuda_backend::cuda_bind::cuda_create_stream(0) };
+        let stream = unsafe { tfhe_cuda_common::cuda_bind::cuda_create_stream(0) };
        let result = G2Projective::msm(&[gen], &[one], stream, 0, false).unwrap();
-        unsafe { tfhe_cuda_backend::cuda_bind::cuda_destroy_stream(stream, 0) };
+        unsafe { tfhe_cuda_common::cuda_bind::cuda_destroy_stream(stream, 0) };

        // Same approach: extract affine coordinates directly from normalized projective
        let normalized = result.from_montgomery_normalized();
--- a/tfhe-zk-pok/Cargo.toml
+++ b/tfhe-zk-pok/Cargo.toml
@@ -25,8 +25,8 @@ zeroize = "1.7.0"
 num-bigint = "0.4.5"
 tfhe-versionable = { version = "0.7.0", path = "../utils/tfhe-versionable" }
 tfhe-safe-serialize = { version = "0.1.0", path = "../utils/tfhe-safe-serialize" }
-zk-cuda-backend = { version = "0.1.0", path = "../backends/zk-cuda-backend", optional = true }
-tfhe-cuda-backend = { version = "=0.14.0", path = "../backends/tfhe-cuda-backend", optional = true }
+zk-cuda-backend = { version = "0.1.1", path = "../backends/zk-cuda-backend", optional = true }
+tfhe-cuda-common = { version = "0.1.0", path = "../backends/tfhe-cuda-common", optional = true }
 itertools.workspace = true
 [target.'cfg(target_family = "wasm")'.dependencies]
 getrandom = { workspace = true, features = ["js"] }
@@ -37,7 +37,7 @@ wasm-par-mq = { version = "0.1.0", path = "../utils/wasm-par-mq", features = [
 [features]
 experimental = []
 cross-origin-wasm = ["dep:wasm-par-mq"]
-gpu-experimental = ["dep:zk-cuda-backend", "dep:tfhe-cuda-backend"]
+gpu-experimental = ["dep:zk-cuda-backend", "dep:tfhe-cuda-common"]

 [dev-dependencies]
 serde_json = "~1.0"
--- a/tfhe-zk-pok/src/gpu/mod.rs
+++ b/tfhe-zk-pok/src/gpu/mod.rs
@@ -14,7 +14,7 @@ use crate::curve_api::bls12_446::{G1Affine, G2Affine, Zp, G1, G2};
 use crate::curve_api::CurveGroupOps;
 use ark_ec::CurveGroup;
 use ark_ff::{BigInt, MontFp, PrimeField};
-use tfhe_cuda_backend::cuda_bind::{
+use tfhe_cuda_common::cuda_bind::{
    cuda_create_stream, cuda_destroy_stream, cuda_get_number_of_gpus,
 };
 use zk_cuda_backend::{G1Affine as CudaG1Affine, G2Affine as CudaG2Affine, Scalar as CudaScalar};
--- a/tfhe-zk-pok/src/gpu/tests/zk_cuda_backend.rs
+++ b/tfhe-zk-pok/src/gpu/tests/zk_cuda_backend.rs
@@ -3,7 +3,7 @@
 use crate::curve_api::bls12_446::{Zp, G1, G2};
 use crate::curve_api::CurveGroupOps;
 use crate::gpu::{g1_affine_from_cuda, g1_affine_to_cuda, g2_affine_from_cuda, g2_affine_to_cuda};
-use tfhe_cuda_backend::cuda_bind::{cuda_create_stream, cuda_destroy_stream};
+use tfhe_cuda_common::cuda_bind::{cuda_create_stream, cuda_destroy_stream};
 use zk_cuda_backend::conversions::{g1_affine_from_montgomery, g2_affine_from_montgomery};
 use zk_cuda_backend::{
    G1Affine as CudaG1Affine, G1Projective as CudaG1Projective, G2Affine as CudaG2Affine,
--- a/tfhe/Cargo.toml
+++ b/tfhe/Cargo.toml
@@ -64,9 +64,9 @@ tfhe-fft = { version = "0.10.1", path = "../tfhe-fft", features = [
    "serde",
    "fft128",
 ] }
-tfhe-ntt = { version = "0.7.1", path = "../tfhe-ntt" }
+tfhe-ntt = { version = "0.7.0", path = "../tfhe-ntt" }
 pulp = { workspace = true, features = ["default"] }
-tfhe-cuda-backend = { version = "0.14.0", path = "../backends/tfhe-cuda-backend", optional = true }
+tfhe-cuda-backend = { version = "0.14.1", path = "../backends/tfhe-cuda-backend", optional = true }
 aligned-vec = { workspace = true, features = ["default", "serde"] }
 dyn-stack = { workspace = true, features = ["default"] }
 paste = { workspace = true }
--- a/tfhe/src/shortint/oprf.rs
+++ b/tfhe/src/shortint/oprf.rs
@@ -1477,4 +1477,67 @@ pub(crate) mod test {
            }
        }
    }
+
+    #[test]
+    fn oprf_test_uniformity_bits_ci_run_filter() {
+        let sample_count: usize = 100_000;
+
+        let p_value_limit: f64 = 0.000_01;
+
+        use crate::shortint::gen_keys;
+        use crate::shortint::parameters::test_params::{
+            TEST_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_TUNIFORM_2M128,
+            TEST_PARAM_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+        };
+        use crate::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
+
+        for params in [
+            ShortintParameterSet::from(
+                TEST_PARAM_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+            ),
+            ShortintParameterSet::from(PARAM_MESSAGE_2_CARRY_2_KS_PBS),
+            ShortintParameterSet::from(TEST_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_TUNIFORM_2M128),
+        ] {
+            let (ck, sk) = gen_keys(params);
+            let oprf_ck = OprfPrivateKey::new(&ck);
+            let oprf_sk = OprfServerKey::new(&oprf_ck, &ck).unwrap();
+
+            let random_bits_per_block = sk.message_modulus.0.ilog2() as u64;
+
+            for random_bits_count in [3u64, 4] {
+                let expected_num_blocks =
+                    random_bits_count.div_ceil(random_bits_per_block) as usize;
+
+                test_uniformity(
+                    sample_count,
+                    p_value_limit,
+                    1 << random_bits_count,
+                    |seed| {
+                        let seed = (seed as u128).to_le_bytes();
+                        let blocks = oprf_sk.generate_oblivious_pseudo_random_bits(
+                            seed.as_slice(),
+                            random_bits_count,
+                            &sk,
+                        );
+
+                        let mut combined: u64 = 0;
+                        let mut shift = 0u64;
+                        for (i, block) in blocks.iter().enumerate() {
+                            let decrypted = ck.decrypt_message_and_carry(block);
+                            let block_bits = bits_in_block(
+                                i,
+                                expected_num_blocks,
+                                random_bits_count,
+                                random_bits_per_block,
+                            );
+                            combined |= decrypted << shift;
+                            shift += block_bits;
+                        }
+
+                        combined
+                    },
+                );
+            }
+        }
+    }
 }
Author	SHA1	Message	Date
Pedro Alves	eff405bab1	fix(gpu): define CUDA_ARCH in tfhe-cuda-common CMakeLists device.cu uses CUDA_ARCH in preprocessor conditionals (e.g., cuda_check_support_thread_block_clusters relies on #if CUDA_ARCH >= 900). Without this define, CUDA_ARCH defaults to 0, silently disabling thread block cluster support on Hopper GPUs.	2026-04-23 20:35:27 -03:00
Pedro Alves	6932138917	chore(gpu): bump tfhe-cuda-backend to 0.14.1 and zk-cuda-backend to 0.1.1	2026-04-23 16:42:55 -03:00
Pedro Alves	a1cc8c4883	chore(gpu): extract shared CUDA device utilities into tfhe-rs-cuda-common	2026-04-23 16:35:30 -03:00