feat(gpu): optimize BLS12-446 field arithmetic for MSM performance

- Replace 64-bit CIOS Montgomery multiplication with 32-bit MAD chains (mad.lo.cc/madc.hi.cc), exploiting native 2x throughput of 32-bit ops on NVIDIA GPUs via even/odd accumulator separation - Add fp_mont_sqr using a triangular MAD chain (upper triangle computed once and doubled, diagonal added separately), saving of the multiplications versus treating squaring as a general multiplication - Add fp_add_lazy/fp_sub_lazy (and Fp2 variants): skip the final conditional subtraction when the result feeds fp_mont_mul, which accepts inputs in [0, 2p). Wired into fp2_mont_mul, fp2_mont_square, and G1/G2 projective_point_double - Replace all fp_mont_mul(c, a, a) squaring patterns with fp_mont_sqr across curve.cu and fp2.cu (point addition, doubling, inversion)
chore(gpu): optimize CI
2026-04-28 03:01:21 -04:00 · 2026-04-15 15:24:52 +04:00 · 2026-04-15 12:48:31 +02:00 · 2026-04-14 16:24:33 +02:00
43 changed files with 771 additions and 135 deletions
--- a/.github/workflows/gpu_core_h100_tests.yml
+++ b/.github/workflows/gpu_core_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [ labeled, opened, synchronize ]

 permissions:
  contents: read
@@ -38,6 +38,7 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -62,15 +63,16 @@ jobs:
              - tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
              - tfhe/src/shortint/parameters/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_core_h100_tests.yml'
+            core_crypto:
+              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_core_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
--- a/.github/workflows/gpu_hlapi_h100_tests.yml
+++ b/.github/workflows/gpu_hlapi_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [ labeled, opened, synchronize ]

 permissions:
  contents: read
@@ -38,6 +38,7 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -65,13 +66,15 @@ jobs:
              - tfhe/src/c_api/**
              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_hlapi_h100_tests.yml'
+            core_crypto:
+              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_hlapi_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -17,8 +17,8 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  schedule:
-    # Nightly tests will be triggered each evening 8p.m.
-    - cron: "0 20 * * *"
+    # Weekly tests will be triggered every Monday at 8p.m.
+    - cron: "0 20 * * 1"
  pull_request:


@@ -28,10 +28,41 @@ permissions:
 # zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning

 jobs:
+  should-run:
+    name: gpu_integer_long_run_tests/should-run
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: read  # Needed to check for file change
+    outputs:
+      is_needed_in_gpu_ci: ${{ env.IS_PR == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
+        with:
+          files_yaml: |
+            gpu:
+              - tfhe/Cargo.toml
+              - tfhe/build.rs
+              - backends/tfhe-cuda-backend/**
+              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/gpu/**
+              - tfhe/src/shortint/parameters/**
+              - '.github/workflows/gpu_integer_long_run_tests.yml'
+
  setup-instance:
    name: gpu_integer_long_run_tests/setup-instance
-    if: github.event_name != 'schedule' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    needs: [should-run]
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      needs.should-run.outputs.is_needed_in_gpu_ci == 'true'
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -131,6 +131,10 @@ jobs:
        env:
          GCC_VERSION: ${{ matrix.gcc }}

+      - name: Run semgrep and lint checks on CUDA code
+        run: |
+          make semgrep_and_lint_gpu_code
+
      - name: Run fmt checks
        run: |
          make check_fmt_gpu
@@ -139,10 +143,6 @@ jobs:
        run: |
          make pcc_gpu

-      - name: Run semgrep and lint checks on CUDA code
-        run: |
-          make semgrep_and_lint_gpu_code
-
      - name: Run semver checks on tfhe-cuda-backend
        run: |
          make semver_check_cuda_backend
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -63,7 +63,6 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_classic_tests.yml'
              - scripts/integer-tests.sh

--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [ labeled, opened, synchronize ]

 permissions:
  contents: read
@@ -38,6 +38,7 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -63,16 +64,17 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_h100_tests.yml'
              - scripts/integer-tests.sh
+            core_crypto:
+              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_signed_integer_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -64,7 +64,6 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_signed_integer_tests.yml'
              - scripts/integer-tests.sh

--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -63,7 +63,6 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_classic_tests.yml'
              - scripts/integer-tests.sh

--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -23,7 +23,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [ labeled, opened, synchronize ]

 permissions:
  contents: read
@@ -38,6 +38,7 @@ jobs:
      pull-requests: read  # Needed to check for file change
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+      core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -63,16 +64,17 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_h100_tests.yml'
              - scripts/integer-tests.sh
+            core_crypto:
+              - tfhe/src/core_crypto/gpu/**

  setup-instance:
    name: gpu_unsigned_integer_h100_tests/setup-instance
    needs: should-run
    if: github.event_name == 'workflow_dispatch' ||
-      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -64,7 +64,6 @@ jobs:
              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_unsigned_integer_tests.yml'
              - scripts/integer-tests.sh

--- a/.github/workflows/gpu_zk_tests.yml
+++ b/.github/workflows/gpu_zk_tests.yml
@@ -55,12 +55,9 @@ jobs:
              - tfhe/build.rs
              - backends/tfhe-cuda-backend/**
              - backends/zk-cuda-backend/**
-              - tfhe/src/core_crypto/gpu/**
-              - tfhe/src/integer/gpu/**
              - tfhe/src/shortint/parameters/**
              - tfhe/src/zk/**
              - tfhe-zk-pok/**
-              - 'tfhe/docs/**/**.md'
              - '.github/workflows/gpu_zk_tests.yml'
              - ci/slab.toml

--- a/backends/zk-cuda-backend/cuda/include/curve.h
+++ b/backends/zk-cuda-backend/cuda/include/curve.h
@@ -17,7 +17,13 @@ __host__ __device__ void fp2_zero(Fp2 &a);

 // G1 point: (x, y) coordinates in Fp
 // Curve equation: y^2 = x^3 + b (short Weierstrass form with a = 0)
-struct G1Affine {
+//
+// alignas(sizeof(uint64_t)): The bool infinity field causes the struct to be
+// padded to the largest field alignment (4 bytes in 32-bit limb mode, 8 bytes
+// in 64-bit). Forcing alignment to sizeof(uint64_t) ensures
+// sizeof(G1Affine)==120 in both modes, matching the Rust FFI bindings which
+// are always generated from the 64-bit layout regardless of LIMB_BITS_CONFIG.
+struct alignas(sizeof(uint64_t)) G1Affine {
  Fp x;
  Fp y;
  bool infinity; // true if point at infinity (identity element)
@@ -36,7 +42,9 @@ struct G1Affine {

 // G2 point: (x, y) coordinates in Fp2
 // Curve equation: y^2 = x^3 + b' (twisted curve over Fp2)
-struct G2Affine {
+//
+// alignas(sizeof(uint64_t)): same ABI-stability reason as G1Affine above.
+struct alignas(sizeof(uint64_t)) G2Affine {
  Fp2 x;
  Fp2 y;
  bool infinity; // true if point at infinity (identity element)
--- a/backends/zk-cuda-backend/cuda/include/fp.h
+++ b/backends/zk-cuda-backend/cuda/include/fp.h
@@ -18,7 +18,7 @@
 // Supported values: 32, 64.
 // ============================================================================
 #ifndef LIMB_BITS_CONFIG
-#define LIMB_BITS_CONFIG 64
+#define LIMB_BITS_CONFIG 32
 #endif

 #if LIMB_BITS_CONFIG == 64
@@ -209,6 +209,17 @@ __host__ __device__ void fp_add(Fp &c, const Fp &a, const Fp &b);
 // MONTGOMERY: Both inputs and output must be in Montgomery form
 __host__ __device__ void fp_sub(Fp &c, const Fp &a, const Fp &b);

+// Lazy addition: c = a + b, output in [0, 2p) for inputs in [0, p).
+// Skips the final conditional subtraction of fp_add.
+// Safe as input to fp_mont_mul (CIOS accepts [0, 2p)); NOT safe for final
+// results or as input to fp_sub/fp_neg which require [0, p) inputs.
+__host__ __device__ void fp_add_lazy(Fp &c, const Fp &a, const Fp &b);
+
+// Lazy subtraction: c ≡ a - b (mod p), output in [0, 2p) for inputs in [0, p).
+// Adds p unconditionally, skipping the borrow-select of fp_sub.
+// Same safety concerns as fp_add_lazy.
+__host__ __device__ void fp_sub_lazy(Fp &c, const Fp &a, const Fp &b);
+
 // Multiplication: c = a * b (without reduction)
 // "Raw" means the operation is performed without modular reduction modulo p.
 // The result is stored in double-width (2*FP_LIMBS limbs) and may be >= p.
@@ -225,6 +236,11 @@ __host__ __device__ void fp_mont_reduce(Fp &c, const UNSIGNED_LIMB *a);
 // Both a and b are in Montgomery form, result is in Montgomery form
 __host__ __device__ void fp_mont_mul(Fp &c, const Fp &a, const Fp &b);

+// Montgomery squaring: c = (a^2 * R_INV) mod p
+// Both input and output in Montgomery form.
+// On device uses a triangular MAD chain (fewer multiplications).
+__host__ __device__ void fp_mont_sqr(Fp &c, const Fp &a);
+
 // CONVERSION: Input is normal form, output is Montgomery form
 __host__ __device__ void fp_to_montgomery(Fp &c, const Fp &a);

--- a/backends/zk-cuda-backend/cuda/include/fp2.h
+++ b/backends/zk-cuda-backend/cuda/include/fp2.h
@@ -72,6 +72,11 @@ __host__ __device__ void fp2_add(Fp2 &c, const Fp2 &a, const Fp2 &b);
 // Subtraction: c = a - b
 __host__ __device__ void fp2_sub(Fp2 &c, const Fp2 &a, const Fp2 &b);

+// Lazy add/sub: each component output in [0, 2p) for inputs in [0, p).
+// Safe as input to fp2_mont_mul; same contract as fp_add_lazy / fp_sub_lazy.
+__host__ __device__ void fp2_add_lazy(Fp2 &c, const Fp2 &a, const Fp2 &b);
+__host__ __device__ void fp2_sub_lazy(Fp2 &c, const Fp2 &a, const Fp2 &b);
+
 // Multiplication: c = a * b
 // (a0 + a1*i) * (b0 + b1*i) = (a0*b0 - a1*b1) + (a0*b1 + a1*b0)*i
 // NOTE: Assumes inputs are in normal form and converts to/from Montgomery
@@ -84,7 +89,7 @@ __host__ __device__ void fp2_mont_mul(Fp2 &c, const Fp2 &a, const Fp2 &b);
 // Montgomery squaring: c = a^2 (all in Montgomery form)
 // Uses the complex-squaring identity: c0 = (a0+a1)(a0-a1), c1 = 2*a0*a1
 // Only 2 Fp multiplications vs 3 for fp2_mont_mul(c, a, a).
-// NOTE: All inputs and outputs are in Montgomery form (no conversions)
+// NOTE: All inputs should be in Montgomery form
 __host__ __device__ void fp2_mont_square(Fp2 &c, const Fp2 &a);

 // Squaring: c = a^2
--- a/backends/zk-cuda-backend/cuda/src/curve.cu
+++ b/backends/zk-cuda-backend/cuda/src/curve.cu
@@ -1413,7 +1413,7 @@ __host__ __device__ void projective_point_add(G1Projective &result,
  u = Y2Z1 - Y1Z2;

  // uu = u^2
-  fp_mont_mul(uu, u, u);
+  fp_mont_sqr(uu, u);

  // v = X2 * Z1 - X1 * Z2 = X2*Z1 - X1Z2
  Fp X2Z1;
@@ -1428,7 +1428,7 @@ __host__ __device__ void projective_point_add(G1Projective &result,
  }

  // vv = v^2
-  fp_mont_mul(vv, v, v);
+  fp_mont_sqr(vv, v);
  // vvv = v * vv
  fp_mont_mul(vvv, v, vv);

@@ -1568,9 +1568,9 @@ __host__ __device__ void projective_mixed_add(G1Projective &result,
  }

  // uu = u^2
-  fp_mont_mul(uu, u, u);
+  fp_mont_sqr(uu, u);
  // vv = v^2
-  fp_mont_mul(vv, v, v);
+  fp_mont_sqr(vv, v);
  // vvv = v * vv
  fp_mont_mul(vvv, v, vv);

@@ -1692,7 +1692,7 @@ __host__ __device__ void projective_point_double(G1Projective &result,

  // A = 3 * X^2
  Fp X_sq, A;
-  fp_mont_mul(X_sq, p.X, p.X);
+  fp_mont_sqr(X_sq, p.X);
  fp_mul3(A, X_sq);

  // B = Y * Z
@@ -1706,7 +1706,7 @@ __host__ __device__ void projective_point_double(G1Projective &result,

  // D = A^2 - 8*C
  Fp A_sq, eight_C;
-  fp_mont_mul(A_sq, A, A);
+  fp_mont_sqr(A_sq, A);
  fp_mul8(eight_C, C);
  Fp D = A_sq - eight_C;

@@ -1716,14 +1716,16 @@ __host__ __device__ void projective_point_double(G1Projective &result,
  fp_double(result.X, BD);

  // Y3 = A * (4*C - D) - 8 * Y^2 * B^2
-  Fp four_C, A_times_diff;
+  Fp four_C, four_C_minus_D, A_times_diff;
  fp_mul4(four_C, C);
-  Fp four_C_minus_D = four_C - D;
+  // Lazy sub: four_C_minus_D feeds fp_mont_mul, so skip the conditional
+  // subtract and output in [0, 2p) instead of [0, p).
+  fp_sub_lazy(four_C_minus_D, four_C, D);
  fp_mont_mul(A_times_diff, A, four_C_minus_D);

  Fp Y_sq, B_sq, Y_sq_B_sq, eight_Y_sq_B_sq;
-  fp_mont_mul(Y_sq, p.Y, p.Y);
-  fp_mont_mul(B_sq, B, B);
+  fp_mont_sqr(Y_sq, p.Y);
+  fp_mont_sqr(B_sq, B);
  fp_mont_mul(Y_sq_B_sq, Y_sq, B_sq);
  fp_mul8(eight_Y_sq_B_sq, Y_sq_B_sq);
  result.Y = A_times_diff - eight_Y_sq_B_sq;
@@ -1773,9 +1775,13 @@ __host__ __device__ void projective_point_double(G2Projective &result,
  fp2_double(result.X, BD);

  // Y3 = A * (4*C - D) - 8 * Y^2 * B^2
-  Fp2 four_C, A_times_diff;
+  Fp2 four_C, four_C_minus_D, A_times_diff;
  fp2_mul4(four_C, C);
-  Fp2 four_C_minus_D = four_C - D;
+
+  // we can't use lazy sub here because for fp2 with Karatsuba path we will end
+  // up with values in [0, 4p) instead of [0, 2p), which would break the final
+  // result
+  fp2_sub(four_C_minus_D, four_C, D);
  fp2_mont_mul(A_times_diff, A, four_C_minus_D);

  Fp2 Y_sq, B_sq, Y_sq_B_sq, eight_Y_sq_B_sq;
--- a/backends/zk-cuda-backend/cuda/src/primitives/fp.cu
+++ b/backends/zk-cuda-backend/cuda/src/primitives/fp.cu
@@ -7,8 +7,6 @@
 #include <cuda_runtime.h>

 // For CUDA device code, we use __constant__ memory
-// Constants are hardcoded at compile time (like sppark) to avoid
-// cudaMemcpyToSymbol
 // Note: DEVICE_MODULUS is in normal form (not Montgomery)
 __constant__ const Fp DEVICE_MODULUS = {BLS12_446_MODULUS_LIMBS};

@@ -104,7 +102,7 @@ __host__ __device__ ComparisonType fp_cmp(const Fp &a, const Fp &b) {

 __host__ __device__ bool fp_is_zero(const Fp &a) {
  // By doing this way we avoid branching
-  uint64_t acc = 0;
+  UNSIGNED_LIMB acc = 0;
  for (int i = 0; i < FP_LIMBS; i++) {
    acc |= a.limb[i];
  }
@@ -114,8 +112,8 @@ __host__ __device__ bool fp_is_zero(const Fp &a) {
 __host__ __device__ bool fp_is_one(const Fp &a) {
  if (a.limb[0] != 1)
    return false;
-  // By doing this way we avoid branching
-  uint64_t acc = 0;
+  // All higher limbs must be zero.
+  UNSIGNED_LIMB acc = 0;
  for (int i = 1; i < FP_LIMBS; i++) {
    acc |= a.limb[i];
  }
@@ -207,6 +205,40 @@ __host__ __device__ UNSIGNED_LIMB fp_add_raw(Fp &c, const Fp &a, const Fp &b) {
        "l"(b.limb[1]), "l"(b.limb[2]), "l"(b.limb[3]), "l"(b.limb[4]),
        "l"(b.limb[5]), "l"(b.limb[6]));
  return carry_out;
+#elif defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 32
+  // 32-bit PTX carry chain: add.cc.u32 sets the hardware carry flag,
+  // addc.cc.u32 propagates it. Eliminates software carry-detect comparisons
+  // across all 14 limbs.
+  // Operand map: %0..%13 = c[0..13], %14 = carry_out,
+  //              %15..%28 = a[0..13], %29..%42 = b[0..13].
+  uint32_t carry_out;
+  asm("add.cc.u32   %0,  %15, %29;\n\t" // c[0]  = a[0]  + b[0],  set CF
+      "addc.cc.u32  %1,  %16, %30;\n\t" // c[1]  = a[1]  + b[1]  + CF
+      "addc.cc.u32  %2,  %17, %31;\n\t" // c[2]  = a[2]  + b[2]  + CF
+      "addc.cc.u32  %3,  %18, %32;\n\t" // c[3]  = a[3]  + b[3]  + CF
+      "addc.cc.u32  %4,  %19, %33;\n\t" // c[4]  = a[4]  + b[4]  + CF
+      "addc.cc.u32  %5,  %20, %34;\n\t" // c[5]  = a[5]  + b[5]  + CF
+      "addc.cc.u32  %6,  %21, %35;\n\t" // c[6]  = a[6]  + b[6]  + CF
+      "addc.cc.u32  %7,  %22, %36;\n\t" // c[7]  = a[7]  + b[7]  + CF
+      "addc.cc.u32  %8,  %23, %37;\n\t" // c[8]  = a[8]  + b[8]  + CF
+      "addc.cc.u32  %9,  %24, %38;\n\t" // c[9]  = a[9]  + b[9]  + CF
+      "addc.cc.u32  %10, %25, %39;\n\t" // c[10] = a[10] + b[10] + CF
+      "addc.cc.u32  %11, %26, %40;\n\t" // c[11] = a[11] + b[11] + CF
+      "addc.cc.u32  %12, %27, %41;\n\t" // c[12] = a[12] + b[12] + CF
+      "addc.cc.u32  %13, %28, %42;\n\t" // c[13] = a[13] + b[13] + CF
+      "addc.u32     %14, 0,   0;\n\t"   // carry_out = 0 + 0 + CF (0 or 1)
+      : "=r"(c.limb[0]), "=r"(c.limb[1]), "=r"(c.limb[2]), "=r"(c.limb[3]),
+        "=r"(c.limb[4]), "=r"(c.limb[5]), "=r"(c.limb[6]), "=r"(c.limb[7]),
+        "=r"(c.limb[8]), "=r"(c.limb[9]), "=r"(c.limb[10]), "=r"(c.limb[11]),
+        "=r"(c.limb[12]), "=r"(c.limb[13]), "=r"(carry_out)
+      : "r"(a.limb[0]), "r"(a.limb[1]), "r"(a.limb[2]), "r"(a.limb[3]),
+        "r"(a.limb[4]), "r"(a.limb[5]), "r"(a.limb[6]), "r"(a.limb[7]),
+        "r"(a.limb[8]), "r"(a.limb[9]), "r"(a.limb[10]), "r"(a.limb[11]),
+        "r"(a.limb[12]), "r"(a.limb[13]), "r"(b.limb[0]), "r"(b.limb[1]),
+        "r"(b.limb[2]), "r"(b.limb[3]), "r"(b.limb[4]), "r"(b.limb[5]),
+        "r"(b.limb[6]), "r"(b.limb[7]), "r"(b.limb[8]), "r"(b.limb[9]),
+        "r"(b.limb[10]), "r"(b.limb[11]), "r"(b.limb[12]), "r"(b.limb[13]));
+  return static_cast<UNSIGNED_LIMB>(carry_out);
 #else
  // Host path: portable software carry detection
  UNSIGNED_LIMB carry = 0;
@@ -248,6 +280,41 @@ __host__ __device__ UNSIGNED_LIMB fp_sub_raw(Fp &c, const Fp &a, const Fp &b) {
  // subc.u64 with 0-0-CF produces 0 if no borrow, or 0xFFFFFFFFFFFFFFFF if
  // borrow. Normalize to 0/1 for callers that check (borrow != 0) or add it.
  return borrow_out & 1;
+#elif defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 32
+  // 32-bit PTX borrow chain: sub.cc.u32 sets the hardware borrow flag,
+  // subc.cc.u32 propagates it across all 14 limbs.
+  // subc.u32 with 0-0-BF gives 0xFFFFFFFF on borrow; normalize to 0/1.
+  // Operand map: %0..%13 = c[0..13], %14 = borrow_out,
+  //              %15..%28 = a[0..13], %29..%42 = b[0..13].
+  uint32_t borrow_out;
+  asm("sub.cc.u32   %0,  %15, %29;\n\t" // c[0]  = a[0]  - b[0],  set BF
+      "subc.cc.u32  %1,  %16, %30;\n\t" // c[1]  = a[1]  - b[1]  - BF
+      "subc.cc.u32  %2,  %17, %31;\n\t" // c[2]  = a[2]  - b[2]  - BF
+      "subc.cc.u32  %3,  %18, %32;\n\t" // c[3]  = a[3]  - b[3]  - BF
+      "subc.cc.u32  %4,  %19, %33;\n\t" // c[4]  = a[4]  - b[4]  - BF
+      "subc.cc.u32  %5,  %20, %34;\n\t" // c[5]  = a[5]  - b[5]  - BF
+      "subc.cc.u32  %6,  %21, %35;\n\t" // c[6]  = a[6]  - b[6]  - BF
+      "subc.cc.u32  %7,  %22, %36;\n\t" // c[7]  = a[7]  - b[7]  - BF
+      "subc.cc.u32  %8,  %23, %37;\n\t" // c[8]  = a[8]  - b[8]  - BF
+      "subc.cc.u32  %9,  %24, %38;\n\t" // c[9]  = a[9]  - b[9]  - BF
+      "subc.cc.u32  %10, %25, %39;\n\t" // c[10] = a[10] - b[10] - BF
+      "subc.cc.u32  %11, %26, %40;\n\t" // c[11] = a[11] - b[11] - BF
+      "subc.cc.u32  %12, %27, %41;\n\t" // c[12] = a[12] - b[12] - BF
+      "subc.cc.u32  %13, %28, %42;\n\t" // c[13] = a[13] - b[13] - BF
+      "subc.u32     %14, 0,   0;\n\t"   // borrow_out = 0 - 0 - BF (0 or
+                                        // 0xFFFFFFFF)
+      : "=r"(c.limb[0]), "=r"(c.limb[1]), "=r"(c.limb[2]), "=r"(c.limb[3]),
+        "=r"(c.limb[4]), "=r"(c.limb[5]), "=r"(c.limb[6]), "=r"(c.limb[7]),
+        "=r"(c.limb[8]), "=r"(c.limb[9]), "=r"(c.limb[10]), "=r"(c.limb[11]),
+        "=r"(c.limb[12]), "=r"(c.limb[13]), "=r"(borrow_out)
+      : "r"(a.limb[0]), "r"(a.limb[1]), "r"(a.limb[2]), "r"(a.limb[3]),
+        "r"(a.limb[4]), "r"(a.limb[5]), "r"(a.limb[6]), "r"(a.limb[7]),
+        "r"(a.limb[8]), "r"(a.limb[9]), "r"(a.limb[10]), "r"(a.limb[11]),
+        "r"(a.limb[12]), "r"(a.limb[13]), "r"(b.limb[0]), "r"(b.limb[1]),
+        "r"(b.limb[2]), "r"(b.limb[3]), "r"(b.limb[4]), "r"(b.limb[5]),
+        "r"(b.limb[6]), "r"(b.limb[7]), "r"(b.limb[8]), "r"(b.limb[9]),
+        "r"(b.limb[10]), "r"(b.limb[11]), "r"(b.limb[12]), "r"(b.limb[13]));
+  return static_cast<UNSIGNED_LIMB>(borrow_out & 1u);
 #else
  // Host path: portable software borrow detection
  UNSIGNED_LIMB borrow = 0;
@@ -287,6 +354,17 @@ __host__ __device__ void fp_add(Fp &c, const Fp &a, const Fp &b) {
  UNSIGNED_LIMB mask =
      -use_original; // all-ones if keep sum, all-zeros if keep reduced

+  for (int i = 0; i < FP_LIMBS; i++) {
+    c.limb[i] = (sum.limb[i] & mask) | (reduced.limb[i] & ~mask);
+  }
+#elif defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 32
+  // Same branchless logic as the 64-bit path; mask arithmetic is identical
+  // since UNSIGNED_LIMB is uint32_t: -1u == 0xFFFFFFFF (all-ones).
+  Fp reduced;
+  UNSIGNED_LIMB borrow = fp_sub_raw(reduced, sum, fp_modulus());
+  UNSIGNED_LIMB use_original = ((carry ^ 1u) & borrow);
+  UNSIGNED_LIMB mask = -use_original;
+
  for (int i = 0; i < FP_LIMBS; i++) {
    c.limb[i] = (sum.limb[i] & mask) | (reduced.limb[i] & ~mask);
  }
@@ -319,6 +397,15 @@ __host__ __device__ void fp_sub(Fp &c, const Fp &a, const Fp &b) {
  UNSIGNED_LIMB mask =
      -borrow; // all-ones if borrow (use corrected), all-zeros if not

+  for (int i = 0; i < FP_LIMBS; i++) {
+    c.limb[i] = (corrected.limb[i] & mask) | (diff.limb[i] & ~mask);
+  }
+#elif defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 32
+  // Same branchless logic as the 64-bit path; -1u == 0xFFFFFFFF for uint32_t.
+  Fp corrected;
+  fp_add_raw(corrected, diff, fp_modulus());
+  UNSIGNED_LIMB mask = -borrow;
+
  for (int i = 0; i < FP_LIMBS; i++) {
    c.limb[i] = (corrected.limb[i] & mask) | (diff.limb[i] & ~mask);
  }
@@ -333,6 +420,26 @@ __host__ __device__ void fp_sub(Fp &c, const Fp &a, const Fp &b) {
 #endif
 }

+// Lazy addition: c = a + b, result in [0, 2p) for inputs in [0, p).
+// Skips the conditional subtraction of fp_add; valid as input to fp_mont_mul
+// since CIOS accepts operands in [0, 2p).
+__host__ __device__ void fp_add_lazy(Fp &c, const Fp &a, const Fp &b) {
+  fp_add_raw(c, a, b);
+}
+
+// Lazy subtraction: c ≡ a - b (mod p), result in [0, 2p) for inputs in [0, p).
+// Adds p unconditionally (no borrow-select), saving one conditional branch.
+// Valid as input to fp_mont_mul; must NOT be used where [0, p) is
+// required (e.g. final results, inputs to fp_sub/fp_neg).
+__host__ __device__ void fp_sub_lazy(Fp &c, const Fp &a, const Fp &b) {
+  Fp diff;
+  fp_sub_raw(diff, a, b);            // a - b, borrow absorbed into bit pattern
+  fp_add_raw(c, diff, fp_modulus()); // always add p; carry discarded
+  // For a >= b (no borrow): diff = a-b ∈ [0,p), result = a-b+p ∈ [p,2p)  ✓
+  // For a < b  (borrow=1):  diff wraps, result = a-b+2^N+p mod 2^N = a-b+p ∈
+  // [0,p) ✓
+}
+
 // Small-constant multiplication via addition chains.
 // These replace full Montgomery multiplications by 2, 3, 4, 8 with a few
 // modular additions, each ~25 instructions vs ~200+ for CIOS Montgomery mul.
@@ -483,14 +590,32 @@ __host__ __device__ void fp_mont_reduce(Fp &c, const UNSIGNED_LIMB *a) {
  for (int i = 0; i < FP_LIMBS; i++) {
    UNSIGNED_LIMB u = t[i] * p_prime; // u = t[i] * p' mod 2^LIMB_BITS

-    // Add u * p to t, starting at position i
+    // Add u * p to t, starting at position i.
+    // Use uint64_t accumulator in 32-bit mode to avoid carry overflow:
+    // hi + carry1 + carry2 can reach 2^32 which overflows uint32_t.
+#if LIMB_BITS_CONFIG == 32
+    uint64_t carry = 0;
+    for (int j = 0; j < FP_LIMBS; j++) {
+      uint64_t acc =
+          (uint64_t)t[i + j] + (uint64_t)u * (uint64_t)p.limb[j] + carry;
+      t[i + j] = (UNSIGNED_LIMB)acc;
+      carry = acc >> LIMB_BITS;
+    }
+    // Propagate remaining carry (carry ≤ 2^32-1 at this point)
+    int idx = i + FP_LIMBS;
+    while (carry != 0 && idx <= 2 * FP_LIMBS) {
+      uint64_t acc = (uint64_t)t[idx] + carry;
+      t[idx] = (UNSIGNED_LIMB)acc;
+      carry = acc >> LIMB_BITS;
+      idx++;
+    }
+#else
    UNSIGNED_LIMB carry = 0;
    for (int j = 0; j < FP_LIMBS; j++) {
      UNSIGNED_LIMB hi, lo;
      mul_limbs(u, p.limb[j], hi, lo);

      // Three-way addition: t[i+j] + lo + carry
-      // Do it in two steps to handle carries properly
      UNSIGNED_LIMB temp = t[i + j] + lo;
      UNSIGNED_LIMB carry1 = (temp < t[i + j]) ? 1 : 0;

@@ -499,7 +624,6 @@ __host__ __device__ void fp_mont_reduce(Fp &c, const UNSIGNED_LIMB *a) {

      t[i + j] = sum;

-      // Next carry is hi + carry1 + carry2
      carry = hi + carry1 + carry2;
    }

@@ -511,6 +635,7 @@ __host__ __device__ void fp_mont_reduce(Fp &c, const UNSIGNED_LIMB *a) {
      t[idx] = sum;
      idx++;
    }
+#endif
  }

  // Result is in t[FP_LIMBS..2*FP_LIMBS-1] (high half)
@@ -534,29 +659,7 @@ __host__ __device__ void fp_mont_reduce(Fp &c, const UNSIGNED_LIMB *a) {
  }
 }

-// ============================================================================
 // PTX-accelerated CIOS Montgomery multiplication (device path)
-// ============================================================================
-// The CIOS algorithm for 7 x 64-bit limbs executes 98 multiply-accumulate
-// steps across 7 outer iterations. Each step computes:
-//   (carry, t[j]) = t[j] + a[j] * b_i + carry
-// which is a 64x64->128 multiply plus a three-operand addition with carry.
-//
-// The C++ path uses software carry detection: carry = (sum < old) ? 1 : 0.
-// The PTX path below uses hardware carry flags via the .cc suffix:
-//   - mul.lo.u64 / mul.hi.u64 : 64x64->128 wide multiply
-//   - add.cc.u64 / addc.u64   : addition chain with hardware carry flag
-//
-// Each multiply-accumulate step uses 6 PTX instructions instead of ~10+ in
-// the software-carry version. The 7 outer iterations are fully unrolled, and
-// the limb-shift loop (t[j] = t[j+1]) is eliminated by register renaming.
-//
-// REGISTER ALIASING NOTE: All PTX temporaries (_lo, _hi) are declared as
-// .reg inside the asm block. This prevents nvcc's register allocator from
-// aliasing them with C operands (t_j, carry), which was the root cause of
-// previous correctness bugs where "+l" outputs could share registers with
-// "l" inputs in the same asm statement.
-// ============================================================================

 #ifdef __CUDA_ARCH__
 #if LIMB_BITS_CONFIG == 64
@@ -735,14 +838,408 @@ __device__ __noinline__ void fp_mont_mul_cios_ptx(Fp &c, const Fp &a,
 #endif // LIMB_BITS_CONFIG == 64
 #endif // __CUDA_ARCH__

+// 32-bit dual MAD-chain Montgomery multiplication (device path)
+
+#ifdef __CUDA_ARCH__
+
+// PTX carry-chain primitives for 32-bit Montgomery arithmetic.
+//
+// These are macros rather than __forceinline__ functions because the hardware
+// carry flag (CC register) does not survive a function-call boundary
+// (lo, hi) = a * b : 64-bit product, no carry in or out.
+// Initialises a fresh wide accumulator slot.
+#define FP_MUL_WIDE_32(lo, hi, a, b)                                           \
+  asm("mul.lo.u32 %0, %2, %3; mul.hi.u32 %1, %2, %3;"                          \
+      : "=r"(lo), "=r"(hi)                                                     \
+      : "r"(a), "r"(b))
+
+// lo += lo(a*b); hi += hi(a*b) + CC.  Sets CC.
+// Opens a carry chain (mad.lo.cc / madc.hi.cc).
+#define FP_MAD_WIDE_CC_32(lo, hi, a, b)                                        \
+  asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.cc.u32 %1, %2, %3, %1;"           \
+      : "+r"(lo), "+r"(hi)                                                     \
+      : "r"(a), "r"(b))
+
+// lo += lo(a*b) + CC; hi += hi(a*b) + CC.  Sets CC.
+// Continues a carry chain (madc.lo.cc / madc.hi.cc).
+#define FP_MADC_WIDE_CC_32(lo, hi, a, b)                                       \
+  asm("madc.lo.cc.u32 %0, %2, %3, %0; madc.hi.cc.u32 %1, %2, %3, %1;"          \
+      : "+r"(lo), "+r"(hi)                                                     \
+      : "r"(a), "r"(b))
+
+// r += CC.  No carry out terminates a carry chain.
+#define FP_ADDC_32(r) asm("addc.u32 %0, %0, 0;" : "+r"(r))
+
+// dst = src + CC.  No carry out
+#define FP_ADDC_INTO_32(dst, src)                                              \
+  asm("addc.u32 %0, %1, 0;" : "=r"(dst) : "r"(src))
+
+// r = CC (capture carry flag as 0 or 1).  No carry out.
+#define FP_CARRY_32(r) asm("addc.u32 %0, 0, 0;" : "=r"(r))
+
+// dst = src + src.  Sets CC opens a left-shift doubling chain.
+#define FP_DBL_CC_32(dst, src)                                                 \
+  asm("add.cc.u32 %0, %1, %1;" : "=r"(dst) : "r"(src))
+
+// r = r + r + CC.  Sets CC continues a left-shift doubling chain.
+#define FP_DBLC_CC_32(r) asm("addc.cc.u32 %0, %0, %0;" : "+r"(r))
+
+/// dst = lo32 | (hi32 << 32): pack two 32-bit halves into one 64-bit register.
+#define FP_PACK_U64(dst, lo32, hi32)                                           \
+  asm("mov.b64 %0, {%1, %2};" : "=l"(dst) : "r"(lo32), "r"(hi32))
+
+// Initialize acc[0..n-1] with products of every other element of a and bi.
+// For each j (step 2): acc[j] = lo(a[j]*bi), acc[j+1] = hi(a[j]*bi).
+static __device__ __forceinline__ void
+fp_mul_n_32(uint32_t *acc, const uint32_t *a, uint32_t bi, int n) {
+#pragma unroll
+  for (int j = 0; j < n; j += 2) {
+    asm("mul.lo.u32 %0, %1, %2;" : "=r"(acc[j]) : "r"(a[j]), "r"(bi));
+    asm("mul.hi.u32 %0, %1, %2;" : "=r"(acc[j + 1]) : "r"(a[j]), "r"(bi));
+  }
+}
+
+// Multiply-accumulate across n limbs with a hardware carry chain.
+// First pair uses mad.lo.cc + madc.hi.cc (initiates the chain).
+// Remaining pairs continue with madc.lo.cc + madc.hi.cc.
+// Carry flag exits in CC on return; caller must consume it.
+static __device__ __forceinline__ void
+fp_cmad_n_32(uint32_t *acc, const uint32_t *a, uint32_t bi, int n) {
+  asm("mad.lo.cc.u32 %0, %2, %3, %0; madc.hi.cc.u32 %1, %2, %3, %1;"
+      : "+r"(acc[0]), "+r"(acc[1])
+      : "r"(a[0]), "r"(bi));
+#pragma unroll
+  for (int j = 2; j < n; j += 2)
+    asm("madc.lo.cc.u32 %0, %2, %3, %0; madc.hi.cc.u32 %1, %2, %3, %1;"
+        : "+r"(acc[j]), "+r"(acc[j + 1])
+        : "r"(a[j]), "r"(bi));
+  // CC holds the final carry on return
+}
+
+// Multiply-accumulate with implicit right-shift of odd by two positions.
+// Each pair: odd[j] = lo/hi(a[j]*bi) + old_odd[j+2] + CC.
+// Reads are always two positions ahead of writes so forward iteration is safe.
+// Final pair terminates the chain with addend=0 and no carry-out (.hi only).
+static __device__ __forceinline__ void
+fp_madc_n_rshift_32(uint32_t *odd, const uint32_t *a, uint32_t bi, int n) {
+#pragma unroll
+  for (int j = 0; j < n - 2; j += 2)
+    asm("madc.lo.cc.u32 %0, %2, %3, %4; madc.hi.cc.u32 %1, %2, %3, %5;"
+        : "=r"(odd[j]), "=r"(odd[j + 1])
+        : "r"(a[j]), "r"(bi), "r"(odd[j + 2]), "r"(odd[j + 3]));
+  asm("madc.lo.cc.u32 %0, %2, %3, 0; madc.hi.u32 %1, %2, %3, 0;"
+      : "=r"(odd[n - 2]), "=r"(odd[n - 1])
+      : "r"(a[n - 2]), "r"(bi));
+  // Note: final madc.hi.u32 has no .cc so CC is clear on return
+}
+
+// After the call even[0] == 0 (by the Montgomery invariant), so the next
+// iteration's right-shift effectively advances the window by one limb.
+static __device__ __forceinline__ void
+fp_mad_n_redc_32(uint32_t *even, uint32_t *odd, const uint32_t *a,
+                 const uint32_t *p, uint32_t bi, uint32_t M0, bool first) {
+  constexpr int n = 14; // 32-bit limbs for BLS12-446 (446 bits → 14 × 32-bit)
+
+  if (first) {
+    // Fresh initialization: no carry from previous iteration.
+    // even[2j]   = lo(a[2j]   * bi),  even[2j+1] = hi(a[2j]   * bi)
+    // odd[2j]    = lo(a[2j+1] * bi),  odd[2j+1]  = hi(a[2j+1] * bi)
+    fp_mul_n_32(even, a, bi, n);
+    fp_mul_n_32(odd, a + 1, bi, n);
+  } else {
+    // Merge carry from previous iteration and advance both accumulators.
+    asm("add.cc.u32 %0, %0, %1;" : "+r"(even[0]) : "r"(odd[1]));
+    fp_madc_n_rshift_32(odd, a + 1, bi, n);
+    fp_cmad_n_32(even, a, bi, n);
+    asm("addc.u32 %0, %0, 0;" : "+r"(odd[n - 1]));
+  }
+
+  // Montgomery reduction: choose mi so that even[0] + lo(p[0]*mi) = 0 mod 2^32
+  uint32_t mi = even[0] * M0;
+  fp_cmad_n_32(odd, p + 1, mi, n);
+  fp_cmad_n_32(even, p, mi, n);
+  asm("addc.u32 %0, %0, 0;" : "+r"(odd[n - 1]));
+}
+
+// Carry-add: acc[i] += a[i] for i = 0..n-1 with PTX carry chain.
+// Starts with add.cc (initiates chain); all subsequent adds use addc.cc.
+// Carry flag is left set in CC on return for the caller to consume.
+static __device__ __forceinline__ void fp_cadd_n_32(uint32_t *acc,
+                                                    const uint32_t *a, int n) {
+  asm("add.cc.u32 %0, %0, %1;" : "+r"(acc[0]) : "r"(a[0]));
+#pragma unroll
+  for (int i = 1; i < n; i++)
+    asm("addc.cc.u32 %0, %0, %1;" : "+r"(acc[i]) : "r"(a[i]));
+}
+
+// Even row of the upper-triangle squaring pass.
+// Adds a[1..n-2]*bi into odd[0..n-3] (cmad chain), places a[n-1]*bi into
+// odd[n-2..n-1] fresh (terminates carry), then adds a[0..n-1]*bi into
+// even[0..n-1] (independent cmad chain), folding the even carry into odd[n-1].
+static __device__ __forceinline__ void fp_mad_row_32(uint32_t *odd,
+                                                     uint32_t *even,
+                                                     const uint32_t *a,
+                                                     uint32_t bi, int n) {
+  fp_cmad_n_32(odd, a + 1, bi, n - 2);
+  asm("madc.lo.cc.u32 %0, %2, %3, 0; madc.hi.u32 %1, %2, %3, 0;"
+      : "=r"(odd[n - 2]), "=r"(odd[n - 1])
+      : "r"(a[n - 1]), "r"(bi));
+  fp_cmad_n_32(even, a, bi, n);
+  asm("addc.u32 %0, %0, 0;" : "+r"(odd[n - 1]));
+}
+
+// Odd row of the upper-triangle squaring pass.
+// Adds a[0..n-3]*bi into odd[0..n-3] (cmad chain), places a[n-2]*bi into
+// odd[n-2..n-1] fresh, then adds a[1..n-2]*bi into even[0..n-3] (n-2 terms),
+// folding the even carry into odd[n-1].
+static __device__ __forceinline__ void fp_qad_row_32(uint32_t *odd,
+                                                     uint32_t *even,
+                                                     const uint32_t *a,
+                                                     uint32_t bi, int n) {
+  fp_cmad_n_32(odd, a, bi, n - 2);
+  asm("madc.lo.cc.u32 %0, %2, %3, 0; madc.hi.u32 %1, %2, %3, 0;"
+      : "=r"(odd[n - 2]), "=r"(odd[n - 1])
+      : "r"(a[n - 2]), "r"(bi));
+  fp_cmad_n_32(even, a + 1, bi, n - 2);
+  asm("addc.u32 %0, %0, 0;" : "+r"(odd[n - 1]));
+}
+
+// One Montgomery-reduction row without a multiply step (b_i = 0).
+// Used by fp_mont_sqr_mad32 to reduce the lower n words of the wide product.
+// Mirrors fp_mad_n_redc_32 but omits the initial product accumulation, leaving
+// only the annihilation step that drives even[0] to zero.
+static __device__ __forceinline__ void
+fp_mul_by_1_row_32(uint32_t *even, uint32_t *odd, const uint32_t *p,
+                   uint32_t M0, bool first) {
+  constexpr int n = 14;
+  // mi removes even[0]: even[0] + lo(p[0]*mi) == 0 mod 2^32.
+  // IMPORTANT: mi must be computed from even[0] *after* any add.cc that
+  // modifies it. Plain integer multiply does not touch CC.
+  uint32_t mi;
+  if (first) {
+    mi = even[0] * M0;
+    fp_mul_n_32(odd, p + 1, mi, n);
+    fp_cmad_n_32(even, p, mi, n);
+    asm("addc.u32 %0, %0, 0;" : "+r"(odd[n - 1]));
+  } else {
+    // Absorb the shifted carry word from the previous step, then reduce.
+    asm("add.cc.u32 %0, %0, %1;" : "+r"(even[0]) : "r"(odd[1]));
+    // Use PTX mul explicitly: a plain C multiply after add.cc could in theory
+    // let the compiler insert an instruction that clobbers CC before
+    // madc_n_rshift.
+    asm("mul.lo.u32 %0, %1, %2;" : "=r"(mi) : "r"(even[0]), "r"(M0));
+    fp_madc_n_rshift_32(odd, p + 1, mi, n);
+    fp_cmad_n_32(even, p, mi, n);
+    asm("addc.u32 %0, %0, 0;" : "+r"(odd[n - 1]));
+  }
+}
+
+// Montgomery squaring using CIOS with triangular 32-bit MAD chains.
+// See fp_mont_mul_mad32 for the algorithm reference (Koç et al., 1996).
+//
+// Computes c = a^2 * R^{-1} mod p (input and output in Montgomery form).
+__device__ __noinline__ void fp_mont_sqr_mad32(Fp &c, const Fp &a) {
+  constexpr int n = 14;
+
+  const uint32_t *a32 = reinterpret_cast<const uint32_t *>(a.limb);
+  const uint32_t *p32 = reinterpret_cast<const uint32_t *>(DEVICE_MODULUS.limb);
+  const uint32_t M0 = static_cast<uint32_t>(DEVICE_P_PRIME);
+
+  uint32_t wide[2 * n], wtemp[2 * n - 2];
+  // Phase 1: upper triangle a[i]*a[j] for j > i
+  fp_mul_n_32(wtemp, a32 + 1, a32[0], n);
+  fp_mul_n_32(wide + 2, a32 + 2, a32[0], n - 2);
+
+#pragma unroll
+  for (int i = 2; i <= n - 4; i += 2) {
+    fp_mad_row_32(&wide[2 * i], &wtemp[2 * i - 2], &a32[i], a32[i - 1], n - i);
+    fp_qad_row_32(&wtemp[2 * i], &wide[2 * i + 2], &a32[i + 1], a32[i], n - i);
+  }
+
+  FP_MUL_WIDE_32(wide[2 * n - 4], wide[2 * n - 3], a32[n - 1], a32[n - 3]);
+  FP_MAD_WIDE_CC_32(wtemp[2 * n - 6], wtemp[2 * n - 5], a32[n - 2], a32[n - 3]);
+  FP_ADDC_32(wide[2 * n - 3]);
+  FP_MUL_WIDE_32(wtemp[2 * n - 4], wtemp[2 * n - 3], a32[n - 1], a32[n - 2]);
+
+  fp_cadd_n_32(&wide[2], &wtemp[1], 2 * n - 4);
+  FP_ADDC_INTO_32(wide[2 * n - 2], wtemp[2 * n - 3]);
+
+  // Phase 2: double the upper-triangle sum (left-shift the 2n-bit value by 1)
+  wide[0] = 0;
+  FP_DBL_CC_32(wide[1], wtemp[0]);
+#pragma unroll
+  for (int j = 2; j < 2 * n - 1; j++)
+    FP_DBLC_CC_32(wide[j]);
+  FP_CARRY_32(wide[2 * n - 1]);
+
+  // Phase 3: add diagonal a[i]^2 terms (squares of each limb)
+  FP_MAD_WIDE_CC_32(wide[0], wide[1], a32[0], a32[0]);
+#pragma unroll
+  for (int i = 1; i < n; i++)
+    FP_MADC_WIDE_CC_32(wide[2 * i], wide[2 * i + 1], a32[i], a32[i]);
+
+  // Phase 4: Montgomery reduction
+  uint32_t red_odd[n];
+#pragma unroll
+  for (int i = 0; i < n; i += 2) {
+    fp_mul_by_1_row_32(&wide[0], &red_odd[0], p32, M0, i == 0);
+    fp_mul_by_1_row_32(&red_odd[0], &wide[0], p32, M0, false);
+  }
+  // Merge the final red_odd word into wide[0..n-1].
+  fp_cadd_n_32(&wide[0], &red_odd[1], n - 1);
+  FP_ADDC_32(wide[n - 1]);
+
+  // Add reduced lower half into upper half wide[n..2n-1]; the result lives
+  // in wide[n..2n-1] and is in [0, 2p).
+  fp_cadd_n_32(&wide[n], &wide[0], n);
+  FP_CARRY_32(wide[0]); // discard overflow (always 0 for p<2^446)
+
+#if LIMB_BITS_CONFIG == 64
+  // Pack uint32_t pairs back into uint64_t limbs.
+#pragma unroll
+  for (int j = 0; j < 7; j++)
+    FP_PACK_U64(c.limb[j], wide[n + 2 * j], wide[n + 2 * j + 1]);
+
+  const uint64_t p0 = DEVICE_MODULUS.limb[0], p1 = DEVICE_MODULUS.limb[1],
+                 p2 = DEVICE_MODULUS.limb[2], p3 = DEVICE_MODULUS.limb[3],
+                 p4 = DEVICE_MODULUS.limb[4], p5 = DEVICE_MODULUS.limb[5],
+                 p6 = DEVICE_MODULUS.limb[6];
+  uint64_t r0, r1, r2, r3, r4, r5, r6, mask64;
+  asm("sub.cc.u64   %0, %8,  %15;\n\t"
+      "subc.cc.u64  %1, %9,  %16;\n\t"
+      "subc.cc.u64  %2, %10, %17;\n\t"
+      "subc.cc.u64  %3, %11, %18;\n\t"
+      "subc.cc.u64  %4, %12, %19;\n\t"
+      "subc.cc.u64  %5, %13, %20;\n\t"
+      "subc.cc.u64  %6, %14, %21;\n\t"
+      "subc.u64     %7, 0,   0;\n\t"
+      "shr.s64      %7, %7,  63;\n\t"
+      : "=l"(r0), "=l"(r1), "=l"(r2), "=l"(r3), "=l"(r4), "=l"(r5), "=l"(r6),
+        "=l"(mask64)
+      : "l"(c.limb[0]), "l"(c.limb[1]), "l"(c.limb[2]), "l"(c.limb[3]),
+        "l"(c.limb[4]), "l"(c.limb[5]), "l"(c.limb[6]), "l"(p0), "l"(p1),
+        "l"(p2), "l"(p3), "l"(p4), "l"(p5), "l"(p6));
+  c.limb[0] = (c.limb[0] & mask64) | (r0 & ~mask64);
+  c.limb[1] = (c.limb[1] & mask64) | (r1 & ~mask64);
+  c.limb[2] = (c.limb[2] & mask64) | (r2 & ~mask64);
+  c.limb[3] = (c.limb[3] & mask64) | (r3 & ~mask64);
+  c.limb[4] = (c.limb[4] & mask64) | (r4 & ~mask64);
+  c.limb[5] = (c.limb[5] & mask64) | (r5 & ~mask64);
+  c.limb[6] = (c.limb[6] & mask64) | (r6 & ~mask64);
+#else
+#pragma unroll
+  for (int j = 0; j < n; j++)
+    c.limb[j] = wide[n + j];
+  Fp reduced;
+  UNSIGNED_LIMB borrow = fp_sub_raw(reduced, c, fp_modulus());
+  UNSIGNED_LIMB mask32 = -borrow;
+#pragma unroll
+  for (int j = 0; j < n; j++)
+    c.limb[j] = (c.limb[j] & mask32) | (reduced.limb[j] & ~mask32);
+#endif
+}
+
+// Montgomery multiplication using CIOS (Coarsely Integrated Operand Scanning):
+// Computes c = a * b * R^{-1} mod p (all operands in Montgomery form).
+// Inputs are stored as uint64_t[7]; they are reinterpreted as uint32_t[14]
+// (little-endian: a64[j] == a32[2j] | (a32[2j+1] << 32)).
+__device__ __noinline__ void fp_mont_mul_mad32(Fp &c, const Fp &a,
+                                               const Fp &b) {
+  constexpr int n = 14;
+
+  // Reinterpret 64-bit limb arrays as 32-bit on little-endian hardware.
+  const uint32_t *a32 = reinterpret_cast<const uint32_t *>(a.limb);
+  const uint32_t *b32 = reinterpret_cast<const uint32_t *>(b.limb);
+  const uint32_t *p32 = reinterpret_cast<const uint32_t *>(DEVICE_MODULUS.limb);
+
+  // 32-bit Montgomery constant: low 32 bits of DEVICE_P_PRIME.
+  // Correct because -p^{-1} mod 2^32 == (-p^{-1} mod 2^64) mod 2^32.
+  const uint32_t M0 = static_cast<uint32_t>(DEVICE_P_PRIME);
+
+  uint32_t even[n], odd[n];
+
+  // Process every 32-bit limb of b in pairs, alternating primary accumulator.
+#pragma unroll
+  for (int i = 0; i < n; i += 2) {
+    fp_mad_n_redc_32(even, odd, a32, p32, b32[i], M0, i == 0);
+    fp_mad_n_redc_32(odd, even, a32, p32, b32[i + 1], M0, false);
+  }
+
+  // Merge: even[0..n-2] += odd[1..n-1], propagate final carry into even[n-1].
+  fp_cadd_n_32(even, odd + 1, n - 1);
+  FP_ADDC_32(even[n - 1]);
+
+  // Pack and final reduction layout depends on LIMB_BITS_CONFIG.
+  // In both cases UNSIGNED_LIMB* and uint32_t* point to the same 56-byte block.
+#if LIMB_BITS_CONFIG == 64
+  // 64-bit limbs: pack pairs into uint64_t with PTX mov.b64, then do a
+  // branchless 7-limb 64-bit conditional subtraction.
+#pragma unroll
+  for (int j = 0; j < 7; j++)
+    FP_PACK_U64(c.limb[j], even[2 * j], even[2 * j + 1]);
+
+  // subc.u64 0-0-borrow gives 0xFFFF... when c<p (keep), 0 when c>=p (reduce).
+  // shr.s64 sign-extends to a per-bit selection mask.
+  const uint64_t p0 = DEVICE_MODULUS.limb[0], p1 = DEVICE_MODULUS.limb[1],
+                 p2 = DEVICE_MODULUS.limb[2], p3 = DEVICE_MODULUS.limb[3],
+                 p4 = DEVICE_MODULUS.limb[4], p5 = DEVICE_MODULUS.limb[5],
+                 p6 = DEVICE_MODULUS.limb[6];
+  uint64_t r0, r1, r2, r3, r4, r5, r6, mask64;
+  asm("sub.cc.u64   %0, %8,  %15;\n\t"
+      "subc.cc.u64  %1, %9,  %16;\n\t"
+      "subc.cc.u64  %2, %10, %17;\n\t"
+      "subc.cc.u64  %3, %11, %18;\n\t"
+      "subc.cc.u64  %4, %12, %19;\n\t"
+      "subc.cc.u64  %5, %13, %20;\n\t"
+      "subc.cc.u64  %6, %14, %21;\n\t"
+      "subc.u64     %7, 0,   0;\n\t"
+      "shr.s64      %7, %7,  63;\n\t"
+      : "=l"(r0), "=l"(r1), "=l"(r2), "=l"(r3), "=l"(r4), "=l"(r5), "=l"(r6),
+        "=l"(mask64)
+      : "l"(c.limb[0]), "l"(c.limb[1]), "l"(c.limb[2]), "l"(c.limb[3]),
+        "l"(c.limb[4]), "l"(c.limb[5]), "l"(c.limb[6]), "l"(p0), "l"(p1),
+        "l"(p2), "l"(p3), "l"(p4), "l"(p5), "l"(p6));
+  c.limb[0] = (c.limb[0] & mask64) | (r0 & ~mask64);
+  c.limb[1] = (c.limb[1] & mask64) | (r1 & ~mask64);
+  c.limb[2] = (c.limb[2] & mask64) | (r2 & ~mask64);
+  c.limb[3] = (c.limb[3] & mask64) | (r3 & ~mask64);
+  c.limb[4] = (c.limb[4] & mask64) | (r4 & ~mask64);
+  c.limb[5] = (c.limb[5] & mask64) | (r5 & ~mask64);
+  c.limb[6] = (c.limb[6] & mask64) | (r6 & ~mask64);
+#else
+#pragma unroll
+  for (int j = 0; j < n; j++)
+    c.limb[j] = even[j];
+
+  Fp reduced;
+  UNSIGNED_LIMB borrow = fp_sub_raw(reduced, c, fp_modulus());
+  UNSIGNED_LIMB mask32 = -borrow; // all-ones if c<p (keep), all-zeros if c>=p
+#pragma unroll
+  for (int j = 0; j < n; j++)
+    c.limb[j] = (c.limb[j] & mask32) | (reduced.limb[j] & ~mask32);
+#endif
+}
+
+#undef FP_MUL_WIDE_32
+#undef FP_MAD_WIDE_CC_32
+#undef FP_MADC_WIDE_CC_32
+#undef FP_ADDC_32
+#undef FP_ADDC_INTO_32
+#undef FP_CARRY_32
+#undef FP_DBL_CC_32
+#undef FP_DBLC_CC_32
+#undef FP_PACK_U64
+
+#endif // __CUDA_ARCH__
+
 // CIOS (Coarsely Integrated Operand Scanning) Montgomery multiplication
 // Fuses multiplication and reduction in a single pass for better efficiency.
 // Uses only FP_LIMBS+1 limbs of working space instead of 2*FP_LIMBS.
 // Both a and b are in Montgomery form, result is in Montgomery form.
 __host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
-#if defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 64
-  // Device path: fully unrolled PTX with hardware carry flags
-  fp_mont_mul_cios_ptx(c, a, b);
+#ifdef __CUDA_ARCH__
+  // Device path: 32-bit dual MAD chain
+  fp_mont_mul_mad32(c, a, b);
 #else
  // Host path: portable C++ implementation
  const Fp &p = fp_modulus();
@@ -750,11 +1247,31 @@ __host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {

  // Working array: only n+1 limbs needed (vs 2n for separate mul+reduce)
  UNSIGNED_LIMB t[FP_LIMBS + 1];
+  // memset is not guaranteed available in all device compilation contexts;
+  // use an explicit loop which the compiler will unroll anyway.
+#ifdef __CUDA_ARCH__
+  for (int i = 0; i <= FP_LIMBS; i++) {
+    t[i] = 0;
+  }
+#else
  memset(t, 0, (FP_LIMBS + 1) * sizeof(UNSIGNED_LIMB));
+#endif

  // Main CIOS loop: for each limb of b
  for (int i = 0; i < FP_LIMBS; i++) {
    // Step 1: Multiply-accumulate t += a * b[i]
+#if LIMB_BITS_CONFIG == 32
+    uint64_t carry64 = 0;
+    for (int j = 0; j < FP_LIMBS; j++) {
+      uint64_t acc =
+          (uint64_t)t[j] + (uint64_t)a.limb[j] * (uint64_t)b.limb[i] + carry64;
+      t[j] = (UNSIGNED_LIMB)acc;
+      carry64 = acc >> LIMB_BITS;
+    }
+    uint64_t sum64 = (uint64_t)t[FP_LIMBS] + carry64;
+    UNSIGNED_LIMB overflow = (UNSIGNED_LIMB)(sum64 >> LIMB_BITS);
+    t[FP_LIMBS] = (UNSIGNED_LIMB)sum64;
+#else
    UNSIGNED_LIMB carry = 0;
    for (int j = 0; j < FP_LIMBS; j++) {
      UNSIGNED_LIMB hi, lo;
@@ -767,18 +1284,31 @@ __host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
      UNSIGNED_LIMB c2 = (sum2 < sum1) ? 1 : 0;
      t[j] = sum2;

-      // carry = hi + c1 + c2
      carry = hi + c1 + c2;
    }
-    // Add carry to t[n]
    UNSIGNED_LIMB sum = t[FP_LIMBS] + carry;
    UNSIGNED_LIMB overflow = (sum < t[FP_LIMBS]) ? 1 : 0;
    t[FP_LIMBS] = sum;
+#endif

    // Step 2: Reduction - compute m = t[0] * p' mod 2^LIMB_BITS
    UNSIGNED_LIMB m = t[0] * p_prime;

    // Add m * p to t (this zeros out t[0])
+#if LIMB_BITS_CONFIG == 32
+    carry64 = 0;
+    for (int j = 0; j < FP_LIMBS; j++) {
+      uint64_t acc =
+          (uint64_t)t[j] + (uint64_t)m * (uint64_t)p.limb[j] + carry64;
+      t[j] = (UNSIGNED_LIMB)acc;
+      carry64 = acc >> LIMB_BITS;
+    }
+    // Merge carry from reduction with the overflow from step 1.
+    // sum64 ≤ (2^32-1) + (2^32-1) + 1 = 2^33-1, so the new overflow is 0 or 1.
+    uint64_t s64 = (uint64_t)t[FP_LIMBS] + carry64 + (uint64_t)overflow;
+    t[FP_LIMBS] = (UNSIGNED_LIMB)s64;
+    overflow = (UNSIGNED_LIMB)(s64 >> LIMB_BITS);
+#else
    carry = 0;
    for (int j = 0; j < FP_LIMBS; j++) {
      UNSIGNED_LIMB hi, lo;
@@ -800,6 +1330,7 @@ __host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
    UNSIGNED_LIMB c2 = (s2 < s1) ? 1 : 0;
    t[FP_LIMBS] = s2;
    overflow = c1 + c2; // Track overflow for final reduction
+#endif

    // Step 3: Shift right by one limb (divide by 2^LIMB_BITS)
    // t[0..n-1] = t[1..n], t[n] = overflow
@@ -810,7 +1341,13 @@ __host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
  }

  // Copy result to output
+#ifdef __CUDA_ARCH__
+  for (int i = 0; i < FP_LIMBS; i++) {
+    c.limb[i] = t[i];
+  }
+#else
  memcpy(&c.limb[0], t, FP_LIMBS * sizeof(UNSIGNED_LIMB));
+#endif

  // Final reduction: if result >= p or there's overflow, subtract p
  if (t[FP_LIMBS] != 0 || fp_cmp(c, p) != ComparisonType::Less) {
@@ -829,6 +1366,19 @@ __host__ __device__ void fp_mont_mul(Fp &c, const Fp &a, const Fp &b) {
  fp_mont_mul_cios(c, a, b);
 }

+// Montgomery squaring: c = (a^2 * R_INV) mod p
+// Input and output in Montgomery form.
+// On device: uses fp_mont_sqr_mad32 (triangular MAD chain, ~30-40% fewer
+// multiplications than fp_mont_mul(c, a, a)).
+// On host: delegates to fp_mont_mul_cios(c, a, a).
+__host__ __device__ void fp_mont_sqr(Fp &c, const Fp &a) {
+#ifdef __CUDA_ARCH__
+  fp_mont_sqr_mad32(c, a);
+#else
+  fp_mont_mul_cios(c, a, a);
+#endif
+}
+
 // CONVERSION: Convert from normal form to Montgomery form
 // Input a is in normal form, output c is in Montgomery form
 // Uses CIOS: c = a * R^2 * R^-1 mod p = a * R mod p
@@ -900,9 +1450,9 @@ __host__ __device__ static void fp_pow_internal_mont(Fp &result,
    int start_bit = (limb_idx == msb_idx) ? bit_pos : LIMB_BITS - 1;

    for (int bit = start_bit; bit >= 0; bit--) {
-      // Square result
+      // Square result using the optimised squaring path
      Fp temp;
-      fp_mont_mul(temp, result, result);
+      fp_mont_sqr(temp, result);
      fp_copy(result, temp);

      // Multiply by base if current bit is set
@@ -1081,7 +1631,7 @@ __host__ __device__ bool fp_sqrt(Fp &c, const Fp &a) {
  // Verify: c^2 should equal a (mod p) - using Montgomery form
  Fp c_mont, c_squared_mont;
  fp_to_montgomery(c_mont, c);
-  fp_mont_mul(c_squared_mont, c_mont, c_mont);
+  fp_mont_sqr(c_squared_mont, c_mont);

  if (fp_cmp(c_squared_mont, a_mont) == ComparisonType::Equal) {
    return true;
@@ -1091,7 +1641,7 @@ __host__ __device__ bool fp_sqrt(Fp &c, const Fp &a) {
  Fp alt_c, alt_c_mont;
  fp_sub(alt_c, p, c);
  fp_to_montgomery(alt_c_mont, alt_c);
-  fp_mont_mul(c_squared_mont, alt_c_mont, alt_c_mont);
+  fp_mont_sqr(c_squared_mont, alt_c_mont);
  if (fp_cmp(c_squared_mont, a_mont) == ComparisonType::Equal) {
    fp_copy(c, alt_c);
    return true;
@@ -1103,7 +1653,7 @@ __host__ __device__ bool fp_sqrt(Fp &c, const Fp &a) {
    fp_sub(reduced_c, c, p);
    fp_copy(c, reduced_c);
    fp_to_montgomery(reduced_c_mont, reduced_c);
-    fp_mont_mul(c_squared_mont, reduced_c_mont, reduced_c_mont);
+    fp_mont_sqr(c_squared_mont, reduced_c_mont);
    if (fp_cmp(c_squared_mont, a_mont) == ComparisonType::Equal) {
      return true;
    }
--- a/backends/zk-cuda-backend/cuda/src/primitives/fp2.cu
+++ b/backends/zk-cuda-backend/cuda/src/primitives/fp2.cu
@@ -74,6 +74,18 @@ __host__ __device__ void fp2_sub(Fp2 &c, const Fp2 &a, const Fp2 &b) {
  fp_sub(c.c1, a.c1, b.c1);
 }

+// Lazy add/sub for Fp2: component-wise fp_add_lazy / fp_sub_lazy.
+// Outputs each component in [0, 2p); safe as input to fp2_mont_mul.
+__host__ __device__ void fp2_add_lazy(Fp2 &c, const Fp2 &a, const Fp2 &b) {
+  fp_add_lazy(c.c0, a.c0, b.c0);
+  fp_add_lazy(c.c1, a.c1, b.c1);
+}
+
+__host__ __device__ void fp2_sub_lazy(Fp2 &c, const Fp2 &a, const Fp2 &b) {
+  fp_sub_lazy(c.c0, a.c0, b.c0);
+  fp_sub_lazy(c.c1, a.c1, b.c1);
+}
+
 // Small-constant multiplication via addition chains.
 // These replace full Fp2 Montgomery multiplications by 2, 3, 4, 8 with
 // modular additions on each component.
@@ -158,8 +170,10 @@ __host__ __device__ void fp2_mont_mul(Fp2 &c, const Fp2 &a, const Fp2 &b) {

  fp_mont_mul(t0, a.c0, b.c0);
  fp_mont_mul(t1, a.c1, b.c1);
-  fp_add(t2, a.c0, a.c1);
-  fp_add(t3, b.c0, b.c1);
+  // Lazy add: skip the conditional subtraction since t2, t3 feed fp_mont_mul
+  // which accepts inputs in [0, 2p). Saves 2 conditional subtractions.
+  fp_add_lazy(t2, a.c0, a.c1);
+  fp_add_lazy(t3, b.c0, b.c1);
  fp_mont_mul(t2, t2, t3);
  fp_sub(c.c0, t0, t1);
  fp_sub(c.c1, t2, t0);
@@ -176,8 +190,10 @@ __host__ __device__ void fp2_mont_mul(Fp2 &c, const Fp2 &a, const Fp2 &b) {
 __host__ __device__ void fp2_mont_square(Fp2 &c, const Fp2 &a) {
  Fp sum, diff, c0_tmp, prod;

-  fp_add(sum, a.c0, a.c1);
-  fp_sub(diff, a.c0, a.c1);
+  // Lazy add/sub: sum and diff feed fp_mont_mul (accepts [0, 2p)).
+  // Saves 2 conditional subtractions vs canonical fp_add + fp_sub.
+  fp_add_lazy(sum, a.c0, a.c1);
+  fp_sub_lazy(diff, a.c0, a.c1);
  fp_mont_mul(c0_tmp, sum, diff);

  fp_mont_mul(prod, a.c0, a.c1);
@@ -242,7 +258,7 @@ __host__ __device__ void fp_inv_fermat(Fp &result, const Fp &a) {
      if (found_first_bit || ((p_minus_2.limb[limb] >> bit) & 1)) {
        found_first_bit = true;
        Fp temp;
-        fp_mont_mul(temp, result_mont, result_mont);
+        fp_mont_sqr(temp, result_mont);
        fp_copy(result_mont, temp);

        if ((p_minus_2.limb[limb] >> bit) & 1) {
@@ -267,8 +283,8 @@ __host__ __device__ void fp2_inv(Fp2 &c, const Fp2 &a) {

  // Compute norm = a0^2 + a1^2 in Montgomery form
  Fp t0, t1, norm_m;
-  fp_mont_mul(t0, a0_m, a0_m);
-  fp_mont_mul(t1, a1_m, a1_m);
+  fp_mont_sqr(t0, a0_m);
+  fp_mont_sqr(t1, a1_m);
  fp_add(norm_m, t0, t1);

  // Convert norm to normal form for inversion, then back to Montgomery
@@ -295,8 +311,8 @@ __host__ __device__ void fp2_inv(Fp2 &c, const Fp2 &a) {
 __host__ __device__ void fp2_mont_inv(Fp2 &c, const Fp2 &a) {
  Fp t0, t1, norm, norm_inv;

-  fp_mont_mul(t0, a.c0, a.c0);
-  fp_mont_mul(t1, a.c1, a.c1);
+  fp_mont_sqr(t0, a.c0);
+  fp_mont_sqr(t1, a.c1);
  fp_add(norm, t0, t1);
  fp_mont_inv(norm_inv, norm);
  fp_mont_mul(c.c0, a.c0, norm_inv);
--- a/tfhe/docs/README.md
+++ b/tfhe/docs/README.md
@@ -29,7 +29,7 @@ Learn the basics of TFHE-rs, set it up, and make it run with ease.

 Start building with TFHE-rs by exploring its core features, discovering essential guides, and learning more with user-friendly tutorials.

-<table data-view="cards"><thead><tr><th></th><th></th><th></th><th data-hidden data-card-cover data-type="files"></th></tr></thead><tbody><tr><td><strong>FHE Computations</strong></td><td>Run FHE computation on encrypted data.</td><td><ul><li><a href="fhe-computation/types/">Types </a></li><li><a href="fhe-computation/operations/">Operations</a></li></ul></td><td><a href=".gitbook/assets/bronze-gradient.png">bronze-gradient.png</a></td></tr><tr><td><strong>Configuration</strong></td><td>Advanced configuration for better performance.</td><td><ul><li><a href="configuration/rust-configuration.md">Advanced Rust </a></li><li><a href="configuration/gpu-acceleration/run-on-gpu.md">GPU acceleration</a></li><li><a href="configuration/hpu-acceleration/run-on-hpu.md">HPU acceleration</a></li></ul></td><td><a href=".gitbook/assets/yellow-gradient.png">yellow-gradient.png</a></td></tr><tr><td><strong>Integration</strong></td><td>Use TFHE-rs in different contexts or platforms.</td><td><ul><li><a href="integration/c-api.md">C API</a></li><li><a href="integration/js-on-wasm-api.md">JS on WASM API</a></li></ul></td><td><a href=".gitbook/assets/orange-gradient.png">orange-gradient.png</a></td></tr></tbody></table>
+<table data-view="cards"><thead><tr><th></th><th></th><th></th><th data-hidden data-card-cover data-type="files"></th></tr></thead><tbody><tr><td><strong>FHE Computations</strong></td><td>Run FHE computation on encrypted data.</td><td><ul><li><a href="fhe-computation/types/">Types </a></li><li><a href="fhe-computation/operations/">Operations</a></li></ul></td><td><a href=".gitbook/assets/bronze-gradient.png">bronze-gradient.png</a></td></tr><tr><td><strong>Configuration</strong></td><td>Advanced configuration for better performance.</td><td><ul><li><a href="configuration/rust-configuration.md">Advanced Rust </a></li><li><a href="configuration/gpu-acceleration/run-on-gpu.md">GPU acceleration</a></li><li><a href="configuration/hpu-acceleration/run-on-hpu.md">HPU acceleration</a></li></ul></td><td><a href=".gitbook/assets/yellow-gradient.png">yellow-gradient.png</a></td></tr><tr><td><strong>Integration</strong></td><td>Use TFHE-rs in different contexts or platforms..</td><td><ul><li><a href="integration/c-api.md">C API</a></li><li><a href="integration/js-on-wasm-api.md">JS on WASM API</a></li></ul></td><td><a href=".gitbook/assets/orange-gradient.png">orange-gradient.png</a></td></tr></tbody></table>

 ## Explore more

@@ -42,7 +42,7 @@ Explore step-by-step guides that walk you through real-world uses of TFHE-rs.&#x
 * [Homomorphic parity bit](tutorials/parity-bit.md): Learn how to implement a parity bit calculation over encrypted data
 * [Homomorphic case changing on ASCII string](tutorials/ascii-fhe-string.md): See how to process string data securely by changing cases while keeping the data encrypted.
 * [SHA256 with Boolean API](tutorials/sha256-bool.md): Delve into a more complex example: implementing the SHA256 hash function entirely on encrypted boolean values.
-* [All tutorials](tutorials/see-all-tutorials.md): A complete list of all available tutorials in one place.
+* [All tutorials](tutorials/see-all-tutorials.md): A complete list of all available tutorials in one place.tutorials: A complete list of all available tutorials in one place.

 ### References & Explanations

--- a/tfhe/docs/configuration/gpu-acceleration/run-on-gpu.md
+++ b/tfhe/docs/configuration/gpu-acceleration/run-on-gpu.md
@@ -18,7 +18,7 @@ To reproduce TFHE-rs GPU benchmarks, see [this dedicated page](../../getting-sta

 ## GPU TFHE-rs features

-By default, the GPU backend uses specific cryptographic parameters. When calling the [`tfhe::ConfigBuilder::default()`](https://doc.rust-lang.org/nightly/core/default/trait.Default.html#tymethod.default) function, the cryptographic parameters for PBS will be:
+By default, the GPU backend uses specific cryptographic parameters. When calling the [`tfhe::ConfigBuilder::default()`](https://doc.rust-lang.org/nightly/core/default/trait.Default.html#tymethod.default) function, the cryptographic for PBS will be:
 - PBS parameters: [`PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS`](https://docs.rs/tfhe/latest/tfhe/shortint/parameters/aliases/constant.PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS.html)

 These PBS parameters are accompanied by the following compression parameters: 
@@ -62,7 +62,7 @@ The key differences between the CPU API and the GPU API are:

 To compile and execute GPU TFHE-rs programs, make sure your system has the following software installed.

-* CUDA version >= 10
+* Cuda version >= 10
 * Compute Capability >= 3.0
 * [gcc](https://gcc.gnu.org/) >= 8.0 - check this [page](https://gist.github.com/ax3l/9489132) for more details about nvcc/gcc compatible versions
 * [cmake](https://cmake.org/) >= 3.24
--- a/tfhe/docs/fhe-computation/advanced-features/encrypted-prf.md
+++ b/tfhe/docs/fhe-computation/advanced-features/encrypted-prf.md
@@ -5,7 +5,7 @@ This document explains the mechanism and steps to generate an oblivious encrypte
 The goal is to give to the server the possibility to generate a random value, which will be obtained in an encrypted format and will remain unknown to the server.

 The main method for this is `FheUint::generate_oblivious_pseudo_random_custom_range` which returns an integer in the given range.
-Currently the range can only be in the form `[0, excluded_upper_bound)` with any `excluded_upper_bound` in `[1, 2^64)`
+Currently the range can only be in the form `[0, excluded_upper_bound[` with any `excluded_upper_bound` in `[1, 2^64[`
 It follows a distribution close to the uniform.

 This function guarantees the norm-1 distance (defined as ∆(P,Q) := 1/2 Sum[ω∈Ω] |P(ω) − Q(ω)|)
@@ -20,11 +20,11 @@ If the range is a power of 2, the distribution is uniform (for any `max_distance


 For powers of 2 specifically there are two methods on `FheUint` and `FheInt` (based on [this article](https://eprint.iacr.org/2024/665)): 
- `generate_oblivious_pseudo_random` which return an integer taken uniformly in the full integer range (`[0; 2^N[` for a `FheUintN` and `[-2^(N-1); 2^(N-1))` for a `FheIntN`).
+- `generate_oblivious_pseudo_random` which return an integer taken uniformly in the full integer range (`[0; 2^N[` for a `FheUintN` and `[-2^(N-1); 2^(N-1)[` for a `FheIntN`).
 - `generate_oblivious_pseudo_random_bounded` which return an integer taken uniformly in `[0; 2^random_bits_count[`. For a `FheUintN`, we must have  `random_bits_count <= N`. For a `FheIntN`, we must have  `random_bits_count <= N - 1`.


-These methods take a seed `Seed` as input, which could be any `u128` value.
+These method functions take a seed `Seed` as input, which could be any `u128` value.
 They rely on the use of the usual server key.
 The output is reproducible, i.e., the function is deterministic from the inputs: assuming the same hardware, seed and server key, this function outputs the same random encrypted value.

--- a/tfhe/docs/fhe-computation/advanced-features/noise-squashing.md
+++ b/tfhe/docs/fhe-computation/advanced-features/noise-squashing.md
@@ -44,7 +44,7 @@ pub fn main() {

    // FheInt16 case
    let clear: i16 = -42;
-    let enc = FheInt16::encrypt(clear, &cks);
+    let enc = FheInt10::encrypt(clear, &cks);
    let bitand = &enc & &enc;
    let squashed = bitand.squash_noise().unwrap();

--- a/tfhe/docs/fhe-computation/advanced-features/overflow-operations.md
+++ b/tfhe/docs/fhe-computation/advanced-features/overflow-operations.md
@@ -8,11 +8,11 @@ These operations might be slower than their non-overflow-detecting equivalent, s

 Here's the list of operations supported along with their symbol:

-| name                                                    | symbol            | type   |
-| ------------------------------------------------------- | ----------------- | ------ |
-| [Add](https://doc.rust-lang.org/std/ops/trait.Add.html) | `overflowing_add` | Binary |
-| [Sub](https://doc.rust-lang.org/std/ops/trait.Sub.html) | `overflowing_sub` | Binary |
-| [Mul](https://doc.rust-lang.org/std/ops/trait.Mul.html) | `overflowing_mul` | Binary |
+| name                                                    | symbol         | type   |
+| ------------------------------------------------------- | -------------- | ------ |
+| [Add](https://doc.rust-lang.org/std/ops/trait.Add.html) | `overflow_add` | Binary |
+| [Sub](https://doc.rust-lang.org/std/ops/trait.Sub.html) | `overflow_sub` | Binary |
+| [Mul](https://doc.rust-lang.org/std/ops/trait.Mul.html) | `overflow_mul` | Binary |

 The usage of these operations is similar to the standard ones. The key difference is in the decryption process, as shown in following example:

--- a/tfhe/docs/fhe-computation/advanced-features/rerand.md
+++ b/tfhe/docs/fhe-computation/advanced-features/rerand.md
@@ -1,6 +1,8 @@
 # Ciphertext Re-Randomization

-This document explains the ciphertext re-randomization feature in TFHE-rs, designed to protect FHE computations against attacks under the sIND-CPA^D security model (`s` stands for strong), described in the paper [Drifting Towards Better Error Probabilities in Fully Homomorphic Encryption Schemes](https://eprint.iacr.org/2024/1718).
+In the paper [Drifting Towards Better Error Probabilities in Fully Homomorphic Encryption Schemes](https://eprint.iacr.org/2024/1718), Bernard et al. introduced the sIND-CPA^D security model (`s` stands for strong here).
+
+This document explains the ciphertext re-randomization feature in TFHE-rs, designed to protect FHE computations against attacks under the sIND-CPA^D security model.

 To be secure under that model, **TFHE-rs** provides a re-randomization primitive that allows users to re-randomize ciphertexts before they are used as inputs to a predefined FHE program `F`. In this context, `F` should be understood as any FHE computation that must remain secure under the sIND-CPA^D model. All encrypted inputs to `F` must be re-randomized prior to execution.

--- a/tfhe/docs/fhe-computation/advanced-features/trivial-ciphertext.md
+++ b/tfhe/docs/fhe-computation/advanced-features/trivial-ciphertext.md
@@ -4,7 +4,7 @@ This document describes how to use trivial encryption in **TFHE-rs** to initiali

 Sometimes, the server side needs to initialize a value. For example, when computing the sum of a list of ciphertexts, you typically initialize the `sum` variable to `0`.

-Instead of asking the client to send an actual encrypted zero, the server can use a trivial encryption. A trivial encryption creates a ciphertext that contains the desired value but isn't securely encrypted - essentially anyone with any key can decrypt it.
+Instead of asking the client to send an actual encrypted zero, the server can use a trivial encryption. A trivial encryption creates a ciphertext that contains the desired value but isn't securely encrypted - essentially anyone, any key can decrypt it.

 ```rust
 use tfhe::prelude::*;
--- a/tfhe/docs/fhe-computation/advanced-features/upgrade-key-chain.md
+++ b/tfhe/docs/fhe-computation/advanced-features/upgrade-key-chain.md
@@ -5,7 +5,7 @@ easily upgrade a ciphertext that is under older parameters to newer parameters.

 It is different and complementary to the data versioning feature, as the
 data versioning feature allows loading ciphertexts generated
-with a previous TFHE-rs version if the ciphertext structure changed.
+with a previous TFHE-rs version if the ciphertext structurally changed.


 The `UpgradeKeyChain` first needs to know about possible parameters, for that,
--- a/tfhe/docs/fhe-computation/compute/parameters.md
+++ b/tfhe/docs/fhe-computation/compute/parameters.md
@@ -25,14 +25,14 @@ fn main() {
 ```

 {% hint style="info" %}
-These default parameters may be updated in future releases of **TFHE-rs**, potentially causing incompatibilities between versions. For production systems, it is therefore recommended to specify a fixed parameter set.
+These default parameters may be updated with in future releases of **TFHE-rs**, potentially causing incompatibilities between versions. For production systems, it is therefore recommended to specify a fixed parameter set.
 {% endhint %}

 ## Parameters versioning and naming scheme

 Parameter sets are versioned for backward compatibility. This means that each set of parameters can be tied to a specific version of **TFHE-rs**, so that they remain unchanged and compatible after an upgrade.

-All parameter sets are stored as variables inside the `tfhe::shortint::parameters` module, with submodules named after the versions of **TFHE-rs** in which these parameters were added. For example, parameters added in **TFHE-rs** v1.0 can be found inside `tfhe::shortint::parameters::v1_0`.
+All parameter sets are stored as variables inside the `tfhe::shortint::parameters` module, with submodules named after the versions of **TFHE-rs** in which these parameters where added. For example, parameters added in **TFHE-rs** v1.0 can be found inside `tfhe::shortint::parameters::v1_0`.

 The naming convention of these parameters indicates their capabilities. Taking `tfhe::parameters::v1_0::V1_0_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128` as an example:

--- a/tfhe/docs/fhe-computation/data-handling/compress.md
+++ b/tfhe/docs/fhe-computation/data-handling/compress.md
@@ -41,7 +41,7 @@ fn main() {
 }
 ```

-### Compressing ciphertexts after some homomorphic computation
+### Compression ciphertexts after some homomorphic computation

 You can compress ciphertexts at any time, even after performing multiple homomorphic operations.

--- a/tfhe/docs/fhe-computation/data-handling/serialization.md
+++ b/tfhe/docs/fhe-computation/data-handling/serialization.md
@@ -4,7 +4,7 @@ This document explains the `serialization` and `deserialization` features that a

 ## Safe serialization/deserialization

-When dealing with sensitive types, it's important to implement safe serialization and safe deserialization functions to prevent runtime errors and enhance security. **TFHE-rs** provides easy to use functions for this purpose, such as `safe_serialize`, `safe_deserialize` and `safe_deserialize_conformant`.
+When dealing with sensitive types, it's important to implement safe serialization and safe deserialization functions to prevent runtime errors and enhance security. **TFHE-rs** provide easy to use functions for this purpose, such as `safe_serialize`, `safe_deserialize` and `safe_deserialize_conformant`.

 Here is a basic example on how to use it:

--- a/tfhe/docs/fhe-computation/operations/arithmetic-operations.md
+++ b/tfhe/docs/fhe-computation/operations/arithmetic-operations.md
@@ -45,7 +45,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
    a *= &b;     // Clear equivalent computations: 15 * 27 mod 256 = 149
    b = &b + &c;    // Clear equivalent computations: 27 + 43 mod 256 = 70
    b -= 76u8;   // Clear equivalent computations: 70 - 76 mod 256 = 250
-    d -= 13i8;   // Clear equivalent computations: -87 - 13 = -100 in [-128, 127]
+    d -= 13i8;   // Clear equivalent computations: -87 - 13 = 100 in [-128, 128[

    let dec_a: u8 = a.decrypt(&keys);
    let dec_b: u8 = b.decrypt(&keys);
--- a/tfhe/docs/fhe-computation/operations/comparison-operations.md
+++ b/tfhe/docs/fhe-computation/operations/comparison-operations.md
@@ -15,8 +15,8 @@ Supported operations:
 | [Not Equal](https://doc.rust-lang.org/std/cmp/trait.PartialEq.html)         | `ne`   | Binary |
 | [Greater Than](https://doc.rust-lang.org/std/cmp/trait.PartialOrd.html)     | `gt`   | Binary |
 | [Greater or Equal](https://doc.rust-lang.org/std/cmp/trait.PartialOrd.html) | `ge`   | Binary |
-| [Less Than](https://doc.rust-lang.org/std/cmp/trait.PartialOrd.html)            | `lt`   | Binary |
-| [Less Than or Equal](https://doc.rust-lang.org/std/cmp/trait.PartialOrd.html)   | `le`   | Binary |
+| [Lower](https://doc.rust-lang.org/std/cmp/trait.PartialOrd.html)            | `lt`   | Binary |
+| [Lower or Equal](https://doc.rust-lang.org/std/cmp/trait.PartialOrd.html)   | `le`   | Binary |

 The following example shows how to perform comparison operations:

--- a/tfhe/docs/fhe-computation/operations/string-operations.md
+++ b/tfhe/docs/fhe-computation/operations/string-operations.md
@@ -21,7 +21,7 @@ This document details the string operations supported by **TFHE-rs**.
 | [find](https://doc.rust-lang.org/stable/std/primitive.str.html#method.find)                                     |find                  | FheAsciiString | FheAsciiString or ClearString                  |
 | [rfind](https://doc.rust-lang.org/stable/std/primitive.str.html#method.rfind)                                   |rfind                 | FheAsciiString | FheAsciiString or ClearString                  |
 | [strip_prefix](https://doc.rust-lang.org/stable/std/primitive.str.html#method.strip_prefix)                     |strip_prefix          | FheAsciiString | FheAsciiString or ClearString                  |
-| [strip_suffix](https://doc.rust-lang.org/stable/std/primitive.str.html#method.strip_suffix)                     |strip_suffix          | FheAsciiString | FheAsciiString or ClearString                  |
+| [strip_suffix](https://doc.rust-lang.org/stable/std/primitive.str.html#method.strip_suffix)                     |strip_suffix          | FheAsciiString | FheAsci---iString or ClearString               |
 | [concat](https://doc.rust-lang.org/stable/std/primitive.str.html#method.concat)                                 |concat                | FheAsciiString | FheAsciiString                                 |
 | [repeat](https://doc.rust-lang.org/stable/std/primitive.str.html#method.repeat)                                 |repeat                | FheAsciiString | u16 or u32 or i32 or usize or (FheUint16, u16) |
 | [trim_end](https://doc.rust-lang.org/stable/std/primitive.str.html#method.trim_end)                             |trim_end              | FheAsciiString |                                                |
--- a/tfhe/docs/fhe-computation/operations/ternary-conditional-operations.md
+++ b/tfhe/docs/fhe-computation/operations/ternary-conditional-operations.md
@@ -2,7 +2,7 @@

 This document details the ternary operations supported by **TFHE-rs**.

-The ternary conditional operator executes conditional instructions in the form `if cond { choice_if_true } else { choice_if_false }`.
+The ternary conditional operator execute conditional instructions in the form `if cond { choice_if_true } else { choice_if_false }`.

 | name             | symbol   | type    |
 | ---------------- | -------- | ------- |
--- a/tfhe/docs/fhe-computation/tooling/debug.md
+++ b/tfhe/docs/fhe-computation/tooling/debug.md
@@ -54,7 +54,7 @@ a: Ok(1234), b: Ok(4567), c: Ok(89101112)
 a * b = Ok(5635678)
 ```

-If any input to `mul_all` is not a trivial ciphertext, the computations will be done 100% in FHE, and the program will output:
+If any input to `mul_all` is not a trivial ciphertexts, the computations will be done 100% in FHE, and the program will output:

 ```console
 a: Err(NotTrivialCiphertextError), b: Err(NotTrivialCiphertextError), c: Err(NotTrivialCiphertextError)
--- a/tfhe/docs/fhe-computation/types/array.md
+++ b/tfhe/docs/fhe-computation/types/array.md
@@ -2,7 +2,7 @@

 This document describes the array types provided by the High-level API.

-These new encrypted types allow you to easily perform array and tensor operations on encrypted data, taking care of the iteration and shape logic for you.
+This new encrypted types allow you to easily perform array and tensor operations on encrypted data, taking care of the iteration and shape logic for you.

 It also implements efficient algorithms in some cases, like summing elements of an array.

--- a/tfhe/docs/fhe-computation/types/strings.md
+++ b/tfhe/docs/fhe-computation/types/strings.md
@@ -2,7 +2,7 @@

 This document explains the FheAsciiString type for handling encrypted strings in TFHE-rs.

-TFHE-rs has support for ASCII strings with the type FheAsciiString.
+TFHE-rs has supports for ASCII strings with the type FheAsciiString.
 You can enable this feature using the flag: --features=strings

 {% hint style="warning" %}
--- a/tfhe/docs/getting-started/README.md
+++ b/tfhe/docs/getting-started/README.md
@@ -16,7 +16,7 @@

 TFHE is a Fully Homomorphic Encryption (FHE) scheme based on Learning With Errors (LWE), which is a secure cryptographic primitive against even quantum computers. The **TFHE-rs** library implements Zama’s variant of TFHE.

-### Homomorphic Encryption Basics
+#### Homomorphic Encryption Basics

 The basic elements of cryptography:

@@ -30,7 +30,7 @@ FHE allows to compute on ciphertexts without revealing the content of the messag
 * **Homomorphic addition:** $$E[x] + E[y] = E[x + y]$$
 * **Homomorphic multiplication:** $$E[x] * E[y] = E[x * y]$$

-### Zama's variant of TFHE
+## Zama's variant of TFHE

 Zama's variant of TFHE is a fully homomorphic scheme that takes fixed-precision numbers as messages. It implements all homomorphic operations needed, such as addition and function evaluation via Programmable Bootstrapping.

--- a/tfhe/docs/getting-started/installation.md
+++ b/tfhe/docs/getting-started/installation.md
@@ -32,7 +32,7 @@ By default, **TFHE-rs** makes the assumption that hardware AES features are enab
 - x86_64: sse2, aesni
 - aarch64: aes, neon

-To add support for older CPUs, import **TFHE-rs** with the `software-prng` feature in your `Cargo.toml`:
+To add support for older CPU, import **TFHE-rs** with the `software-prng` feature in your `Cargo.toml`:

 ```toml
 tfhe = { version = "~1.6.0", features = ["boolean", "shortint", "integer", "software-prng"] }
--- a/tfhe/docs/getting-started/quick-start.md
+++ b/tfhe/docs/getting-started/quick-start.md
@@ -104,11 +104,11 @@ fn main() {
    let a = FheUint8::encrypt(clear_a, &client_key);
    let b = FheUint8::encrypt(clear_b, &client_key);

-    // Server-side
+    //Server-side
    set_server_key(server_key);
    let result = a + b;

-    // Client-side
+    //Client-side
    let decrypted_result: u8 = result.decrypt(&client_key);

    let clear_result = clear_a + clear_b;
--- a/tfhe/docs/getting-started/security-and-cryptography.md
+++ b/tfhe/docs/getting-started/security-and-cryptography.md
@@ -103,7 +103,7 @@ params_lwe = LWE.Parameters(n=879, q=2**64, Xs=ND.Binary, Xe=ND.TUniform(46))
 LWE.estimate(params_lwe, deny_list=("arora-gb", "bkw"))
 ```

-The output corresponds to a selection of attack costs (`usvp`, `bdd`, etc), each with running time `rop`. The security level is the `log2` of the smallest `rop` value (in this case `dual_hybrid` with `2^134.8`). Therefore, the security level of this parameter set is ~134 bits. The same technique can be applied to the GLWE parameters by replacing the LWE dimension `879` by `k*N = 4*512`, i.e. `n=2048` and `Xe=ND.TUniform(46)` by `Xe = ND.TUniform(17)`, that is:
+The output corresponds to a selection of attack costs (`usvp`, `bdd`, etc), each with running time `rop`. The security level is the `log2` of the smallest `rop` value (in this case `dual_hybrid` with `2^134.8`). Therefore, the security level of this parameter set is ~134 bits. The same technique can be applied to the GLWE parameters by replacing the LWE dimension `879` by `k*N = 512*4`, i.e. `n=2048` and `Xe=ND.TUniform(46)` by `Xe = ND.TUniform(17)`, that is:

 ```
 from estimator import *
@@ -124,6 +124,6 @@ The parameter sets for the x86 CPU backend with a $$p_{error} \le 2^{-128}$$ are



-## Classical public key encryption
+## Classical public key encryption.

 In classical public key encryption, the public key contains a given number of ciphertexts all encrypting the value 0. By setting the number of encryptions to 0 in the public key at $$m = \lceil (n+1) \log(q) \rceil + \lambda$$, where $$n$$ is the LWE dimension, $$q$$ is the ciphertext modulus, and $$\lambda$$ is the number of security bits. This construction is secure due to the leftover hash lemma, which relates to the impossibility of breaking the underlying multiple subset sum problem. This guarantees both a high-density subset sum and an exponentially large number of possible associated random vectors per LWE sample $$(a,b)$$.
--- a/tfhe/docs/integration/c-api.md
+++ b/tfhe/docs/integration/c-api.md
@@ -2,7 +2,7 @@

 This document describes the C bindings to the **TFHE-rs** high-level primitives for creating Fully Homomorphic Encryption (FHE) programs.

-## Setting up TFHE-rs C API for C programming
+## Setting up TFHE-rs C API for C programming.

 You can build **TFHE-rs** C API using the following command:

@@ -19,7 +19,7 @@ Locate files in the right path:
  * The `tfhe-c-api-dynamic-buffer.h` header
  * The static (.a) and dynamic (.so) libraries

-Ensure your build system links the C or C++ program against **TFHE-rs** C API binaries and the dynamic buffer library.
+Ensure your build system configures the C or C++ program links against **TFHE-rs** C API binaries and the dynamic buffer library.

 The following is a minimal `CMakeLists.txt` configuration example:

--- a/tfhe/docs/references/fine-grained-apis/quick-start.md
+++ b/tfhe/docs/references/fine-grained-apis/quick-start.md
@@ -4,7 +4,7 @@ This library makes it possible to execute **homomorphic operations over encrypte

 The server, however, has to know the circuit to be evaluated. At the end of the computation, the server returns the encryption of the result to the user. Then the user can decrypt it with the `secret key`.

-## General method to write a homomorphic circuit program
+## General method to write an homomorphic circuit program

 The overall process to write an homomorphic program is the same for all types. The basic steps for using the TFHE-rs library are the following:

--- a/tfhe/docs/tutorials/parity-bit.md
+++ b/tfhe/docs/tutorials/parity-bit.md
@@ -166,7 +166,7 @@ fn check_parity_bit_validity(
 ) -> bool
 ```

-To make it generic, the first step is:
+To make it generic, the first steps is:

 ```Rust
 fn compute_parity_bit<BoolType>(
--- a/tfhe/src/high_level_api/xof_key_set/mod.rs
+++ b/tfhe/src/high_level_api/xof_key_set/mod.rs
@@ -378,7 +378,7 @@ impl CompressedXofKeySet {
    }

    /// Decompress the KeySet
-    pub fn decompress(self) -> crate::Result<XofKeySet> {
+    pub fn decompress(&self) -> crate::Result<XofKeySet> {
        let tag = self.compressed_server_key.tag.clone();
        let (mut public_key, expanded_server_key) = self.expand();
        // Server key tag is the source of truth; sync public key