fix large (>512 elements) ecntt issue (#553)

This PR solves an issue for large ecntt where cuda blocks are too large and cannot be assigned to SMs. The fix is to reduce thread count per block and increase block count in that case.
2026-01-07 22:53:56 -05:00 · 2024-07-04 15:33:49 +03:00
parent a4b1eb3de9
commit 5516320ad7
2 changed files with 3 additions and 2 deletions
--- a/icicle/src/ntt/ntt.cu
+++ b/icicle/src/ntt/ntt.cu
@@ -320,7 +320,8 @@ namespace ntt {
                                                          // less then max to allow more concurrent blocks on SM
      const int logn_shmem = is_shared_mem_enabled ? int(log(2 * num_threads) / log(2))
                                                   : 0; // TODO: shared memory support only for types <= 32 bytes
-      int num_threads_coset = max(min(n / 2, MAX_NUM_THREADS), 1);
+      // Note: for ecntt we limit block size (=#threads per block) since otherwise it doesn't fit the SM resources.
+      int num_threads_coset = max(min(n / 2, IS_ECNTT ? MAX_THREADS_BATCH_ECNTT : MAX_NUM_THREADS), 1);
      int num_blocks_coset = (n * batch_size + num_threads_coset - 1) / num_threads_coset;

      if (inverse) {
--- a/wrappers/rust/icicle-core/src/ecntt/tests.rs
+++ b/wrappers/rust/icicle-core/src/ecntt/tests.rs
@@ -15,7 +15,7 @@ pub fn check_ecntt<C: Curve>()
 where
    <C::ScalarField as FieldImpl>::Config: ECNTT<C>,
 {
-    let test_sizes = [1 << 4, 1 << 9];
+    let test_sizes = [1 << 4, 1 << 9, 1 << 18];
    for test_size in test_sizes {
        let points = C::generate_random_projective_points(test_size);