fix large (>512 elements) ecntt issue (#553)

This PR solves an issue for large ecntt where cuda blocks are too large
and cannot be assigned to SMs. The fix is to reduce thread count per
block and increase block count in that case.
This commit is contained in:
yshekel
2024-07-04 15:33:49 +03:00
committed by GitHub
parent a4b1eb3de9
commit 5516320ad7
2 changed files with 3 additions and 2 deletions

View File

@@ -320,7 +320,8 @@ namespace ntt {
// less then max to allow more concurrent blocks on SM
const int logn_shmem = is_shared_mem_enabled ? int(log(2 * num_threads) / log(2))
: 0; // TODO: shared memory support only for types <= 32 bytes
int num_threads_coset = max(min(n / 2, MAX_NUM_THREADS), 1);
// Note: for ecntt we limit block size (=#threads per block) since otherwise it doesn't fit the SM resources.
int num_threads_coset = max(min(n / 2, IS_ECNTT ? MAX_THREADS_BATCH_ECNTT : MAX_NUM_THREADS), 1);
int num_blocks_coset = (n * batch_size + num_threads_coset - 1) / num_threads_coset;
if (inverse) {

View File

@@ -15,7 +15,7 @@ pub fn check_ecntt<C: Curve>()
where
<C::ScalarField as FieldImpl>::Config: ECNTT<C>,
{
let test_sizes = [1 << 4, 1 << 9];
let test_sizes = [1 << 4, 1 << 9, 1 << 18];
for test_size in test_sizes {
let points = C::generate_random_projective_points(test_size);