mirror of
https://github.com/pseXperiments/icicle.git
synced 2026-01-07 22:53:56 -05:00
fix large (>512 elements) ecntt issue (#553)
This PR solves an issue for large ecntt where cuda blocks are too large and cannot be assigned to SMs. The fix is to reduce thread count per block and increase block count in that case.
This commit is contained in:
@@ -320,7 +320,8 @@ namespace ntt {
|
||||
// less then max to allow more concurrent blocks on SM
|
||||
const int logn_shmem = is_shared_mem_enabled ? int(log(2 * num_threads) / log(2))
|
||||
: 0; // TODO: shared memory support only for types <= 32 bytes
|
||||
int num_threads_coset = max(min(n / 2, MAX_NUM_THREADS), 1);
|
||||
// Note: for ecntt we limit block size (=#threads per block) since otherwise it doesn't fit the SM resources.
|
||||
int num_threads_coset = max(min(n / 2, IS_ECNTT ? MAX_THREADS_BATCH_ECNTT : MAX_NUM_THREADS), 1);
|
||||
int num_blocks_coset = (n * batch_size + num_threads_coset - 1) / num_threads_coset;
|
||||
|
||||
if (inverse) {
|
||||
|
||||
@@ -15,7 +15,7 @@ pub fn check_ecntt<C: Curve>()
|
||||
where
|
||||
<C::ScalarField as FieldImpl>::Config: ECNTT<C>,
|
||||
{
|
||||
let test_sizes = [1 << 4, 1 << 9];
|
||||
let test_sizes = [1 << 4, 1 << 9, 1 << 18];
|
||||
for test_size in test_sizes {
|
||||
let points = C::generate_random_projective_points(test_size);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user