chore(gpu): define higher values for the sm size based on compute capability

This commit is contained in:
Agnes Leroy
2024-08-01 10:15:48 +02:00
committed by Agnès Leroy
parent 80fe45f354
commit d69dd20079
6 changed files with 31 additions and 12 deletions

View File

@@ -144,20 +144,20 @@ jobs:
- name: Run core crypto and internal CUDA backend tests
run: |
make test_core_crypto_gpu
make test_cuda_backend
BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu
BIG_TESTS_INSTANCE=TRUE make test_cuda_backend
- name: Run user docs tests
run: |
make test_user_doc_gpu
BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu
- name: Test C API
run: |
make test_c_api_gpu
BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu
- name: Run High Level API Tests
run: |
make test_high_level_api_gpu
BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu
slack-notify:
name: Slack Notification

View File

@@ -144,11 +144,11 @@ jobs:
- name: Run signed integer tests
run: |
make test_signed_integer_gpu_ci
BIG_TESTS_INSTANCE=TRUE make test_signed_integer_gpu_ci
- name: Run signed integer multi-bit tests
run: |
make test_signed_integer_multi_bit_gpu_ci
BIG_TESTS_INSTANCE=TRUE make test_signed_integer_multi_bit_gpu_ci
slack-notify:
name: Slack Notification

View File

@@ -144,11 +144,11 @@ jobs:
- name: Run unsigned integer tests
run: |
make test_unsigned_integer_gpu_ci
BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_gpu_ci
- name: Run unsigned integer multi-bit tests
run: |
make test_unsigned_integer_multi_bit_gpu_ci
BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_multi_bit_gpu_ci
slack-notify:
name: Slack Notification

View File

@@ -247,5 +247,14 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
gpu_index);
check_cuda_error(cudaGetLastError());
#if CUDA_ARCH == 900
max_shared_memory = 226000;
#elif CUDA_ARCH == 890
max_shared_memory = 127000;
#elif CUDA_ARCH == 800
max_shared_memory = 163000;
#elif CUDA_ARCH == 700
max_shared_memory = 95000;
#endif
return max_shared_memory;
}

View File

@@ -234,7 +234,12 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
int32_t h_smart_copy_in[r * num_blocks];
int32_t h_smart_copy_out[r * num_blocks];
auto max_shared_memory = cuda_get_max_shared_memory(gpu_indexes[0]);
/// Here it is important to query the default max shared memory on device 0
/// instead of cuda_get_max_shared_memory,
/// to avoid bugs with tree_add_chunks trying to use too much shared memory
int max_shared_memory = 0;
check_cuda_error(cudaDeviceGetAttribute(
&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));
// create lut object for message and carry
// we allocate luts_message_carry in the host function (instead of scratch)

View File

@@ -129,8 +129,13 @@ fi
# Override test-threads number to avoid Out-of-memory issues on GPU instances
if [[ "${backend}" == "gpu" ]]; then
test_threads=5
doctest_threads=5
if [[ "${BIG_TESTS_INSTANCE}" == TRUE ]]; then
test_threads=5
doctest_threads=5
else
test_threads=3
doctest_threads=3
fi
fi
filter_expression=$(/usr/bin/python3 scripts/test_filtering.py --layer integer --backend "${backend}" ${fast_tests_argument} ${nightly_tests_argument} ${multi_bit_argument} ${sign_argument} ${no_big_params_argument})