chore(gpu): define higher values for the sm size based on compute capability

2026-01-10 07:08:03 -05:00 · 2024-08-01 10:15:48 +02:00
parent 80fe45f354
commit d69dd20079
6 changed files with 31 additions and 12 deletions
--- a/.github/workflows/hyperstack_tfhe_gpu_tests.yml
+++ b/.github/workflows/hyperstack_tfhe_gpu_tests.yml
@@ -144,20 +144,20 @@ jobs:

      - name: Run core crypto and internal CUDA backend tests
        run: |
-          make test_core_crypto_gpu
-          make test_cuda_backend
+          BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu
+          BIG_TESTS_INSTANCE=TRUE make test_cuda_backend

      - name: Run user docs tests
        run: |
-          make test_user_doc_gpu
+          BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu

      - name: Test C API
        run: |
-          make test_c_api_gpu
+          BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu

      - name: Run High Level API Tests
        run: |
-          make test_high_level_api_gpu
+          BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu

  slack-notify:
    name: Slack Notification
--- a/.github/workflows/hyperstack_tfhe_signed_integer_gpu_tests.yml
+++ b/.github/workflows/hyperstack_tfhe_signed_integer_gpu_tests.yml
@@ -144,11 +144,11 @@ jobs:

      - name: Run signed integer tests
        run: |
-          make test_signed_integer_gpu_ci
+          BIG_TESTS_INSTANCE=TRUE make test_signed_integer_gpu_ci

      - name: Run signed integer multi-bit tests
        run: |
-          make test_signed_integer_multi_bit_gpu_ci
+          BIG_TESTS_INSTANCE=TRUE make test_signed_integer_multi_bit_gpu_ci

  slack-notify:
    name: Slack Notification
--- a/.github/workflows/hyperstack_tfhe_unsigned_integer_gpu_tests.yml
+++ b/.github/workflows/hyperstack_tfhe_unsigned_integer_gpu_tests.yml
@@ -144,11 +144,11 @@ jobs:

      - name: Run unsigned integer tests
        run: |
-          make test_unsigned_integer_gpu_ci
+          BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_gpu_ci

      - name: Run unsigned integer multi-bit tests
        run: |
-          make test_unsigned_integer_multi_bit_gpu_ci
+          BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_multi_bit_gpu_ci

  slack-notify:
    name: Slack Notification
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -247,5 +247,14 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
  cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
                         gpu_index);
  check_cuda_error(cudaGetLastError());
+#if CUDA_ARCH == 900
+  max_shared_memory = 226000;
+#elif CUDA_ARCH == 890
+  max_shared_memory = 127000;
+#elif CUDA_ARCH == 800
+  max_shared_memory = 163000;
+#elif CUDA_ARCH == 700
+  max_shared_memory = 95000;
+#endif
  return max_shared_memory;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -234,7 +234,12 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
  int32_t h_smart_copy_in[r * num_blocks];
  int32_t h_smart_copy_out[r * num_blocks];

-  auto max_shared_memory = cuda_get_max_shared_memory(gpu_indexes[0]);
+  /// Here it is important to query the default max shared memory on device 0
+  /// instead of cuda_get_max_shared_memory,
+  /// to avoid bugs with tree_add_chunks trying to use too much shared memory
+  int max_shared_memory = 0;
+  check_cuda_error(cudaDeviceGetAttribute(
+      &max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));

  // create lut object for message and carry
  // we allocate luts_message_carry in the host function (instead of scratch)
--- a/scripts/integer-tests.sh
+++ b/scripts/integer-tests.sh
@@ -129,8 +129,13 @@ fi

 # Override test-threads number to avoid Out-of-memory issues on GPU instances
 if [[ "${backend}" == "gpu" ]]; then
-    test_threads=5
-    doctest_threads=5
+    if [[ "${BIG_TESTS_INSTANCE}" == TRUE ]]; then
+        test_threads=5
+        doctest_threads=5
+    else
+        test_threads=3
+        doctest_threads=3
+    fi
 fi

 filter_expression=$(/usr/bin/python3 scripts/test_filtering.py --layer integer --backend "${backend}" ${fast_tests_argument} ${nightly_tests_argument} ${multi_bit_argument} ${sign_argument} ${no_big_params_argument})