bench(ci): fix concrete-cuda benchmarks

2026-04-17 03:00:54 -04:00 · 2023-03-29 15:19:41 +02:00
parent 9cacd4adff
commit d9652b8936
10 changed files with 68 additions and 59 deletions
--- a/.github/workflows/concrete_cuda_benchmark.yml
+++ b/.github/workflows/concrete_cuda_benchmark.yml
@@ -44,21 +44,23 @@ jobs:
        run: |
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
      - name: Export CUDA variables
        if: ${{ !cancelled() }}
        run: |
          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-11.8/bin/nvcc" >> "${GITHUB_ENV}"
      # Specify the correct host compilers
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
-          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CC=/usr/bin/gcc-8" >> "${GITHUB_ENV}"
+          echo "CXX=/usr/bin/g++-8" >> "${GITHUB_ENV}"
+          echo "CUDAHOSTCXX=/usr/bin/g++-8" >> "${GITHUB_ENV}"
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Rust install
@@ -78,20 +80,21 @@ jobs:
      - name: Benchmark concrete-cuda
        if: ${{ !cancelled() }}
        run: |
-          ${{ BENCHMARK_DIR }}/benchmark_concrete_cuda --benchmark_out=benchmarks_results.json --benchmark_out_format=json
+          ${{ env.BENCHMARK_DIR }}/benchmark_concrete_cuda --benchmark_out=benchmarks_results.json 
+          --benchmark_out_format=json

      - name: Upload raw results artifact
        uses: actions/upload-artifact@v3
        with:
          name: concrete_cuda_${{ github.sha }}_raw
-          path: ${{ BENCHMARK_DIR }}/benchmarks_results.json
+          path: benchmarks_results.json

      - name: Parse results
        shell: bash
        run: |
          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
          COMMIT_HASH="$(git describe --tags --dirty)"
-          python3 ./ci/benchmark_parser.py ${{ BENCHMARK_DIR }}/benchmarks_results.json ${{ env.RESULTS_FILENAME }} \
+          python3 ./ci/benchmark_parser.py benchmarks_results.json ${{ env.RESULTS_FILENAME }} \
          --database compiler_benchmarks \
          --hardware ${{ inputs.instance_type }} \
          --project-version ${COMMIT_HASH} \
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bit_extraction.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bit_extraction.cpp
@@ -17,7 +17,7 @@ typedef struct {
  int number_of_inputs;
 } BitExtractionBenchmarkParams;

-class BitExtractionBenchmark_u64 : public benchmark::Fixture {
+class BitExtraction_u64 : public benchmark::Fixture {
 protected:
  int lwe_dimension;
  int glwe_dimension;
@@ -79,7 +79,7 @@ public:
  }
 };

-BENCHMARK_DEFINE_F(BitExtractionBenchmark_u64, BitExtraction)
+BENCHMARK_DEFINE_F(BitExtraction_u64, ConcreteCuda_BitExtraction)
 (benchmark::State &st) {
  for (auto _ : st) {
    // Execute bit extract
@@ -109,5 +109,5 @@ BitExtractionBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
             x.number_of_bits_to_extract, x.number_of_inputs});
 }

-BENCHMARK_REGISTER_F(BitExtractionBenchmark_u64, BitExtraction)
+BENCHMARK_REGISTER_F(BitExtraction_u64, ConcreteCuda_BitExtraction)
    ->Apply(BitExtractionBenchmarkGenerateParams);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bootstrap.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bootstrap.cpp
@@ -13,7 +13,7 @@ typedef struct {
  int input_lwe_ciphertext_count;
 } BootstrapBenchmarkParams;

-class BootstrapBenchmark_u64 : public benchmark::Fixture {
+class Bootstrap_u64 : public benchmark::Fixture {
 protected:
  int lwe_dimension;
  int glwe_dimension;
@@ -76,7 +76,8 @@ public:
  }
 };

-BENCHMARK_DEFINE_F(BootstrapBenchmark_u64, AmortizedPBS)(benchmark::State &st) {
+BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_AmortizedPBS)
+(benchmark::State &st) {
  void *v_stream = (void *)stream;

  for (auto _ : st) {
@@ -92,7 +93,7 @@ BENCHMARK_DEFINE_F(BootstrapBenchmark_u64, AmortizedPBS)(benchmark::State &st) {
  }
 }

-BENCHMARK_DEFINE_F(BootstrapBenchmark_u64, CopiesPlusAmortizedPBS)
+BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_CopiesPlusAmortizedPBS)
 (benchmark::State &st) {
  void *v_stream = (void *)stream;

@@ -119,7 +120,7 @@ BENCHMARK_DEFINE_F(BootstrapBenchmark_u64, CopiesPlusAmortizedPBS)
  }
 }

-BENCHMARK_DEFINE_F(BootstrapBenchmark_u64, LowLatencyPBS)
+BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_LowLatencyPBS)
 (benchmark::State &st) {
  for (auto _ : st) {
    // Execute PBS
@@ -134,7 +135,7 @@ BENCHMARK_DEFINE_F(BootstrapBenchmark_u64, LowLatencyPBS)
  }
 }

-BENCHMARK_DEFINE_F(BootstrapBenchmark_u64, CopiesPlusLowLatencyPBS)
+BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_CopiesPlusLowLatencyPBS)
 (benchmark::State &st) {
  void *v_stream = (void *)stream;

@@ -184,12 +185,12 @@ BootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
    }
 }

-BENCHMARK_REGISTER_F(BootstrapBenchmark_u64, AmortizedPBS)
+BENCHMARK_REGISTER_F(Bootstrap_u64, ConcreteCuda_AmortizedPBS)
    ->Apply(BootstrapBenchmarkGenerateParams);
-BENCHMARK_REGISTER_F(BootstrapBenchmark_u64, LowLatencyPBS)
+BENCHMARK_REGISTER_F(Bootstrap_u64, ConcreteCuda_LowLatencyPBS)
    ->Apply(BootstrapBenchmarkGenerateParams);

-BENCHMARK_REGISTER_F(BootstrapBenchmark_u64, CopiesPlusAmortizedPBS)
+BENCHMARK_REGISTER_F(Bootstrap_u64, ConcreteCuda_CopiesPlusAmortizedPBS)
    ->Apply(BootstrapBenchmarkGenerateParams);
-BENCHMARK_REGISTER_F(BootstrapBenchmark_u64, CopiesPlusLowLatencyPBS)
+BENCHMARK_REGISTER_F(Bootstrap_u64, ConcreteCuda_CopiesPlusLowLatencyPBS)
    ->Apply(BootstrapBenchmarkGenerateParams);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_circuit_bootstrap.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_circuit_bootstrap.cpp
@@ -17,7 +17,7 @@ typedef struct {
  int number_of_inputs;
 } CircuitBootstrapBenchmarkParams;

-class CircuitBootstrapBenchmark_u64 : public benchmark::Fixture {
+class CircuitBootstrap_u64 : public benchmark::Fixture {
 protected:
  int lwe_dimension;
  int glwe_dimension;
@@ -87,7 +87,7 @@ public:
  }
 };

-BENCHMARK_DEFINE_F(CircuitBootstrapBenchmark_u64, CircuitBootstrap)
+BENCHMARK_DEFINE_F(CircuitBootstrap_u64, ConcreteCuda_CircuitBootstrap)
 (benchmark::State &st) {
  for (auto _ : st) {
    // Execute circuit bootstrap
@@ -116,5 +116,5 @@ CircuitBootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
             x.cbs_base_log, x.cbs_level, x.number_of_inputs});
 }

-BENCHMARK_REGISTER_F(CircuitBootstrapBenchmark_u64, CircuitBootstrap)
+BENCHMARK_REGISTER_F(CircuitBootstrap_u64, ConcreteCuda_CircuitBootstrap)
    ->Apply(CircuitBootstrapBenchmarkGenerateParams);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_cmux_tree.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_cmux_tree.cpp
@@ -14,7 +14,7 @@ typedef struct {
  int level_count;
 } CMUXTreeBenchmarkParams;

-class CMUXTreeBenchmark_u64 : public benchmark::Fixture {
+class CMUXTree_u64 : public benchmark::Fixture {
 protected:
  int glwe_dimension;
  int polynomial_size;
@@ -65,7 +65,7 @@ public:
  }
 };

-BENCHMARK_DEFINE_F(CMUXTreeBenchmark_u64, CMUXTree)(benchmark::State &st) {
+BENCHMARK_DEFINE_F(CMUXTree_u64, ConcreteCuda_CMUXTree)(benchmark::State &st) {
  for (auto _ : st) {
    // Execute scratch/CMUX tree/cleanup
    cuda_cmux_tree_64(stream, gpu_index, (void *)d_glwe_out,
@@ -90,5 +90,5 @@ static void CMUXTreeBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
             x.level_count});
 }

-BENCHMARK_REGISTER_F(CMUXTreeBenchmark_u64, CMUXTree)
+BENCHMARK_REGISTER_F(CMUXTree_u64, ConcreteCuda_CMUXTree)
    ->Apply(CMUXTreeBenchmarkGenerateParams);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_keyswitch.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_keyswitch.cpp
@@ -12,7 +12,7 @@ typedef struct {
  int number_of_inputs;
 } KeyswitchBenchmarkParams;

-class KeyswitchBenchmark_u64 : public benchmark::Fixture {
+class Keyswitch_u64 : public benchmark::Fixture {
 protected:
  int input_lwe_dimension;
  int output_lwe_dimension;
@@ -61,7 +61,8 @@ public:
  }
 };

-BENCHMARK_DEFINE_F(KeyswitchBenchmark_u64, Keyswitch)(benchmark::State &st) {
+BENCHMARK_DEFINE_F(Keyswitch_u64, ConcreteCuda_Keyswitch)
+(benchmark::State &st) {
  for (auto _ : st) {
    // Execute keyswitch
    cuda_keyswitch_lwe_ciphertext_vector_64(
@@ -72,7 +73,7 @@ BENCHMARK_DEFINE_F(KeyswitchBenchmark_u64, Keyswitch)(benchmark::State &st) {
  }
 }

-BENCHMARK_DEFINE_F(KeyswitchBenchmark_u64, CopiesPlusKeyswitch)
+BENCHMARK_DEFINE_F(Keyswitch_u64, ConcreteCuda_CopiesPlusKeyswitch)
 (benchmark::State &st) {
  uint64_t *lwe_in_ct = (uint64_t *)malloc(
      number_of_inputs * (input_lwe_dimension + 1) * sizeof(uint64_t));
@@ -110,8 +111,8 @@ KeyswitchBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
             x.ksk_level, x.number_of_inputs});
 }

-BENCHMARK_REGISTER_F(KeyswitchBenchmark_u64, Keyswitch)
+BENCHMARK_REGISTER_F(Keyswitch_u64, ConcreteCuda_Keyswitch)
    ->Apply(KeyswitchBenchmarkGenerateParams);

-BENCHMARK_REGISTER_F(KeyswitchBenchmark_u64, CopiesPlusKeyswitch)
+BENCHMARK_REGISTER_F(Keyswitch_u64, ConcreteCuda_CopiesPlusKeyswitch)
    ->Apply(KeyswitchBenchmarkGenerateParams);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_linear_algebra.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_linear_algebra.cpp
@@ -9,7 +9,7 @@ typedef struct {
  int input_lwe_ciphertext_count;
 } LinearAlgebraBenchmarkParams;

-class LinearAlgebraBenchmark_u64 : public benchmark::Fixture {
+class LinearAlgebra_u64 : public benchmark::Fixture {
 protected:
  int lwe_dimension;
  double noise_variance = 2.9802322387695312e-08;
@@ -62,7 +62,8 @@ public:
  }
 };

-BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, Addition)(benchmark::State &st) {
+BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_Addition)
+(benchmark::State &st) {
  // Execute addition
  for (auto _ : st) {
    cuda_add_lwe_ciphertext_vector_64(
@@ -72,7 +73,7 @@ BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, Addition)(benchmark::State &st) {
  }
 }

-BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, CopiesPlusAddition)
+BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_CopiesPlusAddition)
 (benchmark::State &st) {
  // Execute addition
  for (auto _ : st) {
@@ -97,7 +98,7 @@ BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, CopiesPlusAddition)
  }
 }

-BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, PlaintextAddition)
+BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_PlaintextAddition)
 (benchmark::State &st) {
  for (auto _ : st) {
    // Execute addition
@@ -108,7 +109,7 @@ BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, PlaintextAddition)
  }
 }

-BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, CopiesPlusPlaintextAddition)
+BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_CopiesPlusPlaintextAddition)
 (benchmark::State &st) {
  for (auto _ : st) {

@@ -131,7 +132,7 @@ BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, CopiesPlusPlaintextAddition)
  }
 }

-BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, PlaintextMultiplication)
+BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_CleartextMultiplication)
 (benchmark::State &st) {
  for (auto _ : st) {
    // Execute addition
@@ -142,8 +143,8 @@ BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, PlaintextMultiplication)
  }
 }

-BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64,
-                   CopiesPlusPlaintextMultiplication)
+BENCHMARK_DEFINE_F(LinearAlgebra_u64,
+                   ConcreteCuda_CopiesPlusCleartextMultiplication)
 (benchmark::State &st) {
  for (auto _ : st) {
    cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
@@ -165,7 +166,8 @@ BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64,
  }
 }

-BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, Negate)(benchmark::State &st) {
+BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_Negation)
+(benchmark::State &st) {
  for (auto _ : st) {
    // Execute addition
    cuda_negate_lwe_ciphertext_vector_64(
@@ -175,7 +177,7 @@ BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, Negate)(benchmark::State &st) {
  }
 }

-BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, CopiesPlusNegate)
+BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_CopiesPlusNegation)
 (benchmark::State &st) {
  for (auto _ : st) {
    cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
@@ -208,20 +210,21 @@ LinearAlgebraBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
    b->Args({x.lwe_dimension, x.input_lwe_ciphertext_count});
 }

-BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, Addition)
+BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_Addition)
    ->Apply(LinearAlgebraBenchmarkGenerateParams);
-BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, CopiesPlusAddition)
+BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_CopiesPlusAddition)
    ->Apply(LinearAlgebraBenchmarkGenerateParams);
-BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, PlaintextAddition)
+BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_PlaintextAddition)
    ->Apply(LinearAlgebraBenchmarkGenerateParams);
-BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, CopiesPlusPlaintextAddition)
+BENCHMARK_REGISTER_F(LinearAlgebra_u64,
+                     ConcreteCuda_CopiesPlusPlaintextAddition)
    ->Apply(LinearAlgebraBenchmarkGenerateParams);
-BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, PlaintextMultiplication)
+BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_CleartextMultiplication)
    ->Apply(LinearAlgebraBenchmarkGenerateParams);
-BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64,
-                     CopiesPlusPlaintextMultiplication)
+BENCHMARK_REGISTER_F(LinearAlgebra_u64,
+                     ConcreteCuda_CopiesPlusCleartextMultiplication)
    ->Apply(LinearAlgebraBenchmarkGenerateParams);
-BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, Negate)
+BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_Negation)
    ->Apply(LinearAlgebraBenchmarkGenerateParams);
-BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, CopiesPlusNegate)
+BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_CopiesPlusNegation)
    ->Apply(LinearAlgebraBenchmarkGenerateParams);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_wop_bootstrap.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_wop_bootstrap.cpp
@@ -19,7 +19,7 @@ typedef struct {
  int tau;
 } WopPBSBenchmarkParams;

-class WopPBSBenchmark_u64 : public benchmark::Fixture {
+class WopPBS_u64 : public benchmark::Fixture {
 protected:
  int lwe_dimension;
  int glwe_dimension;
@@ -111,7 +111,7 @@ public:
  }
 };

-BENCHMARK_DEFINE_F(WopPBSBenchmark_u64, WopPBS)(benchmark::State &st) {
+BENCHMARK_DEFINE_F(WopPBS_u64, ConcreteCuda_WopPBS)(benchmark::State &st) {
  for (auto _ : st) {
    // Execute wop pbs
    cuda_wop_pbs_64(
@@ -125,7 +125,7 @@ BENCHMARK_DEFINE_F(WopPBSBenchmark_u64, WopPBS)(benchmark::State &st) {
  }
 }

-BENCHMARK_DEFINE_F(WopPBSBenchmark_u64, CopiesPlusWopPBS)
+BENCHMARK_DEFINE_F(WopPBS_u64, ConcreteCuda_CopiesPlusWopPBS)
 (benchmark::State &st) {
  for (auto _ : st) {
    cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_ct_in_array,
@@ -163,7 +163,7 @@ static void WopPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
             x.tau});
 }

-BENCHMARK_REGISTER_F(WopPBSBenchmark_u64, WopPBS)
+BENCHMARK_REGISTER_F(WopPBS_u64, ConcreteCuda_WopPBS)
    ->Apply(WopPBSBenchmarkGenerateParams);
-BENCHMARK_REGISTER_F(WopPBSBenchmark_u64, CopiesPlusWopPBS)
+BENCHMARK_REGISTER_F(WopPBS_u64, ConcreteCuda_CopiesPlusWopPBS)
    ->Apply(WopPBSBenchmarkGenerateParams);
--- a/ci/ec2_products_cost.json
+++ b/ci/ec2_products_cost.json
@@ -1,4 +1,5 @@
 {
-  "m6i.metal": 7.168,
-  "c6a.metal": 7.344
+    "m6i.metal": 7.168,
+    "c6a.metal": 7.344,
+    "p3.2xlarge": 1.061
 }
--- a/ci/slab.toml
+++ b/ci/slab.toml
@@ -21,7 +21,7 @@ security_group= ["sg-0bf1c1d79c97bc88f", ]

 [profile.gpu-bench]
 region = "us-east-1"
-image_id = "ami-03f11dc8c6a5f5c0a"
+image_id = "ami-08e27480d79e82238"
 instance_type = "p3.2xlarge"
 subnet_id = "subnet-8123c9e7"
 security_group= ["sg-0f8b52622a2669491", ]
@@ -77,7 +77,7 @@ max_parallel_jobs = 2

 [command.concrete-cuda-benchmark]
 workflow = "concrete_cuda_benchmark.yml"
-profile = "gpu-test"
+profile = "gpu-bench"
 check_run_name = "Concrete Cuda Performances Benchmarks"

 #################################################