bench(ci): fix concrete-cuda benchmarks

This commit is contained in:
Agnes Leroy
2023-03-29 15:19:41 +02:00
committed by Agnès Leroy
parent 9cacd4adff
commit d9652b8936
10 changed files with 68 additions and 59 deletions

View File

@@ -44,21 +44,23 @@ jobs:
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- uses: actions/checkout@v2
- uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-11.8/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CC=/usr/bin/gcc-8" >> "${GITHUB_ENV}"
echo "CXX=/usr/bin/g++-8" >> "${GITHUB_ENV}"
echo "CUDAHOSTCXX=/usr/bin/g++-8" >> "${GITHUB_ENV}"
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Rust install
@@ -78,20 +80,21 @@ jobs:
- name: Benchmark concrete-cuda
if: ${{ !cancelled() }}
run: |
${{ BENCHMARK_DIR }}/benchmark_concrete_cuda --benchmark_out=benchmarks_results.json --benchmark_out_format=json
${{ env.BENCHMARK_DIR }}/benchmark_concrete_cuda --benchmark_out=benchmarks_results.json
--benchmark_out_format=json
- name: Upload raw results artifact
uses: actions/upload-artifact@v3
with:
name: concrete_cuda_${{ github.sha }}_raw
path: ${{ BENCHMARK_DIR }}/benchmarks_results.json
path: benchmarks_results.json
- name: Parse results
shell: bash
run: |
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
COMMIT_HASH="$(git describe --tags --dirty)"
python3 ./ci/benchmark_parser.py ${{ BENCHMARK_DIR }}/benchmarks_results.json ${{ env.RESULTS_FILENAME }} \
python3 ./ci/benchmark_parser.py benchmarks_results.json ${{ env.RESULTS_FILENAME }} \
--database compiler_benchmarks \
--hardware ${{ inputs.instance_type }} \
--project-version ${COMMIT_HASH} \

View File

@@ -17,7 +17,7 @@ typedef struct {
int number_of_inputs;
} BitExtractionBenchmarkParams;
class BitExtractionBenchmark_u64 : public benchmark::Fixture {
class BitExtraction_u64 : public benchmark::Fixture {
protected:
int lwe_dimension;
int glwe_dimension;
@@ -79,7 +79,7 @@ public:
}
};
BENCHMARK_DEFINE_F(BitExtractionBenchmark_u64, BitExtraction)
BENCHMARK_DEFINE_F(BitExtraction_u64, ConcreteCuda_BitExtraction)
(benchmark::State &st) {
for (auto _ : st) {
// Execute bit extract
@@ -109,5 +109,5 @@ BitExtractionBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
x.number_of_bits_to_extract, x.number_of_inputs});
}
BENCHMARK_REGISTER_F(BitExtractionBenchmark_u64, BitExtraction)
BENCHMARK_REGISTER_F(BitExtraction_u64, ConcreteCuda_BitExtraction)
->Apply(BitExtractionBenchmarkGenerateParams);

View File

@@ -13,7 +13,7 @@ typedef struct {
int input_lwe_ciphertext_count;
} BootstrapBenchmarkParams;
class BootstrapBenchmark_u64 : public benchmark::Fixture {
class Bootstrap_u64 : public benchmark::Fixture {
protected:
int lwe_dimension;
int glwe_dimension;
@@ -76,7 +76,8 @@ public:
}
};
BENCHMARK_DEFINE_F(BootstrapBenchmark_u64, AmortizedPBS)(benchmark::State &st) {
BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_AmortizedPBS)
(benchmark::State &st) {
void *v_stream = (void *)stream;
for (auto _ : st) {
@@ -92,7 +93,7 @@ BENCHMARK_DEFINE_F(BootstrapBenchmark_u64, AmortizedPBS)(benchmark::State &st) {
}
}
BENCHMARK_DEFINE_F(BootstrapBenchmark_u64, CopiesPlusAmortizedPBS)
BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_CopiesPlusAmortizedPBS)
(benchmark::State &st) {
void *v_stream = (void *)stream;
@@ -119,7 +120,7 @@ BENCHMARK_DEFINE_F(BootstrapBenchmark_u64, CopiesPlusAmortizedPBS)
}
}
BENCHMARK_DEFINE_F(BootstrapBenchmark_u64, LowLatencyPBS)
BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_LowLatencyPBS)
(benchmark::State &st) {
for (auto _ : st) {
// Execute PBS
@@ -134,7 +135,7 @@ BENCHMARK_DEFINE_F(BootstrapBenchmark_u64, LowLatencyPBS)
}
}
BENCHMARK_DEFINE_F(BootstrapBenchmark_u64, CopiesPlusLowLatencyPBS)
BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_CopiesPlusLowLatencyPBS)
(benchmark::State &st) {
void *v_stream = (void *)stream;
@@ -184,12 +185,12 @@ BootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
}
}
BENCHMARK_REGISTER_F(BootstrapBenchmark_u64, AmortizedPBS)
BENCHMARK_REGISTER_F(Bootstrap_u64, ConcreteCuda_AmortizedPBS)
->Apply(BootstrapBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(BootstrapBenchmark_u64, LowLatencyPBS)
BENCHMARK_REGISTER_F(Bootstrap_u64, ConcreteCuda_LowLatencyPBS)
->Apply(BootstrapBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(BootstrapBenchmark_u64, CopiesPlusAmortizedPBS)
BENCHMARK_REGISTER_F(Bootstrap_u64, ConcreteCuda_CopiesPlusAmortizedPBS)
->Apply(BootstrapBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(BootstrapBenchmark_u64, CopiesPlusLowLatencyPBS)
BENCHMARK_REGISTER_F(Bootstrap_u64, ConcreteCuda_CopiesPlusLowLatencyPBS)
->Apply(BootstrapBenchmarkGenerateParams);

View File

@@ -17,7 +17,7 @@ typedef struct {
int number_of_inputs;
} CircuitBootstrapBenchmarkParams;
class CircuitBootstrapBenchmark_u64 : public benchmark::Fixture {
class CircuitBootstrap_u64 : public benchmark::Fixture {
protected:
int lwe_dimension;
int glwe_dimension;
@@ -87,7 +87,7 @@ public:
}
};
BENCHMARK_DEFINE_F(CircuitBootstrapBenchmark_u64, CircuitBootstrap)
BENCHMARK_DEFINE_F(CircuitBootstrap_u64, ConcreteCuda_CircuitBootstrap)
(benchmark::State &st) {
for (auto _ : st) {
// Execute circuit bootstrap
@@ -116,5 +116,5 @@ CircuitBootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
x.cbs_base_log, x.cbs_level, x.number_of_inputs});
}
BENCHMARK_REGISTER_F(CircuitBootstrapBenchmark_u64, CircuitBootstrap)
BENCHMARK_REGISTER_F(CircuitBootstrap_u64, ConcreteCuda_CircuitBootstrap)
->Apply(CircuitBootstrapBenchmarkGenerateParams);

View File

@@ -14,7 +14,7 @@ typedef struct {
int level_count;
} CMUXTreeBenchmarkParams;
class CMUXTreeBenchmark_u64 : public benchmark::Fixture {
class CMUXTree_u64 : public benchmark::Fixture {
protected:
int glwe_dimension;
int polynomial_size;
@@ -65,7 +65,7 @@ public:
}
};
BENCHMARK_DEFINE_F(CMUXTreeBenchmark_u64, CMUXTree)(benchmark::State &st) {
BENCHMARK_DEFINE_F(CMUXTree_u64, ConcreteCuda_CMUXTree)(benchmark::State &st) {
for (auto _ : st) {
// Execute scratch/CMUX tree/cleanup
cuda_cmux_tree_64(stream, gpu_index, (void *)d_glwe_out,
@@ -90,5 +90,5 @@ static void CMUXTreeBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
x.level_count});
}
BENCHMARK_REGISTER_F(CMUXTreeBenchmark_u64, CMUXTree)
BENCHMARK_REGISTER_F(CMUXTree_u64, ConcreteCuda_CMUXTree)
->Apply(CMUXTreeBenchmarkGenerateParams);

View File

@@ -12,7 +12,7 @@ typedef struct {
int number_of_inputs;
} KeyswitchBenchmarkParams;
class KeyswitchBenchmark_u64 : public benchmark::Fixture {
class Keyswitch_u64 : public benchmark::Fixture {
protected:
int input_lwe_dimension;
int output_lwe_dimension;
@@ -61,7 +61,8 @@ public:
}
};
BENCHMARK_DEFINE_F(KeyswitchBenchmark_u64, Keyswitch)(benchmark::State &st) {
BENCHMARK_DEFINE_F(Keyswitch_u64, ConcreteCuda_Keyswitch)
(benchmark::State &st) {
for (auto _ : st) {
// Execute keyswitch
cuda_keyswitch_lwe_ciphertext_vector_64(
@@ -72,7 +73,7 @@ BENCHMARK_DEFINE_F(KeyswitchBenchmark_u64, Keyswitch)(benchmark::State &st) {
}
}
BENCHMARK_DEFINE_F(KeyswitchBenchmark_u64, CopiesPlusKeyswitch)
BENCHMARK_DEFINE_F(Keyswitch_u64, ConcreteCuda_CopiesPlusKeyswitch)
(benchmark::State &st) {
uint64_t *lwe_in_ct = (uint64_t *)malloc(
number_of_inputs * (input_lwe_dimension + 1) * sizeof(uint64_t));
@@ -110,8 +111,8 @@ KeyswitchBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
x.ksk_level, x.number_of_inputs});
}
BENCHMARK_REGISTER_F(KeyswitchBenchmark_u64, Keyswitch)
BENCHMARK_REGISTER_F(Keyswitch_u64, ConcreteCuda_Keyswitch)
->Apply(KeyswitchBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(KeyswitchBenchmark_u64, CopiesPlusKeyswitch)
BENCHMARK_REGISTER_F(Keyswitch_u64, ConcreteCuda_CopiesPlusKeyswitch)
->Apply(KeyswitchBenchmarkGenerateParams);

View File

@@ -9,7 +9,7 @@ typedef struct {
int input_lwe_ciphertext_count;
} LinearAlgebraBenchmarkParams;
class LinearAlgebraBenchmark_u64 : public benchmark::Fixture {
class LinearAlgebra_u64 : public benchmark::Fixture {
protected:
int lwe_dimension;
double noise_variance = 2.9802322387695312e-08;
@@ -62,7 +62,8 @@ public:
}
};
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, Addition)(benchmark::State &st) {
BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_Addition)
(benchmark::State &st) {
// Execute addition
for (auto _ : st) {
cuda_add_lwe_ciphertext_vector_64(
@@ -72,7 +73,7 @@ BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, Addition)(benchmark::State &st) {
}
}
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, CopiesPlusAddition)
BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_CopiesPlusAddition)
(benchmark::State &st) {
// Execute addition
for (auto _ : st) {
@@ -97,7 +98,7 @@ BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, CopiesPlusAddition)
}
}
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, PlaintextAddition)
BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_PlaintextAddition)
(benchmark::State &st) {
for (auto _ : st) {
// Execute addition
@@ -108,7 +109,7 @@ BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, PlaintextAddition)
}
}
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, CopiesPlusPlaintextAddition)
BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_CopiesPlusPlaintextAddition)
(benchmark::State &st) {
for (auto _ : st) {
@@ -131,7 +132,7 @@ BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, CopiesPlusPlaintextAddition)
}
}
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, PlaintextMultiplication)
BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_CleartextMultiplication)
(benchmark::State &st) {
for (auto _ : st) {
// Execute addition
@@ -142,8 +143,8 @@ BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, PlaintextMultiplication)
}
}
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64,
CopiesPlusPlaintextMultiplication)
BENCHMARK_DEFINE_F(LinearAlgebra_u64,
ConcreteCuda_CopiesPlusCleartextMultiplication)
(benchmark::State &st) {
for (auto _ : st) {
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
@@ -165,7 +166,8 @@ BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64,
}
}
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, Negate)(benchmark::State &st) {
BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_Negation)
(benchmark::State &st) {
for (auto _ : st) {
// Execute addition
cuda_negate_lwe_ciphertext_vector_64(
@@ -175,7 +177,7 @@ BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, Negate)(benchmark::State &st) {
}
}
BENCHMARK_DEFINE_F(LinearAlgebraBenchmark_u64, CopiesPlusNegate)
BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_CopiesPlusNegation)
(benchmark::State &st) {
for (auto _ : st) {
cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
@@ -208,20 +210,21 @@ LinearAlgebraBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
b->Args({x.lwe_dimension, x.input_lwe_ciphertext_count});
}
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, Addition)
BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_Addition)
->Apply(LinearAlgebraBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, CopiesPlusAddition)
BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_CopiesPlusAddition)
->Apply(LinearAlgebraBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, PlaintextAddition)
BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_PlaintextAddition)
->Apply(LinearAlgebraBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, CopiesPlusPlaintextAddition)
BENCHMARK_REGISTER_F(LinearAlgebra_u64,
ConcreteCuda_CopiesPlusPlaintextAddition)
->Apply(LinearAlgebraBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, PlaintextMultiplication)
BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_CleartextMultiplication)
->Apply(LinearAlgebraBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64,
CopiesPlusPlaintextMultiplication)
BENCHMARK_REGISTER_F(LinearAlgebra_u64,
ConcreteCuda_CopiesPlusCleartextMultiplication)
->Apply(LinearAlgebraBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, Negate)
BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_Negation)
->Apply(LinearAlgebraBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(LinearAlgebraBenchmark_u64, CopiesPlusNegate)
BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_CopiesPlusNegation)
->Apply(LinearAlgebraBenchmarkGenerateParams);

View File

@@ -19,7 +19,7 @@ typedef struct {
int tau;
} WopPBSBenchmarkParams;
class WopPBSBenchmark_u64 : public benchmark::Fixture {
class WopPBS_u64 : public benchmark::Fixture {
protected:
int lwe_dimension;
int glwe_dimension;
@@ -111,7 +111,7 @@ public:
}
};
BENCHMARK_DEFINE_F(WopPBSBenchmark_u64, WopPBS)(benchmark::State &st) {
BENCHMARK_DEFINE_F(WopPBS_u64, ConcreteCuda_WopPBS)(benchmark::State &st) {
for (auto _ : st) {
// Execute wop pbs
cuda_wop_pbs_64(
@@ -125,7 +125,7 @@ BENCHMARK_DEFINE_F(WopPBSBenchmark_u64, WopPBS)(benchmark::State &st) {
}
}
BENCHMARK_DEFINE_F(WopPBSBenchmark_u64, CopiesPlusWopPBS)
BENCHMARK_DEFINE_F(WopPBS_u64, ConcreteCuda_CopiesPlusWopPBS)
(benchmark::State &st) {
for (auto _ : st) {
cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_ct_in_array,
@@ -163,7 +163,7 @@ static void WopPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
x.tau});
}
BENCHMARK_REGISTER_F(WopPBSBenchmark_u64, WopPBS)
BENCHMARK_REGISTER_F(WopPBS_u64, ConcreteCuda_WopPBS)
->Apply(WopPBSBenchmarkGenerateParams);
BENCHMARK_REGISTER_F(WopPBSBenchmark_u64, CopiesPlusWopPBS)
BENCHMARK_REGISTER_F(WopPBS_u64, ConcreteCuda_CopiesPlusWopPBS)
->Apply(WopPBSBenchmarkGenerateParams);

View File

@@ -1,4 +1,5 @@
{
"m6i.metal": 7.168,
"c6a.metal": 7.344
"m6i.metal": 7.168,
"c6a.metal": 7.344,
"p3.2xlarge": 1.061
}

View File

@@ -21,7 +21,7 @@ security_group= ["sg-0bf1c1d79c97bc88f", ]
[profile.gpu-bench]
region = "us-east-1"
image_id = "ami-03f11dc8c6a5f5c0a"
image_id = "ami-08e27480d79e82238"
instance_type = "p3.2xlarge"
subnet_id = "subnet-8123c9e7"
security_group= ["sg-0f8b52622a2669491", ]
@@ -77,7 +77,7 @@ max_parallel_jobs = 2
[command.concrete-cuda-benchmark]
workflow = "concrete_cuda_benchmark.yml"
profile = "gpu-test"
profile = "gpu-bench"
check_run_name = "Concrete Cuda Performances Benchmarks"
#################################################