mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-04-28 03:01:21 -04:00
Compare commits
1 Commits
as/benchma
...
go/feat/im
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4e9dc1caee |
1
.github/workflows/benchmark_gpu.yml
vendored
1
.github/workflows/benchmark_gpu.yml
vendored
@@ -38,7 +38,6 @@ on:
|
||||
- integer_aes
|
||||
- integer_aes256
|
||||
- hlapi_erc7984
|
||||
- hlapi_erc7984_multi_group
|
||||
- hlapi_dex
|
||||
- hlapi_noise_squash
|
||||
op_flavor:
|
||||
|
||||
35
.github/workflows/benchmark_gpu_common.yml
vendored
35
.github/workflows/benchmark_gpu_common.yml
vendored
@@ -237,10 +237,8 @@ jobs:
|
||||
BENCH_PARAMS_TYPE: ${{ matrix.params_type }}
|
||||
BENCH_COMMAND: ${{ matrix.command }}
|
||||
PRECISIONS_SET: ${{ inputs.precisions_set }}
|
||||
__TFHE_RS_BENCH_MULTI_PROC_GROUPS: 2
|
||||
|
||||
- name: Parse results
|
||||
if: ${{ inputs.command != 'hlapi_erc7984_multi_group' }}
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
|
||||
--database tfhe_rs \
|
||||
@@ -258,39 +256,6 @@ jobs:
|
||||
REF_NAME: ${{ github.ref_name }}
|
||||
BENCH_TYPE: ${{ matrix.bench_type }}
|
||||
|
||||
- name: Parse and merge erc7984_multi_group results
|
||||
if: ${{ inputs.command == 'hlapi_erc7984_multi_group' }}
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py tfhe-benchmark/target_p0/criterion "${RESULTS_FILENAME_P0}" \
|
||||
--database tfhe_rs \
|
||||
--hardware "${INPUTS_HARDWARE_NAME}" \
|
||||
--backend gpu \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--branch "${REF_NAME}" \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${BENCH_DATE}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--bench-type "${BENCH_TYPE}"
|
||||
python3 ./ci/benchmark_parser.py tfhe-benchmark/target_p1/criterion "${RESULTS_FILENAME_P1}" \
|
||||
--database tfhe_rs \
|
||||
--hardware "${INPUTS_HARDWARE_NAME}" \
|
||||
--backend gpu \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--branch "${REF_NAME}" \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${BENCH_DATE}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--bench-type "${BENCH_TYPE}"
|
||||
python3 ./ci/merge_multi_group_results.py --bench-type "${BENCH_TYPE}" --output "${RESULTS_FILENAME}" "${RESULTS_FILENAME_P0}" "${RESULTS_FILENAME_P1}"
|
||||
env:
|
||||
INPUTS_HARDWARE_NAME: ${{ inputs.hardware_name }}
|
||||
REF_NAME: ${{ github.ref_name }}
|
||||
BENCH_TYPE: ${{ matrix.bench_type }}
|
||||
RESULTS_FILENAME_P0: parsed_benchmark_results_p0_${{ github.sha }}.json
|
||||
RESULTS_FILENAME_P1: parsed_benchmark_results_p1_${{ github.sha }}.json
|
||||
|
||||
- name: Parse additional benchmarks results files
|
||||
if: ${{ inputs.additional_file_to_parse }}
|
||||
run: |
|
||||
|
||||
21
.github/workflows/benchmark_summary.yml
vendored
21
.github/workflows/benchmark_summary.yml
vendored
@@ -114,27 +114,6 @@ jobs:
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
|
||||
|
||||
run-benchmarks-gpu-erc7984-multi-group:
|
||||
name: benchmark_documentation/run-benchmarks-gpu-erc7984-multi-group
|
||||
uses: ./.github/workflows/benchmark_gpu_common.yml
|
||||
if: inputs.run-gpu-benchmarks
|
||||
needs: parse-gpu-inputs
|
||||
with:
|
||||
profile: ${{ needs.parse-gpu-inputs.outputs.profile }}
|
||||
hardware_name: ${{ needs.parse-gpu-inputs.outputs.hardware_name }}
|
||||
command: hlapi_erc7984_multi_group
|
||||
bench_type: throughput
|
||||
params_type: multi_bit
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
|
||||
|
||||
# TODO add make recipe for HPU benchmarks
|
||||
# run-benchmarks-hpu:
|
||||
# name: benchmark_documentation/run-benchmarks-hpu
|
||||
|
||||
10
.github/workflows/gpu_core_h100_tests.yml
vendored
10
.github/workflows/gpu_core_h100_tests.yml
vendored
@@ -23,7 +23,7 @@ on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled, opened, synchronize ]
|
||||
types: [ labeled ]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
@@ -38,7 +38,6 @@ jobs:
|
||||
pull-requests: read # Needed to check for file change
|
||||
outputs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
|
||||
@@ -63,16 +62,15 @@ jobs:
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**/**.md'
|
||||
- '.github/workflows/gpu_core_h100_tests.yml'
|
||||
core_crypto:
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
|
||||
setup-instance:
|
||||
name: gpu_core_h100_tests/setup-instance
|
||||
needs: should-run
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
|
||||
|
||||
9
.github/workflows/gpu_hlapi_h100_tests.yml
vendored
9
.github/workflows/gpu_hlapi_h100_tests.yml
vendored
@@ -23,7 +23,7 @@ on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled, opened, synchronize ]
|
||||
types: [ labeled ]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
@@ -38,7 +38,6 @@ jobs:
|
||||
pull-requests: read # Needed to check for file change
|
||||
outputs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
|
||||
@@ -66,15 +65,13 @@ jobs:
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**/**.md'
|
||||
- '.github/workflows/gpu_hlapi_h100_tests.yml'
|
||||
core_crypto:
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
|
||||
setup-instance:
|
||||
name: gpu_hlapi_h100_tests/setup-instance
|
||||
needs: should-run
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
|
||||
|
||||
39
.github/workflows/gpu_integer_long_run_tests.yml
vendored
39
.github/workflows/gpu_integer_long_run_tests.yml
vendored
@@ -17,8 +17,8 @@ on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# Weekly tests will be triggered every Monday at 8p.m.
|
||||
- cron: "0 20 * * 1"
|
||||
# Nightly tests will be triggered each evening 8p.m.
|
||||
- cron: "0 20 * * *"
|
||||
pull_request:
|
||||
|
||||
|
||||
@@ -28,41 +28,10 @@ permissions:
|
||||
# zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
name: gpu_integer_long_run_tests/should-run
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: read # Needed to check for file change
|
||||
outputs:
|
||||
is_needed_in_gpu_ci: ${{ env.IS_PR == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
|
||||
with:
|
||||
files_yaml: |
|
||||
gpu:
|
||||
- tfhe/Cargo.toml
|
||||
- tfhe/build.rs
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- '.github/workflows/gpu_integer_long_run_tests.yml'
|
||||
|
||||
setup-instance:
|
||||
name: gpu_integer_long_run_tests/setup-instance
|
||||
needs: [should-run]
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
needs.should-run.outputs.is_needed_in_gpu_ci == 'true'
|
||||
if: github.event_name != 'schedule' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
|
||||
8
.github/workflows/gpu_pcc.yml
vendored
8
.github/workflows/gpu_pcc.yml
vendored
@@ -131,10 +131,6 @@ jobs:
|
||||
env:
|
||||
GCC_VERSION: ${{ matrix.gcc }}
|
||||
|
||||
- name: Run semgrep and lint checks on CUDA code
|
||||
run: |
|
||||
make semgrep_and_lint_gpu_code
|
||||
|
||||
- name: Run fmt checks
|
||||
run: |
|
||||
make check_fmt_gpu
|
||||
@@ -143,6 +139,10 @@ jobs:
|
||||
run: |
|
||||
make pcc_gpu
|
||||
|
||||
- name: Run semgrep and lint checks on CUDA code
|
||||
run: |
|
||||
make semgrep_and_lint_gpu_code
|
||||
|
||||
- name: Run semver checks on tfhe-cuda-backend
|
||||
run: |
|
||||
make semver_check_cuda_backend
|
||||
|
||||
@@ -63,6 +63,7 @@ jobs:
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**/**.md'
|
||||
- '.github/workflows/gpu_signed_integer_classic_tests.yml'
|
||||
- scripts/integer-tests.sh
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled, opened, synchronize ]
|
||||
types: [ labeled ]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
@@ -38,7 +38,6 @@ jobs:
|
||||
pull-requests: read # Needed to check for file change
|
||||
outputs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
|
||||
@@ -64,17 +63,16 @@ jobs:
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**/**.md'
|
||||
- '.github/workflows/gpu_signed_integer_h100_tests.yml'
|
||||
- scripts/integer-tests.sh
|
||||
core_crypto:
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
|
||||
setup-instance:
|
||||
name: gpu_signed_integer_h100_tests/setup-instance
|
||||
needs: should-run
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
|
||||
|
||||
@@ -64,6 +64,7 @@ jobs:
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**/**.md'
|
||||
- '.github/workflows/gpu_signed_integer_tests.yml'
|
||||
- scripts/integer-tests.sh
|
||||
|
||||
|
||||
@@ -63,6 +63,7 @@ jobs:
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**/**.md'
|
||||
- '.github/workflows/gpu_unsigned_integer_classic_tests.yml'
|
||||
- scripts/integer-tests.sh
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled, opened, synchronize ]
|
||||
types: [ labeled ]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
@@ -38,7 +38,6 @@ jobs:
|
||||
pull-requests: read # Needed to check for file change
|
||||
outputs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
|
||||
@@ -64,17 +63,16 @@ jobs:
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**/**.md'
|
||||
- '.github/workflows/gpu_unsigned_integer_h100_tests.yml'
|
||||
- scripts/integer-tests.sh
|
||||
core_crypto:
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
|
||||
setup-instance:
|
||||
name: gpu_unsigned_integer_h100_tests/setup-instance
|
||||
needs: should-run
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
|
||||
|
||||
@@ -64,6 +64,7 @@ jobs:
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**/**.md'
|
||||
- '.github/workflows/gpu_unsigned_integer_tests.yml'
|
||||
- scripts/integer-tests.sh
|
||||
|
||||
|
||||
3
.github/workflows/gpu_zk_tests.yml
vendored
3
.github/workflows/gpu_zk_tests.yml
vendored
@@ -55,9 +55,12 @@ jobs:
|
||||
- tfhe/build.rs
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- backends/zk-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/zk/**
|
||||
- tfhe-zk-pok/**
|
||||
- 'tfhe/docs/**/**.md'
|
||||
- '.github/workflows/gpu_zk_tests.yml'
|
||||
- ci/slab.toml
|
||||
|
||||
|
||||
47
Makefile
47
Makefile
@@ -312,10 +312,8 @@ semgrep_and_lint_gpu_code: semgrep_lint_setup_venv
|
||||
find "$(TFHECUDA_SRC)" -name '*.h' -o -name '*.cuh' -o -name '*.cu' \
|
||||
| grep -v '/cmake-build-debug/' \
|
||||
| grep -v '/build/' \
|
||||
| xargs venv/bin/semgrep --error --config "$(TFHECUDA_SRC)/.semgrep/release-ordering.yaml" --scan-unknown-extensions
|
||||
| xargs venv/bin/semgrep --config "$(TFHECUDA_SRC)/.semgrep/release-ordering.yaml" --scan-unknown-extensions
|
||||
venv/bin/python3 "scripts/check_scratch_cleanup.py"
|
||||
@# Split the search string using shell string concatenation so the Makefile line doesn't match itself
|
||||
! git ls-files | xargs grep -n 'TODO: ADD COMM''ENT'
|
||||
|
||||
.PHONY: semver_check_cuda_backend # Run semver checks on tfhe-cuda-backend
|
||||
semver_check_cuda_backend:
|
||||
@@ -1957,40 +1955,6 @@ bench_hlapi_erc7984_gpu_classical: install_rs_check_toolchain
|
||||
--bench hlapi-erc7984 \
|
||||
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
|
||||
|
||||
.PHONY: bench_hlapi_erc7984_multi_group_gpu # Runs ERC7984 bench in two processes (half of gpus for each) and aggregates results
|
||||
bench_hlapi_erc7984_multi_group_gpu: install_rs_check_toolchain
|
||||
# This next line must be kept here: the code can not remove this file without a risk of concurrency issues
|
||||
# we don't know which process starts first and which one deletes the files - file deletion may also be not atomic)
|
||||
rm -f /dev/shm/sem.tfhe_bench_*
|
||||
NUM_GROUPS=$${__TFHE_RS_BENCH_MULTI_PROC_GROUPS:-2}; \
|
||||
[ "$$NUM_GROUPS" -ge 2 ] || { echo "Error: __TFHE_RS_BENCH_MULTI_PROC_GROUPS must be at least 2, got $$NUM_GROUPS"; exit 1; }; \
|
||||
trap "echo 'User interrupted the benchmark, stopping all workers!'; rm -f /dev/shm/sem.tfhe_bench_*; kill 0" INT TERM; \
|
||||
for i in $$(seq 0 $$((NUM_GROUPS - 1))); do \
|
||||
GPU_LIST=$$(python3 ci/split_gpus.py $$i $$NUM_GROUPS) || exit 1; \
|
||||
echo "Starting benchmark group $$i with CUDA_VISIBLE_DEVICES=$$GPU_LIST"; \
|
||||
CUDA_VISIBLE_DEVICES=$$GPU_LIST CARGO_TARGET_DIR=target_p$$i RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_GPU_PROCESS_COUNT=$$NUM_GROUPS \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-erc7984 \
|
||||
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow' & \
|
||||
done; \
|
||||
wait
|
||||
|
||||
.PHONY: bench_hlapi_erc7984_multi_group_fake_multi_gpu # Runs ERC7984 bench in two processes in parallel on a single GPU (use to debug bench_hlapi_erc7984_multi_group_gpu)
|
||||
bench_hlapi_erc7984_multi_group_fake_multi_gpu: install_rs_check_toolchain
|
||||
# This next line must be kept here: the code can not remove this file without a risk of concurrency issues
|
||||
# we don't know which process starts first and which one deletes the files - file deletion may also be not atomic)
|
||||
rm -f /dev/shm/sem.tfhe_bench_*
|
||||
trap "echo 'User interrupted the benchmark, stopping all workers!'; rm -f /dev/shm/sem.tfhe_bench_*; kill 0" INT TERM; \
|
||||
CARGO_TARGET_DIR=target_p0 RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=throughput __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_GPU_PROCESS_COUNT=2 \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-erc7984 \
|
||||
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow' & \
|
||||
CARGO_TARGET_DIR=target_p1 RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=throughput __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_GPU_PROCESS_COUNT=2 \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-erc7984 \
|
||||
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow' & \
|
||||
wait
|
||||
|
||||
.PHONY: bench_hlapi_dex # Run benchmarks for DEX operations
|
||||
bench_hlapi_dex: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
@@ -2119,16 +2083,11 @@ bench_summary_gpu: install_rs_check_toolchain
|
||||
--bench hlapi-noise-squash \
|
||||
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::decomp_noise_squash_comp::'
|
||||
|
||||
# This make target only runs the latency benchmark. This is because
|
||||
# summary benchmarks must use the multi-process-multi-group throughput target
|
||||
# to measure throughput. That target must be followed by specific post-processing steps.
|
||||
# Thus that target is run in a separate step in benchmark_summary.yml.
|
||||
ifneq ($(filter latency both,$(BENCH_TYPE)),)
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=latency __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
|
||||
# ERC7984
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-erc7984 \
|
||||
--features=integer,gpu,internal-keycache -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow'
|
||||
endif
|
||||
|
||||
# DEX
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
|
||||
|
||||
@@ -301,7 +301,11 @@ __global__ void keyswitch_zero_output_with_output_indices(
|
||||
// in two parts, a constant part is calculated before the loop, and a variable
|
||||
// part is calculated inside the loop. This seems to help with the register
|
||||
// pressure as well.
|
||||
template <typename Torus, typename KSTorus>
|
||||
// LevelCount template fully unrolls the level loop.
|
||||
// Accumulation step is divided in even and odd to expose more instruction level
|
||||
// parallelism. When LevelCount == 0 (default), the original runtime path is
|
||||
// used unchanged.
|
||||
template <typename Torus, typename KSTorus, int LevelCount = 0>
|
||||
__global__ void
|
||||
keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
@@ -318,13 +322,15 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
|
||||
if (tid <= lwe_dimension_out) {
|
||||
|
||||
KSTorus local_lwe_out = 0;
|
||||
// Accumulator is splited into even and odd iterations.
|
||||
KSTorus local_lwe_out_even = 0;
|
||||
KSTorus local_lwe_out_odd = 0;
|
||||
auto block_lwe_array_in = get_chunk(
|
||||
lwe_array_in, lwe_input_indexes[blockIdx.x], lwe_dimension_in + 1);
|
||||
|
||||
if (tid == lwe_dimension_out && threadIdx.y == 0) {
|
||||
if constexpr (std::is_same_v<KSTorus, Torus>) {
|
||||
local_lwe_out = -block_lwe_array_in[lwe_dimension_in];
|
||||
local_lwe_out_even = -block_lwe_array_in[lwe_dimension_in];
|
||||
} else {
|
||||
auto new_body = closest_repr(block_lwe_array_in[lwe_dimension_in],
|
||||
sizeof(KSTorus) * 8, 1);
|
||||
@@ -337,7 +343,7 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
auto rounded_downscaled_body =
|
||||
(KSTorus)(new_body >> input_to_output_scaling_factor);
|
||||
|
||||
local_lwe_out = -rounded_downscaled_body;
|
||||
local_lwe_out_even = -rounded_downscaled_body;
|
||||
}
|
||||
}
|
||||
const Torus mask_mod_b = (1ll << base_log) - 1ll;
|
||||
@@ -351,18 +357,41 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
for (int i = start_i; i < end_i; i++) {
|
||||
Torus state =
|
||||
init_decomposer_state(block_lwe_array_in[i], base_log, level_count);
|
||||
uint32_t offset = i * level_count * (lwe_dimension_out + 1);
|
||||
#pragma unroll 1
|
||||
for (int j = 0; j < level_count; j++) {
|
||||
|
||||
KSTorus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
|
||||
local_lwe_out +=
|
||||
(KSTorus)ksk[tid + j * (lwe_dimension_out + 1) + offset] *
|
||||
decomposed;
|
||||
// Fully unrolled level loop in case LevelCount is precompiled.
|
||||
if constexpr (LevelCount > 0) {
|
||||
uint32_t offset = i * LevelCount * (lwe_dimension_out + 1);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < LevelCount; j++) {
|
||||
KSTorus decomposed =
|
||||
decompose_one<Torus>(state, mask_mod_b, base_log);
|
||||
if (j & 1)
|
||||
local_lwe_out_odd +=
|
||||
(KSTorus)ksk[tid + j * (lwe_dimension_out + 1) + offset] *
|
||||
decomposed;
|
||||
else
|
||||
local_lwe_out_even +=
|
||||
(KSTorus)ksk[tid + j * (lwe_dimension_out + 1) + offset] *
|
||||
decomposed;
|
||||
}
|
||||
} else {
|
||||
// Runtime fallback: original behaviour preserved exactly.
|
||||
uint32_t offset = i * level_count * (lwe_dimension_out + 1);
|
||||
#pragma unroll 1
|
||||
for (int j = 0; j < level_count; j++) {
|
||||
KSTorus decomposed =
|
||||
decompose_one<Torus>(state, mask_mod_b, base_log);
|
||||
local_lwe_out_even +=
|
||||
(KSTorus)ksk[tid + j * (lwe_dimension_out + 1) + offset] *
|
||||
decomposed;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lwe_acc_out[shmem_index] = local_lwe_out;
|
||||
if constexpr (LevelCount > 0)
|
||||
lwe_acc_out[shmem_index] = local_lwe_out_even + local_lwe_out_odd;
|
||||
else
|
||||
lwe_acc_out[shmem_index] = local_lwe_out_even;
|
||||
}
|
||||
|
||||
for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
|
||||
@@ -405,9 +434,46 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
|
||||
dim3 grid(num_samples, num_blocks_per_sample, 1);
|
||||
dim3 threads(num_threads_x, num_threads_y, 1);
|
||||
|
||||
keyswitch<Torus, KSTorus><<<grid, threads, shared_mem, stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count);
|
||||
// Dispatch to a statically-specialised kernel for common level_count values
|
||||
// so the level loop is fully unrolled.
|
||||
#define KS_LAUNCH(N) \
|
||||
keyswitch<Torus, KSTorus, N><<<grid, threads, shared_mem, stream>>>( \
|
||||
lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk, \
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count)
|
||||
|
||||
switch (level_count) {
|
||||
case 1:
|
||||
KS_LAUNCH(1);
|
||||
break;
|
||||
case 2:
|
||||
KS_LAUNCH(2);
|
||||
break;
|
||||
case 3:
|
||||
KS_LAUNCH(3);
|
||||
break;
|
||||
case 4:
|
||||
KS_LAUNCH(4);
|
||||
break;
|
||||
case 5:
|
||||
KS_LAUNCH(5);
|
||||
break;
|
||||
case 6:
|
||||
KS_LAUNCH(6);
|
||||
break;
|
||||
case 7:
|
||||
KS_LAUNCH(7);
|
||||
break;
|
||||
case 8:
|
||||
KS_LAUNCH(8);
|
||||
break;
|
||||
case 9:
|
||||
KS_LAUNCH(9);
|
||||
break;
|
||||
default:
|
||||
KS_LAUNCH(0);
|
||||
break;
|
||||
}
|
||||
#undef KS_LAUNCH
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
|
||||
@@ -489,7 +489,7 @@ template <typename Torus>
|
||||
__host__ void host_modulus_switch_multi_bit(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *array_out, Torus *array_in,
|
||||
int size, uint32_t log_modulus, uint32_t degree, uint32_t grouping_factor) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cudaSetDevice(gpu_index);
|
||||
int multibit_size = size / grouping_factor;
|
||||
int num_threads = 0, num_blocks = 0;
|
||||
getNumBlocksAndThreads(multibit_size, 1024, num_blocks, num_threads);
|
||||
|
||||
@@ -308,7 +308,6 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_128(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer) {
|
||||
cleanup_cuda_multi_bit_programmable_bootstrap_128(stream, gpu_index,
|
||||
pbs_buffer);
|
||||
cuda_synchronize_stream(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
}
|
||||
|
||||
// Noise tests variant of the 128-bit multi-bit PBS, restricted to
|
||||
|
||||
@@ -264,7 +264,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
|
||||
scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, pbs_level,
|
||||
input_lwe_ciphertext_count, true, false);
|
||||
input_lwe_ciphertext_count, true, PBS_MS_REDUCTION_T::NO_REDUCTION);
|
||||
uint32_t num_many_lut = 1;
|
||||
uint32_t lut_stride = 0;
|
||||
for (auto _ : st) {
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
"p4d.24xlarge": 32.7726,
|
||||
"p5.48xlarge": 98.32,
|
||||
"rtx4090": 0.04,
|
||||
"n3-L40x4": 3.2,
|
||||
"n3-H100x1": 1.52,
|
||||
"n3-H100x8-NVLink": 12.48,
|
||||
"n3-H100x8": 12.16,
|
||||
|
||||
@@ -1,66 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# This script aggregates multi-process-group benchmark results
|
||||
# that are obtained by running benchmarks in a multi-process approach
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
|
||||
ACCEPTED_TEST_PREFIXES = {
|
||||
"throughput": ["hlapi::cuda::erc7984::throughput"],
|
||||
"latency": ["hlapi::cuda::erc7984::latency"],
|
||||
}
|
||||
|
||||
|
||||
# Looks at the Slab JSON benchmark results and aggregates the "value" field.
|
||||
# For throughput, values are summed across groups.
|
||||
# For latency, values are averaged across groups.
|
||||
def merge_multi_group_results(input_files, output_file, bench_type):
|
||||
accumulated = {}
|
||||
counts = {}
|
||||
metadata = None
|
||||
accepted_prefixes = ACCEPTED_TEST_PREFIXES[bench_type]
|
||||
|
||||
for path in input_files:
|
||||
with open(path) as f:
|
||||
data = json.load(f)
|
||||
if metadata is None:
|
||||
metadata = {k: v for k, v in data.items() if k != "points"}
|
||||
for point in data["points"]:
|
||||
test = point["test"]
|
||||
if not any(test.startswith(prefix) for prefix in accepted_prefixes):
|
||||
print(
|
||||
f"Error: unexpected test '{test}' in {path}: "
|
||||
f"this script only supports aggregation of: {accepted_prefixes}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
if test in accumulated:
|
||||
accumulated[test]["value"] += point["value"]
|
||||
counts[test] += 1
|
||||
else:
|
||||
accumulated[test] = dict(point)
|
||||
counts[test] = 1
|
||||
|
||||
if bench_type == "latency":
|
||||
for test in accumulated:
|
||||
accumulated[test]["value"] /= counts[test]
|
||||
|
||||
result = dict(metadata)
|
||||
result["points"] = list(accumulated.values())
|
||||
|
||||
with open(output_file, "w") as f:
|
||||
json.dump(result, f, indent=2)
|
||||
|
||||
|
||||
# The output is a positional argument, for file names we accept 2+
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("input_files", nargs="+")
|
||||
parser.add_argument("--output", required=True)
|
||||
parser.add_argument("--bench-type", required=True, choices=["throughput", "latency"])
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
if len(args.input_files) < 2:
|
||||
print("Error: at least 2 input files required", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
merge_multi_group_results(args.input_files, args.output, args.bench_type)
|
||||
@@ -1,55 +0,0 @@
|
||||
import argparse
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
|
||||
# List the gpus for a sub-group (group_index) of gpus grouped
|
||||
# in num_groups groups. The output string is passed to CUDA_VISIBLE_DEVICES
|
||||
def get_gpu_count() -> int:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
except FileNotFoundError:
|
||||
print("Error: nvidia-smi not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except subprocess.CalledProcessError as err:
|
||||
print(f"Error: nvidia-smi failed: {err.stderr.strip()}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
return len(result.stdout.strip().splitlines())
|
||||
|
||||
|
||||
def gpu_list_for_group(num_gpus: int, group_index: int, num_groups: int) -> str:
|
||||
# Splits the available gpus un groups and returns
|
||||
# the gpus assigned to group group_index.
|
||||
if num_gpus < num_groups:
|
||||
print(
|
||||
f"Error: cannot split {num_gpus} GPU(s) across {num_groups} group(s): "
|
||||
"not enough GPUs",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
if num_gpus % num_groups != 0:
|
||||
print(
|
||||
f"Error: {num_gpus} GPU(s) is not evenly divisible by {num_groups} group(s)",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
gpus_per_group = num_gpus // num_groups
|
||||
start = group_index * gpus_per_group
|
||||
return ",".join(str(i) for i in range(start, start + gpus_per_group))
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Print the CUDA_VISIBLE_DEVICES value for one process in a multi-GPU split."
|
||||
)
|
||||
parser.add_argument("group_index", type=int, help="0-based index of this process group")
|
||||
parser.add_argument("num_groups", type=int, help="Total number of process groups")
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
num_gpus = get_gpu_count()
|
||||
print(gpu_list_for_group(num_gpus, args.group_index, args.num_groups))
|
||||
@@ -14,9 +14,6 @@ publish = false
|
||||
name = "benchmark"
|
||||
path = "src/lib.rs"
|
||||
|
||||
[target.'cfg(unix)'.dependencies]
|
||||
libc = "0.2"
|
||||
|
||||
[dependencies]
|
||||
bincode = { workspace = true }
|
||||
# clap has to be pinned as its minimum supported rust version
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
#[cfg(feature = "gpu")]
|
||||
use benchmark::utilities::{
|
||||
bench_sync_barrier, configure_gpu, get_bench_gpu_instances, get_param_type, ParamType,
|
||||
};
|
||||
use benchmark::utilities::{configure_gpu, get_param_type, ParamType};
|
||||
use benchmark::utilities::{write_to_json_unchecked, OperatorType};
|
||||
use benchmark_spec::{get_bench_type, BenchmarkType};
|
||||
use criterion::measurement::WallTime;
|
||||
@@ -562,11 +560,6 @@ fn cuda_bench_transfer_throughput<FheType, F>(
|
||||
let num_streams_per_gpu = 6; // Hard coded stream value for FheUint64
|
||||
let chunk_size = (num_elems / num_gpus) as usize;
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
if let Some(n) = get_bench_gpu_instances() {
|
||||
bench_sync_barrier(n);
|
||||
}
|
||||
|
||||
b.iter(|| {
|
||||
from_amounts
|
||||
.par_chunks(chunk_size) // Split into chunks of num_gpus
|
||||
|
||||
@@ -472,106 +472,6 @@ pub fn get_param_type() -> &'static ParamType {
|
||||
PARAM_TYPE.get_or_init(|| ParamType::from_env().unwrap())
|
||||
}
|
||||
|
||||
pub fn get_bench_gpu_instances() -> Option<usize> {
|
||||
env::var("__TFHE_RS_BENCH_GPU_PROCESS_COUNT").ok().map(|v| {
|
||||
v.parse::<usize>().unwrap_or_else(|_| {
|
||||
panic!("__TFHE_RS_BENCH_GPU_PROCESS_COUNT must be a positive integer, got '{v}'")
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
/// Multi-process barrier that ensures num_instances processes
|
||||
/// start at the same time
|
||||
#[cfg(target_os = "linux")]
|
||||
pub fn bench_sync_barrier(num_instances: usize) {
|
||||
use std::ffi::CString;
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
|
||||
const BARRIER_TIMEOUT_SECS: u64 = 120;
|
||||
const MUTEX_NAME_PREFIX: &str = "tfhe_bench";
|
||||
|
||||
// Three POSIX semaphores are used for synchronization
|
||||
// The first one is used to make sure the processes increment the
|
||||
// counter and get the value of the counter atomically .
|
||||
let sem_mutex = CString::new(format!("/{MUTEX_NAME_PREFIX}_mutex")).unwrap();
|
||||
let sem_arrive = CString::new(format!("/{MUTEX_NAME_PREFIX}_arrive")).unwrap();
|
||||
let sem_gate = CString::new(format!("/{MUTEX_NAME_PREFIX}_gate")).unwrap();
|
||||
|
||||
let now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap();
|
||||
let deadline_t = now + Duration::from_secs(BARRIER_TIMEOUT_SECS);
|
||||
let deadline = libc::timespec {
|
||||
tv_sec: deadline_t.as_secs() as libc::time_t,
|
||||
tv_nsec: deadline_t.subsec_nanos() as libc::c_long,
|
||||
};
|
||||
|
||||
let open_sem = |name: &CString, init: u32| {
|
||||
let sem = unsafe { libc::sem_open(name.as_ptr(), libc::O_CREAT, 0o600u32, init) };
|
||||
assert!(
|
||||
sem != libc::SEM_FAILED,
|
||||
"sem_open({:?}) failed: {}",
|
||||
name,
|
||||
std::io::Error::last_os_error()
|
||||
);
|
||||
sem
|
||||
};
|
||||
|
||||
let timed_wait = |sem: *mut libc::sem_t, label: &str| {
|
||||
let ret = unsafe { libc::sem_timedwait(sem, &deadline) };
|
||||
if ret != 0 {
|
||||
panic!(
|
||||
"bench_sync_barrier: timed out on '{label}' after {BARRIER_TIMEOUT_SECS}s \
|
||||
(__TFHE_RS_BENCH_GPU_PROCESS_COUNT={num_instances}). \
|
||||
If semaphores are stale from a prior crash, clean up with: \
|
||||
rm -f /dev/shm/sem.{MUTEX_NAME_PREFIX}_*\n\
|
||||
OS error: {}",
|
||||
std::io::Error::last_os_error()
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
let mutex = open_sem(&sem_mutex, 1);
|
||||
let arrive = open_sem(&sem_arrive, 0);
|
||||
let gate = open_sem(&sem_gate, 0);
|
||||
|
||||
// Process 0 to arrive doesn't need to wait
|
||||
// Processes 1..N to arrive need to wait
|
||||
timed_wait(mutex, "mutex");
|
||||
// Each process posts to the arrive semaphore, incrementing its value
|
||||
unsafe { libc::sem_post(arrive) };
|
||||
// The last process to post to "arrive" will read a value equal to "num_instances"
|
||||
// The other processes read a lower value. "mutex" ensures
|
||||
// the post + get_value are atomic
|
||||
let mut count = 0i32;
|
||||
unsafe { libc::sem_getvalue(arrive, &mut count) };
|
||||
|
||||
// Once a process has posted to arrive and got the value (atomic)
|
||||
// it allows the other processes to do the same
|
||||
unsafe { libc::sem_post(mutex) };
|
||||
|
||||
// The last process reads the "num_instances" value from arrive.
|
||||
// it must then tell the others to continue work. if it doesn't
|
||||
// the other processes will time out at the "gate"
|
||||
if count as usize == num_instances {
|
||||
for _ in 0..num_instances {
|
||||
// Open the gate
|
||||
unsafe { libc::sem_post(gate) };
|
||||
}
|
||||
}
|
||||
|
||||
// Every process waits at the gate. If it doesn't open in a certain time, then we panic
|
||||
timed_wait(gate, "gate");
|
||||
|
||||
// Clean up
|
||||
unsafe {
|
||||
libc::sem_close(mutex);
|
||||
libc::sem_close(arrive);
|
||||
libc::sem_close(gate);
|
||||
libc::sem_unlink(sem_mutex.as_ptr());
|
||||
libc::sem_unlink(sem_arrive.as_ptr());
|
||||
libc::sem_unlink(sem_gate.as_ptr());
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate a number of threads to use to saturate current machine for throughput measurements.
|
||||
pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
|
||||
let ref_block_count = 32; // Represent a ciphertext of 64 bits for 2_2 parameters set
|
||||
|
||||
Reference in New Issue
Block a user