Compare commits

..

1 Commits

Author SHA1 Message Date
Guillermo Oyarzun
4e9dc1caee feat(gpu): expose ILP in standard keyswitch 2026-04-15 13:19:33 +02:00
25 changed files with 117 additions and 415 deletions

View File

@@ -38,7 +38,6 @@ on:
- integer_aes
- integer_aes256
- hlapi_erc7984
- hlapi_erc7984_multi_group
- hlapi_dex
- hlapi_noise_squash
op_flavor:

View File

@@ -237,10 +237,8 @@ jobs:
BENCH_PARAMS_TYPE: ${{ matrix.params_type }}
BENCH_COMMAND: ${{ matrix.command }}
PRECISIONS_SET: ${{ inputs.precisions_set }}
__TFHE_RS_BENCH_MULTI_PROC_GROUPS: 2
- name: Parse results
if: ${{ inputs.command != 'hlapi_erc7984_multi_group' }}
run: |
python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
--database tfhe_rs \
@@ -258,39 +256,6 @@ jobs:
REF_NAME: ${{ github.ref_name }}
BENCH_TYPE: ${{ matrix.bench_type }}
- name: Parse and merge erc7984_multi_group results
if: ${{ inputs.command == 'hlapi_erc7984_multi_group' }}
run: |
python3 ./ci/benchmark_parser.py tfhe-benchmark/target_p0/criterion "${RESULTS_FILENAME_P0}" \
--database tfhe_rs \
--hardware "${INPUTS_HARDWARE_NAME}" \
--backend gpu \
--project-version "${COMMIT_HASH}" \
--branch "${REF_NAME}" \
--commit-date "${COMMIT_DATE}" \
--bench-date "${BENCH_DATE}" \
--walk-subdirs \
--name-suffix avx512 \
--bench-type "${BENCH_TYPE}"
python3 ./ci/benchmark_parser.py tfhe-benchmark/target_p1/criterion "${RESULTS_FILENAME_P1}" \
--database tfhe_rs \
--hardware "${INPUTS_HARDWARE_NAME}" \
--backend gpu \
--project-version "${COMMIT_HASH}" \
--branch "${REF_NAME}" \
--commit-date "${COMMIT_DATE}" \
--bench-date "${BENCH_DATE}" \
--walk-subdirs \
--name-suffix avx512 \
--bench-type "${BENCH_TYPE}"
python3 ./ci/merge_multi_group_results.py --bench-type "${BENCH_TYPE}" --output "${RESULTS_FILENAME}" "${RESULTS_FILENAME_P0}" "${RESULTS_FILENAME_P1}"
env:
INPUTS_HARDWARE_NAME: ${{ inputs.hardware_name }}
REF_NAME: ${{ github.ref_name }}
BENCH_TYPE: ${{ matrix.bench_type }}
RESULTS_FILENAME_P0: parsed_benchmark_results_p0_${{ github.sha }}.json
RESULTS_FILENAME_P1: parsed_benchmark_results_p1_${{ github.sha }}.json
- name: Parse additional benchmarks results files
if: ${{ inputs.additional_file_to_parse }}
run: |

View File

@@ -114,27 +114,6 @@ jobs:
SLAB_URL: ${{ secrets.SLAB_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
run-benchmarks-gpu-erc7984-multi-group:
name: benchmark_documentation/run-benchmarks-gpu-erc7984-multi-group
uses: ./.github/workflows/benchmark_gpu_common.yml
if: inputs.run-gpu-benchmarks
needs: parse-gpu-inputs
with:
profile: ${{ needs.parse-gpu-inputs.outputs.profile }}
hardware_name: ${{ needs.parse-gpu-inputs.outputs.hardware_name }}
command: hlapi_erc7984_multi_group
bench_type: throughput
params_type: multi_bit
secrets:
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
SLAB_URL: ${{ secrets.SLAB_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
# TODO add make recipe for HPU benchmarks
# run-benchmarks-hpu:
# name: benchmark_documentation/run-benchmarks-hpu

View File

@@ -23,7 +23,7 @@ on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [ labeled, opened, synchronize ]
types: [ labeled ]
permissions:
contents: read
@@ -38,7 +38,6 @@ jobs:
pull-requests: read # Needed to check for file change
outputs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -63,16 +62,15 @@ jobs:
- tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
- tfhe/src/shortint/parameters/**
- tfhe/src/c_api/**
- 'tfhe/docs/**/**.md'
- '.github/workflows/gpu_core_h100_tests.yml'
core_crypto:
- tfhe/src/core_crypto/gpu/**
setup-instance:
name: gpu_core_h100_tests/setup-instance
needs: should-run
if: github.event_name != 'pull_request' ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}

View File

@@ -23,7 +23,7 @@ on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [ labeled, opened, synchronize ]
types: [ labeled ]
permissions:
contents: read
@@ -38,7 +38,6 @@ jobs:
pull-requests: read # Needed to check for file change
outputs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -66,15 +65,13 @@ jobs:
- tfhe/src/c_api/**
- 'tfhe/docs/**/**.md'
- '.github/workflows/gpu_hlapi_h100_tests.yml'
core_crypto:
- tfhe/src/core_crypto/gpu/**
setup-instance:
name: gpu_hlapi_h100_tests/setup-instance
needs: should-run
if: github.event_name != 'pull_request' ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}

View File

@@ -17,8 +17,8 @@ on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
schedule:
# Weekly tests will be triggered every Monday at 8p.m.
- cron: "0 20 * * 1"
# Nightly tests will be triggered each evening 8p.m.
- cron: "0 20 * * *"
pull_request:
@@ -28,41 +28,10 @@ permissions:
# zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning
jobs:
should-run:
name: gpu_integer_long_run_tests/should-run
runs-on: ubuntu-latest
permissions:
pull-requests: read # Needed to check for file change
outputs:
is_needed_in_gpu_ci: ${{ env.IS_PR == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
gpu:
- tfhe/Cargo.toml
- tfhe/build.rs
- backends/tfhe-cuda-backend/**
- tfhe/src/core_crypto/gpu/**
- tfhe/src/integer/gpu/**
- tfhe/src/shortint/parameters/**
- '.github/workflows/gpu_integer_long_run_tests.yml'
setup-instance:
name: gpu_integer_long_run_tests/setup-instance
needs: [should-run]
if: github.event_name == 'workflow_dispatch' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
needs.should-run.outputs.is_needed_in_gpu_ci == 'true'
if: github.event_name != 'schedule' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}

View File

@@ -131,10 +131,6 @@ jobs:
env:
GCC_VERSION: ${{ matrix.gcc }}
- name: Run semgrep and lint checks on CUDA code
run: |
make semgrep_and_lint_gpu_code
- name: Run fmt checks
run: |
make check_fmt_gpu
@@ -143,6 +139,10 @@ jobs:
run: |
make pcc_gpu
- name: Run semgrep and lint checks on CUDA code
run: |
make semgrep_and_lint_gpu_code
- name: Run semver checks on tfhe-cuda-backend
run: |
make semver_check_cuda_backend

View File

@@ -63,6 +63,7 @@ jobs:
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**/**.md'
- '.github/workflows/gpu_signed_integer_classic_tests.yml'
- scripts/integer-tests.sh

View File

@@ -23,7 +23,7 @@ on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [ labeled, opened, synchronize ]
types: [ labeled ]
permissions:
contents: read
@@ -38,7 +38,6 @@ jobs:
pull-requests: read # Needed to check for file change
outputs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -64,17 +63,16 @@ jobs:
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**/**.md'
- '.github/workflows/gpu_signed_integer_h100_tests.yml'
- scripts/integer-tests.sh
core_crypto:
- tfhe/src/core_crypto/gpu/**
setup-instance:
name: gpu_signed_integer_h100_tests/setup-instance
needs: should-run
if: github.event_name != 'pull_request' ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}

View File

@@ -64,6 +64,7 @@ jobs:
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**/**.md'
- '.github/workflows/gpu_signed_integer_tests.yml'
- scripts/integer-tests.sh

View File

@@ -63,6 +63,7 @@ jobs:
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**/**.md'
- '.github/workflows/gpu_unsigned_integer_classic_tests.yml'
- scripts/integer-tests.sh

View File

@@ -23,7 +23,7 @@ on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [ labeled, opened, synchronize ]
types: [ labeled ]
permissions:
contents: read
@@ -38,7 +38,6 @@ jobs:
pull-requests: read # Needed to check for file change
outputs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -64,17 +63,16 @@ jobs:
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**/**.md'
- '.github/workflows/gpu_unsigned_integer_h100_tests.yml'
- scripts/integer-tests.sh
core_crypto:
- tfhe/src/core_crypto/gpu/**
setup-instance:
name: gpu_unsigned_integer_h100_tests/setup-instance
needs: should-run
if: github.event_name == 'workflow_dispatch' ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}

View File

@@ -64,6 +64,7 @@ jobs:
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**/**.md'
- '.github/workflows/gpu_unsigned_integer_tests.yml'
- scripts/integer-tests.sh

View File

@@ -55,9 +55,12 @@ jobs:
- tfhe/build.rs
- backends/tfhe-cuda-backend/**
- backends/zk-cuda-backend/**
- tfhe/src/core_crypto/gpu/**
- tfhe/src/integer/gpu/**
- tfhe/src/shortint/parameters/**
- tfhe/src/zk/**
- tfhe-zk-pok/**
- 'tfhe/docs/**/**.md'
- '.github/workflows/gpu_zk_tests.yml'
- ci/slab.toml

View File

@@ -312,10 +312,8 @@ semgrep_and_lint_gpu_code: semgrep_lint_setup_venv
find "$(TFHECUDA_SRC)" -name '*.h' -o -name '*.cuh' -o -name '*.cu' \
| grep -v '/cmake-build-debug/' \
| grep -v '/build/' \
| xargs venv/bin/semgrep --error --config "$(TFHECUDA_SRC)/.semgrep/release-ordering.yaml" --scan-unknown-extensions
| xargs venv/bin/semgrep --config "$(TFHECUDA_SRC)/.semgrep/release-ordering.yaml" --scan-unknown-extensions
venv/bin/python3 "scripts/check_scratch_cleanup.py"
@# Split the search string using shell string concatenation so the Makefile line doesn't match itself
! git ls-files | xargs grep -n 'TODO: ADD COMM''ENT'
.PHONY: semver_check_cuda_backend # Run semver checks on tfhe-cuda-backend
semver_check_cuda_backend:
@@ -1957,40 +1955,6 @@ bench_hlapi_erc7984_gpu_classical: install_rs_check_toolchain
--bench hlapi-erc7984 \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_hlapi_erc7984_multi_group_gpu # Runs ERC7984 bench in two processes (half of gpus for each) and aggregates results
bench_hlapi_erc7984_multi_group_gpu: install_rs_check_toolchain
# This next line must be kept here: the code can not remove this file without a risk of concurrency issues
# we don't know which process starts first and which one deletes the files - file deletion may also be not atomic)
rm -f /dev/shm/sem.tfhe_bench_*
NUM_GROUPS=$${__TFHE_RS_BENCH_MULTI_PROC_GROUPS:-2}; \
[ "$$NUM_GROUPS" -ge 2 ] || { echo "Error: __TFHE_RS_BENCH_MULTI_PROC_GROUPS must be at least 2, got $$NUM_GROUPS"; exit 1; }; \
trap "echo 'User interrupted the benchmark, stopping all workers!'; rm -f /dev/shm/sem.tfhe_bench_*; kill 0" INT TERM; \
for i in $$(seq 0 $$((NUM_GROUPS - 1))); do \
GPU_LIST=$$(python3 ci/split_gpus.py $$i $$NUM_GROUPS) || exit 1; \
echo "Starting benchmark group $$i with CUDA_VISIBLE_DEVICES=$$GPU_LIST"; \
CUDA_VISIBLE_DEVICES=$$GPU_LIST CARGO_TARGET_DIR=target_p$$i RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_GPU_PROCESS_COUNT=$$NUM_GROUPS \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-erc7984 \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow' & \
done; \
wait
.PHONY: bench_hlapi_erc7984_multi_group_fake_multi_gpu # Runs ERC7984 bench in two processes in parallel on a single GPU (use to debug bench_hlapi_erc7984_multi_group_gpu)
bench_hlapi_erc7984_multi_group_fake_multi_gpu: install_rs_check_toolchain
# This next line must be kept here: the code can not remove this file without a risk of concurrency issues
# we don't know which process starts first and which one deletes the files - file deletion may also be not atomic)
rm -f /dev/shm/sem.tfhe_bench_*
trap "echo 'User interrupted the benchmark, stopping all workers!'; rm -f /dev/shm/sem.tfhe_bench_*; kill 0" INT TERM; \
CARGO_TARGET_DIR=target_p0 RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=throughput __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_GPU_PROCESS_COUNT=2 \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-erc7984 \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow' & \
CARGO_TARGET_DIR=target_p1 RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=throughput __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_GPU_PROCESS_COUNT=2 \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-erc7984 \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow' & \
wait
.PHONY: bench_hlapi_dex # Run benchmarks for DEX operations
bench_hlapi_dex: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
@@ -2119,16 +2083,11 @@ bench_summary_gpu: install_rs_check_toolchain
--bench hlapi-noise-squash \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::decomp_noise_squash_comp::'
# This make target only runs the latency benchmark. This is because
# summary benchmarks must use the multi-process-multi-group throughput target
# to measure throughput. That target must be followed by specific post-processing steps.
# Thus that target is run in a separate step in benchmark_summary.yml.
ifneq ($(filter latency both,$(BENCH_TYPE)),)
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=latency __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
# ERC7984
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-erc7984 \
--features=integer,gpu,internal-keycache -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow'
endif
# DEX
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \

View File

@@ -301,7 +301,11 @@ __global__ void keyswitch_zero_output_with_output_indices(
// in two parts, a constant part is calculated before the loop, and a variable
// part is calculated inside the loop. This seems to help with the register
// pressure as well.
template <typename Torus, typename KSTorus>
// LevelCount template fully unrolls the level loop.
// Accumulation step is divided in even and odd to expose more instruction level
// parallelism. When LevelCount == 0 (default), the original runtime path is
// used unchanged.
template <typename Torus, typename KSTorus, int LevelCount = 0>
__global__ void
keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lwe_array_in,
@@ -318,13 +322,15 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
if (tid <= lwe_dimension_out) {
KSTorus local_lwe_out = 0;
// Accumulator is splited into even and odd iterations.
KSTorus local_lwe_out_even = 0;
KSTorus local_lwe_out_odd = 0;
auto block_lwe_array_in = get_chunk(
lwe_array_in, lwe_input_indexes[blockIdx.x], lwe_dimension_in + 1);
if (tid == lwe_dimension_out && threadIdx.y == 0) {
if constexpr (std::is_same_v<KSTorus, Torus>) {
local_lwe_out = -block_lwe_array_in[lwe_dimension_in];
local_lwe_out_even = -block_lwe_array_in[lwe_dimension_in];
} else {
auto new_body = closest_repr(block_lwe_array_in[lwe_dimension_in],
sizeof(KSTorus) * 8, 1);
@@ -337,7 +343,7 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
auto rounded_downscaled_body =
(KSTorus)(new_body >> input_to_output_scaling_factor);
local_lwe_out = -rounded_downscaled_body;
local_lwe_out_even = -rounded_downscaled_body;
}
}
const Torus mask_mod_b = (1ll << base_log) - 1ll;
@@ -351,18 +357,41 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
for (int i = start_i; i < end_i; i++) {
Torus state =
init_decomposer_state(block_lwe_array_in[i], base_log, level_count);
uint32_t offset = i * level_count * (lwe_dimension_out + 1);
#pragma unroll 1
for (int j = 0; j < level_count; j++) {
KSTorus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
local_lwe_out +=
(KSTorus)ksk[tid + j * (lwe_dimension_out + 1) + offset] *
decomposed;
// Fully unrolled level loop in case LevelCount is precompiled.
if constexpr (LevelCount > 0) {
uint32_t offset = i * LevelCount * (lwe_dimension_out + 1);
#pragma unroll
for (int j = 0; j < LevelCount; j++) {
KSTorus decomposed =
decompose_one<Torus>(state, mask_mod_b, base_log);
if (j & 1)
local_lwe_out_odd +=
(KSTorus)ksk[tid + j * (lwe_dimension_out + 1) + offset] *
decomposed;
else
local_lwe_out_even +=
(KSTorus)ksk[tid + j * (lwe_dimension_out + 1) + offset] *
decomposed;
}
} else {
// Runtime fallback: original behaviour preserved exactly.
uint32_t offset = i * level_count * (lwe_dimension_out + 1);
#pragma unroll 1
for (int j = 0; j < level_count; j++) {
KSTorus decomposed =
decompose_one<Torus>(state, mask_mod_b, base_log);
local_lwe_out_even +=
(KSTorus)ksk[tid + j * (lwe_dimension_out + 1) + offset] *
decomposed;
}
}
}
lwe_acc_out[shmem_index] = local_lwe_out;
if constexpr (LevelCount > 0)
lwe_acc_out[shmem_index] = local_lwe_out_even + local_lwe_out_odd;
else
lwe_acc_out[shmem_index] = local_lwe_out_even;
}
for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
@@ -405,9 +434,46 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
dim3 grid(num_samples, num_blocks_per_sample, 1);
dim3 threads(num_threads_x, num_threads_y, 1);
keyswitch<Torus, KSTorus><<<grid, threads, shared_mem, stream>>>(
lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
lwe_dimension_in, lwe_dimension_out, base_log, level_count);
// Dispatch to a statically-specialised kernel for common level_count values
// so the level loop is fully unrolled.
#define KS_LAUNCH(N) \
keyswitch<Torus, KSTorus, N><<<grid, threads, shared_mem, stream>>>( \
lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk, \
lwe_dimension_in, lwe_dimension_out, base_log, level_count)
switch (level_count) {
case 1:
KS_LAUNCH(1);
break;
case 2:
KS_LAUNCH(2);
break;
case 3:
KS_LAUNCH(3);
break;
case 4:
KS_LAUNCH(4);
break;
case 5:
KS_LAUNCH(5);
break;
case 6:
KS_LAUNCH(6);
break;
case 7:
KS_LAUNCH(7);
break;
case 8:
KS_LAUNCH(8);
break;
case 9:
KS_LAUNCH(9);
break;
default:
KS_LAUNCH(0);
break;
}
#undef KS_LAUNCH
check_cuda_error(cudaGetLastError());
}

View File

@@ -489,7 +489,7 @@ template <typename Torus>
__host__ void host_modulus_switch_multi_bit(
cudaStream_t stream, uint32_t gpu_index, Torus *array_out, Torus *array_in,
int size, uint32_t log_modulus, uint32_t degree, uint32_t grouping_factor) {
check_cuda_error(cudaSetDevice(gpu_index));
cudaSetDevice(gpu_index);
int multibit_size = size / grouping_factor;
int num_threads = 0, num_blocks = 0;
getNumBlocksAndThreads(multibit_size, 1024, num_blocks, num_threads);

View File

@@ -308,7 +308,6 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_128(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer) {
cleanup_cuda_multi_bit_programmable_bootstrap_128(stream, gpu_index,
pbs_buffer);
cuda_synchronize_stream(static_cast<cudaStream_t>(stream), gpu_index);
}
// Noise tests variant of the 128-bit multi-bit PBS, restricted to

View File

@@ -264,7 +264,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer,
lwe_dimension, glwe_dimension, polynomial_size, pbs_level,
input_lwe_ciphertext_count, true, false);
input_lwe_ciphertext_count, true, PBS_MS_REDUCTION_T::NO_REDUCTION);
uint32_t num_many_lut = 1;
uint32_t lut_stride = 0;
for (auto _ : st) {

View File

@@ -6,7 +6,6 @@
"p4d.24xlarge": 32.7726,
"p5.48xlarge": 98.32,
"rtx4090": 0.04,
"n3-L40x4": 3.2,
"n3-H100x1": 1.52,
"n3-H100x8-NVLink": 12.48,
"n3-H100x8": 12.16,

View File

@@ -1,66 +0,0 @@
#!/usr/bin/env python3
# This script aggregates multi-process-group benchmark results
# that are obtained by running benchmarks in a multi-process approach
import argparse
import json
import sys
ACCEPTED_TEST_PREFIXES = {
"throughput": ["hlapi::cuda::erc7984::throughput"],
"latency": ["hlapi::cuda::erc7984::latency"],
}
# Looks at the Slab JSON benchmark results and aggregates the "value" field.
# For throughput, values are summed across groups.
# For latency, values are averaged across groups.
def merge_multi_group_results(input_files, output_file, bench_type):
accumulated = {}
counts = {}
metadata = None
accepted_prefixes = ACCEPTED_TEST_PREFIXES[bench_type]
for path in input_files:
with open(path) as f:
data = json.load(f)
if metadata is None:
metadata = {k: v for k, v in data.items() if k != "points"}
for point in data["points"]:
test = point["test"]
if not any(test.startswith(prefix) for prefix in accepted_prefixes):
print(
f"Error: unexpected test '{test}' in {path}: "
f"this script only supports aggregation of: {accepted_prefixes}",
file=sys.stderr,
)
sys.exit(1)
if test in accumulated:
accumulated[test]["value"] += point["value"]
counts[test] += 1
else:
accumulated[test] = dict(point)
counts[test] = 1
if bench_type == "latency":
for test in accumulated:
accumulated[test]["value"] /= counts[test]
result = dict(metadata)
result["points"] = list(accumulated.values())
with open(output_file, "w") as f:
json.dump(result, f, indent=2)
# The output is a positional argument, for file names we accept 2+
parser = argparse.ArgumentParser()
parser.add_argument("input_files", nargs="+")
parser.add_argument("--output", required=True)
parser.add_argument("--bench-type", required=True, choices=["throughput", "latency"])
if __name__ == "__main__":
args = parser.parse_args()
if len(args.input_files) < 2:
print("Error: at least 2 input files required", file=sys.stderr)
sys.exit(1)
merge_multi_group_results(args.input_files, args.output, args.bench_type)

View File

@@ -1,55 +0,0 @@
import argparse
import subprocess
import sys
# List the gpus for a sub-group (group_index) of gpus grouped
# in num_groups groups. The output string is passed to CUDA_VISIBLE_DEVICES
def get_gpu_count() -> int:
try:
result = subprocess.run(
["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
capture_output=True,
text=True,
check=True,
)
except FileNotFoundError:
print("Error: nvidia-smi not found", file=sys.stderr)
sys.exit(1)
except subprocess.CalledProcessError as err:
print(f"Error: nvidia-smi failed: {err.stderr.strip()}", file=sys.stderr)
sys.exit(1)
return len(result.stdout.strip().splitlines())
def gpu_list_for_group(num_gpus: int, group_index: int, num_groups: int) -> str:
# Splits the available gpus un groups and returns
# the gpus assigned to group group_index.
if num_gpus < num_groups:
print(
f"Error: cannot split {num_gpus} GPU(s) across {num_groups} group(s): "
"not enough GPUs",
file=sys.stderr,
)
sys.exit(1)
if num_gpus % num_groups != 0:
print(
f"Error: {num_gpus} GPU(s) is not evenly divisible by {num_groups} group(s)",
file=sys.stderr,
)
sys.exit(1)
gpus_per_group = num_gpus // num_groups
start = group_index * gpus_per_group
return ",".join(str(i) for i in range(start, start + gpus_per_group))
parser = argparse.ArgumentParser(
description="Print the CUDA_VISIBLE_DEVICES value for one process in a multi-GPU split."
)
parser.add_argument("group_index", type=int, help="0-based index of this process group")
parser.add_argument("num_groups", type=int, help="Total number of process groups")
if __name__ == "__main__":
args = parser.parse_args()
num_gpus = get_gpu_count()
print(gpu_list_for_group(num_gpus, args.group_index, args.num_groups))

View File

@@ -14,9 +14,6 @@ publish = false
name = "benchmark"
path = "src/lib.rs"
[target.'cfg(unix)'.dependencies]
libc = "0.2"
[dependencies]
bincode = { workspace = true }
# clap has to be pinned as its minimum supported rust version

View File

@@ -1,7 +1,5 @@
#[cfg(feature = "gpu")]
use benchmark::utilities::{
bench_sync_barrier, configure_gpu, get_bench_gpu_instances, get_param_type, ParamType,
};
use benchmark::utilities::{configure_gpu, get_param_type, ParamType};
use benchmark::utilities::{write_to_json_unchecked, OperatorType};
use benchmark_spec::{get_bench_type, BenchmarkType};
use criterion::measurement::WallTime;
@@ -562,11 +560,6 @@ fn cuda_bench_transfer_throughput<FheType, F>(
let num_streams_per_gpu = 6; // Hard coded stream value for FheUint64
let chunk_size = (num_elems / num_gpus) as usize;
#[cfg(target_os = "linux")]
if let Some(n) = get_bench_gpu_instances() {
bench_sync_barrier(n);
}
b.iter(|| {
from_amounts
.par_chunks(chunk_size) // Split into chunks of num_gpus

View File

@@ -472,106 +472,6 @@ pub fn get_param_type() -> &'static ParamType {
PARAM_TYPE.get_or_init(|| ParamType::from_env().unwrap())
}
pub fn get_bench_gpu_instances() -> Option<usize> {
env::var("__TFHE_RS_BENCH_GPU_PROCESS_COUNT").ok().map(|v| {
v.parse::<usize>().unwrap_or_else(|_| {
panic!("__TFHE_RS_BENCH_GPU_PROCESS_COUNT must be a positive integer, got '{v}'")
})
})
}
/// Multi-process barrier that ensures num_instances processes
/// start at the same time
#[cfg(target_os = "linux")]
pub fn bench_sync_barrier(num_instances: usize) {
use std::ffi::CString;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
const BARRIER_TIMEOUT_SECS: u64 = 120;
const MUTEX_NAME_PREFIX: &str = "tfhe_bench";
// Three POSIX semaphores are used for synchronization
// The first one is used to make sure the processes increment the
// counter and get the value of the counter atomically .
let sem_mutex = CString::new(format!("/{MUTEX_NAME_PREFIX}_mutex")).unwrap();
let sem_arrive = CString::new(format!("/{MUTEX_NAME_PREFIX}_arrive")).unwrap();
let sem_gate = CString::new(format!("/{MUTEX_NAME_PREFIX}_gate")).unwrap();
let now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap();
let deadline_t = now + Duration::from_secs(BARRIER_TIMEOUT_SECS);
let deadline = libc::timespec {
tv_sec: deadline_t.as_secs() as libc::time_t,
tv_nsec: deadline_t.subsec_nanos() as libc::c_long,
};
let open_sem = |name: &CString, init: u32| {
let sem = unsafe { libc::sem_open(name.as_ptr(), libc::O_CREAT, 0o600u32, init) };
assert!(
sem != libc::SEM_FAILED,
"sem_open({:?}) failed: {}",
name,
std::io::Error::last_os_error()
);
sem
};
let timed_wait = |sem: *mut libc::sem_t, label: &str| {
let ret = unsafe { libc::sem_timedwait(sem, &deadline) };
if ret != 0 {
panic!(
"bench_sync_barrier: timed out on '{label}' after {BARRIER_TIMEOUT_SECS}s \
(__TFHE_RS_BENCH_GPU_PROCESS_COUNT={num_instances}). \
If semaphores are stale from a prior crash, clean up with: \
rm -f /dev/shm/sem.{MUTEX_NAME_PREFIX}_*\n\
OS error: {}",
std::io::Error::last_os_error()
);
}
};
let mutex = open_sem(&sem_mutex, 1);
let arrive = open_sem(&sem_arrive, 0);
let gate = open_sem(&sem_gate, 0);
// Process 0 to arrive doesn't need to wait
// Processes 1..N to arrive need to wait
timed_wait(mutex, "mutex");
// Each process posts to the arrive semaphore, incrementing its value
unsafe { libc::sem_post(arrive) };
// The last process to post to "arrive" will read a value equal to "num_instances"
// The other processes read a lower value. "mutex" ensures
// the post + get_value are atomic
let mut count = 0i32;
unsafe { libc::sem_getvalue(arrive, &mut count) };
// Once a process has posted to arrive and got the value (atomic)
// it allows the other processes to do the same
unsafe { libc::sem_post(mutex) };
// The last process reads the "num_instances" value from arrive.
// it must then tell the others to continue work. if it doesn't
// the other processes will time out at the "gate"
if count as usize == num_instances {
for _ in 0..num_instances {
// Open the gate
unsafe { libc::sem_post(gate) };
}
}
// Every process waits at the gate. If it doesn't open in a certain time, then we panic
timed_wait(gate, "gate");
// Clean up
unsafe {
libc::sem_close(mutex);
libc::sem_close(arrive);
libc::sem_close(gate);
libc::sem_unlink(sem_mutex.as_ptr());
libc::sem_unlink(sem_arrive.as_ptr());
libc::sem_unlink(sem_gate.as_ptr());
}
}
/// Generate a number of threads to use to saturate current machine for throughput measurements.
pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
let ref_block_count = 32; // Represent a ciphertext of 64 bits for 2_2 parameters set