Compare commits

..

2 Commits

Author SHA1 Message Date
David Testé
014b9d1b5c WIP: run gpu documentation benchmarks on scaleway 2026-03-26 15:12:55 +01:00
David Testé
3518aa4ed2 chore(ci): add terraform script for gpu benchmarks
This would spawn a H100-SXM-8-80G on Scaleway platform.
2026-03-25 15:28:42 +01:00
338 changed files with 5357 additions and 26666 deletions

View File

@@ -4,9 +4,6 @@ ignore = [
"RUSTSEC-2024-0436",
# Ignoring unmaintained 'bincode' crate. Getting rid of it would be too complex on the short term.
"RUSTSEC-2025-0141",
# Ignoring unsoundness in 'rand' with custom logger. Rand update is currently blocked by
# arkworks and we do not use custom loggers.
"RUSTSEC-2026-0097",
]
[output]

View File

@@ -82,10 +82,11 @@ runs:
sudo apt update
sudo apt -y install cuda-toolkit-"${TOOLKIT_VERSION}"
# Command to put back in once nvcc check is fixed
# find /usr/local -executable -name "nvcc"
- name: Export CUDA variables
shell: bash
run: |
find /usr/local -executable -name "nvcc"
CUDA_PATH=/usr/local/cuda-"${CUDA_VERSION}"
{
echo "CUDA_PATH=$CUDA_PATH";

View File

@@ -54,7 +54,7 @@ jobs:
- name: Retrieve data from cache
id: retrieve-data-cache
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
with:
path: |
utils/tfhe-backward-compat-data/**/*.cbor
@@ -89,7 +89,7 @@ jobs:
- name: Store data in cache
if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
continue-on-error: true
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
with:
path: |
utils/tfhe-backward-compat-data/**/*.cbor

View File

@@ -16,6 +16,7 @@ env:
PULL_REQUEST_MD_LINK: ""
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
@@ -36,7 +37,6 @@ jobs:
csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
versionable_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.versionable_any_changed }}
safe_serialize_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.safe_serialize_any_changed }}
core_crypto_test: ${{ env.IS_PULL_REQUEST == 'false' ||
steps.changed-files.outputs.core_crypto_any_changed ||
steps.changed-files.outputs.dependencies_any_changed }}
@@ -64,7 +64,7 @@ jobs:
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: "false"
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
- name: Check for file changes
@@ -79,7 +79,6 @@ jobs:
- tfhe-zk-pok/**
- utils/tfhe-versionable/**
- utils/tfhe-versionable-derive/**
- utils/tfhe-safe-serialize/**
csprng:
- tfhe-csprng/**
zk_pok:
@@ -87,8 +86,6 @@ jobs:
versionable:
- utils/tfhe-versionable/**
- utils/tfhe-versionable-derive/**
safe_serialize:
- utils/tfhe-safe-serialize/**
core_crypto:
- tfhe/src/core_crypto/**
boolean:
@@ -125,7 +122,6 @@ jobs:
steps.changed-files.outputs.csprng_any_changed == 'true' ||
steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
steps.changed-files.outputs.versionable_any_changed == 'true' ||
steps.changed-files.outputs.safe_serialize_any_changed == 'true' ||
steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
steps.changed-files.outputs.boolean_any_changed == 'true' ||
steps.changed-files.outputs.shortint_any_changed == 'true' ||
@@ -149,7 +145,7 @@ jobs:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: "false"
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
- name: Install latest stable
@@ -174,11 +170,6 @@ jobs:
run: |
make test_versionable
- name: Run tfhe-safe-serialize tests
if: needs.should-run.outputs.safe_serialize_test == 'true'
run: |
make test_safe_serialize
- name: Run core tests
if: needs.should-run.outputs.core_crypto_test == 'true'
run: |
@@ -200,7 +191,7 @@ jobs:
- name: Node cache restoration
id: node-cache
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
with:
path: |
~/.nvm
@@ -213,7 +204,7 @@ jobs:
make install_node
- name: Node cache save
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
if: steps.node-cache.outputs.cache-hit != 'true'
with:
path: |

View File

@@ -34,7 +34,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -99,7 +99,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -14,11 +14,12 @@ env:
PULL_REQUEST_MD_LINK: ""
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [labeled]
types: [ labeled ]
permissions:
contents: read
@@ -31,16 +32,16 @@ jobs:
if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved')
runs-on: ubuntu-latest
permissions:
pull-requests: read # Needed to check for file change
pull-requests: read # Needed to check for file change
outputs:
wasm_test: ${{ github.event_name == 'workflow_dispatch' ||
steps.changed-files.outputs.wasm_any_changed }}
steps.changed-files.outputs.wasm_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: "false"
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
- name: Check for file changes
@@ -62,7 +63,6 @@ jobs:
- tfhe/js_on_wasm_tests/**
- tfhe/web_wasm_parallel_tests/**
- utils/tfhe-versionable/**
- utils/tfhe-safe-serialize/**
- .github/workflows/aws_tfhe_wasm_tests.yml
wasm-tests:
@@ -78,7 +78,7 @@ jobs:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: "false"
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
- name: Install latest stable
@@ -92,7 +92,7 @@ jobs:
- name: Node cache restoration
id: node-cache
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
with:
path: |
~/.nvm
@@ -105,7 +105,7 @@ jobs:
make install_node
- name: Node cache save
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
if: steps.node-cache.outputs.cache-hit != 'true'
with:
path: |
@@ -128,21 +128,15 @@ jobs:
run: |
make test_nodejs_wasm_api_ci
- name: Run parallel wasm tests
run: |
make test_web_js_api_parallel_chrome_ci
- name: Run wasm_par_mq tests
run: |
make test_wasm_par_mq_chrome_ci
make test_wasm_par_mq_firefox_ci
- name: Run parallel wasm tests
run: |
make test_web_js_api_parallel_chrome_ci
make test_web_js_api_parallel_firefox_ci
- name: Run cross origin wasm tests
run: |
make test_web_js_api_cross_origin_chrome_ci
make test_web_js_api_cross_origin_firefox_ci
- name: Run x86_64/wasm zk compatibility tests
run: |
make test_zk_wasm_x86_compat_ci

View File

@@ -6,9 +6,6 @@ name: backward_compat_pr_change_report
on:
pull_request:
env:
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
permissions:
contents: read
@@ -17,35 +14,9 @@ concurrency:
cancel-in-progress: true
jobs:
should-run:
name: backward_compat_pr_change_report/should-run
runs-on: ubuntu-latest
permissions:
pull-requests: read # Needed to check for file change
outputs:
backward_report: ${{ steps.changed-files.outputs.backward_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
backward:
- utils/tfhe-lints/snapshots/*.json
change-report:
name: backward_compat_pr_change_report/change-report (bpr)
runs-on: ubuntu-latest
needs: should-run
if:
needs.should-run.outputs.backward_report == 'true'
permissions:
pull-requests: write # To send and modify message in the PR
steps:

View File

@@ -14,12 +14,11 @@ on:
- signed_integer
- integer_compression
- integer_zk
- msm_zk
- shortint
- shortint_oprf
- hlapi_unsigned
- hlapi_signed
- hlapi_erc7984
- hlapi_erc20
- hlapi_dex
- hlapi_noise_squash
- hlapi_kvstore
@@ -93,8 +92,8 @@ jobs:
if inputs_command == "integer_zk":
files_to_parse.append("pke_zk_crs_sizes.csv")
elif inputs_command == "hlapi_erc7984":
files_to_parse.append("erc7984_pbs_count.csv")
elif inputs_command == "hlapi_erc20":
files_to_parse.append("erc20_pbs_count.csv")
elif inputs_command == "hlapi_dex":
files_to_parse.extend(
[

View File

@@ -107,7 +107,7 @@ jobs:
]:
f.write(f"""{env_name}=["{'", "'.join(values_to_join)}"]\n""")
- name: Set matrix arguments outputs
- name: Set martix arguments outputs
id: set_matrix_args
run: | # zizmor: ignore[template-injection] these env variable are safe
{
@@ -126,7 +126,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -261,7 +261,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -108,14 +108,14 @@ jobs:
SLAB_URL: ${{ secrets.SLAB_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
run-benchmarks-hlapi-erc7984:
name: benchmark_cpu_weekly/run-benchmarks-hlapi-erc7984
run-benchmarks-hlapi-erc20:
name: benchmark_cpu_weekly/run-benchmarks-hlapi-erc20
if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_cpu_common.yml
with:
command: hlapi_erc7984
additional_file_to_parse: erc7984_pbs_count.csv
command: hlapi_erc20
additional_file_to_parse: erc20_pbs_count.csv
secrets:
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}

View File

@@ -33,7 +33,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -17,10 +17,6 @@ on:
description: "Run GPU core-crypto benchmarks"
type: boolean
default: true
run-gpu-zk-benchmarks:
description: "Run GPU ZK benchmarks"
type: boolean
default: true
run-hpu-benchmarks:
description: "Run HPU benchmarks"
type: boolean
@@ -40,7 +36,7 @@ jobs:
uses: ./.github/workflows/benchmark_cpu_common.yml
if: inputs.run-cpu-benchmarks
with:
command: integer,hlapi_erc7984
command: integer,hlapi_erc20
op_flavor: fast_default
bench_type: both
precisions_set: documentation
@@ -93,9 +89,10 @@ jobs:
uses: ./.github/workflows/benchmark_gpu_common.yml
if: inputs.run-gpu-integer-benchmarks
with:
profile: multi-h100-sxm5
backend: terraform
profile: scaleway-multi-h100-sxm5
hardware_name: n3-H100-SXM5x8
command: integer_multi_bit,hlapi_erc7984
command: integer_multi_bit,hlapi_erc20
op_flavor: fast_default
bench_type: both
precisions_set: documentation
@@ -107,14 +104,14 @@ jobs:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
SLAB_URL: ${{ secrets.SLAB_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
run-benchmarks-hpu-integer:
name: benchmark_documentation/run-benchmarks-hpu-integer
uses: ./.github/workflows/benchmark_hpu_common.yml
if: inputs.run-hpu-benchmarks
with:
command: integer,hlapi_erc7984
command: integer,hlapi_erc20
op_flavor: default
bench_type: both
precisions_set: documentation
@@ -154,7 +151,8 @@ jobs:
uses: ./.github/workflows/benchmark_gpu_common.yml
if: inputs.run-gpu-core-crypto-benchmarks
with:
profile: multi-h100-sxm5
backend: terraform
profile: scaleway-multi-h100-sxm5
hardware_name: n3-H100-SXM5x8
command: pbs, ks_pbs
bench_type: latency
@@ -167,44 +165,23 @@ jobs:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
SLAB_URL: ${{ secrets.SLAB_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
run-benchmarks-gpu-zk-server:
name: benchmark_documentation/run-benchmarks-gpu-zk-server
uses: ./.github/workflows/benchmark_gpu_common.yml
if: inputs.run-gpu-zk-benchmarks
with:
profile: multi-h100-sxm5
hardware_name: n3-H100-SXM5x8
command: integer_zk
op_flavor: default
bench_type: both
secrets:
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
SLAB_URL: ${{ secrets.SLAB_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
generate-svgs-with-benchmarks-run:
name: benchmark-documentation/generate-svgs-with-benchmarks-run
if: ${{ always() &&
(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks || inputs.run-hpu-benchmarks) &&
(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks ||inputs.run-hpu-benchmarks) &&
inputs.generate-svgs }}
needs: [
run-benchmarks-cpu-integer, run-benchmarks-gpu-integer, run-benchmarks-hpu-integer,
run-benchmarks-cpu-zk-server, run-benchmarks-cpu-zk-client,
run-benchmarks-cpu-core-crypto, run-benchmarks-gpu-core-crypto,
run-benchmarks-gpu-zk-server
run-benchmarks-cpu-core-crypto, run-benchmarks-gpu-core-crypto
]
uses: ./.github/workflows/generate_svgs.yml
with:
time_span_days: 5
generate-cpu-svgs: ${{ inputs.run-cpu-benchmarks }}
generate-gpu-svgs: ${{ inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks }}
generate-gpu-svgs: ${{ inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks }}
generate-hpu-svgs: ${{ inputs.run-hpu-benchmarks }}
secrets:
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
@@ -213,7 +190,7 @@ jobs:
generate-svgs-without-benchmarks-run:
name: benchmark-documentation/generate-svgs-without-benchmarks-run
if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks || inputs.run-hpu-benchmarks) &&
if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-hpu-benchmarks) &&
inputs.generate-svgs }}
uses: ./.github/workflows/generate_svgs.yml
with:

View File

@@ -31,13 +31,10 @@ on:
- pbs128
- ks
- ks_pbs
- tfhe_zk_pok
- msm_zk
- integer_zk
- integer_zk_experimental
- integer_aes
- integer_aes256
- hlapi_erc7984
- hlapi_erc20
- hlapi_dex
- hlapi_noise_squash
op_flavor:
@@ -123,8 +120,8 @@ jobs:
if inputs_command == "integer_zk":
files_to_parse.append("pke_zk_crs_sizes.csv")
elif inputs_command == "hlapi_erc7984":
files_to_parse.append("erc7984_pbs_count.csv")
elif inputs_command == "hlapi_erc20":
files_to_parse.append("erc20_pbs_count.csv")
elif inputs_command == "hlapi_dex":
files_to_parse.extend(
[

View File

@@ -111,7 +111,7 @@ jobs:
]:
f.write(f"""{env_name}=["{'", "'.join(values_to_join)}"]\n""")
- name: Set matrix arguments outputs
- name: Set martix arguments outputs
id: set_matrix_args
run: | # zizmor: ignore[template-injection] these env variable are safe
{
@@ -126,11 +126,17 @@ jobs:
needs: prepare-matrix
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
# otherwise we'll try to run the next job on a non-existing on-demand instance.
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
- name: Start remote instance
id: start-remote-instance
continue-on-error: true
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -139,6 +145,25 @@ jobs:
backend: ${{ inputs.backend }}
profile: ${{ inputs.profile }}
- name: Acknowledge remote instance failure
if: steps.start-remote-instance.outcome == 'failure' &&
inputs.profile != 'single-h100'
run: |
echo "Remote instance instance has failed to start (profile provided: '${INPUTS_PROFILE}')"
echo "Permanent instance instance cannot be used as a substitute (profile needed: 'single-h100')"
exit 1
env:
INPUTS_PROFILE: ${{ inputs.profile }}
# This will allow to fallback on permanent instances running on Hyperstack.
- name: Use permanent remote instance
id: use-permanent-instance
if: env.SECRETS_AVAILABLE == 'true' &&
steps.start-remote-instance.outcome == 'failure' &&
inputs.profile == 'single-h100'
run: |
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
# Install dependencies only once since cuda-benchmarks uses a matrix strategy, thus running multiple times.
install-dependencies:
name: benchmark_gpu_common/install-dependencies
@@ -159,6 +184,7 @@ jobs:
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Setup Hyperstack dependencies
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
uses: ./.github/actions/gpu_setup
with:
cuda-version: ${{ matrix.cuda }}
@@ -307,13 +333,13 @@ jobs:
teardown-instance:
name: benchmark_gpu_common/teardown-instance
if: ${{ always() && needs.setup-instance.result == 'success' }}
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
needs: [ setup-instance, cuda-benchmarks, slack-notify ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -42,7 +42,7 @@ env:
OPTIMIZATION_TARGET: "throughput"
BATCH_SIZE: "5000"
SCHEDULING_POLICY: "MAX_PARALLELISM"
BENCHMARKS: "erc7984"
BENCHMARKS: "erc20"
BRANCH_NAME: ${{ github.ref_name }}
COMMIT_SHA: ${{ github.sha }}
SLAB_SECRET: ${{ secrets.JOB_SECRET }}
@@ -77,7 +77,7 @@ jobs:
if [[ ${IS_MANUAL_RUN} == true ]]; then
PROFILE_RAW="${PROFILE_MANUAL_RUN}"
else
PROFILE_RAW="${PROFILE_SCHEDULED_RUN}"
PROFILE_RAW="${PROFILE}"
fi
# shellcheck disable=SC2001
PROFILE_VAL=$(echo "${PROFILE_RAW}" | sed 's|.*[[:space:]](\(.*\))|\1|')
@@ -94,7 +94,7 @@ jobs:
steps:
- name: Start remote instance
id: start-remote-instance
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -204,7 +204,7 @@ jobs:
uses: foundry-rs/foundry-toolchain@8789b3e21e6c11b2697f5eb56eddae542f746c10
- name: Cache cargo
uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3
with:
path: |
~/.cargo/registry
@@ -214,14 +214,14 @@ jobs:
restore-keys: ${{ runner.os }}-cargo-
- name: Login to GitHub Container Registry
uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Login to Chainguard Registry
uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0
with:
registry: cgr.dev
username: ${{ secrets.CGR_USERNAME }}
@@ -248,13 +248,13 @@ jobs:
npm install && npm run deploy:emptyProxies && npx hardhat compile
working-directory: fhevm/
- name: Profile erc7984 no-cmux benchmark on GPU
- name: Profile erc20 no-cmux benchmark on GPU
run: |
BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" \
FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" \
BENCHMARK_TYPE="THROUGHPUT_200" \
OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" \
make -e "profile_erc7984_gpu"
make -e "profile_erc20_gpu"
working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker
- name: Get nsys profile name
@@ -333,7 +333,7 @@ jobs:
steps:
- name: Stop remote instance
id: stop-instance
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -14,7 +14,7 @@ on:
- integer
- hlapi_unsigned
- hlapi_signed
- hlapi_erc7984
- hlapi_erc20
op_flavor:
description: "Operations set to run"
type: choice

View File

@@ -95,7 +95,7 @@ jobs:
]:
f.write(f"""{env_name}=["{'", "'.join(values_to_join)}"]\n""")
- name: Set matrix arguments outputs
- name: Set martix arguments outputs
id: set_matrix_args
run: | # zizmor: ignore[template-injection] these env variable are safe
{

View File

@@ -143,7 +143,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -387,7 +387,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -63,7 +63,7 @@ jobs:
with open(env_file, "a") as f:
f.write(f"""BROWSER=["{'", "'.join(split_browser)}"]\n""")
- name: Set matrix arguments output
- name: Set martix arguments output
id: set_matrix_arg
run: | # zizmor: ignore[template-injection] this env variable is safe
echo "browser=${{ toJSON(env.BROWSER) }}" >> "${GITHUB_OUTPUT}"
@@ -77,7 +77,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -124,7 +124,7 @@ jobs:
- name: Node cache restoration
id: node-cache
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
with:
path: |
~/.nvm
@@ -137,7 +137,7 @@ jobs:
make install_node
- name: Node cache save
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
if: steps.node-cache.outputs.cache-hit != 'true'
with:
path: |
@@ -158,9 +158,9 @@ jobs:
env:
BROWSER: ${{ matrix.browser }}
- name: Run benchmarks (cross origin)
- name: Run benchmarks (unsafe coop)
run: |
make bench_web_js_api_cross_origin_"${BROWSER}"_ci
make bench_web_js_api_unsafe_coop_"${BROWSER}"_ci
env:
BROWSER: ${{ matrix.browser }}
@@ -218,7 +218,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -94,7 +94,7 @@ jobs:
with open(env_file, "a") as f:
f.write(f"""RUNNERS=["{'", "'.join(runners)}"]\n""")
- name: Set matrix runners outputs
- name: Set martix runners outputs
id: set_matrix_runners
run: | # zizmor: ignore[template-injection] these env variable are safe
echo "runners=${{ toJSON(env.RUNNERS) }}" >> "${GITHUB_OUTPUT}"
@@ -138,7 +138,7 @@ jobs:
- name: Node cache restoration
if: inputs.run-pcc-cpu-batch == 'pcc_batch_2'
id: node-cache
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
with:
path: |
~/.nvm
@@ -151,7 +151,7 @@ jobs:
make install_node
- name: Node cache save
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
if: inputs.run-pcc-cpu-batch == 'pcc_batch_2' && steps.node-cache.outputs.cache-hit != 'true'
with:
path: |

View File

@@ -63,7 +63,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -146,7 +146,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -50,7 +50,7 @@ jobs:
version: ${{ steps.get_zizmor.outputs.version }}
- name: Ensure SHA pinned actions
uses: zgosalvez/github-actions-ensure-sha-pinned-actions@ca46236c6ce584ae24bc6283ba8dcf4b3ec8a066 # v5.0.4
uses: zgosalvez/github-actions-ensure-sha-pinned-actions@70c4af2ed5282c51ba40566d026d6647852ffa3e # v5.0.1
with:
allowlist: |
slsa-framework/slsa-github-generator

View File

@@ -74,7 +74,7 @@ jobs:
make test_shortint_cov
- name: Upload tfhe coverage to Codecov
uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2
uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
with:
token: ${{ secrets.CODECOV_TOKEN }}
@@ -88,7 +88,7 @@ jobs:
make test_integer_cov
- name: Upload tfhe coverage to Codecov
uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2
uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
with:
token: ${{ secrets.CODECOV_TOKEN }}

View File

@@ -209,98 +209,60 @@ jobs:
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
gpu-zk-server-latency-table:
name: generate_documentation_svgs/gpu-zk-server-latency-table
uses: ./.github/workflows/generate_svg_common.yml
if: inputs.generate-gpu-svgs
with:
backend: gpu
hardware_name: n3-H100-SXM5x8
layer: integer
bench_subset: zk
pbs_kind: multi_bit
grouping_factor: 4
bench_type: latency
time_span_days: ${{ inputs.time_span_days }}
output_filename: gpu-zk-benchmark-latency
secrets:
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
gpu-zk-server-throughput-table:
name: generate_documentation_svgs/gpu-zk-server-throughput-table
uses: ./.github/workflows/generate_svg_common.yml
if: inputs.generate-gpu-svgs
with:
backend: gpu
hardware_name: n3-H100-SXM5x8
layer: integer
bench_subset: zk
pbs_kind: multi_bit
grouping_factor: 4
bench_type: throughput
time_span_days: ${{ inputs.time_span_days }}
output_filename: gpu-zk-benchmark-throughput
secrets:
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
# -----------------------------------------------------------
# ERC7984 benchmarks tables
# ERC20 benchmarks tables
# -----------------------------------------------------------
cpu-erc7984-latency-throughput-table:
name: generate_documentation_svgs/cpu-erc7984-latency-throughput-table
cpu-erc20-latency-throughput-table:
name: generate_documentation_svgs/cpu-erc20-latency-throughput-table
uses: ./.github/workflows/generate_svg_common.yml
if: inputs.generate-cpu-svgs
with:
backend: cpu
hardware_name: hpc7a.96xlarge
layer: hlapi
bench_subset: erc7984
bench_subset: erc20
pbs_kind: classical
bench_type: both
time_span_days: ${{ inputs.time_span_days }}
output_filename: cpu-hlapi-erc7984-benchmark-latency-throughput
output_filename: cpu-hlapi-erc20-benchmark-latency-throughput
secrets:
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
gpu-erc7984-latency-throughput-table:
name: generate_documentation_svgs/gpu-erc7984-latency-throughput-table
gpu-erc20-latency-throughput-table:
name: generate_documentation_svgs/gpu-erc20-latency-throughput-table
uses: ./.github/workflows/generate_svg_common.yml
if: inputs.generate-gpu-svgs
with:
backend: gpu
hardware_name: n3-H100-SXM5x8
layer: hlapi
bench_subset: erc7984
bench_subset: erc20
pbs_kind: multi_bit
grouping_factor: 4
bench_type: both
time_span_days: ${{ inputs.time_span_days }}
output_filename: gpu-hlapi-erc7984-benchmark-h100x8-sxm5-latency-throughput
output_filename: gpu-hlapi-erc20-benchmark-h100x8-sxm5-latency-throughput
secrets:
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
hpu-erc7984-latency-throughput-table:
name: generate_documentation_svgs/hpu-erc7984-latency-throughput-table
hpu-erc20-latency-throughput-table:
name: generate_documentation_svgs/hpu-erc20-latency-throughput-table
uses: ./.github/workflows/generate_svg_common.yml
if: inputs.generate-hpu-svgs
with:
backend: hpu
hardware_name: hpu_x1
layer: hlapi
bench_subset: erc7984
bench_subset: erc20
pbs_kind: classical
bench_type: both
time_span_days: ${{ inputs.time_span_days }}
output_filename: hpu-hlapi-erc7984-benchmark-hpux1-latency-throughput.svg
output_filename: hpu-hlapi-erc20-benchmark-hpux1-latency-throughput.svg
secrets:
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}

View File

@@ -43,7 +43,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -149,7 +149,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -23,7 +23,7 @@ on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [ labeled, opened, synchronize ]
types: [ labeled ]
permissions:
contents: read
@@ -38,7 +38,6 @@ jobs:
pull-requests: read # Needed to check for file change
outputs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -63,24 +62,29 @@ jobs:
- tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
- tfhe/src/shortint/parameters/**
- tfhe/src/c_api/**
- 'tfhe/docs/**/**.md'
- '.github/workflows/gpu_core_h100_tests.yml'
core_crypto:
- tfhe/src/core_crypto/gpu/**
setup-instance:
name: gpu_core_h100_tests/setup-instance
needs: should-run
if: github.event_name != 'pull_request' ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
# otherwise we'll try to run the next job on a non-existing on-demand instance.
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
steps:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
continue-on-error: true
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -89,6 +93,13 @@ jobs:
backend: hyperstack
profile: single-h100
# This will allow to fallback on permanent instances running on Hyperstack.
- name: Use permanent remote instance
id: use-permanent-instance
if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
run: |
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
# This instance will be spawned especially for pull-request from forked repository
- name: Start GitHub instance
id: start-github-instance
@@ -121,6 +132,7 @@ jobs:
token: ${{ env.CHECKOUT_TOKEN }}
- name: Setup Hyperstack dependencies
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
uses: ./.github/actions/gpu_setup
with:
cuda-version: ${{ matrix.cuda }}
@@ -164,14 +176,14 @@ jobs:
teardown-instance:
name: gpu_core_h100_tests/teardown-instance
if: ${{ always() && needs.setup-instance.result == 'success' }}
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -77,7 +77,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -182,7 +182,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -25,11 +25,17 @@ jobs:
name: gpu_full_h100_tests/setup-instance
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
# otherwise we'll try to run the next job on a non-existing on-demand instance.
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
- name: Start remote instance
id: start-remote-instance
continue-on-error: true
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -38,6 +44,13 @@ jobs:
backend: hyperstack
profile: single-h100
# This will allow to fallback on permanent instances running on Hyperstack.
- name: Use permanent remote instance
id: use-permanent-instance
if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
run: |
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
cuda-tests-linux:
name: gpu_full_h100_tests/cuda-tests-linux
needs: [ setup-instance ]
@@ -61,6 +74,7 @@ jobs:
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Setup Hyperstack dependencies
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
uses: ./.github/actions/gpu_setup
with:
cuda-version: ${{ matrix.cuda }}
@@ -104,13 +118,13 @@ jobs:
teardown-instance:
name: gpu_full_h100_tests/teardown-instance
if: ${{ always() && needs.setup-instance.result == 'success' }}
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -80,7 +80,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -186,7 +186,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -23,7 +23,7 @@ on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [ labeled, opened, synchronize ]
types: [ labeled ]
permissions:
contents: read
@@ -38,7 +38,6 @@ jobs:
pull-requests: read # Needed to check for file change
outputs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -66,23 +65,27 @@ jobs:
- tfhe/src/c_api/**
- 'tfhe/docs/**/**.md'
- '.github/workflows/gpu_hlapi_h100_tests.yml'
core_crypto:
- tfhe/src/core_crypto/gpu/**
setup-instance:
name: gpu_hlapi_h100_tests/setup-instance
needs: should-run
if: github.event_name != 'pull_request' ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
# otherwise we'll try to run the next job on a non-existing on-demand instance.
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
steps:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
continue-on-error: true
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -91,6 +94,13 @@ jobs:
backend: hyperstack
profile: single-h100
# This will allow to fallback on permanent instances running on Hyperstack.
- name: Use permanent remote instance
id: use-permanent-instance
if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
run: |
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
# This instance will be spawned especially for pull-request from forked repository
- name: Start GitHub instance
id: start-github-instance
@@ -123,6 +133,7 @@ jobs:
token: ${{ env.CHECKOUT_TOKEN }}
- name: Setup Hyperstack dependencies
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
uses: ./.github/actions/gpu_setup
with:
cuda-version: ${{ matrix.cuda }}
@@ -173,14 +184,14 @@ jobs:
teardown-instance:
name: gpu_hlapi_h100_tests/teardown-instance
if: ${{ always() && needs.setup-instance.result == 'success' }}
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -17,8 +17,8 @@ on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
schedule:
# Weekly tests will be triggered every Monday at 8p.m.
- cron: "0 20 * * 1"
# Nightly tests will be triggered each evening 8p.m.
- cron: "0 20 * * *"
pull_request:
@@ -28,48 +28,17 @@ permissions:
# zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning
jobs:
should-run:
name: gpu_integer_long_run_tests/should-run
runs-on: ubuntu-latest
permissions:
pull-requests: read # Needed to check for file change
outputs:
is_needed_in_gpu_ci: ${{ env.IS_PR == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
with:
files_yaml: |
gpu:
- tfhe/Cargo.toml
- tfhe/build.rs
- backends/tfhe-cuda-backend/**
- tfhe/src/core_crypto/gpu/**
- tfhe/src/integer/gpu/**
- tfhe/src/shortint/parameters/**
- '.github/workflows/gpu_integer_long_run_tests.yml'
setup-instance:
name: gpu_integer_long_run_tests/setup-instance
needs: [should-run]
if: github.event_name == 'workflow_dispatch' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
needs.should-run.outputs.is_needed_in_gpu_ci == 'true'
if: github.event_name != 'schedule' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -143,7 +112,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -74,7 +74,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -166,7 +166,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -74,7 +74,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -166,7 +166,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -38,7 +38,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -131,10 +131,6 @@ jobs:
env:
GCC_VERSION: ${{ matrix.gcc }}
- name: Run semgrep and lint checks on CUDA code
run: |
make semgrep_and_lint_gpu_code
- name: Run fmt checks
run: |
make check_fmt_gpu
@@ -143,6 +139,10 @@ jobs:
run: |
make pcc_gpu
- name: Run semgrep and lint checks on CUDA code
run: |
make semgrep_and_lint_gpu_code
- name: Run semver checks on tfhe-cuda-backend
run: |
make semver_check_cuda_backend
@@ -176,7 +176,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -63,6 +63,7 @@ jobs:
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**/**.md'
- '.github/workflows/gpu_signed_integer_classic_tests.yml'
- scripts/integer-tests.sh
@@ -79,7 +80,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -168,7 +169,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -23,7 +23,7 @@ on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [ labeled, opened, synchronize ]
types: [ labeled ]
permissions:
contents: read
@@ -38,7 +38,6 @@ jobs:
pull-requests: read # Needed to check for file change
outputs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -64,25 +63,30 @@ jobs:
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**/**.md'
- '.github/workflows/gpu_signed_integer_h100_tests.yml'
- scripts/integer-tests.sh
core_crypto:
- tfhe/src/core_crypto/gpu/**
setup-instance:
name: gpu_signed_integer_h100_tests/setup-instance
needs: should-run
if: github.event_name != 'pull_request' ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
# otherwise we'll try to run the next job on a non-existing on-demand instance.
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
steps:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
continue-on-error: true
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -91,6 +95,13 @@ jobs:
backend: hyperstack
profile: single-h100
# This will allow to fallback on permanent instances running on Hyperstack.
- name: Use permanent remote instance
id: use-permanent-instance
if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
run: |
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
# This instance will be spawned especially for pull-request from forked repository
- name: Start GitHub instance
id: start-github-instance
@@ -123,6 +134,7 @@ jobs:
token: ${{ env.CHECKOUT_TOKEN }}
- name: Setup Hyperstack dependencies
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
uses: ./.github/actions/gpu_setup
with:
cuda-version: ${{ matrix.cuda }}
@@ -164,14 +176,14 @@ jobs:
teardown-instance:
name: gpu_signed_integer_h100_tests/teardown-instance
if: ${{ always() && needs.setup-instance.result == 'success' }}
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -64,6 +64,7 @@ jobs:
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**/**.md'
- '.github/workflows/gpu_signed_integer_tests.yml'
- scripts/integer-tests.sh
@@ -80,7 +81,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -177,7 +178,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -63,6 +63,7 @@ jobs:
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**/**.md'
- '.github/workflows/gpu_unsigned_integer_classic_tests.yml'
- scripts/integer-tests.sh
@@ -79,7 +80,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -168,7 +169,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -23,7 +23,7 @@ on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [ labeled, opened, synchronize ]
types: [ labeled ]
permissions:
contents: read
@@ -38,7 +38,6 @@ jobs:
pull-requests: read # Needed to check for file change
outputs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -64,25 +63,30 @@ jobs:
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**/**.md'
- '.github/workflows/gpu_unsigned_integer_h100_tests.yml'
- scripts/integer-tests.sh
core_crypto:
- tfhe/src/core_crypto/gpu/**
setup-instance:
name: gpu_unsigned_integer_h100_tests/setup-instance
needs: should-run
if: github.event_name == 'workflow_dispatch' ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
# otherwise we'll try to run the next job on a non-existing on-demand instance.
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
steps:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
continue-on-error: true
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -91,6 +95,13 @@ jobs:
backend: hyperstack
profile: single-h100
# This will allow to fallback on permanent instances running on Hyperstack.
- name: Use permanent remote instance
id: use-permanent-instance
if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
run: |
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
# This instance will be spawned especially for pull-request from forked repository
- name: Start GitHub instance
id: start-github-instance
@@ -123,6 +134,7 @@ jobs:
token: ${{ env.CHECKOUT_TOKEN }}
- name: Setup Hyperstack dependencies
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
uses: ./.github/actions/gpu_setup
with:
cuda-version: ${{ matrix.cuda }}
@@ -164,14 +176,14 @@ jobs:
teardown-instance:
name: gpu_unsigned_integer_h100_tests/teardown-instance
if: ${{ always() && needs.setup-instance.result == 'success' }}
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -64,6 +64,7 @@ jobs:
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**/**.md'
- '.github/workflows/gpu_unsigned_integer_tests.yml'
- scripts/integer-tests.sh
@@ -80,7 +81,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -177,7 +178,7 @@ jobs:
- name: Stop instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -51,13 +51,7 @@ jobs:
with:
files_yaml: |
gpu:
- tfhe/Cargo.toml
- tfhe/build.rs
- backends/tfhe-cuda-backend/**
- backends/zk-cuda-backend/**
- tfhe/src/shortint/parameters/**
- tfhe/src/zk/**
- tfhe-zk-pok/**
- '.github/workflows/gpu_zk_tests.yml'
- ci/slab.toml
@@ -73,7 +67,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -132,9 +126,6 @@ jobs:
- name: Run zk-cuda-backend integration tests
run: |
make test_zk_cuda_backend
make test_zk_pok_experimental_gpu
make test_integer_zk_gpu
make test_integer_zk_experimental_gpu
slack-notify:
name: gpu_zk_tests/slack-notify
@@ -167,7 +158,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -107,7 +107,7 @@ jobs:
path: target/package
- name: Authenticate on registry
uses: rust-lang/crates-io-auth-action@bbd81622f20ce9e2dd9622e3218b975523e45bbe # v1.0.4
uses: rust-lang/crates-io-auth-action@b7e9a28eded4986ec6b1fa40eeee8f8f165559ec # v1.0.3
id: auth
- name: Publish crate.io package

View File

@@ -1,36 +1,12 @@
# Common workflow to make crate release for CUDA backend
name: make_release_common_cuda
name: make_release_cuda
on:
workflow_call:
workflow_dispatch:
inputs:
package-name:
type: string
required: true
dry-run:
dry_run:
description: "Dry-run"
type: boolean
default: true
secrets:
REPO_CHECKOUT_TOKEN:
required: true
SLAB_ACTION_TOKEN:
required: true
SLAB_BASE_URL:
required: true
SLAB_URL:
required: true
JOB_SECRET:
required: true
SLACK_CHANNEL:
required: true
BOT_USERNAME:
required: true
SLACK_WEBHOOK:
required: true
ALLOWED_TEAM:
required: true
READ_ORG_TOKEN:
required: true
env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
@@ -45,15 +21,15 @@ permissions: {}
jobs:
verify-triggering-actor:
name: make_release_common_cuda/verify-triggering-actor
name: make_release_cuda/verify-triggering-actor
if: startsWith(github.ref, 'refs/tags/')
uses: ./.github/workflows/verify_triggering_actor.yml
secrets:
ALLOWED_TEAM: ${{ secrets.ALLOWED_TEAM }}
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
setup-instance:
name: make_release_common_cuda/setup-instance
name: make_release_cuda/setup-instance
needs: verify-triggering-actor
runs-on: ubuntu-latest
outputs:
@@ -61,7 +37,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -71,7 +47,7 @@ jobs:
profile: gpu-build
package:
name: make_release_common_cuda/package
name: make_release_cuda/package
needs: setup-instance
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
outputs:
@@ -100,6 +76,7 @@ jobs:
toolchain: stable
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
{
@@ -112,6 +89,7 @@ jobs:
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${GCC_VERSION}";
@@ -123,14 +101,12 @@ jobs:
GCC_VERSION: ${{ matrix.gcc }}
- name: Prepare package
env:
PACKAGE: ${{ inputs.package-name }}
run: |
cargo package -p "${PACKAGE}"
cargo package -p tfhe-cuda-backend
- uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
with:
name: crate-${{ inputs.package-name }}
name: crate-tfhe-cuda-backend
path: target/package/*.crate
- name: generate hash
@@ -138,8 +114,8 @@ jobs:
run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
provenance:
name: make_release_common_cuda/provenance
if: ${{ !inputs.dry-run }}
name: make_release_cuda/provenance
if: ${{ !inputs.dry_run }}
needs: [package]
# This action cannot be pinned to a specific commit (see https://github.com/slsa-framework/slsa-github-generator/blob/main/README.md#referencing-slsa-builders-and-generators)
uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0 # zizmor: ignore[unpinned-uses] as said above SLSA cannot be pinned by tag today
@@ -152,7 +128,7 @@ jobs:
base64-subjects: ${{ needs.package.outputs.hash }}
publish-cuda-release:
name: make_release_common_cuda/publish-cuda-release
name: make_release_cuda/publish-cuda-release
needs: [setup-instance, package] # for comparing hashes
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
permissions:
@@ -174,6 +150,7 @@ jobs:
toolchain: stable
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
{
@@ -186,6 +163,7 @@ jobs:
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${GCC_VERSION}";
@@ -199,23 +177,22 @@ jobs:
- name: Download artifact
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
with:
name: crate-${{ inputs.package-name }}
name: crate-tfhe-cuda-backend
path: target/package
- name: Authenticate on registry
uses: rust-lang/crates-io-auth-action@bbd81622f20ce9e2dd9622e3218b975523e45bbe # v1.0.4
uses: rust-lang/crates-io-auth-action@b7e9a28eded4986ec6b1fa40eeee8f8f165559ec # v1.0.3
id: auth
- name: Publish crate.io package
env:
CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
PACKAGE: ${{ inputs.package-name }}
DRY-RUN: ${{ inputs.dry-run && '--dry-run' || '' }}
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
run: |
# dry-run expansion cannot be double quoted when variable contains empty string otherwise cargo publish
# would fail. This is safe since dry-run is handled in the env section above.
# DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish
# would fail. This is safe since DRY_RUN is handled in the env section above.
# shellcheck disable=SC2086
cargo publish -p "${PACKAGE}" ${DRY-RUN}
cargo publish -p tfhe-cuda-backend ${DRY_RUN}
- name: Generate hash
id: published_hash
@@ -227,7 +204,7 @@ jobs:
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
env:
SLACK_COLOR: failure
SLACK_MESSAGE: "SLSA ${{ inputs.package-name }} crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "SLSA tfhe-cuda-backend crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
@@ -235,17 +212,17 @@ jobs:
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "${{ inputs.package-name }} release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "tfhe-cuda-backend release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: make_release_common_cuda/teardown-instance
name: make_release_cuda/teardown-instance
if: ${{ always() && needs.setup-instance.result == 'success' }}
needs: [setup-instance, publish-cuda-release]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -258,4 +235,4 @@ jobs:
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (${{ inputs.package-name }} release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "Instance teardown (publish-cuda-release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -16,10 +16,6 @@ on:
description: "Push web js package"
type: boolean
default: true
push_web_compat_package:
description: "Push web compat (cross-origin) js package"
type: boolean
default: true
push_node_package:
description: "Push node js package"
type: boolean
@@ -103,23 +99,6 @@ jobs:
tag: ${{ env.NPM_TAG }}
provenance: true
- name: Build web compat (cross-origin) package
if: ${{ inputs.push_web_compat_package }}
run: |
rm -rf tfhe/pkg
make build_web_js_api
sed -i 's/"tfhe"/"tfhe-compat"/g' tfhe/pkg/package.json
- name: Publish web compat (cross-origin) package
if: ${{ inputs.push_web_compat_package }}
uses: JS-DevTools/npm-publish@0fd2f4369c5d6bcfcde6091a7c527d810b9b5c3f
with:
package: tfhe/pkg/package.json
dry-run: ${{ inputs.dry_run }}
tag: ${{ env.NPM_TAG }}
provenance: true
- name: Build Node package
if: ${{ inputs.push_node_package }}
run: |

View File

@@ -1,44 +0,0 @@
# Publish new release of tfhe-rs CUDA backend on crates.io.
name: make_release_tfhe_cuda
on:
workflow_dispatch:
inputs:
dry_run:
description: "Dry-run"
type: boolean
default: true
env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
permissions: {}
# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
jobs:
make-release:
name: make_release_tfhe_cuda/make-release
uses: ./.github/workflows/make_release_common_cuda.yml
with:
package-name: "tfhe-cuda-backend"
dry-run: ${{ inputs.dry_run }}
permissions:
actions: read # Needed to detect the GitHub Actions environment
id-token: write # Needed to create the provenance via GitHub OIDC
contents: write # Needed to upload assets/artifacts
secrets:
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
SLAB_URL: ${{ secrets.SLAB_URL }}
JOB_SECRET: ${{ secrets.JOB_SECRET }}

View File

@@ -1,32 +0,0 @@
name: make_release_tfhe_safe_serialize
on:
workflow_dispatch:
inputs:
dry_run:
description: "Dry-run"
type: boolean
default: true
permissions: {}
# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
jobs:
make-release:
name: make_release_tfhe_safe_serialize/make-release
uses: ./.github/workflows/make_release_common.yml
with:
package-name: "tfhe-safe-serialize"
dry-run: ${{ inputs.dry_run }}
permissions:
actions: read # Needed to detect the GitHub Actions environment
id-token: write # Needed to create the provenance via GitHub OIDC
contents: write # Needed to upload assets/artifacts
secrets:
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}

View File

@@ -1,44 +0,0 @@
# Publish new release of CUDA Zero-Knowledge primitives on crates.io.
name: make_release_zk_cuda
on:
workflow_dispatch:
inputs:
dry_run:
description: "Dry-run"
type: boolean
default: true
env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
permissions: {}
# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
jobs:
make-release:
name: make_release_zk_cuda/make-release
uses: ./.github/workflows/make_release_common_cuda.yml
with:
package-name: "zk-cuda-backend"
dry-run: ${{ inputs.dry_run }}
permissions:
actions: read # Needed to detect the GitHub Actions environment
id-token: write # Needed to create the provenance via GitHub OIDC
contents: write # Needed to upload assets/artifacts
secrets:
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
SLAB_URL: ${{ secrets.SLAB_URL }}
JOB_SECRET: ${{ secrets.JOB_SECRET }}

View File

@@ -53,7 +53,7 @@ jobs:
- name: Restore Sagemath image from cache
id: docker-cache
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
with:
path: /tmp/sagemath_image
key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}
@@ -76,7 +76,7 @@ jobs:
- name: Store Sagemath image in cache
if: steps.docker-cache.outputs.cache-hit != 'true'
continue-on-error: true
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
with:
path: /tmp/sagemath_image
key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}

4
.gitignore vendored
View File

@@ -25,7 +25,6 @@ dieharder_run.log
# Cuda local build
backends/tfhe-cuda-backend/cuda/cmake-build-debug/
backends/tfhe-cuda-backend/cuda/build/
# WASM tests
tfhe/web_wasm_parallel_tests/server.PID
@@ -35,9 +34,6 @@ node_modules/
package-lock.json
utils/wasm-par-mq/examples/*/pkg/
# Commit lock files of backward data generation crates
!utils/tfhe-backward-compat-data/crates/generate_*/Cargo.lock
# Python .env
.env
__pycache__

View File

@@ -14,12 +14,10 @@ members = [
"tfhe-fft",
"tfhe-ntt",
"tfhe-zk-pok",
"utils/benchmark_spec",
"utils/param_dedup",
"utils/tfhe-backward-compat-checker",
"utils/tfhe-backward-compat-data",
"utils/tfhe-backward-compat-data/crates/add_new_version",
"utils/tfhe-safe-serialize",
"utils/tfhe-versionable",
"utils/tfhe-versionable-derive",
"utils/wasm-par-mq",
@@ -45,7 +43,6 @@ rand = "0.8"
rayon = "1.11"
serde = { version = "1.0", default-features = false }
wasm-bindgen = { version = "0.2.114" }
wasm-bindgen-futures = { version = "0.4.56" }
# js-sys (at this point in time) automatically enables the unsafe-eval feature which we do not want
# this does not prevent other deps from enabling it, but it at least conveys our need to not have it
# we still enable std, which was part of default before

View File

@@ -1,6 +1,6 @@
BSD 3-Clause Clear License
Copyright © 2026 ZAMA.
Copyright © 2025 ZAMA.
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,

254
Makefile
View File

@@ -122,12 +122,6 @@ install_build_wasm32_target:
( echo "Unable to install wasm32-unknown-unknown target toolchain, check your rustup installation. \
Rustup can be downloaded at https://rustup.rs/" && exit 1 )
.PHONY: install_check_wasm32_target # Install the wasm32 toolchain used for checks
install_check_wasm32_target:
rustup target add wasm32-unknown-unknown --toolchain "$(RS_CHECK_TOOLCHAIN)" || \
( echo "Unable to install wasm32-unknown-unknown target toolchain, check your rustup installation. \
Rustup can be downloaded at https://rustup.rs/" && exit 1 )
.PHONY: install_cargo_nextest # Install cargo nextest used for shortint tests
install_cargo_nextest:
@cargo nextest --version > /dev/null 2>&1 || \
@@ -312,7 +306,7 @@ semgrep_and_lint_gpu_code: semgrep_lint_setup_venv
find "$(TFHECUDA_SRC)" -name '*.h' -o -name '*.cuh' -o -name '*.cu' \
| grep -v '/cmake-build-debug/' \
| grep -v '/build/' \
| xargs venv/bin/semgrep --error --config "$(TFHECUDA_SRC)/.semgrep/release-ordering.yaml" --scan-unknown-extensions
| xargs venv/bin/semgrep --config "$(TFHECUDA_SRC)/.semgrep/release-ordering.yaml" --scan-unknown-extensions
venv/bin/python3 "scripts/check_scratch_cleanup.py"
.PHONY: semver_check_cuda_backend # Run semver checks on tfhe-cuda-backend
@@ -356,23 +350,23 @@ check_fmt_js: check_nvm_installed
.PHONY: check_fmt_toml # Check TOML files format
check_fmt_toml: install_taplo
@RUST_LOG=warn taplo fmt --check || \
{ echo "TOML files format check failed. Please run 'make fmt_toml'"; exit 1; }
echo "TOML files format check failed. Please run 'make fmt_toml'"
.PHONY: check_typos # Check for typos in codebase
check_typos: install_typos_checker
@git ls-files ":!*.png" ":!*.cbor" ":!*.bcode" ":!*.ico" ":!*/twiddles.cu" | typos --file-list - && echo "No typos found"
@typos && echo "No typos found"
.PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
clippy_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats,extended-types,zk-pok \
--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats,extended-types,zk-pok \
--all-targets \
-p tfhe -- --no-deps -D warnings
.PHONY: check_gpu # Run check on tfhe with "gpu" enabled
check_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats \
--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \
--all-targets \
-p tfhe
@@ -386,7 +380,7 @@ clippy_hpu: install_rs_check_toolchain
.PHONY: clippy_gpu_hpu # Run clippy lints on tfhe with "gpu" and "hpu" enabled
clippy_gpu_hpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,hpu,pbs-stats,extended-types,zk-pok \
--features=boolean,shortint,integer,internal-keycache,gpu,hpu,pbs-stats,extended-types,zk-pok \
--all-targets \
-p tfhe -- --no-deps -D warnings
@@ -479,7 +473,7 @@ clippy_rustdoc_gpu: install_rs_check_toolchain
fi && \
CARGO_TERM_QUIET=true CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu,gpu-experimental-zk \
--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu \
-p tfhe -- --nocapture
.PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
@@ -490,17 +484,11 @@ clippy_c_api: install_rs_check_toolchain
.PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
clippy_js_wasm_api: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,extended-types \
-p tfhe -- --no-deps -D warnings
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok,extended-types \
-p tfhe -- --no-deps -D warnings
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok,extended-types,parallel-wasm-api \
-p tfhe -- --no-deps -D warnings
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok,extended-types,cross-origin-wasm-api \
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,extended-types \
-p tfhe -- --no-deps -D warnings
.PHONY: clippy_tasks # Run clippy lints on helper tasks crate.
@@ -541,15 +529,6 @@ clippy_zk_pok: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-p tfhe-zk-pok --features=experimental -- --no-deps -D warnings
.PHONY: clippy_zk_pok_wasm # Run clippy lints on tfhe-zk-pok for wasm32 target
clippy_zk_pok_wasm: install_rs_check_toolchain install_check_wasm32_target
RUSTFLAGS="$(WASM_RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--target wasm32-unknown-unknown \
-p tfhe-zk-pok -- --no-deps -D warnings
RUSTFLAGS="$(WASM_RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--target wasm32-unknown-unknown \
-p tfhe-zk-pok --features cross-origin-wasm -- --no-deps -D warnings
.PHONY: clippy_versionable # Run clippy lints on tfhe-versionable
clippy_versionable: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
@@ -557,11 +536,6 @@ clippy_versionable: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-p tfhe-versionable -- --no-deps -D warnings
.PHONY: clippy_safe_serialize # Run clippy lints on tfhe-safe-serialize
clippy_safe_serialize: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-p tfhe-safe-serialize -- --no-deps -D warnings
.PHONY: clippy_param_dedup # Run clippy lints on param_dedup tool
clippy_param_dedup: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
@@ -587,28 +561,15 @@ clippy_backward_compat_data: install_rs_check_toolchain # the toolchain is selec
echo "Cannot run clippy for backward compat crate on non x86 platform for now."; \
fi
.PHONY: check_backward_compat_locks_did_not_change # Check backward compat Cargo.lock files are up to date
check_backward_compat_locks_did_not_change: install_rs_check_toolchain
@for crate in `ls -1 $(BACKWARD_COMPAT_DATA_DIR)/crates/ | grep generate_`; do \
echo "checking Cargo.lock for $$crate"; \
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options \
-C $(BACKWARD_COMPAT_DATA_DIR)/crates/$$crate metadata --locked --format-version 1 > /dev/null || \
( echo "Cargo.lock for $$crate is out of date. Update it with:" && \
echo " cd $(BACKWARD_COMPAT_DATA_DIR)/crates/$$crate && cargo metadata --format-version 1 > /dev/null" && \
echo "then commit the updated Cargo.lock." && exit 1 ); \
done
.PHONY: clippy_test_vectors # Run clippy lints on the test vectors app
clippy_test_vectors: install_rs_check_toolchain
cd apps/test-vectors; RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-p tfhe-test-vectors -- --no-deps -D warnings
# WARNING: This target is not directly run in CI. When adding a subtarget here,
# MAKE SURE TO ALSO ADD IT TO A PCC BATCH BELOW
.PHONY: clippy_all # Run all clippy targets
clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_zk_pok_wasm clippy_trivium \
clippy_versionable clippy_safe_serialize clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup \
clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_trivium \
clippy_versionable clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup \
clippy_test_vectors clippy_backward_compat_data clippy_wasm_par_mq
.PHONY: clippy_fast # Run main clippy targets
@@ -705,7 +666,7 @@ build_c_api: install_rs_check_toolchain
.PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
build_c_api_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,extended-types,gpu,gpu-experimental-zk \
--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,extended-types,gpu \
-p tfhe
.PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
@@ -714,14 +675,11 @@ build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,experimental-force_fft_algo_dif4 \
-p tfhe
.PHONY: build_web_js_api # Build the js API targeting the web browser, in sequential or cross origin parallelism modes.
.PHONY: build_web_js_api # Build the js API targeting the web browser
build_web_js_api: install_wasm_pack
cd tfhe && \
RUSTFLAGS="$(WASM_RUSTFLAGS)" wasm-pack build --release --target=web \
-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok,extended-types,cross-origin-wasm-api && \
find pkg/snippets -type f -iname worker_helpers.js -exec sed -i 's|import("../../..")|import("../../../tfhe.js")|g' {} \;
cp utils/wasm-par-mq/js/coordinator.js tfhe/pkg/
jq '.files += ["snippets"]' tfhe/pkg/package.json > tmp_pkg.json && mv -f tmp_pkg.json tfhe/pkg/package.json
-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok,extended-types
.PHONY: build_web_js_api_parallel # Build the js API targeting the web browser with parallelism support
# parallel wasm requires specific build options, see https://github.com/rust-lang/rust/pull/147225
@@ -807,7 +765,7 @@ test_zk_cuda_backend:
.PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend test_zk_cuda_backend
test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
.PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
test_core_crypto_gpu:
@@ -1243,31 +1201,12 @@ test_tfhe_csprng_big_endian: install_cargo_cross
RUSTFLAGS="" cross test --profile $(CARGO_PROFILE) \
-p tfhe-csprng --target=powerpc64-unknown-linux-gnu
.PHONY: test_zk_pok # Run tfhe-zk-pok tests
test_zk_pok:
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
-p tfhe-zk-pok --features experimental
.PHONY: test_zk_pok_experimental_gpu # Run tfhe-zk-pok GPU-accelerated tests
test_zk_pok_experimental_gpu:
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
-p tfhe-zk-pok --features experimental,gpu-experimental -- gpu
.PHONY: test_integer_zk_gpu # Run tfhe-zk-pok tests
test_integer_zk_gpu:
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
--features=integer,zk-pok,gpu -p tfhe -- \
integer::gpu::zk::
.PHONY: test_integer_zk_experimental_gpu # Run tfhe-zk-pok tests
test_integer_zk_experimental_gpu:
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
--features=integer,zk-pok,gpu,gpu-experimental-zk -p tfhe -- \
integer::gpu::zk::
.PHONY: test_zk_cuda # Run all GPU MSM integration tests (CPU vs GPU comparison + integration test)
test_zk_cuda: test_zk_cuda_backend test_zk_pok_experimental_gpu test_integer_zk_gpu test_integer_zk_experimental_gpu
.PHONY: test_zk_wasm_x86_compat_ci
test_zk_wasm_x86_compat_ci: check_nvm_installed
source ~/.nvm/nvm.sh && \
@@ -1286,11 +1225,6 @@ test_versionable:
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
--all-targets -p tfhe-versionable
.PHONY: test_safe_serialize # Run tests for tfhe-safe-serialize subcrate
test_safe_serialize:
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
--all-targets -p tfhe-safe-serialize
# The backward compat data folder holds historical binary data but also rust code to generate and load them.
.PHONY: gen_backward_compat_data # Re-generate backward compatibility data
gen_backward_compat_data:
@@ -1425,19 +1359,6 @@ test_nodejs_wasm_api_ci: build_node_js_api
# This is an internal target, not meant to be called on its own.
run_web_js_api_parallel: build_web_js_api_parallel setup_venv
cd $(WEB_SERVER_DIR) && npm install && npm run build
source venv/bin/activate && \
python ci/webdriver.py \
--browser-path $(browser_path) \
--driver-path $(driver_path) \
--browser-kind $(browser_kind) \
--server-cmd $(server_cmd) \
--server-workdir "$(WEB_SERVER_DIR)" \
--id-pattern $(filter) \
--id-exclude-pattern asyncMainThread
# This is an internal target, not meant to be called on its own.
run_web_js_api_cross_origin: build_web_js_api setup_venv
cd $(WEB_SERVER_DIR) && npm install && npm run build
source venv/bin/activate && \
python ci/webdriver.py \
@@ -1480,38 +1401,6 @@ test_web_js_api_parallel_firefox_ci: setup_venv
nvm use $(NODE_VERSION) && \
$(MAKE) test_web_js_api_parallel_firefox
test_web_js_api_cross_origin_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
test_web_js_api_cross_origin_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
test_web_js_api_cross_origin_chrome: browser_kind = chrome
test_web_js_api_cross_origin_chrome: server_cmd = "npm run server:cross-origin"
test_web_js_api_cross_origin_chrome: filter = ZeroKnowledgeTest # Only run zk proof tests in cross-origin mode
.PHONY: test_web_js_api_cross_origin_chrome # Run tests for the web wasm api in cross-origin mode on Chrome
test_web_js_api_cross_origin_chrome: run_web_js_api_cross_origin
.PHONY: test_web_js_api_cross_origin_chrome_ci # Run tests for the web wasm api in cross-origin mode on Chrome
test_web_js_api_cross_origin_chrome_ci: setup_venv
source ~/.nvm/nvm.sh && \
nvm install $(NODE_VERSION) && \
nvm use $(NODE_VERSION) && \
$(MAKE) test_web_js_api_cross_origin_chrome
test_web_js_api_cross_origin_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
test_web_js_api_cross_origin_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
test_web_js_api_cross_origin_firefox: browser_kind = firefox
test_web_js_api_cross_origin_firefox: server_cmd = "npm run server:cross-origin"
test_web_js_api_cross_origin_firefox: filter = ZeroKnowledgeTest # Only run zk proof tests in cross-origin mode
.PHONY: test_web_js_api_cross_origin_firefox # Run tests for the web wasm api in cross-origin mode on Firefox
test_web_js_api_cross_origin_firefox: run_web_js_api_cross_origin
.PHONY: test_web_js_api_cross_origin_firefox_ci # Run tests for the web wasm api in cross-origin mode on Firefox
test_web_js_api_cross_origin_firefox_ci: setup_venv
source ~/.nvm/nvm.sh && \
nvm install $(NODE_VERSION) && \
nvm use $(NODE_VERSION) && \
$(MAKE) test_web_js_api_cross_origin_firefox
WASM_PAR_MQ_TEST_DIR=utils/wasm-par-mq/web_tests
.PHONY: build_wasm_par_mq_tests # Build the wasm-par-mq test WASM package
@@ -1675,50 +1564,27 @@ bench_integer_rerand_gpu: install_rs_check_toolchain
--bench integer-rerand \
--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_msm_zk
bench_msm_zk: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench zk-msm \
--features=zk-pok -p tfhe-benchmark --profile release --
# GPU benchmarks need --profile release for correct measurements
.PHONY: bench_msm_zk_gpu
bench_msm_zk_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench zk-msm \
--features=gpu,gpu-experimental-zk,zk-pok -p tfhe-benchmark --profile release -- zk::cuda::msm
# GPU benchmarks need --profile release for correct measurements
.PHONY: bench_integer_zk_gpu
bench_integer_zk_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-zk-pke \
--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release --
# GPU benchmarks need --profile release for correct measurements
.PHONY: bench_integer_zk_experimental_gpu
bench_integer_zk_experimental_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-zk-pke \
--features=integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats,zk-pok -p tfhe-benchmark --profile release --
--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_integer_aes_gpu # Run benchmarks for AES on GPU backend
bench_integer_aes_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-aes \
--features=integer,internal-keycache,gpu -p tfhe-benchmark --profile release_lto_off --
--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_integer_aes256_gpu # Run benchmarks for AES256 on GPU backend
bench_integer_aes256_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-aes256 \
--features=integer,internal-keycache,gpu -p tfhe-benchmark --profile release_lto_off --
--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_integer_trivium_gpu # Run benchmarks for trivium on GPU backend
bench_integer_trivium_gpu: install_rs_check_toolchain
@@ -1882,37 +1748,37 @@ bench_web_js_api_parallel_firefox_ci: setup_venv
nvm use $(NODE_VERSION) && \
$(MAKE) bench_web_js_api_parallel_firefox
bench_web_js_api_cross_origin_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
bench_web_js_api_cross_origin_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
bench_web_js_api_cross_origin_chrome: browser_kind = chrome
bench_web_js_api_cross_origin_chrome: server_cmd = "npm run server:cross-origin"
bench_web_js_api_cross_origin_chrome: filter = ZeroKnowledgeBench # Only bench zk with cross-origin workers
bench_web_js_api_unsafe_coop_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
bench_web_js_api_unsafe_coop_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
bench_web_js_api_unsafe_coop_chrome: browser_kind = chrome
bench_web_js_api_unsafe_coop_chrome: server_cmd = "npm run server:unsafe-coop"
bench_web_js_api_unsafe_coop_chrome: filter = ZeroKnowledgeBench # Only bench zk with unsafe coop
.PHONY: bench_web_js_api_cross_origin_chrome # Run benchmarks for the web wasm api without cross-origin isolation
bench_web_js_api_cross_origin_chrome: run_web_js_api_cross_origin
.PHONY: bench_web_js_api_unsafe_coop_chrome # Run benchmarks for the web wasm api without cross-origin isolation
bench_web_js_api_unsafe_coop_chrome: run_web_js_api_parallel
.PHONY: bench_web_js_api_cross_origin_chrome_ci # Run benchmarks for the web wasm api without cross-origin isolation
bench_web_js_api_cross_origin_chrome_ci: setup_venv
.PHONY: bench_web_js_api_unsafe_coop_chrome_ci # Run benchmarks for the web wasm api without cross-origin isolation
bench_web_js_api_unsafe_coop_chrome_ci: setup_venv
source ~/.nvm/nvm.sh && \
nvm install $(NODE_VERSION) && \
nvm use $(NODE_VERSION) && \
$(MAKE) bench_web_js_api_cross_origin_chrome
$(MAKE) bench_web_js_api_unsafe_coop_chrome
bench_web_js_api_cross_origin_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
bench_web_js_api_cross_origin_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
bench_web_js_api_cross_origin_firefox: browser_kind = firefox
bench_web_js_api_cross_origin_firefox: server_cmd = "npm run server:cross-origin"
bench_web_js_api_cross_origin_firefox: filter = ZeroKnowledgeBench # Only bench zk with cross-origin workers
bench_web_js_api_unsafe_coop_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
bench_web_js_api_unsafe_coop_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
bench_web_js_api_unsafe_coop_firefox: browser_kind = firefox
bench_web_js_api_unsafe_coop_firefox: server_cmd = "npm run server:unsafe-coop"
bench_web_js_api_unsafe_coop_firefox: filter = ZeroKnowledgeBench # Only bench zk with unsafe coop
.PHONY: bench_web_js_api_cross_origin_firefox # Run benchmarks for the web wasm api without cross-origin isolation
bench_web_js_api_cross_origin_firefox: run_web_js_api_cross_origin
.PHONY: bench_web_js_api_unsafe_coop_firefox # Run benchmarks for the web wasm api without cross-origin isolation
bench_web_js_api_unsafe_coop_firefox: run_web_js_api_parallel
.PHONY: bench_web_js_api_cross_origin_firefox_ci # Run benchmarks for the web wasm api without cross-origin isolation
bench_web_js_api_cross_origin_firefox_ci: setup_venv
.PHONY: bench_web_js_api_unsafe_coop_firefox_ci # Run benchmarks for the web wasm api without cross-origin isolation
bench_web_js_api_unsafe_coop_firefox_ci: setup_venv
source ~/.nvm/nvm.sh && \
nvm install $(NODE_VERSION) && \
nvm use $(NODE_VERSION) && \
$(MAKE) bench_web_js_api_cross_origin_firefox
$(MAKE) bench_web_js_api_unsafe_coop_firefox
.PHONY: bench_hlapi_unsigned # Run benchmarks for integer operations
bench_hlapi_unsigned: install_rs_check_toolchain
@@ -1945,25 +1811,25 @@ bench_hlapi_hpu: install_rs_check_toolchain
--bench hlapi \
--features=integer,internal-keycache,hpu,hpu-v80,pbs-stats -p tfhe-benchmark --
.PHONY: bench_hlapi_erc7984 # Run benchmarks for ERC7984 operations
bench_hlapi_erc7984: install_rs_check_toolchain
.PHONY: bench_hlapi_erc20 # Run benchmarks for ERC20 operations
bench_hlapi_erc20: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-erc7984 \
--bench hlapi-erc20 \
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --
.PHONY: bench_hlapi_erc7984_gpu # Run benchmarks for ERC7984 operations on GPU
bench_hlapi_erc7984_gpu: install_rs_check_toolchain
.PHONY: bench_hlapi_erc20_gpu # Run benchmarks for ERC20 operations on GPU
bench_hlapi_erc20_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-erc7984 \
--bench hlapi-erc20 \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_hlapi_erc7984_gpu_classical # Run benchmarks for ERC7984 operations on GPU with classical parameters
bench_hlapi_erc7984_gpu_classical: install_rs_check_toolchain
.PHONY: bench_hlapi_erc20_gpu_classical # Run benchmarks for ERC20 operations on GPU with classical parameters
bench_hlapi_erc20_gpu_classical: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=classical \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-erc7984 \
--bench hlapi-erc20 \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_hlapi_dex # Run benchmarks for DEX operations
@@ -1987,13 +1853,13 @@ bench_hlapi_dex_gpu_classical: install_rs_check_toolchain
--bench hlapi-dex \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_hlapi_erc7984_hpu # Run benchmarks for ECR20 operations on HPU
bench_hlapi_erc7984_hpu: install_rs_check_toolchain
.PHONY: bench_hlapi_erc20_hpu # Run benchmarks for ECR20 operations on HPU
bench_hlapi_erc20_hpu: install_rs_check_toolchain
source ./setup_hpu.sh --config $(HPU_CONFIG); \
export V80_PCIE_DEV=${V80_PCIE_DEV}; \
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-erc7984 \
--bench hlapi-erc20 \
--features=integer,internal-keycache,hpu,hpu-v80,pbs-stats -p tfhe-benchmark --
.PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
@@ -2001,13 +1867,6 @@ bench_tfhe_zk_pok: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-zk-pok --
.PHONY: bench_tfhe_zk_pok_gpu # Run benchmarks for the tfhe_zk_pok crate using GPU acceleration
bench_tfhe_zk_pok_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--package tfhe-zk-pok \
--features=gpu-experimental --profile release
.PHONY: bench_hlapi_noise_squash # Run benchmarks for noise squash operation
bench_hlapi_noise_squash: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
@@ -2049,10 +1908,10 @@ bench_summary: install_rs_check_toolchain
--bench hlapi-noise-squash \
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::decomp_noise_squash_comp::'
# ERC7984
# ERC20
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-erc7984 \
--bench hlapi-erc20 \
--features=integer,internal-keycache -p tfhe-benchmark -- '::transfer::overflow'
# DEX
@@ -2094,10 +1953,10 @@ bench_summary_gpu: install_rs_check_toolchain
--bench hlapi-noise-squash \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::decomp_noise_squash_comp::'
# ERC7984
# ERC20
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-erc7984 \
--bench hlapi-erc20 \
--features=integer,gpu,internal-keycache -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow'
# DEX
@@ -2276,7 +2135,6 @@ pcc_batch_5:
$(call run_recipe_with_details,clippy_tfhe_lints)
$(call run_recipe_with_details,check_compile_tests)
$(call run_recipe_with_details,clippy_backward_compat_data)
$(call run_recipe_with_details,check_backward_compat_locks_did_not_change)
.PHONY: pcc_batch_6 # duration: 6'32''
pcc_batch_6:
@@ -2285,10 +2143,8 @@ pcc_batch_6:
$(call run_recipe_with_details,clippy_tasks)
$(call run_recipe_with_details,clippy_tfhe_csprng)
$(call run_recipe_with_details,clippy_zk_pok)
$(call run_recipe_with_details,clippy_zk_pok_wasm)
$(call run_recipe_with_details,clippy_trivium)
$(call run_recipe_with_details,clippy_versionable)
$(call run_recipe_with_details,clippy_safe_serialize)
$(call run_recipe_with_details,clippy_param_dedup)
$(call run_recipe_with_details,docs)

View File

@@ -15,3 +15,12 @@ extend-ignore-identifiers-re = [
"0x[0-9a-fA-F]+",
"xrt_coreutil",
]
[files]
extend-exclude = [
"backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cu",
"backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu",
"backends/tfhe-hpu-backend/config_store/**/*.link_summary",
"*.cbor",
"*.bcode",
]

View File

@@ -1,6 +1,6 @@
BSD 3-Clause Clear License
Copyright © 2026 ZAMA.
Copyright © 2025 ZAMA.
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,

View File

@@ -62,29 +62,3 @@ rules:
cuda_synchronize_stream(...);
...
}
- id: tfhe-cuda-unwrapped-cuda-runtime-call
message: "CUDA runtime API call is not wrapped in `check_cuda_error(...)`."
severity: WARNING
languages: [c, cpp]
options:
generic_ellipsis_max_span: 500
paths:
include:
- "*.cu"
- "*.cuh"
- "*.cpp"
- "*.h"
exclude:
- backends/tfhe-cuda-backend/cuda/check_cuda.cu # contains cuda checking functions
- backends/tfhe-cuda-backend/cuda/include/device.h # contains the cuda_check_error macro (and others)
patterns:
- pattern: $FUNC(...)
- metavariable-regex:
metavariable: $FUNC
regex: "^cuda[A-Z][A-Za-z0-9]*$" # matches cudaMalloc/cudaMemcpy/... (not project helpers like cuda_set_device)
- pattern-not-inside: check_cuda_error(...)
- pattern-not-inside: |
$FUNC(...);
check_cuda_error(cudaGetLastError());
- pattern-not-inside: $FUNC(...) == $VAL

View File

@@ -36,19 +36,5 @@ void cuda_glwe_sample_extract_128_async(
void const *glwe_array_in, uint32_t const *nth_array, uint32_t num_nths,
uint32_t num_lwes_to_extract_per_glwe, uint32_t num_lwes_stored_per_glwe,
uint32_t glwe_dimension, uint32_t polynomial_size);
void cuda_modulus_switch_multi_bit_64_async(void *stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in, uint32_t size,
uint32_t log_modulus,
uint32_t degree,
uint32_t grouping_factor);
void cuda_modulus_switch_multi_bit_128_async(void *stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in, uint32_t size,
uint32_t log_modulus,
uint32_t degree,
uint32_t grouping_factor);
}
#endif

View File

@@ -382,17 +382,14 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
->use_sequential_algorithm_to_resolve_group_carries;
cuda_set_device(0);
check_cuda_error(
cudaEventCreateWithFlags(&create_indexes_done, cudaEventDisableTiming));
cudaEventCreateWithFlags(&create_indexes_done, cudaEventDisableTiming);
create_indexes_for_overflow_sub(streams.get_ith(0), num_blocks, group_size,
use_seq, allocate_gpu_memory, size_tracker);
check_cuda_error(cudaEventRecord(create_indexes_done, streams.stream(0)));
cudaEventRecord(create_indexes_done, streams.stream(0));
cuda_set_device(1);
check_cuda_error(
cudaStreamWaitEvent(streams.stream(1), create_indexes_done, 0));
cudaStreamWaitEvent(streams.stream(1), create_indexes_done, 0);
cuda_set_device(2);
check_cuda_error(
cudaStreamWaitEvent(streams.stream(2), create_indexes_done, 0));
cudaStreamWaitEvent(streams.stream(2), create_indexes_done, 0);
scatter_indexes_for_overflowing_sub(
streams.stream(1), streams.gpu_index(1),
@@ -845,7 +842,7 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
free(second_indexes_for_overflow_sub_gpu_2);
free(scalars_for_overflow_sub_gpu_2);
check_cuda_error(cudaEventDestroy(create_indexes_done));
cudaEventDestroy(create_indexes_done);
// release sub streams
sub_streams_1.release();

View File

@@ -39,28 +39,6 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_64(void *stream,
uint32_t gpu_index,
int8_t **pbs_buffer);
// Noise-tests-namespaced wrappers for scratch/cleanup, so that callers
// working with the noise-tests PBS variant use a consistent naming scheme.
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer);
// Noise tests variant: 64-bit torus, polynomial_size=2048 only. Uses the
// NOISE_TESTS keybundle mode for noise analysis purposes.
void cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_async(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
@@ -78,23 +56,6 @@ void cuda_multi_bit_programmable_bootstrap_128_async(
void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
const uint32_t gpu_index,
int8_t **buffer);
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_128(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer);
void cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lwe_array_in, void const *lwe_input_indexes,
void const *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride);
}
#endif // CUDA_MULTI_BIT_H

View File

@@ -105,11 +105,11 @@ template <typename Torus> struct zk_expand_mem {
uint32_t num_lwes;
uint32_t num_compact_lists;
int_radix_lut<Torus> *message_and_carry_extract_luts = nullptr;
int_radix_lut<Torus> *identity_lut = nullptr;
int_radix_lut<Torus> *message_and_carry_extract_luts;
int_radix_lut<Torus> *identity_lut;
Torus *tmp_expanded_lwes = nullptr;
Torus *tmp_ksed_small_to_big_expanded_lwes = nullptr;
Torus *tmp_expanded_lwes;
Torus *tmp_ksed_small_to_big_expanded_lwes;
bool gpu_memory_allocated;
@@ -148,6 +148,66 @@ template <typename Torus> struct zk_expand_mem {
PANIC("GPU backend requires carry_modulus equal to message_modulus")
}
// We create the identity LUT only if we are doing a SANITY_CHECK
if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
identity_lut =
new int_radix_lut<Torus>(streams, computing_params, 1, 2 * num_lwes,
allocate_gpu_memory, size_tracker);
auto identity_lut_f = [](Torus x) -> Torus { return x; };
identity_lut->generate_and_broadcast_lut(streams, {0}, {identity_lut_f},
LUT_0_FOR_ALL_BLOCKS);
}
auto message_extract_lut_f = [casting_params](Torus x) -> Torus {
return x % casting_params.message_modulus;
};
auto carry_extract_lut_f = [casting_params](Torus x) -> Torus {
return (x / casting_params.carry_modulus) %
casting_params.message_modulus;
};
// Booleans have to be sanitized
auto sanitize_bool_f = [](Torus x) -> Torus { return x == 0 ? 0 : 1; };
auto message_extract_and_sanitize_bool_lut_f =
[message_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
return sanitize_bool_f(message_extract_lut_f(x));
};
auto carry_extract_and_sanitize_bool_lut_f =
[carry_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
return sanitize_bool_f(carry_extract_lut_f(x));
};
/** In case the casting key casts from BIG to SMALL key we run a single KS
to expand using the casting key as ksk. Otherwise, in case the casting key
casts from SMALL to BIG key, we first keyswitch from SMALL to BIG using
the casting key as ksk, then we keyswitch from BIG to SMALL using the
computing ksk, and lastly we apply the PBS. The output is always on the
BIG key.
**/
auto params = casting_params;
if (casting_key_type == SMALL_TO_BIG) {
params = computing_params;
}
message_and_carry_extract_luts = new int_radix_lut<Torus>(
streams, params, 4, 2 * num_lwes, allocate_gpu_memory, size_tracker);
// We are always packing two LWEs. We just need to be sure we have enough
// space in the carry part to store a message of the same size as is in the
// message part.
if (params.carry_modulus < params.message_modulus)
PANIC("Carry modulus must be at least as large as message modulus");
auto num_packed_msgs = 2;
// Adjust indexes to permute the output and access the correct LUT
auto h_indexes_in = static_cast<Torus *>(
malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
auto h_indexes_out = static_cast<Torus *>(
malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
auto h_lut_indexes = static_cast<Torus *>(
malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
d_expand_jobs =
static_cast<expand_job<Torus> *>(cuda_malloc_with_size_tracking_async(
safe_mul_sizeof<expand_job<Torus>>(num_lwes), streams.stream(0),
@@ -156,202 +216,144 @@ template <typename Torus> struct zk_expand_mem {
h_expand_jobs = static_cast<expand_job<Torus> *>(
malloc(safe_mul_sizeof<expand_job<Torus>>(num_lwes)));
// NO_CASTING expands directly into the output buffer — no LUTs, no PBS,
// no intermediate buffers needed.
if (expand_kind != EXPAND_KIND::NO_CASTING) {
/** In case the casting key casts from BIG to SMALL key we run a single KS
to expand using the casting key as ksk. Otherwise, in case the casting key
casts from SMALL to BIG key, we first keyswitch from SMALL to BIG using
the casting key as ksk, then we keyswitch from BIG to SMALL using the
computing ksk, and lastly we apply the PBS. The output is always on the
BIG key.
**/
auto params = casting_params;
if (casting_key_type == SMALL_TO_BIG) {
params = computing_params;
/*
* Each LWE contains encrypted data in both carry and message spaces
* that needs to be extracted.
*
* The loop processes each compact list (k) and for each LWE within that
* list:
* 1. Sets input indexes to read each LWE twice (for carry and message
* extraction)
* 2. Creates output indexes to properly reorder the results
* 3. Selects appropriate LUT index based on whether boolean sanitization is
* needed
*
* We want the output to have always first the content of the message part
* and then the content of the carry part of each LWE.
*
* i.e. msg_extract(LWE_0), carry_extract(LWE_0), msg_extract(LWE_1),
* carry_extract(LWE_1), ...
*
* Aiming that behavior, with 4 LWEs we would have:
*
* // Each LWE is processed twice
* h_indexes_in = {0, 1, 2, 3, 0, 1, 2, 3}
*
* // First 4 use message LUT, last 4 use carry LUT
* h_lut_indexes = {0, 0, 0, 0, 1, 1, 1, 1}
*
* // Reorders output so message and carry for each LWE appear together
* h_indexes_out = {0, 2, 4, 6, 1, 3, 5, 7}
*
* If an LWE contains a boolean value, its LUT index is shifted by
* num_packed_msgs to use the sanitization LUT (which ensures output is
* exactly 0 or 1).
*/
auto offset = 0;
for (int k = 0; k < num_compact_lists; k++) {
auto num_lwes_in_kth = this->num_lwes_per_compact_list[k];
for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
auto lwe_index = i + num_packed_msgs * offset;
auto lwe_index_in_list = i % num_lwes_in_kth;
PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
"Cuda error: index %d is beyond the max value %d",
lwe_index, num_packed_msgs * num_lwes);
h_indexes_in[lwe_index] = lwe_index_in_list + offset;
h_indexes_out[lwe_index] =
num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
"Cuda error: index %lu is beyond the max value %lu",
(unsigned long)h_indexes_in[lwe_index],
(unsigned long)(num_packed_msgs * num_lwes));
PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
"Cuda error: index %lu is beyond the max value %lu",
(unsigned long)h_indexes_out[lwe_index],
(unsigned long)(num_packed_msgs * num_lwes));
// is_boolean_array tells us which input is a boolean and thus the
// related output needs boolean sanitization. It naturally has
// total_blocks entries, but h_indexes_out reaches
// message_modulus * ceil(total_blocks/2) - 1. When total_blocks is odd,
// the ceiling causes out-of-bounds access. Reading garbage "true" would
// set h_lut_indexes to an invalid index pointing to uninitialized
// memory instead of a real LUT. Rust pads is_boolean_array with FALSE
// to match.
PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
"Cuda error: index %lu for is_boolean_array is out of "
"bounds (len is %lu)",
(unsigned long)h_indexes_out[lwe_index],
(unsigned long)is_boolean_array_len);
}
offset += num_lwes_in_kth;
}
// We always pack two LWEs (message and carry parts per LWE)
auto num_packed_msgs = 2;
message_and_carry_extract_luts->set_lwe_indexes(
streams.stream(0), streams.gpu_index(0), h_indexes_in, h_indexes_out);
// Adjust indexes to permute the output and access the correct LUT.
//
// The loop below fills h_indexes_in and h_indexes_out so that the output
// is ordered as: msg_extract(LWE_0), carry_extract(LWE_0),
// msg_extract(LWE_1), carry_extract(LWE_1), ...
//
// With 4 LWEs the arrays look like:
// h_indexes_in = {0, 1, 2, 3, 0, 1, 2, 3} (each LWE read twice)
// h_lut_indexes = {0, 0, 0, 0, 1, 1, 1, 1} (msg LUT then carry LUT)
// h_indexes_out = {0, 2, 4, 6, 1, 3, 5, 7} (interleaved output)
//
// If an LWE contains a boolean its LUT index is shifted by
// num_packed_msgs to use the sanitization LUT (output clamped to {0, 1}).
auto h_indexes_in = static_cast<Torus *>(
malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
auto h_indexes_out = static_cast<Torus *>(
malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
auto active_streams =
streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
// Index generator for message/carry extraction LUTs
auto index_gen = [num_compact_lists,
num_lwes_per_compact_list =
this->num_lwes_per_compact_list,
num_packed_msgs, is_boolean_array,
h_indexes_out](Torus *h_lut_indexes, uint32_t) {
auto offset = 0;
for (int k = 0; k < num_compact_lists; k++) {
auto num_lwes_in_kth = this->num_lwes_per_compact_list[k];
auto num_lwes_in_kth = num_lwes_per_compact_list[k];
for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
auto lwe_index = i + num_packed_msgs * offset;
auto lwe_index_in_list = i % num_lwes_in_kth;
PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
"Cuda error: index %d is beyond the max value %d",
lwe_index, num_packed_msgs * num_lwes);
h_indexes_in[lwe_index] = lwe_index_in_list + offset;
h_indexes_out[lwe_index] =
num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
"Cuda error: index %lu is beyond the max value %lu",
(unsigned long)h_indexes_in[lwe_index],
(unsigned long)(num_packed_msgs * num_lwes));
PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
"Cuda error: index %lu is beyond the max value %lu",
(unsigned long)h_indexes_out[lwe_index],
(unsigned long)(num_packed_msgs * num_lwes));
// is_boolean_array tells us which input is a boolean and thus the
// related output needs boolean sanitization. It naturally has
// total_blocks entries, but h_indexes_out reaches
// message_modulus * ceil(total_blocks/2) - 1. When total_blocks is
// odd, the ceiling causes out-of-bounds access. Reading garbage
// "true" would set h_lut_indexes to an invalid index pointing to
// uninitialized memory instead of a real LUT. Rust pads
// is_boolean_array with FALSE to match.
PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
"Cuda error: index %lu for is_boolean_array is out of "
"bounds (len is %lu)",
(unsigned long)h_indexes_out[lwe_index],
(unsigned long)is_boolean_array_len);
auto boolean_offset =
is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
}
offset += num_lwes_in_kth;
}
};
auto active_streams =
streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
message_and_carry_extract_luts->generate_and_broadcast_lut(
active_streams, {0, 1, 2, 3},
{message_extract_lut_f, carry_extract_lut_f,
message_extract_and_sanitize_bool_lut_f,
carry_extract_and_sanitize_bool_lut_f},
index_gen, true, {}, h_lut_indexes);
// SANITY_CHECK uses identity_lut (skipping the full message/carry
// extraction LUT and the SMALL_TO_BIG intermediate buffer).
if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
identity_lut =
new int_radix_lut<Torus>(streams, casting_params, 1, 2 * num_lwes,
allocate_gpu_memory, size_tracker);
message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
// The expanded LWEs will always be on the casting key format
tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
safe_mul_sizeof<Torus>(num_lwes, casting_params.big_lwe_dimension + 1),
streams.stream(0), streams.gpu_index(0), size_tracker,
allocate_gpu_memory);
auto identity_lut_f = [](Torus x) -> Torus { return x; };
identity_lut->generate_and_broadcast_lut(streams, {0}, {identity_lut_f},
LUT_0_FOR_ALL_BLOCKS);
identity_lut->set_lwe_indexes(streams.stream(0), streams.gpu_index(0),
h_indexes_in, h_indexes_out);
identity_lut->allocate_lwe_vector_for_non_trivial_indexes(
active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
} else {
// We are always packing two LWEs. We just need to be sure we have
// enough space in the carry part to store a message of the same size
// as is in the message part.
if (params.carry_modulus < params.message_modulus)
PANIC("Carry modulus must be at least as large as message modulus");
message_and_carry_extract_luts =
new int_radix_lut<Torus>(streams, params, 4, 2 * num_lwes,
allocate_gpu_memory, size_tracker);
message_and_carry_extract_luts->set_lwe_indexes(
streams.stream(0), streams.gpu_index(0), h_indexes_in,
h_indexes_out);
auto message_extract_lut_f = [casting_params](Torus x) -> Torus {
return x % casting_params.message_modulus;
};
auto carry_extract_lut_f = [casting_params](Torus x) -> Torus {
return (x / casting_params.carry_modulus) %
casting_params.message_modulus;
};
auto sanitize_bool_f = [](Torus x) -> Torus { return x == 0 ? 0 : 1; };
auto message_extract_and_sanitize_bool_lut_f =
[message_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
return sanitize_bool_f(message_extract_lut_f(x));
};
auto carry_extract_and_sanitize_bool_lut_f =
[carry_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
return sanitize_bool_f(carry_extract_lut_f(x));
};
auto h_lut_indexes = static_cast<Torus *>(
malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
auto index_gen = [num_compact_lists,
num_lwes_per_compact_list =
this->num_lwes_per_compact_list,
num_packed_msgs, is_boolean_array,
h_indexes_out](Torus *h_lut_indexes, uint32_t) {
auto offset = 0;
for (int k = 0; k < num_compact_lists; k++) {
auto num_lwes_in_kth = num_lwes_per_compact_list[k];
for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
auto lwe_index = i + num_packed_msgs * offset;
auto boolean_offset = is_boolean_array[h_indexes_out[lwe_index]]
? num_packed_msgs
: 0;
h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
}
offset += num_lwes_in_kth;
}
};
message_and_carry_extract_luts->generate_and_broadcast_lut(
active_streams, {0, 1, 2, 3},
{message_extract_lut_f, carry_extract_lut_f,
message_extract_and_sanitize_bool_lut_f,
carry_extract_and_sanitize_bool_lut_f},
index_gen, true, {}, h_lut_indexes);
message_and_carry_extract_luts
->allocate_lwe_vector_for_non_trivial_indexes(
active_streams, 2 * num_lwes, size_tracker,
allocate_gpu_memory);
free(h_lut_indexes);
// SANITY_CHECK panics on SMALL_TO_BIG, so this buffer is only needed
// on the full casting path.
tmp_ksed_small_to_big_expanded_lwes =
(Torus *)cuda_malloc_with_size_tracking_async(
safe_mul_sizeof<Torus>(num_lwes,
casting_params.big_lwe_dimension + 1),
streams.stream(0), streams.gpu_index(0), size_tracker,
allocate_gpu_memory);
}
// The expanded LWEs will always be on the casting key format
tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
safe_mul_sizeof<Torus>(num_lwes,
casting_params.big_lwe_dimension + 1),
streams.stream(0), streams.gpu_index(0), size_tracker,
allocate_gpu_memory);
free(h_indexes_in);
free(h_indexes_out);
}
tmp_ksed_small_to_big_expanded_lwes =
(Torus *)cuda_malloc_with_size_tracking_async(
safe_mul_sizeof<Torus>(num_lwes,
casting_params.big_lwe_dimension + 1),
streams.stream(0), streams.gpu_index(0), size_tracker,
allocate_gpu_memory);
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
free(h_indexes_in);
free(h_indexes_out);
free(h_lut_indexes);
}
void release(CudaStreams streams) {
if (expand_kind != EXPAND_KIND::NO_CASTING) {
if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
identity_lut->release(streams);
delete identity_lut;
} else {
message_and_carry_extract_luts->release(streams);
delete message_and_carry_extract_luts;
cuda_drop_with_size_tracking_async(
tmp_ksed_small_to_big_expanded_lwes, streams.stream(0),
streams.gpu_index(0), gpu_memory_allocated);
}
cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams.stream(0),
streams.gpu_index(0),
gpu_memory_allocated);
message_and_carry_extract_luts->release(streams);
delete message_and_carry_extract_luts;
if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
identity_lut->release(streams);
delete identity_lut;
}
cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams.stream(0),
streams.gpu_index(0),
gpu_memory_allocated);
cuda_drop_with_size_tracking_async(tmp_ksed_small_to_big_expanded_lwes,
streams.stream(0), streams.gpu_index(0),
gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_expand_jobs, streams.stream(0),
streams.gpu_index(0),
gpu_memory_allocated);

View File

@@ -390,7 +390,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
XOR(&wires_a[6], &wires_a[15], &input_bits[7]);
XOR(&wires_a[10], &wires_a[15], &wires_b[0]);
XOR(&wires_a[11], &wires_a[20], &wires_a[9]);
FLUSH(&wires_a[6], &wires_a[10], &wires_a[11]);
FLUSH(&wires_a[6], &wires_a[10]);
XOR(&wires_a[7], &input_bits[7], &wires_a[11]);
FLUSH(&wires_a[7]);
XOR(&wires_a[17], &wires_a[10], &wires_a[11]);
@@ -426,7 +426,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
XOR(&wires_b[22], &wires_b[18], &wires_a[19]);
XOR(&wires_b[23], &wires_b[19], &wires_a[21]);
XOR(&wires_b[24], &wires_b[20], &wires_a[18]);
FLUSH(&wires_b[21], &wires_b[22], &wires_b[23], &wires_b[24]);
FLUSH(&wires_b[21], &wires_b[23], &wires_b[24]);
XOR(&wires_b[25], &wires_b[21], &wires_b[22]);
FLUSH(&wires_b[25]);
@@ -468,7 +468,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
XOR(&wires_b[37], &wires_b[36], &wires_b[34]);
XOR(&wires_b[38], &wires_b[27], &wires_b[36]);
FLUSH(&wires_b[38], &wires_b[37]);
FLUSH(&wires_b[38]);
XOR(&wires_b[44], &wires_b[33], &wires_b[37]);
CudaRadixCiphertextFFI *and_outs_6[] = {&wires_b[39]};
@@ -479,7 +479,7 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
XOR(&wires_b[40], &wires_b[25], &wires_b[39]);
XOR(&wires_b[41], &wires_b[40], &wires_b[37]);
XOR(&wires_b[43], &wires_b[29], &wires_b[40]);
FLUSH(&wires_b[41], &wires_b[40], &wires_b[43], &wires_b[44]);
FLUSH(&wires_b[41]);
XOR(&wires_b[45], &wires_b[42], &wires_b[41]);
FLUSH(&wires_b[45]);
@@ -514,7 +514,6 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
XOR(&wires_b[57], &wires_b[50], &wires_b[53]);
XOR(&wires_b[58], &wires_c[4], &wires_b[46]);
XOR(&wires_b[59], &wires_c[3], &wires_b[54]);
FLUSH(&wires_b[57], &wires_b[58]);
XOR(&wires_b[60], &wires_b[46], &wires_b[57]);
XOR(&wires_b[61], &wires_c[14], &wires_b[57]);
XOR(&wires_b[62], &wires_b[52], &wires_b[58]);
@@ -590,7 +589,6 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
#undef FLUSH
#undef AND
#undef ADD_ONE_FLUSH
#undef ADD_ONE
}
/**

View File

@@ -150,31 +150,3 @@ void cuda_glwe_sample_extract_128_async(
"N's are powers of two in the interval [256..4096].")
}
}
void cuda_modulus_switch_multi_bit_64_async(void *stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in, uint32_t size,
uint32_t log_modulus,
uint32_t degree,
uint32_t grouping_factor) {
host_modulus_switch_multi_bit<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in), size, log_modulus, degree,
grouping_factor);
}
void cuda_modulus_switch_multi_bit_128_async(void *stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in, uint32_t size,
uint32_t log_modulus,
uint32_t degree,
uint32_t grouping_factor) {
host_modulus_switch_multi_bit<__uint128_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<__uint128_t *>(lwe_array_out),
static_cast<__uint128_t *>(lwe_array_in), size, log_modulus, degree,
grouping_factor);
}

View File

@@ -463,48 +463,5 @@ __global__ void __launch_bounds__(512)
return;
}
}
// This function is only used for noise tests, it follows the same logic
// that is embedded in the keybundle just we need a global function to
// be able to test it individually.
template <typename Torus, class params>
__global__ void
modulus_switch_multi_bit(Torus *array_out, const Torus *array_in, int size,
uint32_t log_modulus, uint32_t grouping_factor) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < size) {
int num_monomials = 1 << grouping_factor;
int input_offset = tid * grouping_factor;
int output_offset = tid * num_monomials;
// We calculate all monomials even if the first one is never used.
for (int ggsw_idx = 0; ggsw_idx < num_monomials; ggsw_idx++) {
array_out[ggsw_idx + output_offset] =
calculates_monomial_degree<Torus, params>(&array_in[input_offset],
ggsw_idx, grouping_factor);
}
}
}
// This aims to be launched only from the noise tests.
// That is why we support a specific set of parameters
template <typename Torus>
__host__ void host_modulus_switch_multi_bit(
cudaStream_t stream, uint32_t gpu_index, Torus *array_out, Torus *array_in,
int size, uint32_t log_modulus, uint32_t degree, uint32_t grouping_factor) {
check_cuda_error(cudaSetDevice(gpu_index));
int multibit_size = size / grouping_factor;
int num_threads = 0, num_blocks = 0;
getNumBlocksAndThreads(multibit_size, 1024, num_blocks, num_threads);
switch (degree) {
case 2048:
modulus_switch_multi_bit<Torus, Degree<2048>>
<<<num_blocks, num_threads, 0, stream>>>(
array_out, array_in, multibit_size, log_modulus, grouping_factor);
break;
default:
PANIC("Cuda error: unsupported polynomial size. Supported "
"N's are powers of two in the interval [2048].")
};
check_cuda_error(cudaGetLastError());
}
#endif // CNCRT_TORUS_H

View File

@@ -326,10 +326,6 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
uint32_t gpu_index) {
if (size == 0)
return;
GPU_ASSERT(src != nullptr, "Cuda error: null device ptr");
GPU_ASSERT(dest != nullptr, "Cuda error: null device ptr");
cudaPointerAttributes attr_dest;
check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
PANIC_IF_FALSE(

View File

@@ -373,8 +373,7 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
// Get the number of streaming multiprocessors
int number_of_sm = 0;
check_cuda_error(
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
}

View File

@@ -420,39 +420,6 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
}
}
// Noise tests variant: identical to host_cg_multi_bit_programmable_bootstrap
// but uses NOISE_TESTS keybundle mode.
template <typename Torus, class params>
__host__ void host_cg_multi_bit_programmable_bootstrap_noise_tests(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, uint64_t const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride) {
auto lwe_chunk_size = buffer->lwe_chunk_size;
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {
// Compute a keybundle with NOISE_TESTS mode instead of GENERIC
execute_compute_keybundle_noise_tests<Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset);
execute_cg_external_product_loop<Torus, params>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
lut_stride);
}
}
// Verify if the grid size satisfies the cooperative group constraints
template <typename Torus, class params>
__host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
@@ -517,8 +484,7 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
// Get the number of streaming multiprocessors
int number_of_sm = 0;
check_cuda_error(
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
}

View File

@@ -784,9 +784,9 @@ __host__ uint64_t scratch_programmable_bootstrap_tbc_128(
device_programmable_bootstrap_tbc_128<InputTorus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
full_sm)); // full_sm + minimum_sm_tbc));
check_cuda_error(cudaFuncSetCacheConfig(
cudaFuncSetCacheConfig(
device_programmable_bootstrap_tbc_128<InputTorus, params, FULLSM>,
cudaFuncCachePreferShared));
cudaFuncCachePreferShared);
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_tbc_128<InputTorus, params, FULLSM>,
cudaFuncAttributeNonPortableClusterSizeAllowed, true));
@@ -1271,8 +1271,7 @@ __host__ bool verify_cuda_programmable_bootstrap_128_cg_grid_size(
// Get the number of streaming multiprocessors
int number_of_sm = 0;
check_cuda_error(
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
}

View File

@@ -645,103 +645,6 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_64(void *stream,
*buffer = nullptr;
}
// Noise-tests-namespaced wrappers: delegate to the standard scratch/cleanup so
// that callers using the noise-tests PBS variant have a consistent API.
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
return scratch_cuda_multi_bit_programmable_bootstrap_64_async(
stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
}
void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer) {
cleanup_cuda_multi_bit_programmable_bootstrap_64(stream, gpu_index,
pbs_buffer);
}
// Noise tests variant of the 64-bit multi-bit PBS, restricted to
// polynomial_size=2048. The main difference is that the input
// is assumed to be modulus switched before bootstrapping.
void cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride) {
PANIC_IF_FALSE(num_samples == 1,
"Cuda error (multi-bit PBS): num_samples (%d) should be 1",
num_samples);
PANIC_IF_FALSE(base_log <= 64,
"Cuda error (multi-bit PBS): base log (%d) should be <= 64",
base_log);
PANIC_IF_FALSE(polynomial_size == 2048,
"Cuda error (multi-bit PBS noise tests): only polynomial "
"size 2048 is supported, got %d.",
polynomial_size);
pbs_buffer<uint64_t, MULTI_BIT> *buffer =
(pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
switch (buffer->pbs_variant) {
case PBS_VARIANT::TBC:
#if CUDA_ARCH >= 900
{
host_tbc_multi_bit_programmable_bootstrap_noise_tests<uint64_t,
Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const uint64_t *>(bootstrapping_key), buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_many_lut, lut_stride);
} break;
#else
PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
#endif
case PBS_VARIANT::CG:
host_cg_multi_bit_programmable_bootstrap_noise_tests<uint64_t,
Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const uint64_t *>(bootstrapping_key), buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_many_lut, lut_stride);
break;
case PBS_VARIANT::DEFAULT:
host_multi_bit_programmable_bootstrap_noise_tests<uint64_t, Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const uint64_t *>(bootstrapping_key), buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
}
}
/**
* Computes divisors of the product of num_sms (streaming multiprocessors on the
* GPU) and max_blocks_per_sm (maximum active blocks per SM to launch

View File

@@ -25,8 +25,7 @@ get_start_ith_ggsw_offset(uint32_t polynomial_size, int glwe_dimension,
level_count;
}
template <typename Torus, class params, sharedMemDegree SMD,
bool runs_noise_test = false>
template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_multi_bit_programmable_bootstrap_keybundle(
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
@@ -56,6 +55,9 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
if (lwe_iteration < (lwe_dimension / grouping_factor)) {
const Torus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
double2 *keybundle = keybundle_array +
// select the input
input_idx * keybundle_size_per_input;
@@ -84,40 +86,10 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
// Precalculate the monomial degrees and store them in shared memory
uint32_t *monomial_degrees = (uint32_t *)selected_memory;
if (threadIdx.x < (1 << grouping_factor)) {
if constexpr (runs_noise_test == true) {
// For noise tests the input array contains the input lwe but also the
// modswitched results. This allows to avoid changing the accumulation
// kernel for the noise tests since the input body will stay in the same
// position. The layout of the input array is the following:
// | input lwe | modswitched inputs |
// | lwe size | lwe_size*grouping_factor |
// This offset allows to jump directly to the modswitched inputs,
// skipping the input lwe
const Torus modswitched_offset = lwe_dimension + 1;
const Torus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[input_idx] *
(lwe_dimension / grouping_factor) *
(1 << grouping_factor) +
modswitched_offset];
const Torus *lwe_array_group =
block_lwe_array_in + rev_lwe_iteration * (1 << grouping_factor);
monomial_degrees[threadIdx.x] = lwe_array_group[threadIdx.x];
} else {
// In production we calculate the monomial degrees on the fly, since
// they are not stored in the input array.
const Torus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
const Torus *lwe_array_group =
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
monomial_degrees[threadIdx.x] =
calculates_monomial_degree<Torus, params>(
lwe_array_group, threadIdx.x, grouping_factor);
}
const Torus *lwe_array_group =
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
lwe_array_group, threadIdx.x, grouping_factor);
}
__syncthreads();
@@ -173,8 +145,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
// Then we can just calculate the offset needed to apply this coefficients, and
// the operation transforms into a pointwise vector multiplication, avoiding to
// perform extra instructions other than MADD
template <typename Torus, class params, sharedMemDegree SMD,
bool runs_noise_test = false>
template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
@@ -248,40 +219,10 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
uint32_t *monomial_degrees = (uint32_t *)selected_memory;
if (threadIdx.x < (1 << grouping_factor)) {
if constexpr (runs_noise_test == true) {
// For noise tests the input array contains the input lwe but also the
// modswitched results. This allows to avoid changing the accumulation
// kernel for the noise tests since the input body will stay in the same
// position. The layout of the input array is the following:
// | input lwe | modswitched inputs |
// | lwe size | lwe_size*grouping_factor |
// This offset allows to jump directly to the modswitched inputs,
// skipping the input lwe
const Torus modswitched_offset = lwe_dimension + 1;
const Torus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[input_idx] *
(lwe_dimension / grouping_factor) *
(1 << grouping_factor) +
modswitched_offset];
const Torus *lwe_array_group =
block_lwe_array_in + rev_lwe_iteration * (1 << grouping_factor);
monomial_degrees[threadIdx.x] = lwe_array_group[threadIdx.x];
} else {
// In production we calculate the monomial degrees on the fly, since
// they are not stored in the input array.
const Torus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
const Torus *lwe_array_group =
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
monomial_degrees[threadIdx.x] =
calculates_monomial_degree<Torus, params>(
lwe_array_group, threadIdx.x, grouping_factor);
}
const Torus *lwe_array_group =
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
lwe_array_group, threadIdx.x, grouping_factor);
}
__syncthreads();
@@ -721,7 +662,6 @@ enum class MultiBitKeybundleLaunchMode {
AUTO,
GENERIC,
SPECIALIZED_2_2,
NOISE_TESTS,
};
template <typename Torus, class params>
@@ -786,65 +726,30 @@ __host__ void execute_compute_keybundle_with_mode(
bool use_specialized =
launch_mode == MultiBitKeybundleLaunchMode::SPECIALIZED_2_2 ||
(launch_mode == MultiBitKeybundleLaunchMode::AUTO &&
can_use_specialized) ||
(launch_mode == MultiBitKeybundleLaunchMode::NOISE_TESTS &&
can_use_specialized);
bool use_noise_test_template =
launch_mode == MultiBitKeybundleLaunchMode::NOISE_TESTS;
if (use_specialized) {
dim3 thds_new_keybundle(512, 1, 1);
if (use_noise_test_template) {
// Set up the noise-test variant of the specialized 2_2 kernel
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
Torus, Degree<2048>, FULLSM, true>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
3 * full_sm_keybundle));
check_cuda_error(cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
Torus, Degree<2048>, FULLSM, true>,
cudaFuncCachePreferShared));
check_cuda_error(cudaGetLastError());
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
Torus, Degree<2048>, FULLSM, true>
<<<grid_keybundle, thds_new_keybundle, 3 * full_sm_keybundle,
stream>>>(lwe_array_in, lwe_input_indexes, keybundle_fft,
bootstrapping_key, lwe_dimension, lwe_offset,
chunk_size, keybundle_size_per_input);
} else {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
Torus, Degree<2048>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
3 * full_sm_keybundle));
check_cuda_error(cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
Torus, Degree<2048>, FULLSM>,
cudaFuncCachePreferShared));
check_cuda_error(cudaGetLastError());
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
Torus, Degree<2048>, FULLSM><<<grid_keybundle, thds_new_keybundle,
3 * full_sm_keybundle, stream>>>(
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
lwe_dimension, lwe_offset, chunk_size, keybundle_size_per_input);
}
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
Torus, Degree<2048>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 3 * full_sm_keybundle));
check_cuda_error(cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
Torus, Degree<2048>, FULLSM>,
cudaFuncCachePreferShared));
check_cuda_error(cudaGetLastError());
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
Torus, Degree<2048>, FULLSM><<<grid_keybundle, thds_new_keybundle,
3 * full_sm_keybundle, stream>>>(
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
lwe_dimension, lwe_offset, chunk_size, keybundle_size_per_input);
} else {
if (use_noise_test_template) {
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM,
true>
<<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
lwe_array_in, lwe_input_indexes, keybundle_fft,
bootstrapping_key, lwe_dimension, glwe_dimension,
polynomial_size, grouping_factor, level_count, lwe_offset,
chunk_size, keybundle_size_per_input, d_mem, 0);
} else {
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM>
<<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
lwe_array_in, lwe_input_indexes, keybundle_fft,
bootstrapping_key, lwe_dimension, glwe_dimension,
polynomial_size, grouping_factor, level_count, lwe_offset,
chunk_size, keybundle_size_per_input, d_mem, 0);
}
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM>
<<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
level_count, lwe_offset, chunk_size, keybundle_size_per_input,
d_mem, 0);
}
}
check_cuda_error(cudaGetLastError());
@@ -891,20 +796,6 @@ __host__ void execute_compute_keybundle_2_2_specialized(
grouping_factor, level_count, lwe_offset,
MultiBitKeybundleLaunchMode::SPECIALIZED_2_2);
}
// Used only to run noise tests
template <typename Torus, class params>
__host__ void execute_compute_keybundle_noise_tests(
cudaStream_t stream, uint32_t gpu_index, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
execute_compute_keybundle_with_mode<Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset,
MultiBitKeybundleLaunchMode::NOISE_TESTS);
}
template <typename Torus, class params, bool is_first_iter>
__host__ void execute_step_one(
@@ -1064,62 +955,4 @@ __host__ void host_multi_bit_programmable_bootstrap(
}
}
}
template <typename Torus, class params>
__host__ void host_multi_bit_programmable_bootstrap_noise_tests(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride) {
auto lwe_chunk_size = buffer->lwe_chunk_size;
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {
// Compute a keybundle with NOISE_TESTS mode to enable the specialized
// runs_noise_test=true kernel variant for noise measurement
execute_compute_keybundle_with_mode<Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset,
MultiBitKeybundleLaunchMode::NOISE_TESTS);
// Accumulate (same as standard path)
uint32_t chunk_size =
std::min((uint32_t)lwe_chunk_size,
(lwe_dimension / grouping_factor) - lwe_offset);
for (uint32_t j = 0; j < chunk_size; j++) {
bool is_first_iter = (j + lwe_offset) == 0;
bool is_last_iter =
(j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
if (is_first_iter) {
execute_step_one<Torus, params, true>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, buffer, num_samples, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count);
} else {
execute_step_one<Torus, params, false>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, buffer, num_samples, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count);
}
if (is_last_iter) {
execute_step_two<Torus, params, true>(
stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
num_samples, glwe_dimension, polynomial_size, level_count, j,
num_many_lut, lut_stride);
} else {
execute_step_two<Torus, params, false>(
stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
num_samples, glwe_dimension, polynomial_size, level_count, j,
num_many_lut, lut_stride);
}
}
}
}
#endif // MULTIBIT_PBS_H

View File

@@ -293,81 +293,6 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
*buffer = nullptr;
}
// Noise-tests-namespaced wrappers: delegate to the standard scratch/cleanup so
// that callers using the noise-tests PBS128 variant have a consistent API.
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
return scratch_cuda_multi_bit_programmable_bootstrap_128_async(
stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
}
void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_128(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer) {
cleanup_cuda_multi_bit_programmable_bootstrap_128(stream, gpu_index,
pbs_buffer);
cuda_synchronize_stream(static_cast<cudaStream_t>(stream), gpu_index);
}
// Noise tests variant of the 128-bit multi-bit PBS, restricted to
// polynomial_size=2048. The input is assumed to contain precomputed
// modswitched values in the extended input array layout.
void cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lwe_array_in, void const *lwe_input_indexes,
void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride) {
PANIC_IF_FALSE(num_samples == 1,
"Cuda error (multi-bit PBS): num_samples (%d) should be 1",
num_samples);
PANIC_IF_FALSE(base_log <= 64,
"Cuda error (multi-bit PBS): base log (%d) should be <= 64",
base_log);
PANIC_IF_FALSE(polynomial_size == 2048,
"Cuda error (multi-bit PBS128 noise tests): only polynomial "
"size 2048 is supported, got %d.",
polynomial_size);
auto *buffer =
reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(mem_ptr);
switch (buffer->pbs_variant) {
case PBS_VARIANT::CG:
host_cg_multi_bit_programmable_bootstrap_noise_tests_128<uint64_t,
Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<__uint128_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const __uint128_t *>(lut_vector),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const __uint128_t *>(bootstrapping_key), buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_many_lut, lut_stride);
break;
case PBS_VARIANT::DEFAULT:
host_multi_bit_programmable_bootstrap_noise_tests_128<uint64_t,
Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<__uint128_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const __uint128_t *>(lut_vector),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const __uint128_t *>(bootstrapping_key), buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
}
}
/**
* Computes divisors of the product of num_sms (streaming multiprocessors on the
* GPU) and max_blocks_per_sm (maximum active blocks per SM to launch

View File

@@ -18,8 +18,7 @@ uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle(
(size_t)2); // accumulator
}
template <typename InputTorus, class params, sharedMemDegree SMD,
bool runs_noise_test = false>
template <typename InputTorus, class params, sharedMemDegree SMD>
__global__ void device_multi_bit_programmable_bootstrap_keybundle_128(
const InputTorus *__restrict__ lwe_array_in,
const InputTorus *__restrict__ lwe_input_indexes, double *keybundle_array,
@@ -81,35 +80,11 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_128(
// Precalculate the monomial degrees and store them in shared memory
uint32_t *monomial_degrees = (uint32_t *)selected_memory;
if (threadIdx.x < (1 << grouping_factor)) {
if constexpr (runs_noise_test == true) {
// For noise tests the input array contains the input lwe but also the
// modswitched results. This allows to avoid changing the accumulation
// kernel for the noise tests since the input body will stay in the same
// position. The layout of the input array is the following:
// | input lwe | modswitched inputs |
// | lwe size | lwe_size*grouping_factor |
// This offset allows to jump directly to the modswitched inputs,
// skipping the input lwe
const InputTorus modswitched_offset = lwe_dimension + 1;
const InputTorus *block_lwe_array_in_noise =
&lwe_array_in[lwe_input_indexes[input_idx] *
(lwe_dimension / grouping_factor) *
(1 << grouping_factor) +
modswitched_offset];
const InputTorus *lwe_array_group =
block_lwe_array_in_noise +
rev_lwe_iteration * (1 << grouping_factor);
monomial_degrees[threadIdx.x] = lwe_array_group[threadIdx.x];
} else {
auto lwe_array_group =
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
monomial_degrees[threadIdx.x] =
calculates_monomial_degree<InputTorus, params>(
lwe_array_group, threadIdx.x, grouping_factor);
}
auto lwe_array_group =
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
monomial_degrees[threadIdx.x] =
calculates_monomial_degree<InputTorus, params>(
lwe_array_group, threadIdx.x, grouping_factor);
}
__syncthreads();
@@ -613,74 +588,6 @@ __host__ void execute_compute_keybundle_128(
check_cuda_error(cudaGetLastError());
}
// Used only to run noise tests: launches the keybundle kernel with the
// runs_noise_test=true variant, which reads modswitched inputs from the
// extended input array layout instead of computing them on-the-fly
template <typename InputTorus, class params>
__host__ void execute_compute_keybundle_noise_tests_128(
cudaStream_t stream, uint32_t gpu_index, InputTorus const *lwe_array_in,
InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
cuda_set_device(gpu_index);
auto lwe_chunk_size = buffer->lwe_chunk_size;
uint64_t chunk_size = std::min(
lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
uint64_t keybundle_size_per_input =
lwe_chunk_size * level_count * (glwe_dimension + 1) *
(glwe_dimension + 1) * (polynomial_size / 2) * 4;
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle<
__uint128_t>(polynomial_size);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto d_mem = buffer->d_mem_keybundle;
auto keybundle_fft = buffer->keybundle_fft;
dim3 grid_keybundle(num_samples * chunk_size,
(glwe_dimension + 1) * (glwe_dimension + 1), level_count);
dim3 thds(polynomial_size / params::opt, 1, 1);
if (max_shared_memory < full_sm_keybundle) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_keybundle_128<
InputTorus, params, NOSM, true>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
check_cuda_error(cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_keybundle_128<
InputTorus, params, NOSM, true>,
cudaFuncCachePreferShared));
device_multi_bit_programmable_bootstrap_keybundle_128<InputTorus, params,
NOSM, true>
<<<grid_keybundle, thds, 0, stream>>>(
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
level_count, lwe_offset, chunk_size, keybundle_size_per_input,
d_mem, full_sm_keybundle);
} else {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_keybundle_128<
InputTorus, params, FULLSM, true>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
check_cuda_error(cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_keybundle_128<
InputTorus, params, FULLSM, true>,
cudaFuncCachePreferShared));
device_multi_bit_programmable_bootstrap_keybundle_128<InputTorus, params,
FULLSM, true>
<<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
level_count, lwe_offset, chunk_size, keybundle_size_per_input,
d_mem, 0);
}
check_cuda_error(cudaGetLastError());
}
template <typename InputTorus, class params, bool is_first_iter>
__host__ void execute_step_one_128(
cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
@@ -1212,47 +1119,46 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128(
int max_active_blocks_per_sm;
if (max_shared_memory < partial_sm_cg_accumulate) {
check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
Torus, params, NOSM>,
thds, 0));
thds, 0);
} else if (max_shared_memory < full_sm_cg_accumulate) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
check_cuda_error(cudaFuncSetCacheConfig(
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
PARTIALSM>,
cudaFuncCachePreferShared));
check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
cudaFuncCachePreferShared);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
Torus, params, PARTIALSM>,
thds, partial_sm_cg_accumulate));
thds, partial_sm_cg_accumulate);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
check_cuda_error(cudaFuncSetCacheConfig(
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
FULLSM>,
cudaFuncCachePreferShared));
check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
cudaFuncCachePreferShared);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
Torus, params, FULLSM>,
thds, full_sm_cg_accumulate));
thds, full_sm_cg_accumulate);
check_cuda_error(cudaGetLastError());
}
// Get the number of streaming multiprocessors
int number_of_sm = 0;
check_cuda_error(
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
}
@@ -1293,96 +1199,4 @@ supports_cooperative_groups_on_multibit_programmable_bootstrap_128(
}
}
// Noise tests variant: identical to
// host_cg_multi_bit_programmable_bootstrap_128 but uses the noise-test
// keybundle (runs_noise_test=true) instead of the standard one.
template <typename InputTorus, class params>
__host__ void host_cg_multi_bit_programmable_bootstrap_noise_tests_128(
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
__uint128_t const *bootstrapping_key,
pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride) {
auto lwe_chunk_size = buffer->lwe_chunk_size;
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {
// Compute a keybundle with the noise-test kernel variant
// (runs_noise_test=true) to read precomputed modswitched values
execute_compute_keybundle_noise_tests_128<InputTorus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset);
execute_cg_external_product_loop_128<InputTorus, params>(
stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
lwe_offset, num_many_lut, lut_stride);
}
}
template <typename InputTorus, class params>
__host__ void host_multi_bit_programmable_bootstrap_noise_tests_128(
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
__uint128_t const *bootstrapping_key,
pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride) {
auto lwe_chunk_size = buffer->lwe_chunk_size;
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {
// Compute a keybundle with the noise-test kernel variant
// (runs_noise_test=true) to read precomputed modswitched values
execute_compute_keybundle_noise_tests_128<InputTorus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset);
// Accumulate (same as standard path)
uint64_t chunk_size =
std::min((uint32_t)lwe_chunk_size,
(lwe_dimension / grouping_factor) - lwe_offset);
for (uint32_t j = 0; j < chunk_size; j++) {
bool is_first_iter = (j + lwe_offset) == 0;
bool is_last_iter =
(j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
if (is_first_iter) {
execute_step_one_128<InputTorus, params, true>(
stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
base_log, level_count);
} else {
execute_step_one_128<InputTorus, params, false>(
stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
base_log, level_count);
}
if (is_last_iter) {
execute_step_two_128<InputTorus, params, true>(
stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
num_samples, glwe_dimension, polynomial_size, level_count, j,
num_many_lut, lut_stride);
} else {
execute_step_two_128<InputTorus, params, false>(
stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
num_samples, glwe_dimension, polynomial_size, level_count, j,
num_many_lut, lut_stride);
}
}
}
}
#endif // PROGRAMMABLE_BOOTSTRAP_MULTIBIT_128_CUH

View File

@@ -739,8 +739,7 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
// Get the number of streaming multiprocessors
int number_of_sm = 0;
check_cuda_error(
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
}

View File

@@ -795,40 +795,6 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap_2_2_specialized(
MultiBitTbcLaunchMode::SPECIALIZED_2_2);
}
// Noise tests variant: uses NOISE_TESTS keybundle mode for the keybundle step
// while keeping the standard AUTO accumulate behaviour for the TBC loop.
template <typename Torus, class params>
__host__ void host_tbc_multi_bit_programmable_bootstrap_noise_tests(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride) {
cuda_set_device(gpu_index);
auto lwe_chunk_size = buffer->lwe_chunk_size;
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {
// Keybundle with NOISE_TESTS mode; the TBC accumulate uses AUTO as usual
execute_compute_keybundle_noise_tests<Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset);
// Accumulate (unchanged from standard TBC path)
execute_tbc_external_product_loop<Torus, params>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
lut_stride, MultiBitTbcLaunchMode::AUTO);
}
}
template <typename Torus>
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
uint32_t polynomial_size, uint32_t max_shared_memory) {

View File

@@ -119,73 +119,71 @@ __host__ void host_expand_without_verification(
streams.stream(0), streams.gpu_index(0), true);
if (mem_ptr->expand_kind == EXPAND_KIND::NO_CASTING) {
// This path is added to mimic the CPU fallback behaviour for the no_casting
// expand, which is needed for the noise sanity checks.
host_lwe_expand<Torus, params>(streams.stream(0), streams.gpu_index(0),
lwe_array_out, d_expand_jobs, num_lwes);
} else {
// This is our default path for the expand with casting if needed.
host_lwe_expand<Torus, params>(streams.stream(0), streams.gpu_index(0),
expanded_lwes, d_expand_jobs, num_lwes);
auto lwe_array_input = expanded_lwes;
auto ksks = casting_keys;
auto message_and_carry_extract_luts =
mem_ptr->message_and_carry_extract_luts;
auto lut = mem_ptr->message_and_carry_extract_luts;
if (casting_key_type == SMALL_TO_BIG) {
if (mem_ptr->expand_kind == EXPAND_KIND::SANITY_CHECK) {
PANIC("SANITY_CHECK not supported for SMALL_TO_BIG casting");
}
// Keyswitch from small to big key if needed
auto ksed_small_to_big_expanded_lwes =
mem_ptr->tmp_ksed_small_to_big_expanded_lwes;
std::vector<Torus *> lwe_trivial_indexes_vec =
lut->lwe_trivial_indexes_vec;
auto casting_params = mem_ptr->casting_params;
auto casting_output_dimension = casting_params.big_lwe_dimension;
auto casting_input_dimension = casting_params.small_lwe_dimension;
auto casting_ks_level = casting_params.ks_level;
auto casting_ks_base_log = casting_params.ks_base_log;
// apply keyswitch to BIG
execute_keyswitch_async<Torus>(
streams.get_ith(0), ksed_small_to_big_expanded_lwes,
lwe_trivial_indexes_vec[0], expanded_lwes, lwe_trivial_indexes_vec[0],
casting_keys, casting_input_dimension, casting_output_dimension,
casting_ks_base_log, casting_ks_level, num_lwes,
lut->using_trivial_lwe_indexes, lut->ks_tmp_buf_vec);
// In this case, the next keyswitch will use the compute ksk
ksks = compute_ksks;
lwe_array_input = ksed_small_to_big_expanded_lwes;
}
// Apply LUT
cuda_memset_async(lwe_array_out, 0,
safe_mul_sizeof<Torus>((size_t)(lwe_dimension + 1),
(size_t)num_lwes, (size_t)2),
streams.stream(0), streams.gpu_index(0));
CudaRadixCiphertextFFI output;
into_radix_ciphertext(&output, lwe_array_out, 2 * num_lwes, lwe_dimension);
CudaRadixCiphertextFFI input;
into_radix_ciphertext(&input, lwe_array_input, 2 * num_lwes, lwe_dimension);
// This is a special case only for our noise sanity checks
// If we are doing a SANITY_CHECK expand, we just apply the identity LUT
// This replicates the CPU fallback behaviour of the casting expand
auto final_lut = (mem_ptr->expand_kind == EXPAND_KIND::SANITY_CHECK
? mem_ptr->identity_lut
: message_and_carry_extract_luts);
integer_radix_apply_univariate_lookup_table<Torus>(
streams, &output, &input, bsks, ksks, final_lut, 2 * num_lwes);
release_cpu_radix_ciphertext_async(&input);
release_cpu_radix_ciphertext_async(&output);
return;
}
host_lwe_expand<Torus, params>(streams.stream(0), streams.gpu_index(0),
expanded_lwes, d_expand_jobs, num_lwes);
auto lwe_array_input = expanded_lwes;
auto ksks = casting_keys;
auto message_and_carry_extract_luts = mem_ptr->message_and_carry_extract_luts;
auto lut = mem_ptr->message_and_carry_extract_luts;
if (casting_key_type == SMALL_TO_BIG) {
if (mem_ptr->expand_kind == EXPAND_KIND::SANITY_CHECK) {
PANIC("SANITY_CHECK not supported for SMALL_TO_BIG casting");
}
// Keyswitch from small to big key if needed
auto ksed_small_to_big_expanded_lwes =
mem_ptr->tmp_ksed_small_to_big_expanded_lwes;
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
auto casting_params = mem_ptr->casting_params;
auto casting_output_dimension = casting_params.big_lwe_dimension;
auto casting_input_dimension = casting_params.small_lwe_dimension;
auto casting_ks_level = casting_params.ks_level;
auto casting_ks_base_log = casting_params.ks_base_log;
// apply keyswitch to BIG
execute_keyswitch_async<Torus>(
streams.get_ith(0), ksed_small_to_big_expanded_lwes,
lwe_trivial_indexes_vec[0], expanded_lwes, lwe_trivial_indexes_vec[0],
casting_keys, casting_input_dimension, casting_output_dimension,
casting_ks_base_log, casting_ks_level, num_lwes,
lut->using_trivial_lwe_indexes, lut->ks_tmp_buf_vec);
// In this case, the next keyswitch will use the compute ksk
ksks = compute_ksks;
lwe_array_input = ksed_small_to_big_expanded_lwes;
}
// Apply LUT
cuda_memset_async(lwe_array_out, 0,
safe_mul_sizeof<Torus>((size_t)(lwe_dimension + 1),
(size_t)num_lwes, (size_t)2),
streams.stream(0), streams.gpu_index(0));
CudaRadixCiphertextFFI output;
into_radix_ciphertext(&output, lwe_array_out, 2 * num_lwes, lwe_dimension);
CudaRadixCiphertextFFI input;
into_radix_ciphertext(&input, lwe_array_input, 2 * num_lwes, lwe_dimension);
// This is a special case only for our noise sanity checks
// If we are doing a SANITY_CHECK expand, we just apply the identity LUT
// This replicates the CPU fallback behaviour of the casting expand
if (mem_ptr->expand_kind == EXPAND_KIND::SANITY_CHECK) {
integer_radix_apply_univariate_lookup_table<Torus>(
streams, &output, &input, bsks, ksks, mem_ptr->identity_lut,
2 * num_lwes);
return;
}
integer_radix_apply_univariate_lookup_table<Torus>(
streams, &output, &input, bsks, ksks, message_and_carry_extract_luts,
2 * num_lwes);
release_cpu_radix_ciphertext_async(&input);
release_cpu_radix_ciphertext_async(&output);
compact_lwe_lists.release();
}

View File

@@ -79,30 +79,6 @@ unsafe extern "C" {
polynomial_size: u32,
);
}
unsafe extern "C" {
pub fn cuda_modulus_switch_multi_bit_64_async(
stream: *mut ffi::c_void,
gpu_index: u32,
lwe_array_out: *mut ffi::c_void,
lwe_array_in: *mut ffi::c_void,
size: u32,
log_modulus: u32,
degree: u32,
grouping_factor: u32,
);
}
unsafe extern "C" {
pub fn cuda_modulus_switch_multi_bit_128_async(
stream: *mut ffi::c_void,
gpu_index: u32,
lwe_array_out: *mut ffi::c_void,
lwe_array_in: *mut ffi::c_void,
size: u32,
log_modulus: u32,
degree: u32,
grouping_factor: u32,
);
}
pub const PBS_TYPE_MULTI_BIT: PBS_TYPE = 0;
pub const PBS_TYPE_CLASSICAL: PBS_TYPE = 1;
pub type PBS_TYPE = ffi::c_uint;
@@ -136,6 +112,9 @@ pub type Direction = ffi::c_uint;
pub const BitValue_Zero: BitValue = 0;
pub const BitValue_One: BitValue = 1;
pub type BitValue = ffi::c_uint;
pub const RERAND_MODE_RERAND_WITH_KS: RERAND_MODE = 0;
pub const RERAND_MODE_RERAND_WITHOUT_KS: RERAND_MODE = 1;
pub type RERAND_MODE = ffi::c_uint;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct CudaStreamsFFI {
@@ -2476,9 +2455,6 @@ unsafe extern "C" {
glwe_index: u32,
);
}
pub const RERAND_MODE_RERAND_WITH_KS: RERAND_MODE = 0;
pub const RERAND_MODE_RERAND_WITHOUT_KS: RERAND_MODE = 1;
pub type RERAND_MODE = ffi::c_uint;
unsafe extern "C" {
pub fn scratch_cuda_rerand_64_async(
streams: CudaStreamsFFI,
@@ -2491,7 +2467,7 @@ unsafe extern "C" {
message_modulus: u32,
carry_modulus: u32,
allocate_gpu_memory: bool,
rerand_type: RERAND_MODE,
rerand_type: u32,
) -> u64;
}
unsafe extern "C" {
@@ -3391,48 +3367,6 @@ unsafe extern "C" {
pbs_buffer: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
stream: *mut ffi::c_void,
gpu_index: u32,
pbs_buffer: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
level_count: u32,
input_lwe_ciphertext_count: u32,
allocate_gpu_memory: bool,
) -> u64;
}
unsafe extern "C" {
pub fn cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_64(
stream: *mut ffi::c_void,
gpu_index: u32,
pbs_buffer: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
stream: *mut ffi::c_void,
gpu_index: u32,
lwe_array_out: *mut ffi::c_void,
lwe_output_indexes: *const ffi::c_void,
lut_vector: *const ffi::c_void,
lut_vector_indexes: *const ffi::c_void,
lwe_array_in: *const ffi::c_void,
lwe_input_indexes: *const ffi::c_void,
bootstrapping_key: *const ffi::c_void,
buffer: *mut i8,
lwe_dimension: u32,
glwe_dimension: u32,
polynomial_size: u32,
grouping_factor: u32,
base_log: u32,
level_count: u32,
num_samples: u32,
num_many_lut: u32,
lut_stride: u32,
);
}
unsafe extern "C" {
pub fn scratch_cuda_multi_bit_programmable_bootstrap_128_async(
stream: *mut ffi::c_void,
@@ -3474,44 +3408,3 @@ unsafe extern "C" {
buffer: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
stream: *mut ffi::c_void,
gpu_index: u32,
pbs_buffer: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
level_count: u32,
input_lwe_ciphertext_count: u32,
allocate_gpu_memory: bool,
) -> u64;
}
unsafe extern "C" {
pub fn cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_128(
stream: *mut ffi::c_void,
gpu_index: u32,
pbs_buffer: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
stream: *mut ffi::c_void,
gpu_index: u32,
lwe_array_out: *mut ffi::c_void,
lwe_output_indexes: *const ffi::c_void,
lut_vector: *const ffi::c_void,
lwe_array_in: *const ffi::c_void,
lwe_input_indexes: *const ffi::c_void,
bootstrapping_key: *const ffi::c_void,
buffer: *mut i8,
lwe_dimension: u32,
glwe_dimension: u32,
polynomial_size: u32,
grouping_factor: u32,
base_log: u32,
level_count: u32,
num_samples: u32,
num_many_lut: u32,
lut_stride: u32,
);
}

View File

@@ -1,6 +1,6 @@
[package]
name = "tfhe-hpu-backend"
version = "0.5.0"
version = "0.4.0"
edition = "2021"
license = "BSD-3-Clause-Clear"
description = "HPU implementation on FPGA of TFHE-rs primitives."
@@ -36,7 +36,7 @@ thiserror = "1.0.61"
bytemuck = { workspace = true }
anyhow = "1.0.82"
lazy_static = "1.4.0"
rand = "0.10.1"
rand = "0.8.5"
regex = "1.10.4"
bitflags = { version = "2.5.0", features = ["serde"] }
itertools = "0.11.0"

View File

@@ -1,6 +1,6 @@
BSD 3-Clause Clear License
Copyright © 2026 ZAMA.
Copyright © 2025 ZAMA.
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,

View File

@@ -297,8 +297,8 @@ source setup_hpu.sh --config v80 -p
# Run hlapi benches
make test_high_level_api_hpu
# Run hlapi erc7984 benches
make bench_hlapi_erc7984_hpu
# Run hlapi erc20 benches
make bench_hlapi_erc20_hpu
# Run integer level benches
make bench_integer_hpu

View File

@@ -109,7 +109,7 @@
flush_behaviour = "Patient"
flush = true
[firmware.op_cfg.by_op.ERC_7984]
[firmware.op_cfg.by_op.ERC_20]
fill_batch_fifo = true
min_batch_size = false
use_tiers = true

View File

@@ -121,7 +121,7 @@
flush_behaviour = "Patient"
flush = true
[firmware.op_cfg.by_op.ERC_7984]
[firmware.op_cfg.by_op.ERC_20]
fill_batch_fifo = true
min_batch_size = false
use_tiers = true

View File

@@ -230,7 +230,7 @@ iop!(
[IOP_CMP -> "CMP_NEQ", opcode::CMP_NEQ],
[IOP_CT_F_CT_BOOL -> "IF_THEN_ZERO", opcode::IF_THEN_ZERO],
[IOP_CT_F_2CT_BOOL -> "IF_THEN_ELSE", opcode::IF_THEN_ELSE],
[IOP_2CT_F_3CT -> "ERC_7984", opcode::ERC_7984],
[IOP_2CT_F_3CT -> "ERC_20", opcode::ERC_20],
[IOP_CT_F_CT -> "MEMCPY", opcode::MEMCPY],
[IOP_CT_F_CT -> "ILOG2", opcode::ILOG2],
[IOP_CT_F_CT -> "COUNT0", opcode::COUNT0],
@@ -240,5 +240,5 @@ iop!(
[IOP_CT_F_CT -> "TRAIL0", opcode::TRAIL0],
[IOP_CT_F_CT -> "TRAIL1", opcode::TRAIL1],
[IOP_NCT_F_2NCT -> "ADD_SIMD", opcode::ADD_SIMD],
[IOP_2NCT_F_3NCT -> "ERC_7984_SIMD", opcode::ERC_7984_SIMD],
[IOP_2NCT_F_3NCT -> "ERC_20_SIMD", opcode::ERC_20_SIMD],
);

View File

@@ -74,9 +74,9 @@ pub const IF_THEN_ZERO: u8 = 0xCA;
pub const IF_THEN_ELSE: u8 = 0xCB;
// Custom algorithm
// ERC7984 -> Found xfer algorithm
// ERC20 -> Found xfer algorithm
// 2Ct <- func(3Ct)
pub const ERC_7984: u8 = 0x80;
pub const ERC_20: u8 = 0x80;
// Count bits
pub const COUNT0: u8 = 0x81;
@@ -89,7 +89,7 @@ pub const TRAIL1: u8 = 0x87;
// SIMD for maximum throughput
pub const ADD_SIMD: u8 = 0xF0;
pub const ERC_7984_SIMD: u8 = 0xF1;
pub const ERC_20_SIMD: u8 = 0xF1;
//
// Utility operations
// Used to handle real clone of ciphertext already uploaded in the Hpu memory

View File

@@ -24,7 +24,7 @@ use mem_alloc::{MemAlloc, MemChunk};
mod qdma;
use qdma::QdmaDriver;
use rand::RngExt;
use rand::Rng;
const DMA_XFER_ALIGN: usize = 4096_usize;
@@ -148,8 +148,8 @@ impl HpuHw {
tracing::debug!("Load stage1 through JTAG");
let pdi_stg1_tmp = format!(
"hpu_stg1_{}.pdi",
rand::rng()
.sample_iter(rand::distr::Alphanumeric)
rand::thread_rng()
.sample_iter(rand::distributions::Alphanumeric)
.take(5)
.map(char::from)
.collect::<String>()

View File

@@ -31,7 +31,7 @@ crate::impl_fw!("Demo" [
IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero;
IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else;
ERC_7984 => fw_impl::ilp::iop_erc_7984;
ERC_20 => fw_impl::ilp::iop_erc_20;
CMP_GT => cmp_gt;
CMP_GTE => cmp_gte;

View File

@@ -61,7 +61,7 @@ crate::impl_fw!("Ilp" [
IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero;
IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else;
ERC_7984 => fw_impl::ilp::iop_erc_7984;
ERC_20 => fw_impl::ilp::iop_erc_20;
MEMCPY => fw_impl::ilp::iop_memcpy;
@@ -74,7 +74,7 @@ crate::impl_fw!("Ilp" [
TRAIL1 => fw_impl::ilp_log::iop_trail1;
// SIMD Implementations
ADD_SIMD => fw_impl::llt::iop_add_simd;
ERC_7984_SIMD => fw_impl::llt::iop_erc_7984_simd;
ERC_20_SIMD => fw_impl::llt::iop_erc_20_simd;
]);
#[instrument(level = "trace", skip(prog))]
@@ -1296,13 +1296,13 @@ pub fn iop_if_then_else(prog: &mut Program) {
});
}
/// Implement erc_7984 fund xfer
/// Implement erc_20 fund xfer
/// Targeted algorithm is as follow:
/// 1. Check that from has enough funds
/// 2. Compute real_amount to xfer (i.e. amount or 0)
/// 3. Compute new amount (from - new_amount, to + new_amount)
#[instrument(level = "info", skip(prog))]
pub fn iop_erc_7984(prog: &mut Program) {
pub fn iop_erc_20(prog: &mut Program) {
// Allocate metavariables:
// Dest -> Operand
let mut dst_from = prog.iop_template_var(OperandKind::Dst, 0);
@@ -1314,7 +1314,7 @@ pub fn iop_erc_7984(prog: &mut Program) {
let src_amount = prog.iop_template_var(OperandKind::Src, 2);
// Add Comment header
prog.push_comment("ERC_7984 (new_from, new_to) <- (from, to, amount)".to_string());
prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string());
let props = prog.params();
let tfhe_params: asm::DigitParameters = props.clone().into();

View File

@@ -70,7 +70,7 @@ crate::impl_fw!("Llt" [
IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero;
IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else;
ERC_7984 => fw_impl::llt::iop_erc_7984;
ERC_20 => fw_impl::llt::iop_erc_20;
MEMCPY => fw_impl::ilp::iop_memcpy;
COUNT0 => fw_impl::ilp_log::iop_count0;
@@ -83,7 +83,7 @@ crate::impl_fw!("Llt" [
// SIMD Implementations
ADD_SIMD => fw_impl::llt::iop_add_simd;
ERC_7984_SIMD => fw_impl::llt::iop_erc_7984_simd;
ERC_20_SIMD => fw_impl::llt::iop_erc_20_simd;
]);
// ----------------------------------------------------------------------------
@@ -225,24 +225,24 @@ pub fn iop_muls(prog: &mut Program) {
}
#[instrument(level = "trace", skip(prog))]
pub fn iop_erc_7984(prog: &mut Program) {
pub fn iop_erc_20(prog: &mut Program) {
// Add Comment header
prog.push_comment("ERC_7984 (new_from, new_to) <- (from, to, amount)".to_string());
prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string());
// TODO: Make sweep of kogge_blk_w
// All these little parameters would be very handy to write an
// exploration/compilation program which would try to minimize latency by
// playing with these.
iop_erc_7984_rtl(prog, 0, Some(10)).add_to_prog(prog);
iop_erc_20_rtl(prog, 0, Some(10)).add_to_prog(prog);
}
#[instrument(level = "trace", skip(prog))]
pub fn iop_erc_7984_simd(prog: &mut Program) {
pub fn iop_erc_20_simd(prog: &mut Program) {
// Add Comment header
prog.push_comment("ERC_7984_SIMD (new_from, new_to) <- (from, to, amount)".to_string());
prog.push_comment("ERC_20_SIMD (new_from, new_to) <- (from, to, amount)".to_string());
simd(
prog,
crate::asm::iop::SIMD_N,
fw_impl::llt::iop_erc_7984_rtl,
fw_impl::llt::iop_erc_20_rtl,
None,
);
}
@@ -379,7 +379,7 @@ pub fn iop_rotate_scalar_left(prog: &mut Program) {
// Helper Functions
// ----------------------------------------------------------------------------
/// Implement erc_7984 fund xfer
/// Implement erc_20 fund xfer
/// Targeted algorithm is as follow:
/// 1. Check that from has enough funds
/// 2. Compute real_amount to xfer (i.e. amount or 0)
@@ -391,7 +391,7 @@ pub fn iop_rotate_scalar_left(prog: &mut Program) {
/// (dst_from[0], dst_to[0], ..., dst_from[N-1], dst_to[N-1])
/// Where N is the batch size
#[instrument(level = "trace", skip(prog))]
pub fn iop_erc_7984_rtl(prog: &mut Program, batch_index: u8, kogge_blk_w: Option<usize>) -> Rtl {
pub fn iop_erc_20_rtl(prog: &mut Program, batch_index: u8, kogge_blk_w: Option<usize>) -> Rtl {
// Allocate metavariables:
// Dest -> Operand
let dst_from = prog.iop_template_var(OperandKind::Dst, 2 * batch_index);

View File

@@ -24,7 +24,7 @@ bindgen.workspace = true
[dependencies]
ark-ec.workspace = true
ark-ff.workspace = true
tfhe-cuda-backend = { version = "0.14.0", path = "../tfhe-cuda-backend" }
tfhe-cuda-backend = { version = "=0.14.0", path = "../tfhe-cuda-backend" }
[features]
default = []

View File

@@ -97,23 +97,28 @@ size_t pippenger_scratch_size_g2(uint32_t n, uint32_t gpu_index);
// d_scalars: Device pointer to input BigInt scalars (array of n scalars)
// n: Number of points/scalars
// d_scratch: Caller-provided device scratch buffer for intermediate results
// size_tracker: Reference for tracking GPU memory allocation sizes
void point_msm_g1_async(cudaStream_t stream, uint32_t gpu_index,
G1Projective *h_result, const G1Affine *d_points,
const Scalar *d_scalars, uint32_t n,
G1Projective *d_scratch);
G1Projective *d_scratch, uint64_t &size_tracker,
bool gpu_memory_allocated);
void point_msm_g1(cudaStream_t stream, uint32_t gpu_index,
G1Projective *h_result, const G1Affine *d_points,
const Scalar *d_scalars, uint32_t n, G1Projective *d_scratch);
const Scalar *d_scalars, uint32_t n, G1Projective *d_scratch,
uint64_t &size_tracker, bool gpu_memory_allocated);
// MSM for G2 points with BigInt scalars (projective result)
// Result is written directly to a host pointer.
void point_msm_g2_async(cudaStream_t stream, uint32_t gpu_index,
G2ProjectivePoint *h_result, const G2Point *d_points,
const Scalar *d_scalars, uint32_t n,
G2ProjectivePoint *d_scratch);
G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
bool gpu_memory_allocated);
void point_msm_g2(cudaStream_t stream, uint32_t gpu_index,
G2ProjectivePoint *h_result, const G2Point *d_points,
const Scalar *d_scalars, uint32_t n,
G2ProjectivePoint *d_scratch);
G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
bool gpu_memory_allocated);

View File

@@ -8,16 +8,17 @@
// Multi-Scalar Multiplication (MSM) using Pippenger algorithm for BLS12-446
// Forward declarations for Pippenger implementations
void point_msm_g1_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
G1Projective *h_result,
const G1Affine *d_points,
const Scalar *d_scalars, uint32_t n,
G1Projective *d_scratch);
void point_msm_g1_pippenger_async(
cudaStream_t stream, uint32_t gpu_index, G1Projective *h_result,
const G1Affine *d_points, const Scalar *d_scalars, uint32_t n,
G1Projective *d_scratch, uint64_t &size_tracker, bool gpu_memory_allocated);
void point_msm_g2_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
G2ProjectivePoint *h_result,
const G2Point *d_points,
const Scalar *d_scalars, uint32_t n,
G2ProjectivePoint *d_scratch);
G2ProjectivePoint *d_scratch,
uint64_t &size_tracker,
bool gpu_memory_allocated);
// ============================================================================
// Public MSM API for BigInt scalars
@@ -28,9 +29,11 @@ void point_msm_g2_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
void point_msm_g1_async(cudaStream_t stream, uint32_t gpu_index,
G1Projective *h_result, const G1Affine *d_points,
const Scalar *d_scalars, uint32_t n,
G1Projective *d_scratch) {
G1Projective *d_scratch, uint64_t &size_tracker,
bool gpu_memory_allocated) {
point_msm_g1_pippenger_async(stream, gpu_index, h_result, d_points, d_scalars,
n, d_scratch);
n, d_scratch, size_tracker,
gpu_memory_allocated);
}
// MSM with BigInt scalars for G2 (projective coordinates internally)
@@ -38,17 +41,19 @@ void point_msm_g1_async(cudaStream_t stream, uint32_t gpu_index,
void point_msm_g2_async(cudaStream_t stream, uint32_t gpu_index,
G2ProjectivePoint *h_result, const G2Point *d_points,
const Scalar *d_scalars, uint32_t n,
G2ProjectivePoint *d_scratch) {
G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
bool gpu_memory_allocated) {
point_msm_g2_pippenger_async(stream, gpu_index, h_result, d_points, d_scalars,
n, d_scratch);
n, d_scratch, size_tracker,
gpu_memory_allocated);
}
void point_msm_g1(cudaStream_t stream, uint32_t gpu_index,
G1Projective *h_result, const G1Affine *d_points,
const Scalar *d_scalars, uint32_t n,
G1Projective *d_scratch) {
const Scalar *d_scalars, uint32_t n, G1Projective *d_scratch,
uint64_t &size_tracker, bool gpu_memory_allocated) {
point_msm_g1_async(stream, gpu_index, h_result, d_points, d_scalars, n,
d_scratch);
d_scratch, size_tracker, gpu_memory_allocated);
// The async impl already syncs internally before the CPU-side Horner phase,
// so the stream is idle here. This sync is kept for defensive correctness.
cuda_synchronize_stream(stream, gpu_index);
@@ -57,9 +62,10 @@ void point_msm_g1(cudaStream_t stream, uint32_t gpu_index,
void point_msm_g2(cudaStream_t stream, uint32_t gpu_index,
G2ProjectivePoint *h_result, const G2Point *d_points,
const Scalar *d_scalars, uint32_t n,
G2ProjectivePoint *d_scratch) {
G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
bool gpu_memory_allocated) {
point_msm_g2_async(stream, gpu_index, h_result, d_points, d_scalars, n,
d_scratch);
d_scratch, size_tracker, gpu_memory_allocated);
// See comment in point_msm_g1 above.
cuda_synchronize_stream(stream, gpu_index);
}

View File

@@ -493,13 +493,12 @@ void horner_combine_cpu(ProjectiveType &result,
// window sums. The caller is responsible for allocating and freeing this
// buffer.
template <typename AffineType, typename ProjectiveType>
void point_msm_pippenger_impl_async(cudaStream_t stream, uint32_t gpu_index,
ProjectiveType *h_result,
const AffineType *d_points,
const Scalar *d_scalars, uint32_t n,
uint32_t threads_per_block,
uint32_t window_size, uint32_t bucket_count,
ProjectiveType *d_scratch) {
void point_msm_pippenger_impl_async(
cudaStream_t stream, uint32_t gpu_index, ProjectiveType *h_result,
const AffineType *d_points, const Scalar *d_scalars, uint32_t n,
uint32_t threads_per_block, uint32_t window_size, uint32_t bucket_count,
ProjectiveType *d_scratch, uint64_t &size_tracker,
bool gpu_memory_allocated) {
using ProjectivePoint = Projective<ProjectiveType>;
if (n == 0) {
@@ -706,13 +705,16 @@ void point_msm_g1_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
G1Projective *h_result,
const G1Affine *d_points,
const Scalar *d_scalars, uint32_t n,
G1Projective *d_scratch) {
G1Projective *d_scratch,
uint64_t &size_tracker,
bool gpu_memory_allocated) {
uint32_t window_size, bucket_count;
get_g1_window_params(n, window_size, bucket_count);
point_msm_pippenger_impl_async<G1Affine, G1Projective>(
stream, gpu_index, h_result, d_points, d_scalars, n,
msm_threads_per_block<G1Affine>(n), window_size, bucket_count, d_scratch);
msm_threads_per_block<G1Affine>(n), window_size, bucket_count, d_scratch,
size_tracker, gpu_memory_allocated);
}
// MSM with BigInt scalars for G2 (projective coordinates internally)
@@ -722,11 +724,14 @@ void point_msm_g2_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
G2ProjectivePoint *h_result,
const G2Point *d_points,
const Scalar *d_scalars, uint32_t n,
G2ProjectivePoint *d_scratch) {
G2ProjectivePoint *d_scratch,
uint64_t &size_tracker,
bool gpu_memory_allocated) {
uint32_t window_size, bucket_count;
get_g2_window_params(n, window_size, bucket_count);
point_msm_pippenger_impl_async<G2Point, G2ProjectivePoint>(
stream, gpu_index, h_result, d_points, d_scalars, n,
msm_threads_per_block<G2Point>(n), window_size, bucket_count, d_scratch);
msm_threads_per_block<G2Point>(n), window_size, bucket_count, d_scratch,
size_tracker, gpu_memory_allocated);
}

View File

@@ -187,82 +187,37 @@ __host__ __device__ void fp_copy(Fp &dst, const Fp &src) {
// "Raw" means without modular reduction - performs a + b and returns carry.
// This is an internal helper used by fp_add() which handles reduction.
__host__ __device__ UNSIGNED_LIMB fp_add_raw(Fp &c, const Fp &a, const Fp &b) {
#if defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 64
// PTX carry-chain: add.cc sets the hardware carry flag, addc.cc propagates
// it. This replaces 2 software carry-detect comparisons per limb (~14 extra
// instructions across 7 limbs) with zero-cost hardware flag propagation.
uint64_t carry_out;
asm("add.cc.u64 %0, %8, %15;\n\t" // c[0] = a[0] + b[0], set CF
"addc.cc.u64 %1, %9, %16;\n\t" // c[1] = a[1] + b[1] + CF
"addc.cc.u64 %2, %10, %17;\n\t" // c[2] = a[2] + b[2] + CF
"addc.cc.u64 %3, %11, %18;\n\t" // c[3] = a[3] + b[3] + CF
"addc.cc.u64 %4, %12, %19;\n\t" // c[4] = a[4] + b[4] + CF
"addc.cc.u64 %5, %13, %20;\n\t" // c[5] = a[5] + b[5] + CF
"addc.cc.u64 %6, %14, %21;\n\t" // c[6] = a[6] + b[6] + CF
"addc.u64 %7, 0, 0;\n\t" // carry_out = 0 + 0 + CF
: "=l"(c.limb[0]), "=l"(c.limb[1]), "=l"(c.limb[2]), "=l"(c.limb[3]),
"=l"(c.limb[4]), "=l"(c.limb[5]), "=l"(c.limb[6]), "=l"(carry_out)
: "l"(a.limb[0]), "l"(a.limb[1]), "l"(a.limb[2]), "l"(a.limb[3]),
"l"(a.limb[4]), "l"(a.limb[5]), "l"(a.limb[6]), "l"(b.limb[0]),
"l"(b.limb[1]), "l"(b.limb[2]), "l"(b.limb[3]), "l"(b.limb[4]),
"l"(b.limb[5]), "l"(b.limb[6]));
return carry_out;
#else
// Host path: portable software carry detection
UNSIGNED_LIMB carry = 0;
for (int i = 0; i < FP_LIMBS; i++) {
// Add with carry: c = a + b + carry
UNSIGNED_LIMB sum = a.limb[i] + carry;
carry = (sum < a.limb[i]) ? 1 : 0;
carry = (sum < a.limb[i]) ? 1 : 0; // Check for overflow
sum += b.limb[i];
carry += (sum < b.limb[i]) ? 1 : 0;
carry += (sum < b.limb[i]) ? 1 : 0; // Check for overflow
c.limb[i] = sum;
}
return carry;
#endif
}
// Subtraction with borrow propagation
// "Raw" means without modular reduction - performs a - b and returns borrow.
// This is an internal helper used by fp_sub() which handles reduction.
__host__ __device__ UNSIGNED_LIMB fp_sub_raw(Fp &c, const Fp &a, const Fp &b) {
#if defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 64
// PTX borrow-chain: sub.cc sets the hardware borrow flag, subc.cc propagates
// it. Same benefit as fp_add_raw -- eliminates 2 comparisons per limb.
uint64_t borrow_out;
asm("sub.cc.u64 %0, %8, %15;\n\t" // c[0] = a[0] - b[0], set CF
"subc.cc.u64 %1, %9, %16;\n\t" // c[1] = a[1] - b[1] - CF
"subc.cc.u64 %2, %10, %17;\n\t" // c[2] = a[2] - b[2] - CF
"subc.cc.u64 %3, %11, %18;\n\t" // c[3] = a[3] - b[3] - CF
"subc.cc.u64 %4, %12, %19;\n\t" // c[4] = a[4] - b[4] - CF
"subc.cc.u64 %5, %13, %20;\n\t" // c[5] = a[5] - b[5] - CF
"subc.cc.u64 %6, %14, %21;\n\t" // c[6] = a[6] - b[6] - CF
"subc.u64 %7, 0, 0;\n\t" // borrow_out = 0 - 0 - CF
: "=l"(c.limb[0]), "=l"(c.limb[1]), "=l"(c.limb[2]), "=l"(c.limb[3]),
"=l"(c.limb[4]), "=l"(c.limb[5]), "=l"(c.limb[6]), "=l"(borrow_out)
: "l"(a.limb[0]), "l"(a.limb[1]), "l"(a.limb[2]), "l"(a.limb[3]),
"l"(a.limb[4]), "l"(a.limb[5]), "l"(a.limb[6]), "l"(b.limb[0]),
"l"(b.limb[1]), "l"(b.limb[2]), "l"(b.limb[3]), "l"(b.limb[4]),
"l"(b.limb[5]), "l"(b.limb[6]));
// subc.u64 with 0-0-CF produces 0 if no borrow, or 0xFFFFFFFFFFFFFFFF if
// borrow. Normalize to 0/1 for callers that check (borrow != 0) or add it.
return borrow_out & 1;
#else
// Host path: portable software borrow detection
UNSIGNED_LIMB borrow = 0;
for (int i = 0; i < FP_LIMBS; i++) {
// Subtract with borrow: c = a - b - borrow
UNSIGNED_LIMB diff = a.limb[i] - borrow;
borrow = (diff > a.limb[i]) ? 1 : 0;
borrow = (diff > a.limb[i]) ? 1 : 0; // Check for underflow
UNSIGNED_LIMB old_diff = diff;
diff -= b.limb[i];
borrow += (diff > old_diff) ? 1 : 0;
borrow += (diff > old_diff) ? 1 : 0; // Check for underflow
c.limb[i] = diff;
}
return borrow;
#endif
}
// Addition with modular reduction: c = (a + b) mod p
@@ -271,27 +226,7 @@ __host__ __device__ void fp_add(Fp &c, const Fp &a, const Fp &b) {
Fp sum;
UNSIGNED_LIMB carry = fp_add_raw(sum, a, b);
#if defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 64
// Branchless reduction: always compute sum - p, then select based on
// whether reduction was needed. This avoids divergent branches that stall
// warps when some threads need reduction and others don't.
//
// Decision logic:
// carry=1 -> sum overflowed 448 bits, definitely >= p -> use reduced
// carry=0, borrow=0 -> sum >= p in 448 bits -> use reduced
// carry=0, borrow=1 -> sum < p -> use original sum
// So: use_original = (!carry) & borrow
Fp reduced;
UNSIGNED_LIMB borrow = fp_sub_raw(reduced, sum, fp_modulus());
UNSIGNED_LIMB use_original = ((carry ^ 1) & borrow);
UNSIGNED_LIMB mask =
-use_original; // all-ones if keep sum, all-zeros if keep reduced
for (int i = 0; i < FP_LIMBS; i++) {
c.limb[i] = (sum.limb[i] & mask) | (reduced.limb[i] & ~mask);
}
#else
// Host path: branching is fine on CPU (branch predictor handles it well)
// If there's a carry or sum >= MODULUS, we need to reduce
const Fp &p = fp_modulus();
if (carry || fp_cmp(sum, p) != ComparisonType::Less) {
Fp reduced;
@@ -300,7 +235,6 @@ __host__ __device__ void fp_add(Fp &c, const Fp &a, const Fp &b) {
} else {
fp_copy(c, sum);
}
#endif
}
// Subtraction with modular reduction: c = (a - b) mod p
@@ -309,28 +243,13 @@ __host__ __device__ void fp_sub(Fp &c, const Fp &a, const Fp &b) {
Fp diff;
UNSIGNED_LIMB borrow = fp_sub_raw(diff, a, b);
#if defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 64
// Branchless correction: always compute diff + p, select based on borrow.
// Same rationale as fp_add -- avoids warp divergence.
// borrow=1 -> a < b, need to add p -> use corrected
// borrow=0 -> a >= b, result is valid -> use diff
Fp corrected;
fp_add_raw(corrected, diff, fp_modulus());
UNSIGNED_LIMB mask =
-borrow; // all-ones if borrow (use corrected), all-zeros if not
for (int i = 0; i < FP_LIMBS; i++) {
c.limb[i] = (corrected.limb[i] & mask) | (diff.limb[i] & ~mask);
}
#else
// Host path: branching is fine on CPU
// If there was a borrow, we need to add MODULUS
const Fp &p = fp_modulus();
if (borrow) {
fp_add_raw(c, diff, p);
} else {
fp_copy(c, diff);
}
#endif
}
// Small-constant multiplication via addition chains.
@@ -534,223 +453,23 @@ __host__ __device__ void fp_mont_reduce(Fp &c, const UNSIGNED_LIMB *a) {
}
}
// ============================================================================
// PTX-accelerated CIOS Montgomery multiplication (device path)
// ============================================================================
// The CIOS algorithm for 7 x 64-bit limbs executes 98 multiply-accumulate
// steps across 7 outer iterations. Each step computes:
// (carry, t[j]) = t[j] + a[j] * b_i + carry
// which is a 64x64->128 multiply plus a three-operand addition with carry.
//
// The C++ path uses software carry detection: carry = (sum < old) ? 1 : 0.
// The PTX path below uses hardware carry flags via the .cc suffix:
// - mul.lo.u64 / mul.hi.u64 : 64x64->128 wide multiply
// - add.cc.u64 / addc.u64 : addition chain with hardware carry flag
//
// Each multiply-accumulate step uses 6 PTX instructions instead of ~10+ in
// the software-carry version. The 7 outer iterations are fully unrolled, and
// the limb-shift loop (t[j] = t[j+1]) is eliminated by register renaming.
//
// REGISTER ALIASING NOTE: All PTX temporaries (_lo, _hi) are declared as
// .reg inside the asm block. This prevents nvcc's register allocator from
// aliasing them with C operands (t_j, carry), which was the root cause of
// previous correctness bugs where "+l" outputs could share registers with
// "l" inputs in the same asm statement.
// ============================================================================
#ifdef __CUDA_ARCH__
#if LIMB_BITS_CONFIG == 64
// Multiply-accumulate one limb: (carry_out, t_j) = t_j + a_j * b_i + carry_in
//
// All intermediates (_lo, _hi) are PTX .reg temporaries inside a { } scope
// block to avoid: (1) nvcc register aliasing between C operands, and (2)
// duplicate .reg definitions when the macro is expanded multiple times.
// The 6-instruction sequence:
// mul.lo.u64 _lo, a_j, b_i -- low 64 bits of product
// mul.hi.u64 _hi, a_j, b_i -- high 64 bits of product
// add.cc.u64 t_j, t_j, _lo -- t_j += _lo, set CF
// addc.u64 _hi, _hi, 0 -- _hi += CF
// add.cc.u64 t_j, t_j, carry -- t_j += carry_in, set CF
// addc.u64 carry, _hi, 0 -- carry_out = _hi + CF
#define LIMB_MACC(t_j, carry, a_j, b_i) \
asm volatile("{\n\t" \
".reg .u64 _lo, _hi;\n\t" \
"mul.lo.u64 _lo, %2, %3;\n\t" \
"mul.hi.u64 _hi, %2, %3;\n\t" \
"add.cc.u64 %0, %0, _lo;\n\t" \
"addc.u64 _hi, _hi, 0;\n\t" \
"add.cc.u64 %0, %0, %1;\n\t" \
"addc.u64 %1, _hi, 0;\n\t" \
"}\n\t" \
: "+l"(t_j), "+l"(carry) \
: "l"(a_j), "l"(b_i))
// Single CIOS iteration: multiply-accumulate, reduce, and shift.
//
// Computes:
// 1. t += a * b_i (7 limb multiply-accumulate with carry chain)
// 2. m = t[0] * p_prime (Montgomery reduction factor)
// 3. t += m * p (reduction, zeros out t[0])
// 4. Shift t right by one limb (via register renaming into r0..r7)
//
// The macro lets the compiler allocate registers across all 7 unrolled
// iterations, avoiding spills to local memory.
#define CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, \
a5, a6, b_i, p0, p1, p2, p3, p4, p5, p6, p_prime, \
r0, r1, r2, r3, r4, r5, r6, r7) \
do { \
uint64_t _carry = 0; \
/* Step 1: t += a * b_i */ \
LIMB_MACC(t0, _carry, a0, b_i); \
LIMB_MACC(t1, _carry, a1, b_i); \
LIMB_MACC(t2, _carry, a2, b_i); \
LIMB_MACC(t3, _carry, a3, b_i); \
LIMB_MACC(t4, _carry, a4, b_i); \
LIMB_MACC(t5, _carry, a5, b_i); \
LIMB_MACC(t6, _carry, a6, b_i); \
/* Accumulate final carry into overflow limb t7 */ \
uint64_t _overflow; \
asm("add.cc.u64 %0, %0, %2;\n\t" \
"addc.u64 %1, 0, 0;\n\t" \
: "+l"(t7), "=l"(_overflow) \
: "l"(_carry)); \
\
/* Step 2: m = t0 * p_prime mod 2^64 */ \
uint64_t _m = t0 * p_prime; \
\
/* Step 3: t += m * p (zeros out t0) */ \
_carry = 0; \
LIMB_MACC(t0, _carry, _m, p0); \
LIMB_MACC(t1, _carry, _m, p1); \
LIMB_MACC(t2, _carry, _m, p2); \
LIMB_MACC(t3, _carry, _m, p3); \
LIMB_MACC(t4, _carry, _m, p4); \
LIMB_MACC(t5, _carry, _m, p5); \
LIMB_MACC(t6, _carry, _m, p6); \
/* Finalize overflow: t7 = t7 + _carry + _overflow */ \
/* Plain adds (no carry chain) -- the CIOS invariant guarantees this */ \
/* sum fits in 64 bits so intermediate overflow does not matter. */ \
t7 += _carry; \
t7 += _overflow; \
\
/* Step 4: Shift right by one limb via register renaming */ \
/* t0 is now zero (by construction of m), discard it */ \
r0 = t1; \
r1 = t2; \
r2 = t3; \
r3 = t4; \
r4 = t5; \
r5 = t6; \
r6 = t7; \
r7 = 0; \
} while (0)
__device__ __noinline__ void fp_mont_mul_cios_ptx(Fp &c, const Fp &a,
const Fp &b) {
const uint64_t p0 = DEVICE_MODULUS.limb[0];
const uint64_t p1 = DEVICE_MODULUS.limb[1];
const uint64_t p2 = DEVICE_MODULUS.limb[2];
const uint64_t p3 = DEVICE_MODULUS.limb[3];
const uint64_t p4 = DEVICE_MODULUS.limb[4];
const uint64_t p5 = DEVICE_MODULUS.limb[5];
const uint64_t p6 = DEVICE_MODULUS.limb[6];
const uint64_t pp = DEVICE_P_PRIME;
const uint64_t a0 = a.limb[0], a1 = a.limb[1], a2 = a.limb[2];
const uint64_t a3 = a.limb[3], a4 = a.limb[4], a5 = a.limb[5];
const uint64_t a6 = a.limb[6];
// Accumulator: 7 limbs + 1 overflow, initialized to zero
uint64_t t0 = 0, t1 = 0, t2 = 0, t3 = 0;
uint64_t t4 = 0, t5 = 0, t6 = 0, t7 = 0;
// 7 fully-unrolled CIOS iterations with register renaming for the shift.
// Each iteration processes one limb of b, accumulates a*b[i], reduces,
// and shifts. The output registers become the input for the next iteration.
CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
b.limb[0], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
t4, t5, t6, t7);
CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
b.limb[1], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
t4, t5, t6, t7);
CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
b.limb[2], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
t4, t5, t6, t7);
CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
b.limb[3], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
t4, t5, t6, t7);
CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
b.limb[4], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
t4, t5, t6, t7);
CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
b.limb[5], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
t4, t5, t6, t7);
CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
b.limb[6], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
t4, t5, t6, t7);
// Final reduction: if t[0..7] >= p (extended to 8 limbs), subtract p.
// Compute (t[0..6] - p[0..6]) with borrow, then subtract borrow from t7.
// If t7 after subtraction is non-negative, the reduced result is valid;
// otherwise the original t[0..6] is already in [0, p).
uint64_t r0, r1, r2, r3, r4, r5, r6, mask;
asm("sub.cc.u64 %0, %8, %15;\n\t" // r0 = t0 - p0
"subc.cc.u64 %1, %9, %16;\n\t" // r1 = t1 - p1 - borrow
"subc.cc.u64 %2, %10, %17;\n\t" // r2 = t2 - p2 - borrow
"subc.cc.u64 %3, %11, %18;\n\t" // r3 = t3 - p3 - borrow
"subc.cc.u64 %4, %12, %19;\n\t" // r4 = t4 - p4 - borrow
"subc.cc.u64 %5, %13, %20;\n\t" // r5 = t5 - p5 - borrow
"subc.cc.u64 %6, %14, %21;\n\t" // r6 = t6 - p6 - borrow
"subc.u64 %7, %22, 0;\n\t" // mask_src = t7 - 0 - borrow
"shr.s64 %7, %7, 63;\n\t" // mask = sign-extend: -1 if negative, 0 if
// >= 0
: "=l"(r0), "=l"(r1), "=l"(r2), "=l"(r3), "=l"(r4), "=l"(r5), "=l"(r6),
"=l"(mask)
: "l"(t0), "l"(t1), "l"(t2), "l"(t3), "l"(t4), "l"(t5), "l"(t6), "l"(p0),
"l"(p1), "l"(p2), "l"(p3), "l"(p4), "l"(p5), "l"(p6), "l"(t7));
// Branchless selection:
// mask = 0 -> t >= p (use reduced r[0..6])
// mask = -1 -> t < p (keep original t[0..6])
c.limb[0] = (t0 & mask) | (r0 & ~mask);
c.limb[1] = (t1 & mask) | (r1 & ~mask);
c.limb[2] = (t2 & mask) | (r2 & ~mask);
c.limb[3] = (t3 & mask) | (r3 & ~mask);
c.limb[4] = (t4 & mask) | (r4 & ~mask);
c.limb[5] = (t5 & mask) | (r5 & ~mask);
c.limb[6] = (t6 & mask) | (r6 & ~mask);
}
#undef LIMB_MACC
#undef CIOS_ITERATION_PTX
#endif // LIMB_BITS_CONFIG == 64
#endif // __CUDA_ARCH__
// CIOS (Coarsely Integrated Operand Scanning) Montgomery multiplication
// Fuses multiplication and reduction in a single pass for better efficiency.
// Uses only FP_LIMBS+1 limbs of working space instead of 2*FP_LIMBS.
// Both a and b are in Montgomery form, result is in Montgomery form.
__host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
#if defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 64
// Device path: fully unrolled PTX with hardware carry flags
fp_mont_mul_cios_ptx(c, a, b);
#else
// Host path: portable C++ implementation
const Fp &p = fp_modulus();
UNSIGNED_LIMB p_prime = fp_p_prime();
// Working array: only n+1 limbs needed (vs 2n for separate mul+reduce)
UNSIGNED_LIMB t[FP_LIMBS + 1];
#ifdef __CUDA_ARCH__
for (int i = 0; i < FP_LIMBS + 1; i++) {
t[i] = 0;
}
#else
memset(t, 0, (FP_LIMBS + 1) * sizeof(UNSIGNED_LIMB));
#endif
// Main CIOS loop: for each limb of b
for (int i = 0; i < FP_LIMBS; i++) {
@@ -810,7 +529,14 @@ __host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
}
// Copy result to output
#ifdef __CUDA_ARCH__
#pragma unroll
for (int i = 0; i < FP_LIMBS; i++) {
c.limb[i] = t[i];
}
#else
memcpy(&c.limb[0], t, FP_LIMBS * sizeof(UNSIGNED_LIMB));
#endif
// Final reduction: if result >= p or there's overflow, subtract p
if (t[FP_LIMBS] != 0 || fp_cmp(c, p) != ComparisonType::Less) {
@@ -819,7 +545,6 @@ __host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
fp_copy(c, reduced);
}
// Result is in Montgomery form
#endif
}
// Montgomery multiplication: c = (a * b * R_INV) mod p

View File

@@ -23,8 +23,7 @@ set(ZK_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../src)
set(ZK_PRIMITIVES_DIR ${ZK_SRC_DIR}/primitives)
# Build device library from tfhe-cuda-backend
add_library(tfhe_device_bench STATIC ${TFHE_CUDA_BACKEND_DIR}/src/device.cu
${TFHE_CUDA_BACKEND_DIR}/src/utils/helper_profile.cu)
add_library(tfhe_device_bench STATIC ${TFHE_CUDA_BACKEND_DIR}/src/device.cu)
set_target_properties(
tfhe_device_bench
PROPERTIES CUDA_SEPARABLE_COMPILATION ON

View File

@@ -140,14 +140,14 @@ static void BM_G1_MSM(benchmark::State &state) {
// Warm-up iterations
for (int i = 0; i < WARMUP_ITERATIONS; i++) {
point_msm_g1_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
d_scalars, n, d_scratch);
d_scalars, n, d_scratch, size_tracker, true);
}
cuda_synchronize_stream(g_benchmark_stream, g_gpu_index);
// Benchmark loop: only measure the MSM computation, no memory operations
for (auto _ : state) {
point_msm_g1_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
d_scalars, n, d_scratch);
d_scalars, n, d_scratch, size_tracker, true);
benchmark::ClobberMemory();
}
@@ -221,14 +221,14 @@ static void BM_G2_MSM(benchmark::State &state) {
// Warm-up iterations
for (int i = 0; i < WARMUP_ITERATIONS; i++) {
point_msm_g2_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
d_scalars, n, d_scratch);
d_scalars, n, d_scratch, size_tracker, true);
}
cuda_synchronize_stream(g_benchmark_stream, g_gpu_index);
// Benchmark loop: only measure the MSM computation, no memory operations
for (auto _ : state) {
point_msm_g2_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
d_scalars, n, d_scratch);
d_scalars, n, d_scratch, size_tracker, true);
benchmark::ClobberMemory();
}

View File

@@ -20,8 +20,7 @@ set(ZK_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../src)
set(ZK_PRIMITIVES_DIR ${ZK_SRC_DIR}/primitives)
# Build device library from tfhe-cuda-backend
add_library(tfhe_device STATIC ${TFHE_CUDA_BACKEND_DIR}/src/device.cu
${TFHE_CUDA_BACKEND_DIR}/src/utils/helper_profile.cu)
add_library(tfhe_device STATIC ${TFHE_CUDA_BACKEND_DIR}/src/device.cu)
set_target_properties(
tfhe_device
PROPERTIES CUDA_SEPARABLE_COMPILATION ON

View File

@@ -13,8 +13,8 @@
// ./build/tests_and_benchmarks/tests/basic/basic_curve_ops
#include "curve.h"
#include "device.h"
#include "fp.h"
#include <cassert>
#include <cstdio>
#include <cstring>
@@ -24,7 +24,7 @@ int main() {
// (non-Montgomery) form. Convert to Montgomery, then lift to projective for
// host-side arithmetic.
const G1Affine &gen_normal = g1_generator();
PANIC_IF_FALSE(!g1_is_infinity(gen_normal), "generator must not be infinity");
assert(!g1_is_infinity(gen_normal));
G1Affine gen_affine = gen_normal;
point_to_montgomery_inplace(gen_affine);
@@ -37,21 +37,21 @@ int main() {
// G + (-G) = identity (Z = 0 in the projective convention)
G1Projective identity = G + neg_G;
PANIC_IF_FALSE(fp_is_zero(identity.Z), "G + (-G) must be identity (Z = 0)");
assert(fp_is_zero(identity.Z));
printf("Negation (-G) and G + (-G) = identity: OK\n");
// ---- Addition: 2*G = G + G, 3*G = 2*G + G ----
G1Projective two_G = G + G;
PANIC_IF_FALSE(!(two_G == G1Projective{}), "2*G must not be identity");
assert(!(two_G == G1Projective())); // not the identity
G1Projective three_G = two_G + G;
PANIC_IF_FALSE(!(three_G == G1Projective{}), "3*G must not be identity");
assert(!(three_G == G1Projective()));
printf("Addition (2*G, 3*G): OK\n");
// ---- Compound assignment: G += G ----
G1Projective acc = G;
acc += G; // acc = 2*G
PANIC_IF_FALSE(acc == two_G, "G += G must equal 2*G");
assert(acc == two_G);
printf("Compound assignment (+=): OK\n");
// ---- Scalar multiplication: 3*G using Scalar type ----
@@ -61,22 +61,19 @@ int main() {
scalar_3.limb[0] = 3;
G1Projective three_G_via_scalar = G * scalar_3;
PANIC_IF_FALSE(!(three_G_via_scalar == G1Projective{}),
"3*G via scalar must not be identity");
assert(!(three_G_via_scalar == G1Projective()));
// Normalise both to Z = 1 (Montgomery) before comparing coordinates.
normalize_projective_g1(three_G);
normalize_projective_g1(three_G_via_scalar);
PANIC_IF_FALSE(three_G == three_G_via_scalar,
"3*G via addition must equal 3*G via scalar multiply");
assert(three_G == three_G_via_scalar);
printf("Scalar multiplication (3*G == G + G + G): OK\n");
// ---- Projective -> affine conversion ----
// projective_to_affine_g1 keeps coordinates in Montgomery form.
G1Affine three_G_affine;
projective_to_affine_g1(three_G_affine, three_G);
PANIC_IF_FALSE(!g1_is_infinity(three_G_affine),
"3*G in affine must not be infinity");
assert(!g1_is_infinity(three_G_affine));
printf("Projective -> affine conversion: OK\n");
// ---- Convert to normal-form coordinates ----
@@ -85,8 +82,7 @@ int main() {
G1Projective result = three_G_via_scalar;
normalize_from_montgomery_g1(
result); // coordinates now in normal (non-Montgomery) form
PANIC_IF_FALSE(!fp_is_zero(result.Z),
"normalized result must have non-zero Z");
assert(!fp_is_zero(result.Z)); // Z = 1 (non-zero)
printf("Conversion to normal-form projective: OK\n");
printf("All G1 curve operations passed.\n");

View File

@@ -11,8 +11,8 @@
// cmake --build build --target basic_fp_ops
// ./build/tests_and_benchmarks/tests/basic/basic_fp_ops
#include "device.h"
#include "fp.h"
#include <cassert>
#include <cstdio>
int main() {
@@ -25,16 +25,16 @@ int main() {
fp_one(b); // b = 1
c = a + b; // c = 2
PANIC_IF_FALSE(c.limb[0] == 2, "1 + 1 must equal 2");
assert(c.limb[0] == 2);
c = c - a; // c = 1
PANIC_IF_FALSE(fp_is_one(c), "2 - 1 must equal 1");
assert(fp_is_one(c));
// Compound assignment
c += a; // c = 2
PANIC_IF_FALSE(c.limb[0] == 2, "1 += 1 must equal 2");
assert(c.limb[0] == 2);
c -= b; // c = 1
PANIC_IF_FALSE(fp_is_one(c), "2 -= 1 must equal 1");
assert(fp_is_one(c));
printf("Addition/subtraction: OK\n");
@@ -43,7 +43,7 @@ int main() {
// form, but for add/sub/neg small normal-form values also work correctly.
Fp neg_a = -a; // neg_a = -1 mod p
Fp sum = a + neg_a;
PANIC_IF_FALSE(fp_is_zero(sum), "1 + (-1) must equal 0");
assert(fp_is_zero(sum)); // 1 + (-1) = 0
printf("Negation: OK\n");
// ---- Multiplication (Montgomery form required) ----
@@ -56,17 +56,17 @@ int main() {
result_m = one_m * two_m; // result_m = 2 in Montgomery form
fp_from_montgomery(result, result_m);
PANIC_IF_FALSE(result.limb[0] == 2, "1 * 2 must equal 2");
assert(result.limb[0] == 2);
result_m = two_m * two_m; // result_m = 4 in Montgomery form
fp_from_montgomery(result, result_m);
PANIC_IF_FALSE(result.limb[0] == 4, "2 * 2 must equal 4");
assert(result.limb[0] == 4);
// Compound multiplication
result_m = two_m;
result_m *= two_m; // result_m = 4
fp_from_montgomery(result, result_m);
PANIC_IF_FALSE(result.limb[0] == 4, "2 *= 2 must equal 4");
assert(result.limb[0] == 4);
// Convert an arbitrary normal-form value to Montgomery before multiplying
Fp five_normal, five_m, twenty_five_m, twenty_five;
@@ -76,7 +76,7 @@ int main() {
fp_mont_mul(twenty_five_m, five_m, five_m); // 5 * 5 = 25
fp_from_montgomery(twenty_five, twenty_five_m);
PANIC_IF_FALSE(twenty_five.limb[0] == 25, "5 * 5 must equal 25");
assert(twenty_five.limb[0] == 25);
printf("Multiplication: OK\n");
@@ -88,7 +88,7 @@ int main() {
Fp one_check;
fp_div(one_check, five_normal, five_normal); // 5 / 5 = 1
PANIC_IF_FALSE(fp_is_one(one_check), "5 / 5 must equal 1");
assert(fp_is_one(one_check));
// Verify: 5 * 5^{-1} == 1 (using fp_div as a cross-check)
Fp product;
@@ -98,7 +98,7 @@ int main() {
fp_zero(two_normal);
two_normal.limb[0] = 2;
fp_div(product, two_normal, two_normal); // 2 / 2 = 1
PANIC_IF_FALSE(fp_is_one(product), "2 / 2 must equal 1");
assert(fp_is_one(product));
printf("Inversion/division: OK\n");

View File

@@ -20,6 +20,7 @@
#include "device.h"
#include "fp.h"
#include "msm.h"
#include <cassert>
#include <cstdio>
#include <cstring>
#include <vector>
@@ -32,6 +33,7 @@ int main() {
const uint32_t gpu_index = 0;
const uint32_t n = 4; // number of points / scalars
uint64_t size_tracker = 0;
// ---- Prepare host-side points in Montgomery form ----
// Use n doublings of the G1 generator: G, 2*G, 4*G, 8*G.
@@ -74,7 +76,8 @@ int main() {
// ---- Run MSM (synchronous wrapper; result written directly to host) ----
G1Projective h_result;
point_msm_g1(stream, gpu_index, &h_result, d_points, d_scalars, n, d_scratch);
point_msm_g1(stream, gpu_index, &h_result, d_points, d_scalars, n, d_scratch,
size_tracker, true);
// ---- Verify against naive sequential computation on the host ----
// Expected = sum over i of (scalar[i] * point[i]).
@@ -92,8 +95,7 @@ int main() {
// Normalise to Z = 1 (Montgomery) before comparing projective coordinates.
normalize_projective_g1(h_result);
normalize_projective_g1(expected);
PANIC_IF_FALSE(h_result == expected,
"MSM result must match naive sequential computation");
assert(h_result == expected);
printf("MSM result matches naive sequential computation.\n");
// ---- Cleanup ----

View File

@@ -3,7 +3,6 @@
#include "fp.h"
#include "fp_helpers.h" // Include test-only batch operations and kernels
#include <chrono>
#include <cinttypes>
#include <cstdint>
#include <cstring>
#include <cuda_runtime.h>
@@ -298,7 +297,7 @@ protected:
// Test basic addition (on GPU)
TEST_F(FpArithmeticTest, Addition) {
uint64_t size_tracker = 0;
Fp a, b, c, c_cpu;
// Test: 1 + 1 = 2
@@ -321,7 +320,7 @@ TEST_F(FpArithmeticTest, Addition) {
// Test subtraction (on GPU)
TEST_F(FpArithmeticTest, Subtraction) {
uint64_t size_tracker = 0;
Fp a, b, c, a_cpu;
// Test: 2 - 1 = 1
@@ -342,7 +341,7 @@ TEST_F(FpArithmeticTest, Subtraction) {
// Test multiplication (on GPU)
TEST_F(FpArithmeticTest, Multiplication) {
uint64_t size_tracker = 0;
Fp five, three, result, expected;
fp_zero(five);
@@ -371,7 +370,7 @@ TEST_F(FpArithmeticTest, Multiplication) {
// Test negation (on GPU)
TEST_F(FpArithmeticTest, Negation) {
uint64_t size_tracker = 0;
Fp a, neg_a, result;
fp_zero(a);
@@ -396,7 +395,7 @@ TEST_F(FpArithmeticTest, Negation) {
// Test Montgomery conversion round-trip (on GPU)
TEST_F(FpArithmeticTest, MontgomeryRoundTrip) {
uint64_t size_tracker = 0;
Fp value, mont_form, back, mont_form_cpu, back_cpu;
fp_zero(value);
@@ -422,7 +421,7 @@ TEST_F(FpArithmeticTest, MontgomeryRoundTrip) {
// Test Montgomery multiplication (on GPU)
TEST_F(FpArithmeticTest, MontgomeryMultiplication) {
uint64_t size_tracker = 0;
Fp five, three, five_m, three_m, result_m, result, expected, result_cpu;
fp_zero(five);
@@ -461,7 +460,7 @@ TEST_F(FpArithmeticTest, MontgomeryMultiplication) {
// Test comparison operations (on GPU)
TEST_F(FpArithmeticTest, Comparison) {
uint64_t size_tracker = 0;
Fp five, three;
fp_zero(five);
@@ -482,7 +481,7 @@ TEST_F(FpArithmeticTest, Comparison) {
// Test zero and one (on GPU)
TEST_F(FpArithmeticTest, ZeroAndOne) {
uint64_t size_tracker = 0;
Fp zero, one;
fp_zero(zero);
@@ -500,7 +499,7 @@ TEST_F(FpArithmeticTest, ZeroAndOne) {
// Test copy (on GPU)
TEST_F(FpArithmeticTest, Copy) {
uint64_t size_tracker = 0;
Fp a, b, b_cpu;
fp_zero(a);
@@ -523,7 +522,7 @@ TEST_F(FpArithmeticTest, Copy) {
// Test conditional move (on GPU)
TEST_F(FpArithmeticTest, ConditionalMove) {
uint64_t size_tracker = 0;
Fp a, b, result, result_cpu;
fp_zero(a);
@@ -564,7 +563,7 @@ TEST_F(FpArithmeticTest, ConditionalMove) {
// Test multiplication by zero (on GPU)
TEST_F(FpArithmeticTest, MultiplicationByZero) {
uint64_t size_tracker = 0;
Fp a, zero, result, result_cpu;
fp_zero(zero);
@@ -592,7 +591,7 @@ TEST_F(FpArithmeticTest, MultiplicationByZero) {
// Test inversion (on GPU)
TEST_F(FpArithmeticTest, Inversion) {
uint64_t size_tracker = 0;
Fp a, a_inv, result, a_inv_cpu;
fp_zero(a);
@@ -624,7 +623,7 @@ TEST_F(FpArithmeticTest, Inversion) {
// Test inversion of one (on GPU)
TEST_F(FpArithmeticTest, InversionOfOne) {
uint64_t size_tracker = 0;
Fp one, one_inv, one_inv_cpu;
fp_one(one);
@@ -646,7 +645,7 @@ TEST_F(FpArithmeticTest, InversionOfOne) {
// Test division (on GPU)
TEST_F(FpArithmeticTest, Division) {
uint64_t size_tracker = 0;
Fp a, b, quotient, result;
fp_zero(a);
@@ -679,7 +678,7 @@ TEST_F(FpArithmeticTest, Division) {
// Test division by one (on GPU)
TEST_F(FpArithmeticTest, DivisionByOne) {
uint64_t size_tracker = 0;
Fp a, one, result;
fp_one(one);
@@ -708,7 +707,7 @@ TEST_F(FpArithmeticTest, DivisionByOne) {
// Test exponentiation with small exponent (on GPU)
TEST_F(FpArithmeticTest, ExponentiationSmall) {
uint64_t size_tracker = 0;
Fp base, result, expected, result_cpu;
fp_zero(base);
@@ -735,7 +734,7 @@ TEST_F(FpArithmeticTest, ExponentiationSmall) {
// Test exponentiation to power of one (on GPU)
TEST_F(FpArithmeticTest, ExponentiationToPowerOfOne) {
uint64_t size_tracker = 0;
Fp base, result, result_cpu;
fp_zero(base);
@@ -759,7 +758,7 @@ TEST_F(FpArithmeticTest, ExponentiationToPowerOfOne) {
// Test exponentiation to power of zero (on GPU)
TEST_F(FpArithmeticTest, ExponentiationToPowerOfZero) {
uint64_t size_tracker = 0;
Fp base, result, one, result_cpu;
fp_zero(base);
@@ -783,7 +782,7 @@ TEST_F(FpArithmeticTest, ExponentiationToPowerOfZero) {
// Test exponentiation with large exponent (Fermat's little theorem)
TEST_F(FpArithmeticTest, ExponentiationFermat) {
uint64_t size_tracker = 0;
Fp a, result;
fp_zero(a);
@@ -799,7 +798,7 @@ TEST_F(FpArithmeticTest, ExponentiationFermat) {
// Test exponentiation: a^(p-1) = 1 mod p
TEST_F(FpArithmeticTest, ExponentiationFermatInverse) {
uint64_t size_tracker = 0;
Fp a, result, one;
fp_zero(a);
@@ -821,7 +820,7 @@ TEST_F(FpArithmeticTest, ExponentiationFermatInverse) {
// Test square root (on GPU)
TEST_F(FpArithmeticTest, SquareRoot) {
uint64_t size_tracker = 0;
Fp a, square, sqrt_result, verify, square_cpu, sqrt_result_cpu, verify_cpu;
// Test: sqrt(a^2) = a or -a
@@ -873,12 +872,6 @@ TEST_F(FpArithmeticTest, SquareRoot) {
// Also test on CPU for comparison
Fp neg_a_cpu = -a;
// Verify GPU negation matches CPU negation
EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &neg_a, &neg_a_cpu),
ComparisonType::Equal)
<< "GPU negation should match CPU negation";
cuda_synchronize_stream(stream, gpu_index);
bool matches_a = (fp_cmp_gpu(stream, gpu_index, &sqrt_result, &a) ==
ComparisonType::Equal);
cuda_synchronize_stream(stream, gpu_index);
@@ -898,7 +891,7 @@ TEST_F(FpArithmeticTest, SquareRoot) {
// Test square root of zero (on GPU)
TEST_F(FpArithmeticTest, SquareRootOfZero) {
uint64_t size_tracker = 0;
Fp zero, result, result_cpu;
fp_zero(zero);
@@ -920,7 +913,7 @@ TEST_F(FpArithmeticTest, SquareRootOfZero) {
// Test square root of one (on GPU)
TEST_F(FpArithmeticTest, SquareRootOfOne) {
uint64_t size_tracker = 0;
Fp one, result, result_cpu;
fp_one(one);
@@ -942,7 +935,7 @@ TEST_F(FpArithmeticTest, SquareRootOfOne) {
// Test quadratic residue check (on GPU)
TEST_F(FpArithmeticTest, IsQuadraticResidue) {
uint64_t size_tracker = 0;
Fp a, square, square_cpu, zero;
fp_zero(a);
@@ -978,7 +971,7 @@ TEST_F(FpArithmeticTest, IsQuadraticResidue) {
// device. For now, we test individual conversions on GPU and verify with GPU
// comparisons
TEST_F(FpArithmeticTest, BatchMontgomeryConversion) {
uint64_t size_tracker = 0;
const int n = 10;
Fp normal[n], montgomery[n], back[n];
@@ -1018,7 +1011,7 @@ TEST_F(FpArithmeticTest, BatchMontgomeryConversion) {
// Test 1: Addition that doesn't overflow (on GPU)
TEST_F(FpArithmeticTest, LargeAddition1) {
uint64_t size_tracker = 0;
// a = large value
Fp a = test_utils::make_fp(0x18e00013555855ULL, 0x2b772294629DAULL,
0x412736E1F11D66ULL, 0x87BAD325DD638ULL,
@@ -1051,7 +1044,7 @@ TEST_F(FpArithmeticTest, LargeAddition1) {
// Test 2: Addition that triggers reduction (sum > p) (on GPU)
TEST_F(FpArithmeticTest, LargeAddition2WithReduction) {
uint64_t size_tracker = 0;
// Use two large numbers that will trigger reduction
// a + b should wrap around modulus
Fp a = test_utils::make_fp(0x311c0026aab0aaaaULL, 0x56ee4528c573b5ccULL,
@@ -1083,7 +1076,7 @@ TEST_F(FpArithmeticTest, LargeAddition2WithReduction) {
// Test 3: Subtraction without borrow (on GPU)
TEST_F(FpArithmeticTest, LargeSubtraction1) {
uint64_t size_tracker = 0;
// a = large value
Fp a = test_utils::make_fp(0x18e00013555855ULL, 0x2b772294629DAULL,
0x412736E1F11D66ULL, 0x87BAD325DD638ULL,
@@ -1113,15 +1106,11 @@ TEST_F(FpArithmeticTest, LargeSubtraction1) {
ComparisonType::Equal)
<< "GPU result should match CPU result";
cuda_synchronize_stream(stream, gpu_index);
EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &verify, &verify_cpu),
ComparisonType::Equal)
<< "GPU subtraction roundtrip should match CPU roundtrip";
cuda_synchronize_stream(stream, gpu_index);
}
// Test 4: Subtraction with borrow (a < b) (on GPU)
TEST_F(FpArithmeticTest, LargeSubtraction2WithBorrow) {
uint64_t size_tracker = 0;
// a = 50
Fp a = test_utils::make_fp(0x32ULL, 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL,
0x0ULL);
@@ -1155,7 +1144,7 @@ TEST_F(FpArithmeticTest, LargeSubtraction2WithBorrow) {
// Test 5: Multiplication of large values (triggers reduction) (on GPU)
TEST_F(FpArithmeticTest, LargeMultiplication1) {
uint64_t size_tracker = 0;
// a = 2^200 (bit 200 set)
Fp a;
fp_zero(a);
@@ -1201,7 +1190,7 @@ TEST_F(FpArithmeticTest, LargeMultiplication1) {
// Test 6: (p-1) * (p-1) = 1 (mod p) (on GPU)
TEST_F(FpArithmeticTest, LargeMultiplication2ModulusMinus1) {
uint64_t size_tracker = 0;
// a = p - 1
Fp a = test_utils::make_fp(0x311c0026aab0aaaaULL, 0x56ee4528c573b5ccULL,
0x824e6dc3e23acdeeULL, 0xf75a64bbac71602ULL,
@@ -1239,7 +1228,7 @@ TEST_F(FpArithmeticTest, LargeMultiplication2ModulusMinus1) {
// Test 7: Multiplication with 2: a * 2 = a + a (on GPU)
TEST_F(FpArithmeticTest, LargeMultiplication3Half) {
uint64_t size_tracker = 0;
// a = large value
Fp a = test_utils::make_fp(0x18e00013555855ULL, 0x2b772294629DAE6ULL,
0x412736E1F11D66F7ULL, 0x7BAD325DD638B01ULL,
@@ -1275,15 +1264,11 @@ TEST_F(FpArithmeticTest, LargeMultiplication3Half) {
ComparisonType::Equal)
<< "GPU result should match CPU result";
cuda_synchronize_stream(stream, gpu_index);
EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &expected, &expected_cpu),
ComparisonType::Equal)
<< "GPU addition should match CPU addition";
cuda_synchronize_stream(stream, gpu_index);
}
// Test 8: Large number squared (on GPU)
TEST_F(FpArithmeticTest, LargeMultiplication4Square) {
uint64_t size_tracker = 0;
// a = large value
Fp a = test_utils::make_fp(0x123456789ABCDEFULL, 0xFEDCBA9876543210ULL,
0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL);
@@ -1320,7 +1305,7 @@ TEST_F(FpArithmeticTest, LargeMultiplication4Square) {
// Test 9: Addition chain near modulus (on GPU)
TEST_F(FpArithmeticTest, LargeAddition3Chain) {
uint64_t size_tracker = 0;
// Start with p-1
Fp a = test_utils::make_fp(0x311c0026aab0aaaaULL, 0x56ee4528c573b5ccULL,
0x824e6dc3e23acdeeULL, 0x0f75a64bbac71602ULL,
@@ -1347,7 +1332,7 @@ TEST_F(FpArithmeticTest, LargeAddition3Chain) {
// Test 10: Complex multiplication with reduction (on GPU)
TEST_F(FpArithmeticTest, LargeMultiplication5Complex) {
uint64_t size_tracker = 0;
// a = large prime-like number
Fp a = test_utils::make_fp(0x123456789ABCDEFULL, 0xFEDCBA9876543210ULL,
0x0123456789ABCDEFULL, 0xFEDCBA9876543210ULL,
@@ -1401,7 +1386,7 @@ TEST_F(FpArithmeticTest, LargeMultiplication5Complex) {
// Test addition associativity: (a + b) + c = a + (b + c) (on GPU)
TEST_F(FpPropertyTest, AdditionAssociativity) {
uint64_t size_tracker = 0;
for (int i = 0; i < 100; i++) {
Fp a = random_value();
Fp b = random_value();
@@ -1427,7 +1412,7 @@ TEST_F(FpPropertyTest, AdditionAssociativity) {
// Test multiplication associativity: (a * b) * c = a * (b * c) (on GPU)
TEST_F(FpPropertyTest, MultiplicationAssociativity) {
uint64_t size_tracker = 0;
for (int i = 0; i < 50; i++) { // Fewer iterations due to multiplication cost
Fp a = random_value();
Fp b = random_value();
@@ -1453,7 +1438,7 @@ TEST_F(FpPropertyTest, MultiplicationAssociativity) {
// Test distributivity: a * (b + c) = a*b + a*c (on GPU)
TEST_F(FpPropertyTest, MultiplicationDistributivity) {
uint64_t size_tracker = 0;
for (int i = 0; i < 50; i++) {
Fp a = random_value();
Fp b = random_value();
@@ -1480,7 +1465,7 @@ TEST_F(FpPropertyTest, MultiplicationDistributivity) {
// Test addition commutativity with random values (on GPU)
TEST_F(FpPropertyTest, AdditionCommutativityRandom) {
uint64_t size_tracker = 0;
for (int i = 0; i < 100; i++) {
Fp a = random_value();
Fp b = random_value();
@@ -1498,7 +1483,7 @@ TEST_F(FpPropertyTest, AdditionCommutativityRandom) {
// Test multiplication commutativity with random values (on GPU)
TEST_F(FpPropertyTest, MultiplicationCommutativityRandom) {
uint64_t size_tracker = 0;
for (int i = 0; i < 50; i++) {
Fp a = random_value();
Fp b = random_value();
@@ -1516,7 +1501,7 @@ TEST_F(FpPropertyTest, MultiplicationCommutativityRandom) {
// Test additive identity: a + 0 = a (on GPU)
TEST_F(FpPropertyTest, AdditiveIdentity) {
uint64_t size_tracker = 0;
for (int i = 0; i < 100; i++) {
Fp a = random_value();
Fp result;
@@ -1532,7 +1517,7 @@ TEST_F(FpPropertyTest, AdditiveIdentity) {
// Test multiplicative identity: a * 1 = a (on GPU)
TEST_F(FpPropertyTest, MultiplicativeIdentity) {
uint64_t size_tracker = 0;
for (int i = 0; i < 100; i++) {
Fp a = random_value();
Fp result;
@@ -1548,7 +1533,7 @@ TEST_F(FpPropertyTest, MultiplicativeIdentity) {
// Test additive inverse: a + (-a) = 0 (on GPU)
TEST_F(FpPropertyTest, AdditiveInverse) {
uint64_t size_tracker = 0;
for (int i = 0; i < 100; i++) {
Fp a = random_value();
Fp neg_a, result;
@@ -1565,7 +1550,7 @@ TEST_F(FpPropertyTest, AdditiveInverse) {
// Test double negation: -(-a) = a (on GPU)
TEST_F(FpPropertyTest, DoubleNegation) {
uint64_t size_tracker = 0;
for (int i = 0; i < 100; i++) {
Fp a = random_value();
Fp neg_a, neg_neg_a;
@@ -1583,7 +1568,7 @@ TEST_F(FpPropertyTest, DoubleNegation) {
// Test subtraction as addition of negation: a - b = a + (-b) (on GPU)
TEST_F(FpPropertyTest, SubtractionAsNegation) {
uint64_t size_tracker = 0;
for (int i = 0; i < 100; i++) {
Fp a = random_value();
Fp b = random_value();
@@ -1605,7 +1590,7 @@ TEST_F(FpPropertyTest, SubtractionAsNegation) {
// Test Montgomery form round-trip with random values (on GPU)
TEST_F(FpPropertyTest, MontgomeryRoundTripRandom) {
uint64_t size_tracker = 0;
for (int i = 0; i < 100; i++) {
Fp a = random_value();
Fp mont_form, back;
@@ -1622,7 +1607,7 @@ TEST_F(FpPropertyTest, MontgomeryRoundTripRandom) {
// Test multiplicative inverse: a * a^(-1) = 1 (on GPU)
TEST_F(FpPropertyTest, MultiplicativeInverse) {
uint64_t size_tracker = 0;
for (int i = 0; i < 50; i++) {
Fp a = random_value();
// Skip zero (on GPU)
@@ -1645,7 +1630,7 @@ TEST_F(FpPropertyTest, MultiplicativeInverse) {
// Test division: (a / b) * b = a (on GPU)
TEST_F(FpPropertyTest, DivisionProperty) {
uint64_t size_tracker = 0;
for (int i = 0; i < 50; i++) {
Fp a = random_value();
Fp b = random_value();
@@ -1669,7 +1654,7 @@ TEST_F(FpPropertyTest, DivisionProperty) {
// Test division as multiplication by inverse: a / b = a * b^(-1) (on GPU)
TEST_F(FpPropertyTest, DivisionAsInverse) {
uint64_t size_tracker = 0;
for (int i = 0; i < 50; i++) {
Fp a = random_value();
Fp b = random_value();
@@ -1697,7 +1682,7 @@ TEST_F(FpPropertyTest, DivisionAsInverse) {
// Test exponentiation: (a^e1)^e2 = a^(e1*e2) for small exponents (on GPU)
TEST_F(FpPropertyTest, ExponentiationPowerOfPower) {
uint64_t size_tracker = 0;
for (int i = 0; i < 20; i++) { // Fewer iterations due to cost
Fp a = random_value();
// Skip zero (on GPU)
@@ -1731,7 +1716,7 @@ TEST_F(FpPropertyTest, ExponentiationPowerOfPower) {
// Test exponentiation: a^e1 * a^e2 = a^(e1+e2) (on GPU)
TEST_F(FpPropertyTest, ExponentiationProduct) {
uint64_t size_tracker = 0;
for (int i = 0; i < 20; i++) { // Fewer iterations due to cost
Fp a = random_value();
// Skip zero (on GPU)
@@ -1766,7 +1751,7 @@ TEST_F(FpPropertyTest, ExponentiationProduct) {
// Test inversion of inversion: (a^(-1))^(-1) = a (on GPU)
TEST_F(FpPropertyTest, DoubleInversion) {
uint64_t size_tracker = 0;
for (int i = 0; i < 50; i++) {
Fp a = random_value();
// Skip zero (on GPU)
@@ -1790,7 +1775,7 @@ TEST_F(FpPropertyTest, DoubleInversion) {
// Test square root property: sqrt(a^2) = a (for random a) (on GPU)
TEST_F(FpPropertyTest, SquareRootProperty) {
uint64_t size_tracker = 0;
for (int i = 0; i < 50; i++) {
Fp a = random_value();
Fp square, sqrt_result, verify;
@@ -1834,7 +1819,7 @@ TEST_F(FpPropertyTest, SquareRootProperty) {
// Test quadratic residue property: squares are always quadratic residues (on
// GPU)
TEST_F(FpPropertyTest, QuadraticResidueProperty) {
uint64_t size_tracker = 0;
for (int i = 0; i < 100; i++) {
Fp a = random_value();
Fp square;
@@ -1856,7 +1841,7 @@ TEST_F(FpPropertyTest, QuadraticResidueProperty) {
// Test operations with p-1 (on GPU)
TEST_F(FpEdgeCaseTest, OperationsWithModulusMinusOne) {
uint64_t size_tracker = 0;
// (p-1) + 1 = 0 (on GPU)
Fp result;
fp_add_gpu(stream, gpu_index, &result, &modulus_minus_one, &one);
@@ -1883,7 +1868,7 @@ TEST_F(FpEdgeCaseTest, OperationsWithModulusMinusOne) {
// Test operations with p-2 (on GPU)
TEST_F(FpEdgeCaseTest, OperationsWithModulusMinusTwo) {
uint64_t size_tracker = 0;
// (p-2) + 1 = p-1 (on GPU)
Fp result;
fp_add_gpu(stream, gpu_index, &result, &modulus_minus_two, &one);
@@ -1903,7 +1888,7 @@ TEST_F(FpEdgeCaseTest, OperationsWithModulusMinusTwo) {
// Test operations with very small values (on GPU)
TEST_F(FpEdgeCaseTest, VerySmallValues) {
uint64_t size_tracker = 0;
Fp zero_val, one_val, two_val, three_val;
fp_zero(zero_val);
fp_one(one_val);
@@ -1945,7 +1930,7 @@ TEST_F(FpEdgeCaseTest, VerySmallValues) {
// Test operations with max limb values (on GPU)
TEST_F(FpEdgeCaseTest, MaxLimbValues) {
uint64_t size_tracker = 0;
// Test that max_limb_value is valid
EXPECT_TRUE(test_utils::is_valid_fp(max_limb_value))
<< "max_limb_value should be < p";
@@ -1968,7 +1953,7 @@ TEST_F(FpEdgeCaseTest, MaxLimbValues) {
// Test operations with alternating bit patterns (on GPU)
TEST_F(FpEdgeCaseTest, AlternatingBitPatterns) {
uint64_t size_tracker = 0;
// Test that alternating_bits is valid
EXPECT_TRUE(test_utils::is_valid_fp(alternating_bits))
<< "alternating_bits should be < p";
@@ -1991,7 +1976,7 @@ TEST_F(FpEdgeCaseTest, AlternatingBitPatterns) {
// Test edge case: zero operations (on GPU)
TEST_F(FpEdgeCaseTest, ZeroOperations) {
uint64_t size_tracker = 0;
// 0 + 0 = 0 (on GPU)
Fp result;
fp_add_gpu(stream, gpu_index, &result, &zero, &zero);
@@ -2021,7 +2006,7 @@ TEST_F(FpEdgeCaseTest, ZeroOperations) {
// Test edge case: one operations (on GPU)
TEST_F(FpEdgeCaseTest, OneOperations) {
uint64_t size_tracker = 0;
// 1 + 1 = 2 (on GPU)
Fp result;
fp_add_gpu(stream, gpu_index, &result, &one, &one);
@@ -2048,7 +2033,7 @@ TEST_F(FpEdgeCaseTest, OneOperations) {
// Test fp_one_montgomery (on GPU)
TEST_F(FpEdgeCaseTest, OneMontgomery) {
uint64_t size_tracker = 0;
Fp one_mont, one_normal;
fp_one(one_normal);
fp_one_montgomery(one_mont);
@@ -2064,7 +2049,7 @@ TEST_F(FpEdgeCaseTest, OneMontgomery) {
// Test repeated operations (stress test) (on GPU)
TEST_F(FpEdgeCaseTest, RepeatedOperations) {
uint64_t size_tracker = 0;
Fp a = test_utils::random_fp(rng);
Fp result = a;
@@ -2106,7 +2091,7 @@ TEST_F(FpEdgeCaseTest, RepeatedOperations) {
// Test CUDA kernel: array addition
TEST_F(FpCudaKernelTest, CudaKernelArrayAdd) {
uint64_t size_tracker = 0;
const int n = 1000;
Fp *h_a = new Fp[n];
Fp *h_b = new Fp[n];
@@ -2141,7 +2126,7 @@ TEST_F(FpCudaKernelTest, CudaKernelArrayAdd) {
// Test CUDA kernel: array multiplication
TEST_F(FpCudaKernelTest, CudaKernelArrayMul) {
uint64_t size_tracker = 0;
const int n = 1000;
Fp *h_a = new Fp[n];
Fp *h_b = new Fp[n];
@@ -2181,7 +2166,7 @@ TEST_F(FpCudaKernelTest, CudaKernelArrayMul) {
// Test CUDA kernel: array addition with edge cases
TEST_F(FpCudaKernelTest, CudaKernelArrayAddEdgeCases) {
uint64_t size_tracker = 0;
const int n = 100;
Fp *h_a = new Fp[n];
Fp *h_b = new Fp[n];
@@ -2231,7 +2216,7 @@ TEST_F(FpCudaKernelTest, CudaKernelArrayAddEdgeCases) {
// Test CUDA kernel: array multiplication with edge cases
TEST_F(FpCudaKernelTest, CudaKernelArrayMulEdgeCases) {
uint64_t size_tracker = 0;
const int n = 100;
Fp *h_a = new Fp[n];
Fp *h_b = new Fp[n];
@@ -2286,7 +2271,7 @@ TEST_F(FpCudaKernelTest, CudaKernelArrayMulEdgeCases) {
// Test CUDA kernel: large array
TEST_F(FpCudaKernelTest, CudaKernelLargeArray) {
uint64_t size_tracker = 0;
const int n = 10000;
Fp *h_a = new Fp[n];
Fp *h_b = new Fp[n];
@@ -2327,7 +2312,7 @@ TEST_F(FpCudaKernelTest, CudaKernelLargeArray) {
// Test CUDA kernel: boundary conditions for launch configuration
// Tests that the "if (idx < n)" check works correctly at block boundaries
TEST_F(FpCudaKernelTest, CudaKernelBoundaryConditions) {
uint64_t size_tracker = 0;
// Test sizes that stress the launch configuration
// threadsPerBlock = 256, so test around block boundaries
std::vector<int> test_sizes = {1, 255, 256, 257, 511,
@@ -2367,7 +2352,7 @@ TEST_F(FpCudaKernelTest, CudaKernelBoundaryConditions) {
// Test CUDA kernel: verify kernel actually launches (not just CPU fallback)
TEST_F(FpCudaKernelTest, CudaKernelActuallyLaunches) {
uint64_t size_tracker = 0;
const int n = 1000;
Fp *h_a = new Fp[n];
Fp *h_b = new Fp[n];
@@ -2399,7 +2384,7 @@ TEST_F(FpCudaKernelTest, CudaKernelActuallyLaunches) {
// Test CUDA kernel: verify device constant memory is accessible
TEST_F(FpCudaKernelTest, CudaKernelDeviceConstants) {
uint64_t size_tracker = 0;
// This test verifies that DEVICE_MODULUS is properly initialized
// by running a kernel that uses it (multiplication uses Montgomery which
// needs modulus)
@@ -2443,7 +2428,7 @@ TEST_F(FpCudaKernelTest, CudaKernelDeviceConstants) {
// Test CUDA kernel: empty array (edge case)
TEST_F(FpCudaKernelTest, CudaKernelEmptyArray) {
uint64_t size_tracker = 0;
const int n = 0;
Fp *h_a = nullptr;
Fp *h_b = nullptr;
@@ -2460,7 +2445,7 @@ TEST_F(FpCudaKernelTest, CudaKernelEmptyArray) {
// Test CUDA kernel: single element
TEST_F(FpCudaKernelTest, CudaKernelSingleElement) {
uint64_t size_tracker = 0;
const int n = 1;
Fp *h_a = new Fp[n];
Fp *h_b = new Fp[n];
@@ -2486,53 +2471,44 @@ TEST_F(FpCudaKernelTest, CudaKernelSingleElement) {
// ============================================================================
// Test to print generator values (for hardcoding)
// PRIx64 format specifiers require 64-bit limbs
#if LIMB_BITS_CONFIG == 64
TEST_F(FpArithmeticTest, PrintGenerators) {
uint64_t size_tracker = 0;
const G1Affine &g1 = g1_generator();
const G2Affine &g2 = g2_generator();
printf("\n=== G1 Generator (Montgomery form) ===\n");
printf("x: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
"0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
"0x%" PRIx64 "ULL}\n",
printf("x: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
"0x%llxULL, 0x%llxULL}\n",
g1.x.limb[0], g1.x.limb[1], g1.x.limb[2], g1.x.limb[3], g1.x.limb[4],
g1.x.limb[5], g1.x.limb[6]);
printf("y: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
"0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
"0x%" PRIx64 "ULL}\n",
printf("y: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
"0x%llxULL, 0x%llxULL}\n",
g1.y.limb[0], g1.y.limb[1], g1.y.limb[2], g1.y.limb[3], g1.y.limb[4],
g1.y.limb[5], g1.y.limb[6]);
printf("\n=== G2 Generator (Montgomery form) ===\n");
printf("x.c0: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
"0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
"0x%" PRIx64 "ULL}\n",
printf("x.c0: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
"0x%llxULL, 0x%llxULL}\n",
g2.x.c0.limb[0], g2.x.c0.limb[1], g2.x.c0.limb[2], g2.x.c0.limb[3],
g2.x.c0.limb[4], g2.x.c0.limb[5], g2.x.c0.limb[6]);
printf("x.c1: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
"0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
"0x%" PRIx64 "ULL}\n",
printf("x.c1: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
"0x%llxULL, 0x%llxULL}\n",
g2.x.c1.limb[0], g2.x.c1.limb[1], g2.x.c1.limb[2], g2.x.c1.limb[3],
g2.x.c1.limb[4], g2.x.c1.limb[5], g2.x.c1.limb[6]);
printf("y.c0: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
"0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
"0x%" PRIx64 "ULL}\n",
printf("y.c0: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
"0x%llxULL, 0x%llxULL}\n",
g2.y.c0.limb[0], g2.y.c0.limb[1], g2.y.c0.limb[2], g2.y.c0.limb[3],
g2.y.c0.limb[4], g2.y.c0.limb[5], g2.y.c0.limb[6]);
printf("y.c1: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
"0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
"0x%" PRIx64 "ULL}\n",
printf("y.c1: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
"0x%llxULL, 0x%llxULL}\n",
g2.y.c1.limb[0], g2.y.c1.limb[1], g2.y.c1.limb[2], g2.y.c1.limb[3],
g2.y.c1.limb[4], g2.y.c1.limb[5], g2.y.c1.limb[6]);
printf("\n");
}
#endif
// Test is_on_curve_g1 with point at infinity
TEST_F(FpArithmeticTest, CurveG1PointAtInfinity) {
uint64_t size_tracker = 0;
G1Affine point;
g1_point_at_infinity(point);
@@ -2544,7 +2520,7 @@ TEST_F(FpArithmeticTest, CurveG1PointAtInfinity) {
// We'll create a point by starting with a valid y and computing x
// Or use a known valid point
TEST_F(FpArithmeticTest, CurveG1ValidPoint) {
uint64_t size_tracker = 0;
G1Affine point;
point.infinity = false;
@@ -2617,7 +2593,7 @@ TEST_F(FpArithmeticTest, CurveG1ValidPoint) {
// Test is_on_curve_g1 with invalid point
TEST_F(FpArithmeticTest, CurveG1InvalidPoint) {
uint64_t size_tracker = 0;
G1Affine point;
point.infinity = false;
@@ -2630,7 +2606,7 @@ TEST_F(FpArithmeticTest, CurveG1InvalidPoint) {
// Test that negating y preserves curve validity (on GPU)
TEST_F(FpArithmeticTest, CurveG1FieldOperationsConsistency) {
uint64_t size_tracker = 0;
G1Affine point;
point.infinity = false;
@@ -2674,7 +2650,7 @@ TEST_F(FpArithmeticTest, CurveG1FieldOperationsConsistency) {
// Test is_on_curve_g2 with point at infinity
TEST_F(FpArithmeticTest, CurveG2PointAtInfinity) {
uint64_t size_tracker = 0;
G2Affine point;
g2_point_at_infinity(point);

View File

@@ -152,7 +152,7 @@ protected:
// Test basic addition (on GPU)
TEST_F(Fp2ArithmeticTest, Addition) {
uint64_t size_tracker = 0;
Fp2 a, b, c, c_cpu;
// Test: (1 + 0*i) + (1 + 0*i) = (2 + 0*i)
@@ -175,7 +175,7 @@ TEST_F(Fp2ArithmeticTest, Addition) {
// Test subtraction (on GPU)
TEST_F(Fp2ArithmeticTest, Subtraction) {
uint64_t size_tracker = 0;
Fp2 a, b, c, a_cpu;
// Test: (2 + 0*i) - (1 + 0*i) = (1 + 0*i)
@@ -197,7 +197,7 @@ TEST_F(Fp2ArithmeticTest, Subtraction) {
// Test multiplication (on GPU)
TEST_F(Fp2ArithmeticTest, Multiplication) {
uint64_t size_tracker = 0;
Fp2 a, b, result, expected, result_cpu;
// Test: (1 + 1*i) * (1 + 1*i) = (0 + 2*i)
@@ -224,7 +224,7 @@ TEST_F(Fp2ArithmeticTest, Multiplication) {
// Test i * i = -1 (on GPU)
TEST_F(Fp2ArithmeticTest, I_Squared) {
uint64_t size_tracker = 0;
Fp2 i_val, result, expected, result_cpu;
// i = 0 + 1*i
@@ -250,7 +250,7 @@ TEST_F(Fp2ArithmeticTest, I_Squared) {
// Test negation (on GPU)
TEST_F(Fp2ArithmeticTest, Negation) {
uint64_t size_tracker = 0;
Fp2 a, neg_a, result, neg_a_cpu, result_cpu;
a = test_utils_fp2::make_fp2_simple(5, 3);
@@ -273,7 +273,7 @@ TEST_F(Fp2ArithmeticTest, Negation) {
// Test conjugation (on GPU)
TEST_F(Fp2ArithmeticTest, Conjugation) {
uint64_t size_tracker = 0;
Fp2 a, conj, result, conj_cpu, result_cpu;
a = test_utils_fp2::make_fp2_simple(5, 3);
@@ -304,7 +304,7 @@ TEST_F(Fp2ArithmeticTest, Conjugation) {
// Test squaring (on GPU)
TEST_F(Fp2ArithmeticTest, Squaring) {
uint64_t size_tracker = 0;
Fp2 a, square, square_cpu;
// Test: (1 + 1*i)^2 = 2*i
@@ -327,7 +327,7 @@ TEST_F(Fp2ArithmeticTest, Squaring) {
// Test zero and one (on GPU)
TEST_F(Fp2ArithmeticTest, ZeroAndOne) {
uint64_t size_tracker = 0;
Fp2 zero_val, one_val;
fp2_zero(zero_val);
@@ -349,7 +349,7 @@ TEST_F(Fp2ArithmeticTest, ZeroAndOne) {
// Test copy (on GPU)
TEST_F(Fp2ArithmeticTest, Copy) {
uint64_t size_tracker = 0;
Fp2 a, b, b_cpu;
a = test_utils_fp2::make_fp2_simple(42, 123);
@@ -370,7 +370,7 @@ TEST_F(Fp2ArithmeticTest, Copy) {
// Test conditional move (on GPU)
TEST_F(Fp2ArithmeticTest, ConditionalMove) {
uint64_t size_tracker = 0;
Fp2 a, b, result, result_cpu;
a = test_utils_fp2::make_fp2_simple(10, 20);
@@ -411,7 +411,7 @@ TEST_F(Fp2ArithmeticTest, ConditionalMove) {
// Test multiplication by zero (on GPU)
TEST_F(Fp2ArithmeticTest, MultiplicationByZero) {
uint64_t size_tracker = 0;
Fp2 a, zero_val, result, result_cpu;
fp2_zero(zero_val);
@@ -432,7 +432,7 @@ TEST_F(Fp2ArithmeticTest, MultiplicationByZero) {
// Test inversion (on GPU)
TEST_F(Fp2ArithmeticTest, Inversion) {
uint64_t size_tracker = 0;
Fp2 a, a_inv, result, a_inv_cpu, result_cpu;
a = test_utils_fp2::make_fp2_simple(5, 3);
@@ -456,7 +456,7 @@ TEST_F(Fp2ArithmeticTest, Inversion) {
// Test division (on GPU)
TEST_F(Fp2ArithmeticTest, Division) {
uint64_t size_tracker = 0;
Fp2 a, b, quotient, result, quotient_cpu, result_cpu;
a = test_utils_fp2::make_fp2_simple(10, 6);
@@ -482,7 +482,7 @@ TEST_F(Fp2ArithmeticTest, Division) {
// Test multiply by i (on GPU)
TEST_F(Fp2ArithmeticTest, MultiplyByI) {
uint64_t size_tracker = 0;
Fp2 a, result, result_cpu;
// Test: (a + b*i) * i = -b + a*i
@@ -509,7 +509,7 @@ TEST_F(Fp2ArithmeticTest, MultiplyByI) {
// Test Frobenius map (on GPU)
TEST_F(Fp2ArithmeticTest, Frobenius) {
uint64_t size_tracker = 0;
Fp2 a, frob, conj, frob_cpu, conj_cpu;
a = test_utils_fp2::make_fp2_simple(5, 3);
@@ -541,7 +541,7 @@ TEST_F(Fp2ArithmeticTest, Frobenius) {
// Test addition associativity: (a + b) + c = a + (b + c) (on GPU)
TEST_F(Fp2PropertyTest, AdditionAssociativity) {
uint64_t size_tracker = 0;
for (int i = 0; i < 100; i++) {
Fp2 a = random_value();
Fp2 b = random_value();
@@ -567,7 +567,7 @@ TEST_F(Fp2PropertyTest, AdditionAssociativity) {
// Test multiplication associativity: (a * b) * c = a * (b * c) (on GPU)
TEST_F(Fp2PropertyTest, MultiplicationAssociativity) {
uint64_t size_tracker = 0;
for (int i = 0; i < 50; i++) {
Fp2 a = random_value();
Fp2 b = random_value();
@@ -593,7 +593,7 @@ TEST_F(Fp2PropertyTest, MultiplicationAssociativity) {
// Test distributivity: a * (b + c) = a*b + a*c (on GPU)
TEST_F(Fp2PropertyTest, MultiplicationDistributivity) {
uint64_t size_tracker = 0;
for (int i = 0; i < 50; i++) {
Fp2 a = random_value();
Fp2 b = random_value();
@@ -620,7 +620,7 @@ TEST_F(Fp2PropertyTest, MultiplicationDistributivity) {
// Test addition commutativity (on GPU)
TEST_F(Fp2PropertyTest, AdditionCommutativity) {
uint64_t size_tracker = 0;
for (int i = 0; i < 100; i++) {
Fp2 a = random_value();
Fp2 b = random_value();
@@ -638,7 +638,7 @@ TEST_F(Fp2PropertyTest, AdditionCommutativity) {
// Test multiplication commutativity (on GPU)
TEST_F(Fp2PropertyTest, MultiplicationCommutativity) {
uint64_t size_tracker = 0;
for (int i = 0; i < 50; i++) {
Fp2 a = random_value();
Fp2 b = random_value();
@@ -656,7 +656,7 @@ TEST_F(Fp2PropertyTest, MultiplicationCommutativity) {
// Test additive identity: a + 0 = a (on GPU)
TEST_F(Fp2PropertyTest, AdditiveIdentity) {
uint64_t size_tracker = 0;
for (int i = 0; i < 100; i++) {
Fp2 a = random_value();
Fp2 result;
@@ -672,7 +672,7 @@ TEST_F(Fp2PropertyTest, AdditiveIdentity) {
// Test multiplicative identity: a * 1 = a (on GPU)
TEST_F(Fp2PropertyTest, MultiplicativeIdentity) {
uint64_t size_tracker = 0;
for (int i = 0; i < 100; i++) {
Fp2 a = random_value();
Fp2 result;
@@ -688,7 +688,7 @@ TEST_F(Fp2PropertyTest, MultiplicativeIdentity) {
// Test additive inverse: a + (-a) = 0 (on GPU)
TEST_F(Fp2PropertyTest, AdditiveInverse) {
uint64_t size_tracker = 0;
for (int i = 0; i < 100; i++) {
Fp2 a = random_value();
Fp2 neg_a, result;
@@ -705,7 +705,7 @@ TEST_F(Fp2PropertyTest, AdditiveInverse) {
// Test multiplicative inverse: a * a^(-1) = 1 (on GPU)
TEST_F(Fp2PropertyTest, MultiplicativeInverse) {
uint64_t size_tracker = 0;
for (int i = 0; i < 50; i++) {
Fp2 a = random_value();
// Skip zero
@@ -726,7 +726,7 @@ TEST_F(Fp2PropertyTest, MultiplicativeInverse) {
// Test square vs multiply by self: a^2 = a * a (on GPU)
TEST_F(Fp2PropertyTest, SquareVsMultiply) {
uint64_t size_tracker = 0;
for (int i = 0; i < 50; i++) {
Fp2 a = random_value();
@@ -747,7 +747,7 @@ TEST_F(Fp2PropertyTest, SquareVsMultiply) {
// Test CUDA kernel: array addition
TEST_F(Fp2CudaKernelTest, CudaKernelArrayAdd) {
uint64_t size_tracker = 0;
const int n = 1000;
Fp2 *h_a = new Fp2[n];
Fp2 *h_b = new Fp2[n];
@@ -784,7 +784,7 @@ TEST_F(Fp2CudaKernelTest, CudaKernelArrayAdd) {
// Test CUDA kernel: array multiplication
TEST_F(Fp2CudaKernelTest, CudaKernelArrayMul) {
uint64_t size_tracker = 0;
const int n = 1000;
Fp2 *h_a = new Fp2[n];
Fp2 *h_b = new Fp2[n];
@@ -825,7 +825,7 @@ TEST_F(Fp2CudaKernelTest, CudaKernelArrayMul) {
// Test is_on_curve_g2 with point at infinity
TEST_F(Fp2ArithmeticTest, CurveG2PointAtInfinity) {
uint64_t size_tracker = 0;
G2Affine point;
g2_point_at_infinity(point);
@@ -835,7 +835,7 @@ TEST_F(Fp2ArithmeticTest, CurveG2PointAtInfinity) {
// Test is_on_curve_g2 with valid point construction
TEST_F(Fp2ArithmeticTest, CurveG2ValidPointCheck) {
uint64_t size_tracker = 0;
G2Affine point;
point.infinity = false;
@@ -860,7 +860,7 @@ TEST_F(Fp2ArithmeticTest, CurveG2ValidPointCheck) {
// Test that field operations maintain curve validity for G2
TEST_F(Fp2ArithmeticTest, CurveG2FieldOperationsConsistency) {
uint64_t size_tracker = 0;
// Create a point (we'll test the consistency check works)
G2Affine point;
point.infinity = false;

Some files were not shown because too many files have changed in this diff Show More