mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-04-28 03:01:21 -04:00
Compare commits
51 Commits
dt/ci/scal
...
jb/doc/upd
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
23c9bc8f92 | ||
|
|
4790f8ba1c | ||
|
|
79a54df25b | ||
|
|
50d6be121a | ||
|
|
7cd966d8a7 | ||
|
|
6ca929051d | ||
|
|
871cc8f772 | ||
|
|
b938473788 | ||
|
|
74869f5e2f | ||
|
|
326dd6a5c7 | ||
|
|
1abc69751a | ||
|
|
3c2cb273d5 | ||
|
|
b18060e5c8 | ||
|
|
c8827a21a7 | ||
|
|
a7476d0aaa | ||
|
|
10d104e500 | ||
|
|
dbb1f151c8 | ||
|
|
9cb8ad9bff | ||
|
|
d970210ae4 | ||
|
|
5236c21733 | ||
|
|
7598725c7e | ||
|
|
f0cff6176d | ||
|
|
8bb38d4e70 | ||
|
|
35fe71cc07 | ||
|
|
62429da859 | ||
|
|
8a4b3c35f4 | ||
|
|
641fec028f | ||
|
|
8d8379409b | ||
|
|
d547e67f66 | ||
|
|
4cf03c063d | ||
|
|
9372c761dd | ||
|
|
d9dec879e7 | ||
|
|
6a0fb21fd0 | ||
|
|
95058c9b00 | ||
|
|
e19c5826c0 | ||
|
|
adf27ab700 | ||
|
|
32c6db381f | ||
|
|
d79801b340 | ||
|
|
0a4e4cf9e2 | ||
|
|
5d6b3146b1 | ||
|
|
e4ea44c571 | ||
|
|
de331f322a | ||
|
|
844c345e18 | ||
|
|
a9520f8930 | ||
|
|
98a9baf7a8 | ||
|
|
c621c1fc77 | ||
|
|
41fffb0306 | ||
|
|
258db8906a | ||
|
|
70f356d0c6 | ||
|
|
81797a6645 | ||
|
|
bb59dc64fc |
@@ -4,6 +4,9 @@ ignore = [
|
||||
"RUSTSEC-2024-0436",
|
||||
# Ignoring unmaintained 'bincode' crate. Getting rid of it would be too complex on the short term.
|
||||
"RUSTSEC-2025-0141",
|
||||
# Ignoring unsoundness in 'rand' with custom logger. Rand update is currently blocked by
|
||||
# arkworks and we do not use custom loggers.
|
||||
"RUSTSEC-2026-0097",
|
||||
]
|
||||
|
||||
[output]
|
||||
|
||||
3
.github/actions/gpu_setup/action.yml
vendored
3
.github/actions/gpu_setup/action.yml
vendored
@@ -82,11 +82,10 @@ runs:
|
||||
sudo apt update
|
||||
sudo apt -y install cuda-toolkit-"${TOOLKIT_VERSION}"
|
||||
|
||||
# Command to put back in once nvcc check is fixed
|
||||
# find /usr/local -executable -name "nvcc"
|
||||
- name: Export CUDA variables
|
||||
shell: bash
|
||||
run: |
|
||||
find /usr/local -executable -name "nvcc"
|
||||
CUDA_PATH=/usr/local/cuda-"${CUDA_VERSION}"
|
||||
{
|
||||
echo "CUDA_PATH=$CUDA_PATH";
|
||||
|
||||
4
.github/workflows/aws_data_tests.yml
vendored
4
.github/workflows/aws_data_tests.yml
vendored
@@ -54,7 +54,7 @@ jobs:
|
||||
|
||||
- name: Retrieve data from cache
|
||||
id: retrieve-data-cache
|
||||
uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
with:
|
||||
path: |
|
||||
utils/tfhe-backward-compat-data/**/*.cbor
|
||||
@@ -89,7 +89,7 @@ jobs:
|
||||
- name: Store data in cache
|
||||
if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
|
||||
continue-on-error: true
|
||||
uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
with:
|
||||
path: |
|
||||
utils/tfhe-backward-compat-data/**/*.cbor
|
||||
|
||||
19
.github/workflows/aws_tfhe_fast_tests.yml
vendored
19
.github/workflows/aws_tfhe_fast_tests.yml
vendored
@@ -16,7 +16,6 @@ env:
|
||||
PULL_REQUEST_MD_LINK: ""
|
||||
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
@@ -37,6 +36,7 @@ jobs:
|
||||
csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
|
||||
zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
|
||||
versionable_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.versionable_any_changed }}
|
||||
safe_serialize_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.safe_serialize_any_changed }}
|
||||
core_crypto_test: ${{ env.IS_PULL_REQUEST == 'false' ||
|
||||
steps.changed-files.outputs.core_crypto_any_changed ||
|
||||
steps.changed-files.outputs.dependencies_any_changed }}
|
||||
@@ -64,7 +64,7 @@ jobs:
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
persist-credentials: "false"
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Check for file changes
|
||||
@@ -79,6 +79,7 @@ jobs:
|
||||
- tfhe-zk-pok/**
|
||||
- utils/tfhe-versionable/**
|
||||
- utils/tfhe-versionable-derive/**
|
||||
- utils/tfhe-safe-serialize/**
|
||||
csprng:
|
||||
- tfhe-csprng/**
|
||||
zk_pok:
|
||||
@@ -86,6 +87,8 @@ jobs:
|
||||
versionable:
|
||||
- utils/tfhe-versionable/**
|
||||
- utils/tfhe-versionable-derive/**
|
||||
safe_serialize:
|
||||
- utils/tfhe-safe-serialize/**
|
||||
core_crypto:
|
||||
- tfhe/src/core_crypto/**
|
||||
boolean:
|
||||
@@ -122,6 +125,7 @@ jobs:
|
||||
steps.changed-files.outputs.csprng_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.versionable_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.safe_serialize_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.boolean_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.shortint_any_changed == 'true' ||
|
||||
@@ -145,7 +149,7 @@ jobs:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
persist-credentials: "false"
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
@@ -170,6 +174,11 @@ jobs:
|
||||
run: |
|
||||
make test_versionable
|
||||
|
||||
- name: Run tfhe-safe-serialize tests
|
||||
if: needs.should-run.outputs.safe_serialize_test == 'true'
|
||||
run: |
|
||||
make test_safe_serialize
|
||||
|
||||
- name: Run core tests
|
||||
if: needs.should-run.outputs.core_crypto_test == 'true'
|
||||
run: |
|
||||
@@ -191,7 +200,7 @@ jobs:
|
||||
|
||||
- name: Node cache restoration
|
||||
id: node-cache
|
||||
uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
with:
|
||||
path: |
|
||||
~/.nvm
|
||||
@@ -204,7 +213,7 @@ jobs:
|
||||
make install_node
|
||||
|
||||
- name: Node cache save
|
||||
uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
if: steps.node-cache.outputs.cache-hit != 'true'
|
||||
with:
|
||||
path: |
|
||||
|
||||
4
.github/workflows/aws_tfhe_noise_checks.yml
vendored
4
.github/workflows/aws_tfhe_noise_checks.yml
vendored
@@ -34,7 +34,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -99,7 +99,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
30
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
30
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
@@ -14,12 +14,11 @@ env:
|
||||
PULL_REQUEST_MD_LINK: ""
|
||||
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
types: [labeled]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
@@ -32,16 +31,16 @@ jobs:
|
||||
if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved')
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: read # Needed to check for file change
|
||||
pull-requests: read # Needed to check for file change
|
||||
outputs:
|
||||
wasm_test: ${{ github.event_name == 'workflow_dispatch' ||
|
||||
steps.changed-files.outputs.wasm_any_changed }}
|
||||
steps.changed-files.outputs.wasm_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
persist-credentials: "false"
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Check for file changes
|
||||
@@ -63,6 +62,7 @@ jobs:
|
||||
- tfhe/js_on_wasm_tests/**
|
||||
- tfhe/web_wasm_parallel_tests/**
|
||||
- utils/tfhe-versionable/**
|
||||
- utils/tfhe-safe-serialize/**
|
||||
- .github/workflows/aws_tfhe_wasm_tests.yml
|
||||
|
||||
wasm-tests:
|
||||
@@ -78,7 +78,7 @@ jobs:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
persist-credentials: "false"
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
@@ -92,7 +92,7 @@ jobs:
|
||||
|
||||
- name: Node cache restoration
|
||||
id: node-cache
|
||||
uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
with:
|
||||
path: |
|
||||
~/.nvm
|
||||
@@ -105,7 +105,7 @@ jobs:
|
||||
make install_node
|
||||
|
||||
- name: Node cache save
|
||||
uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
if: steps.node-cache.outputs.cache-hit != 'true'
|
||||
with:
|
||||
path: |
|
||||
@@ -128,15 +128,21 @@ jobs:
|
||||
run: |
|
||||
make test_nodejs_wasm_api_ci
|
||||
|
||||
- name: Run parallel wasm tests
|
||||
run: |
|
||||
make test_web_js_api_parallel_chrome_ci
|
||||
|
||||
- name: Run wasm_par_mq tests
|
||||
run: |
|
||||
make test_wasm_par_mq_chrome_ci
|
||||
make test_wasm_par_mq_firefox_ci
|
||||
|
||||
- name: Run parallel wasm tests
|
||||
run: |
|
||||
make test_web_js_api_parallel_chrome_ci
|
||||
make test_web_js_api_parallel_firefox_ci
|
||||
|
||||
- name: Run cross origin wasm tests
|
||||
run: |
|
||||
make test_web_js_api_cross_origin_chrome_ci
|
||||
make test_web_js_api_cross_origin_firefox_ci
|
||||
|
||||
- name: Run x86_64/wasm zk compatibility tests
|
||||
run: |
|
||||
make test_zk_wasm_x86_compat_ci
|
||||
|
||||
@@ -6,6 +6,9 @@ name: backward_compat_pr_change_report
|
||||
on:
|
||||
pull_request:
|
||||
|
||||
env:
|
||||
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
@@ -14,9 +17,35 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
name: backward_compat_pr_change_report/should-run
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: read # Needed to check for file change
|
||||
outputs:
|
||||
backward_report: ${{ steps.changed-files.outputs.backward_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
|
||||
with:
|
||||
files_yaml: |
|
||||
backward:
|
||||
- utils/tfhe-lints/snapshots/*.json
|
||||
|
||||
change-report:
|
||||
name: backward_compat_pr_change_report/change-report (bpr)
|
||||
runs-on: ubuntu-latest
|
||||
needs: should-run
|
||||
if:
|
||||
needs.should-run.outputs.backward_report == 'true'
|
||||
permissions:
|
||||
pull-requests: write # To send and modify message in the PR
|
||||
steps:
|
||||
|
||||
7
.github/workflows/benchmark_cpu.yml
vendored
7
.github/workflows/benchmark_cpu.yml
vendored
@@ -14,11 +14,12 @@ on:
|
||||
- signed_integer
|
||||
- integer_compression
|
||||
- integer_zk
|
||||
- msm_zk
|
||||
- shortint
|
||||
- shortint_oprf
|
||||
- hlapi_unsigned
|
||||
- hlapi_signed
|
||||
- hlapi_erc20
|
||||
- hlapi_erc7984
|
||||
- hlapi_dex
|
||||
- hlapi_noise_squash
|
||||
- hlapi_kvstore
|
||||
@@ -92,8 +93,8 @@ jobs:
|
||||
|
||||
if inputs_command == "integer_zk":
|
||||
files_to_parse.append("pke_zk_crs_sizes.csv")
|
||||
elif inputs_command == "hlapi_erc20":
|
||||
files_to_parse.append("erc20_pbs_count.csv")
|
||||
elif inputs_command == "hlapi_erc7984":
|
||||
files_to_parse.append("erc7984_pbs_count.csv")
|
||||
elif inputs_command == "hlapi_dex":
|
||||
files_to_parse.extend(
|
||||
[
|
||||
|
||||
6
.github/workflows/benchmark_cpu_common.yml
vendored
6
.github/workflows/benchmark_cpu_common.yml
vendored
@@ -107,7 +107,7 @@ jobs:
|
||||
]:
|
||||
f.write(f"""{env_name}=["{'", "'.join(values_to_join)}"]\n""")
|
||||
|
||||
- name: Set martix arguments outputs
|
||||
- name: Set matrix arguments outputs
|
||||
id: set_matrix_args
|
||||
run: | # zizmor: ignore[template-injection] these env variable are safe
|
||||
{
|
||||
@@ -126,7 +126,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -261,7 +261,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
8
.github/workflows/benchmark_cpu_weekly.yml
vendored
8
.github/workflows/benchmark_cpu_weekly.yml
vendored
@@ -108,14 +108,14 @@ jobs:
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
|
||||
|
||||
run-benchmarks-hlapi-erc20:
|
||||
name: benchmark_cpu_weekly/run-benchmarks-hlapi-erc20
|
||||
run-benchmarks-hlapi-erc7984:
|
||||
name: benchmark_cpu_weekly/run-benchmarks-hlapi-erc7984
|
||||
if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
|
||||
needs: prepare-inputs
|
||||
uses: ./.github/workflows/benchmark_cpu_common.yml
|
||||
with:
|
||||
command: hlapi_erc20
|
||||
additional_file_to_parse: erc20_pbs_count.csv
|
||||
command: hlapi_erc7984
|
||||
additional_file_to_parse: erc7984_pbs_count.csv
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
|
||||
4
.github/workflows/benchmark_ct_key_sizes.yml
vendored
4
.github/workflows/benchmark_ct_key_sizes.yml
vendored
@@ -33,7 +33,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -137,7 +137,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
49
.github/workflows/benchmark_documentation.yml
vendored
49
.github/workflows/benchmark_documentation.yml
vendored
@@ -17,6 +17,10 @@ on:
|
||||
description: "Run GPU core-crypto benchmarks"
|
||||
type: boolean
|
||||
default: true
|
||||
run-gpu-zk-benchmarks:
|
||||
description: "Run GPU ZK benchmarks"
|
||||
type: boolean
|
||||
default: true
|
||||
run-hpu-benchmarks:
|
||||
description: "Run HPU benchmarks"
|
||||
type: boolean
|
||||
@@ -36,7 +40,7 @@ jobs:
|
||||
uses: ./.github/workflows/benchmark_cpu_common.yml
|
||||
if: inputs.run-cpu-benchmarks
|
||||
with:
|
||||
command: integer,hlapi_erc20
|
||||
command: integer,hlapi_erc7984
|
||||
op_flavor: fast_default
|
||||
bench_type: both
|
||||
precisions_set: documentation
|
||||
@@ -89,10 +93,9 @@ jobs:
|
||||
uses: ./.github/workflows/benchmark_gpu_common.yml
|
||||
if: inputs.run-gpu-integer-benchmarks
|
||||
with:
|
||||
backend: terraform
|
||||
profile: scaleway-multi-h100-sxm5
|
||||
profile: multi-h100-sxm5
|
||||
hardware_name: n3-H100-SXM5x8
|
||||
command: integer_multi_bit,hlapi_erc20
|
||||
command: integer_multi_bit,hlapi_erc7984
|
||||
op_flavor: fast_default
|
||||
bench_type: both
|
||||
precisions_set: documentation
|
||||
@@ -104,14 +107,14 @@ jobs:
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
|
||||
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
|
||||
|
||||
run-benchmarks-hpu-integer:
|
||||
name: benchmark_documentation/run-benchmarks-hpu-integer
|
||||
uses: ./.github/workflows/benchmark_hpu_common.yml
|
||||
if: inputs.run-hpu-benchmarks
|
||||
with:
|
||||
command: integer,hlapi_erc20
|
||||
command: integer,hlapi_erc7984
|
||||
op_flavor: default
|
||||
bench_type: both
|
||||
precisions_set: documentation
|
||||
@@ -151,8 +154,7 @@ jobs:
|
||||
uses: ./.github/workflows/benchmark_gpu_common.yml
|
||||
if: inputs.run-gpu-core-crypto-benchmarks
|
||||
with:
|
||||
backend: terraform
|
||||
profile: scaleway-multi-h100-sxm5
|
||||
profile: multi-h100-sxm5
|
||||
hardware_name: n3-H100-SXM5x8
|
||||
command: pbs, ks_pbs
|
||||
bench_type: latency
|
||||
@@ -165,23 +167,44 @@ jobs:
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
|
||||
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
|
||||
|
||||
run-benchmarks-gpu-zk-server:
|
||||
name: benchmark_documentation/run-benchmarks-gpu-zk-server
|
||||
uses: ./.github/workflows/benchmark_gpu_common.yml
|
||||
if: inputs.run-gpu-zk-benchmarks
|
||||
with:
|
||||
profile: multi-h100-sxm5
|
||||
hardware_name: n3-H100-SXM5x8
|
||||
command: integer_zk
|
||||
op_flavor: default
|
||||
bench_type: both
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
|
||||
|
||||
generate-svgs-with-benchmarks-run:
|
||||
name: benchmark-documentation/generate-svgs-with-benchmarks-run
|
||||
if: ${{ always() &&
|
||||
(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks ||inputs.run-hpu-benchmarks) &&
|
||||
(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks || inputs.run-hpu-benchmarks) &&
|
||||
inputs.generate-svgs }}
|
||||
needs: [
|
||||
run-benchmarks-cpu-integer, run-benchmarks-gpu-integer, run-benchmarks-hpu-integer,
|
||||
run-benchmarks-cpu-zk-server, run-benchmarks-cpu-zk-client,
|
||||
run-benchmarks-cpu-core-crypto, run-benchmarks-gpu-core-crypto
|
||||
run-benchmarks-cpu-core-crypto, run-benchmarks-gpu-core-crypto,
|
||||
run-benchmarks-gpu-zk-server
|
||||
]
|
||||
uses: ./.github/workflows/generate_svgs.yml
|
||||
with:
|
||||
time_span_days: 5
|
||||
generate-cpu-svgs: ${{ inputs.run-cpu-benchmarks }}
|
||||
generate-gpu-svgs: ${{ inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks }}
|
||||
generate-gpu-svgs: ${{ inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks }}
|
||||
generate-hpu-svgs: ${{ inputs.run-hpu-benchmarks }}
|
||||
secrets:
|
||||
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
|
||||
@@ -190,7 +213,7 @@ jobs:
|
||||
|
||||
generate-svgs-without-benchmarks-run:
|
||||
name: benchmark-documentation/generate-svgs-without-benchmarks-run
|
||||
if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-hpu-benchmarks) &&
|
||||
if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks || inputs.run-hpu-benchmarks) &&
|
||||
inputs.generate-svgs }}
|
||||
uses: ./.github/workflows/generate_svgs.yml
|
||||
with:
|
||||
|
||||
9
.github/workflows/benchmark_gpu.yml
vendored
9
.github/workflows/benchmark_gpu.yml
vendored
@@ -31,10 +31,13 @@ on:
|
||||
- pbs128
|
||||
- ks
|
||||
- ks_pbs
|
||||
- tfhe_zk_pok
|
||||
- msm_zk
|
||||
- integer_zk
|
||||
- integer_zk_experimental
|
||||
- integer_aes
|
||||
- integer_aes256
|
||||
- hlapi_erc20
|
||||
- hlapi_erc7984
|
||||
- hlapi_dex
|
||||
- hlapi_noise_squash
|
||||
op_flavor:
|
||||
@@ -120,8 +123,8 @@ jobs:
|
||||
|
||||
if inputs_command == "integer_zk":
|
||||
files_to_parse.append("pke_zk_crs_sizes.csv")
|
||||
elif inputs_command == "hlapi_erc20":
|
||||
files_to_parse.append("erc20_pbs_count.csv")
|
||||
elif inputs_command == "hlapi_erc7984":
|
||||
files_to_parse.append("erc7984_pbs_count.csv")
|
||||
elif inputs_command == "hlapi_dex":
|
||||
files_to_parse.extend(
|
||||
[
|
||||
|
||||
40
.github/workflows/benchmark_gpu_common.yml
vendored
40
.github/workflows/benchmark_gpu_common.yml
vendored
@@ -111,7 +111,7 @@ jobs:
|
||||
]:
|
||||
f.write(f"""{env_name}=["{'", "'.join(values_to_join)}"]\n""")
|
||||
|
||||
- name: Set martix arguments outputs
|
||||
- name: Set matrix arguments outputs
|
||||
id: set_matrix_args
|
||||
run: | # zizmor: ignore[template-injection] these env variable are safe
|
||||
{
|
||||
@@ -126,17 +126,11 @@ jobs:
|
||||
needs: prepare-matrix
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
|
||||
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
|
||||
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
|
||||
# otherwise we'll try to run the next job on a non-existing on-demand instance.
|
||||
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
|
||||
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
continue-on-error: true
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -145,25 +139,6 @@ jobs:
|
||||
backend: ${{ inputs.backend }}
|
||||
profile: ${{ inputs.profile }}
|
||||
|
||||
- name: Acknowledge remote instance failure
|
||||
if: steps.start-remote-instance.outcome == 'failure' &&
|
||||
inputs.profile != 'single-h100'
|
||||
run: |
|
||||
echo "Remote instance instance has failed to start (profile provided: '${INPUTS_PROFILE}')"
|
||||
echo "Permanent instance instance cannot be used as a substitute (profile needed: 'single-h100')"
|
||||
exit 1
|
||||
env:
|
||||
INPUTS_PROFILE: ${{ inputs.profile }}
|
||||
|
||||
# This will allow to fallback on permanent instances running on Hyperstack.
|
||||
- name: Use permanent remote instance
|
||||
id: use-permanent-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true' &&
|
||||
steps.start-remote-instance.outcome == 'failure' &&
|
||||
inputs.profile == 'single-h100'
|
||||
run: |
|
||||
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# Install dependencies only once since cuda-benchmarks uses a matrix strategy, thus running multiple times.
|
||||
install-dependencies:
|
||||
name: benchmark_gpu_common/install-dependencies
|
||||
@@ -184,7 +159,6 @@ jobs:
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Setup Hyperstack dependencies
|
||||
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
|
||||
uses: ./.github/actions/gpu_setup
|
||||
with:
|
||||
cuda-version: ${{ matrix.cuda }}
|
||||
@@ -333,13 +307,13 @@ jobs:
|
||||
|
||||
teardown-instance:
|
||||
name: benchmark_gpu_common/teardown-instance
|
||||
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
|
||||
if: ${{ always() && needs.setup-instance.result == 'success' }}
|
||||
needs: [ setup-instance, cuda-benchmarks, slack-notify ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
18
.github/workflows/benchmark_gpu_coprocessor.yml
vendored
18
.github/workflows/benchmark_gpu_coprocessor.yml
vendored
@@ -42,7 +42,7 @@ env:
|
||||
OPTIMIZATION_TARGET: "throughput"
|
||||
BATCH_SIZE: "5000"
|
||||
SCHEDULING_POLICY: "MAX_PARALLELISM"
|
||||
BENCHMARKS: "erc20"
|
||||
BENCHMARKS: "erc7984"
|
||||
BRANCH_NAME: ${{ github.ref_name }}
|
||||
COMMIT_SHA: ${{ github.sha }}
|
||||
SLAB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
@@ -77,7 +77,7 @@ jobs:
|
||||
if [[ ${IS_MANUAL_RUN} == true ]]; then
|
||||
PROFILE_RAW="${PROFILE_MANUAL_RUN}"
|
||||
else
|
||||
PROFILE_RAW="${PROFILE}"
|
||||
PROFILE_RAW="${PROFILE_SCHEDULED_RUN}"
|
||||
fi
|
||||
# shellcheck disable=SC2001
|
||||
PROFILE_VAL=$(echo "${PROFILE_RAW}" | sed 's|.*[[:space:]](\(.*\))|\1|')
|
||||
@@ -94,7 +94,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -204,7 +204,7 @@ jobs:
|
||||
uses: foundry-rs/foundry-toolchain@8789b3e21e6c11b2697f5eb56eddae542f746c10
|
||||
|
||||
- name: Cache cargo
|
||||
uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3
|
||||
uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry
|
||||
@@ -214,14 +214,14 @@ jobs:
|
||||
restore-keys: ${{ runner.os }}-cargo-
|
||||
|
||||
- name: Login to GitHub Container Registry
|
||||
uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0
|
||||
uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Login to Chainguard Registry
|
||||
uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0
|
||||
uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
|
||||
with:
|
||||
registry: cgr.dev
|
||||
username: ${{ secrets.CGR_USERNAME }}
|
||||
@@ -248,13 +248,13 @@ jobs:
|
||||
npm install && npm run deploy:emptyProxies && npx hardhat compile
|
||||
working-directory: fhevm/
|
||||
|
||||
- name: Profile erc20 no-cmux benchmark on GPU
|
||||
- name: Profile erc7984 no-cmux benchmark on GPU
|
||||
run: |
|
||||
BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" \
|
||||
FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" \
|
||||
BENCHMARK_TYPE="THROUGHPUT_200" \
|
||||
OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" \
|
||||
make -e "profile_erc20_gpu"
|
||||
make -e "profile_erc7984_gpu"
|
||||
working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker
|
||||
|
||||
- name: Get nsys profile name
|
||||
@@ -333,7 +333,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
2
.github/workflows/benchmark_hpu.yml
vendored
2
.github/workflows/benchmark_hpu.yml
vendored
@@ -14,7 +14,7 @@ on:
|
||||
- integer
|
||||
- hlapi_unsigned
|
||||
- hlapi_signed
|
||||
- hlapi_erc20
|
||||
- hlapi_erc7984
|
||||
op_flavor:
|
||||
description: "Operations set to run"
|
||||
type: choice
|
||||
|
||||
2
.github/workflows/benchmark_hpu_common.yml
vendored
2
.github/workflows/benchmark_hpu_common.yml
vendored
@@ -95,7 +95,7 @@ jobs:
|
||||
]:
|
||||
f.write(f"""{env_name}=["{'", "'.join(values_to_join)}"]\n""")
|
||||
|
||||
- name: Set martix arguments outputs
|
||||
- name: Set matrix arguments outputs
|
||||
id: set_matrix_args
|
||||
run: | # zizmor: ignore[template-injection] these env variable are safe
|
||||
{
|
||||
|
||||
@@ -143,7 +143,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -387,7 +387,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/benchmark_tfhe_fft.yml
vendored
4
.github/workflows/benchmark_tfhe_fft.yml
vendored
@@ -40,7 +40,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -137,7 +137,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/benchmark_tfhe_ntt.yml
vendored
4
.github/workflows/benchmark_tfhe_ntt.yml
vendored
@@ -40,7 +40,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -137,7 +137,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -63,7 +63,7 @@ jobs:
|
||||
with open(env_file, "a") as f:
|
||||
f.write(f"""BROWSER=["{'", "'.join(split_browser)}"]\n""")
|
||||
|
||||
- name: Set martix arguments output
|
||||
- name: Set matrix arguments output
|
||||
id: set_matrix_arg
|
||||
run: | # zizmor: ignore[template-injection] this env variable is safe
|
||||
echo "browser=${{ toJSON(env.BROWSER) }}" >> "${GITHUB_OUTPUT}"
|
||||
@@ -77,7 +77,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -124,7 +124,7 @@ jobs:
|
||||
|
||||
- name: Node cache restoration
|
||||
id: node-cache
|
||||
uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
with:
|
||||
path: |
|
||||
~/.nvm
|
||||
@@ -137,7 +137,7 @@ jobs:
|
||||
make install_node
|
||||
|
||||
- name: Node cache save
|
||||
uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
if: steps.node-cache.outputs.cache-hit != 'true'
|
||||
with:
|
||||
path: |
|
||||
@@ -158,9 +158,9 @@ jobs:
|
||||
env:
|
||||
BROWSER: ${{ matrix.browser }}
|
||||
|
||||
- name: Run benchmarks (unsafe coop)
|
||||
- name: Run benchmarks (cross origin)
|
||||
run: |
|
||||
make bench_web_js_api_unsafe_coop_"${BROWSER}"_ci
|
||||
make bench_web_js_api_cross_origin_"${BROWSER}"_ci
|
||||
env:
|
||||
BROWSER: ${{ matrix.browser }}
|
||||
|
||||
@@ -218,7 +218,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
6
.github/workflows/cargo_build_common.yml
vendored
6
.github/workflows/cargo_build_common.yml
vendored
@@ -94,7 +94,7 @@ jobs:
|
||||
with open(env_file, "a") as f:
|
||||
f.write(f"""RUNNERS=["{'", "'.join(runners)}"]\n""")
|
||||
|
||||
- name: Set martix runners outputs
|
||||
- name: Set matrix runners outputs
|
||||
id: set_matrix_runners
|
||||
run: | # zizmor: ignore[template-injection] these env variable are safe
|
||||
echo "runners=${{ toJSON(env.RUNNERS) }}" >> "${GITHUB_OUTPUT}"
|
||||
@@ -138,7 +138,7 @@ jobs:
|
||||
- name: Node cache restoration
|
||||
if: inputs.run-pcc-cpu-batch == 'pcc_batch_2'
|
||||
id: node-cache
|
||||
uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
with:
|
||||
path: |
|
||||
~/.nvm
|
||||
@@ -151,7 +151,7 @@ jobs:
|
||||
make install_node
|
||||
|
||||
- name: Node cache save
|
||||
uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
if: inputs.run-pcc-cpu-batch == 'pcc_batch_2' && steps.node-cache.outputs.cache-hit != 'true'
|
||||
with:
|
||||
path: |
|
||||
|
||||
4
.github/workflows/cargo_test_ntt.yml
vendored
4
.github/workflows/cargo_test_ntt.yml
vendored
@@ -63,7 +63,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -146,7 +146,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
2
.github/workflows/ci_lint.yml
vendored
2
.github/workflows/ci_lint.yml
vendored
@@ -50,7 +50,7 @@ jobs:
|
||||
version: ${{ steps.get_zizmor.outputs.version }}
|
||||
|
||||
- name: Ensure SHA pinned actions
|
||||
uses: zgosalvez/github-actions-ensure-sha-pinned-actions@70c4af2ed5282c51ba40566d026d6647852ffa3e # v5.0.1
|
||||
uses: zgosalvez/github-actions-ensure-sha-pinned-actions@ca46236c6ce584ae24bc6283ba8dcf4b3ec8a066 # v5.0.4
|
||||
with:
|
||||
allowlist: |
|
||||
slsa-framework/slsa-github-generator
|
||||
|
||||
4
.github/workflows/code_coverage.yml
vendored
4
.github/workflows/code_coverage.yml
vendored
@@ -74,7 +74,7 @@ jobs:
|
||||
make test_shortint_cov
|
||||
|
||||
- name: Upload tfhe coverage to Codecov
|
||||
uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de
|
||||
uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
@@ -88,7 +88,7 @@ jobs:
|
||||
make test_integer_cov
|
||||
|
||||
- name: Upload tfhe coverage to Codecov
|
||||
uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de
|
||||
uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
|
||||
64
.github/workflows/generate_svgs.yml
vendored
64
.github/workflows/generate_svgs.yml
vendored
@@ -209,60 +209,98 @@ jobs:
|
||||
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
|
||||
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
|
||||
|
||||
gpu-zk-server-latency-table:
|
||||
name: generate_documentation_svgs/gpu-zk-server-latency-table
|
||||
uses: ./.github/workflows/generate_svg_common.yml
|
||||
if: inputs.generate-gpu-svgs
|
||||
with:
|
||||
backend: gpu
|
||||
hardware_name: n3-H100-SXM5x8
|
||||
layer: integer
|
||||
bench_subset: zk
|
||||
pbs_kind: multi_bit
|
||||
grouping_factor: 4
|
||||
bench_type: latency
|
||||
time_span_days: ${{ inputs.time_span_days }}
|
||||
output_filename: gpu-zk-benchmark-latency
|
||||
secrets:
|
||||
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
|
||||
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
|
||||
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
|
||||
|
||||
gpu-zk-server-throughput-table:
|
||||
name: generate_documentation_svgs/gpu-zk-server-throughput-table
|
||||
uses: ./.github/workflows/generate_svg_common.yml
|
||||
if: inputs.generate-gpu-svgs
|
||||
with:
|
||||
backend: gpu
|
||||
hardware_name: n3-H100-SXM5x8
|
||||
layer: integer
|
||||
bench_subset: zk
|
||||
pbs_kind: multi_bit
|
||||
grouping_factor: 4
|
||||
bench_type: throughput
|
||||
time_span_days: ${{ inputs.time_span_days }}
|
||||
output_filename: gpu-zk-benchmark-throughput
|
||||
secrets:
|
||||
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
|
||||
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
|
||||
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
|
||||
|
||||
# -----------------------------------------------------------
|
||||
# ERC20 benchmarks tables
|
||||
# ERC7984 benchmarks tables
|
||||
# -----------------------------------------------------------
|
||||
|
||||
cpu-erc20-latency-throughput-table:
|
||||
name: generate_documentation_svgs/cpu-erc20-latency-throughput-table
|
||||
cpu-erc7984-latency-throughput-table:
|
||||
name: generate_documentation_svgs/cpu-erc7984-latency-throughput-table
|
||||
uses: ./.github/workflows/generate_svg_common.yml
|
||||
if: inputs.generate-cpu-svgs
|
||||
with:
|
||||
backend: cpu
|
||||
hardware_name: hpc7a.96xlarge
|
||||
layer: hlapi
|
||||
bench_subset: erc20
|
||||
bench_subset: erc7984
|
||||
pbs_kind: classical
|
||||
bench_type: both
|
||||
time_span_days: ${{ inputs.time_span_days }}
|
||||
output_filename: cpu-hlapi-erc20-benchmark-latency-throughput
|
||||
output_filename: cpu-hlapi-erc7984-benchmark-latency-throughput
|
||||
secrets:
|
||||
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
|
||||
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
|
||||
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
|
||||
|
||||
gpu-erc20-latency-throughput-table:
|
||||
name: generate_documentation_svgs/gpu-erc20-latency-throughput-table
|
||||
gpu-erc7984-latency-throughput-table:
|
||||
name: generate_documentation_svgs/gpu-erc7984-latency-throughput-table
|
||||
uses: ./.github/workflows/generate_svg_common.yml
|
||||
if: inputs.generate-gpu-svgs
|
||||
with:
|
||||
backend: gpu
|
||||
hardware_name: n3-H100-SXM5x8
|
||||
layer: hlapi
|
||||
bench_subset: erc20
|
||||
bench_subset: erc7984
|
||||
pbs_kind: multi_bit
|
||||
grouping_factor: 4
|
||||
bench_type: both
|
||||
time_span_days: ${{ inputs.time_span_days }}
|
||||
output_filename: gpu-hlapi-erc20-benchmark-h100x8-sxm5-latency-throughput
|
||||
output_filename: gpu-hlapi-erc7984-benchmark-h100x8-sxm5-latency-throughput
|
||||
secrets:
|
||||
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
|
||||
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
|
||||
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
|
||||
|
||||
hpu-erc20-latency-throughput-table:
|
||||
name: generate_documentation_svgs/hpu-erc20-latency-throughput-table
|
||||
hpu-erc7984-latency-throughput-table:
|
||||
name: generate_documentation_svgs/hpu-erc7984-latency-throughput-table
|
||||
uses: ./.github/workflows/generate_svg_common.yml
|
||||
if: inputs.generate-hpu-svgs
|
||||
with:
|
||||
backend: hpu
|
||||
hardware_name: hpu_x1
|
||||
layer: hlapi
|
||||
bench_subset: erc20
|
||||
bench_subset: erc7984
|
||||
pbs_kind: classical
|
||||
bench_type: both
|
||||
time_span_days: ${{ inputs.time_span_days }}
|
||||
output_filename: hpu-hlapi-erc20-benchmark-hpux1-latency-throughput.svg
|
||||
output_filename: hpu-hlapi-erc7984-benchmark-hpux1-latency-throughput.svg
|
||||
secrets:
|
||||
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
|
||||
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
|
||||
|
||||
@@ -43,7 +43,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -149,7 +149,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
22
.github/workflows/gpu_core_h100_tests.yml
vendored
22
.github/workflows/gpu_core_h100_tests.yml
vendored
@@ -73,18 +73,12 @@ jobs:
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
|
||||
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
|
||||
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
|
||||
# otherwise we'll try to run the next job on a non-existing on-demand instance.
|
||||
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
|
||||
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
|
||||
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
|
||||
steps:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
continue-on-error: true
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -93,13 +87,6 @@ jobs:
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
|
||||
# This will allow to fallback on permanent instances running on Hyperstack.
|
||||
- name: Use permanent remote instance
|
||||
id: use-permanent-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
|
||||
run: |
|
||||
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# This instance will be spawned especially for pull-request from forked repository
|
||||
- name: Start GitHub instance
|
||||
id: start-github-instance
|
||||
@@ -132,7 +119,6 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Setup Hyperstack dependencies
|
||||
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
|
||||
uses: ./.github/actions/gpu_setup
|
||||
with:
|
||||
cuda-version: ${{ matrix.cuda }}
|
||||
@@ -176,14 +162,14 @@ jobs:
|
||||
|
||||
teardown-instance:
|
||||
name: gpu_core_h100_tests/teardown-instance
|
||||
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
|
||||
if: ${{ always() && needs.setup-instance.result == 'success' }}
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/gpu_fast_tests.yml
vendored
4
.github/workflows/gpu_fast_tests.yml
vendored
@@ -77,7 +77,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -182,7 +182,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
26
.github/workflows/gpu_full_h100_tests.yml
vendored
26
.github/workflows/gpu_full_h100_tests.yml
vendored
@@ -25,17 +25,11 @@ jobs:
|
||||
name: gpu_full_h100_tests/setup-instance
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
|
||||
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
|
||||
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
|
||||
# otherwise we'll try to run the next job on a non-existing on-demand instance.
|
||||
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
|
||||
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
continue-on-error: true
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -44,13 +38,6 @@ jobs:
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
|
||||
# This will allow to fallback on permanent instances running on Hyperstack.
|
||||
- name: Use permanent remote instance
|
||||
id: use-permanent-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
|
||||
run: |
|
||||
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
|
||||
|
||||
cuda-tests-linux:
|
||||
name: gpu_full_h100_tests/cuda-tests-linux
|
||||
needs: [ setup-instance ]
|
||||
@@ -74,7 +61,6 @@ jobs:
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Setup Hyperstack dependencies
|
||||
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
|
||||
uses: ./.github/actions/gpu_setup
|
||||
with:
|
||||
cuda-version: ${{ matrix.cuda }}
|
||||
@@ -118,13 +104,13 @@ jobs:
|
||||
|
||||
teardown-instance:
|
||||
name: gpu_full_h100_tests/teardown-instance
|
||||
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
|
||||
if: ${{ always() && needs.setup-instance.result == 'success' }}
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -80,7 +80,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -186,7 +186,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
22
.github/workflows/gpu_hlapi_h100_tests.yml
vendored
22
.github/workflows/gpu_hlapi_h100_tests.yml
vendored
@@ -74,18 +74,12 @@ jobs:
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
|
||||
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
|
||||
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
|
||||
# otherwise we'll try to run the next job on a non-existing on-demand instance.
|
||||
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
|
||||
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
|
||||
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
|
||||
steps:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
continue-on-error: true
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -94,13 +88,6 @@ jobs:
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
|
||||
# This will allow to fallback on permanent instances running on Hyperstack.
|
||||
- name: Use permanent remote instance
|
||||
id: use-permanent-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
|
||||
run: |
|
||||
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# This instance will be spawned especially for pull-request from forked repository
|
||||
- name: Start GitHub instance
|
||||
id: start-github-instance
|
||||
@@ -133,7 +120,6 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Setup Hyperstack dependencies
|
||||
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
|
||||
uses: ./.github/actions/gpu_setup
|
||||
with:
|
||||
cuda-version: ${{ matrix.cuda }}
|
||||
@@ -184,14 +170,14 @@ jobs:
|
||||
|
||||
teardown-instance:
|
||||
name: gpu_hlapi_h100_tests/teardown-instance
|
||||
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
|
||||
if: ${{ always() && needs.setup-instance.result == 'success' }}
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -38,7 +38,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -112,7 +112,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/gpu_memory_sanitizer.yml
vendored
4
.github/workflows/gpu_memory_sanitizer.yml
vendored
@@ -74,7 +74,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -166,7 +166,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -74,7 +74,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -166,7 +166,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/gpu_pcc.yml
vendored
4
.github/workflows/gpu_pcc.yml
vendored
@@ -38,7 +38,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -176,7 +176,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -80,7 +80,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -169,7 +169,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -75,18 +75,12 @@ jobs:
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
|
||||
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
|
||||
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
|
||||
# otherwise we'll try to run the next job on a non-existing on-demand instance.
|
||||
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
|
||||
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
|
||||
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
|
||||
steps:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
continue-on-error: true
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -95,13 +89,6 @@ jobs:
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
|
||||
# This will allow to fallback on permanent instances running on Hyperstack.
|
||||
- name: Use permanent remote instance
|
||||
id: use-permanent-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
|
||||
run: |
|
||||
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# This instance will be spawned especially for pull-request from forked repository
|
||||
- name: Start GitHub instance
|
||||
id: start-github-instance
|
||||
@@ -134,7 +121,6 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Setup Hyperstack dependencies
|
||||
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
|
||||
uses: ./.github/actions/gpu_setup
|
||||
with:
|
||||
cuda-version: ${{ matrix.cuda }}
|
||||
@@ -176,14 +162,14 @@ jobs:
|
||||
|
||||
teardown-instance:
|
||||
name: gpu_signed_integer_h100_tests/teardown-instance
|
||||
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
|
||||
if: ${{ always() && needs.setup-instance.result == 'success' }}
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -81,7 +81,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -178,7 +178,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -80,7 +80,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -169,7 +169,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -75,18 +75,12 @@ jobs:
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
|
||||
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
|
||||
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
|
||||
# otherwise we'll try to run the next job on a non-existing on-demand instance.
|
||||
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
|
||||
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
|
||||
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
|
||||
steps:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
continue-on-error: true
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -95,13 +89,6 @@ jobs:
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
|
||||
# This will allow to fallback on permanent instances running on Hyperstack.
|
||||
- name: Use permanent remote instance
|
||||
id: use-permanent-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
|
||||
run: |
|
||||
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# This instance will be spawned especially for pull-request from forked repository
|
||||
- name: Start GitHub instance
|
||||
id: start-github-instance
|
||||
@@ -134,7 +121,6 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Setup Hyperstack dependencies
|
||||
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
|
||||
uses: ./.github/actions/gpu_setup
|
||||
with:
|
||||
cuda-version: ${{ matrix.cuda }}
|
||||
@@ -176,14 +162,14 @@ jobs:
|
||||
|
||||
teardown-instance:
|
||||
name: gpu_unsigned_integer_h100_tests/teardown-instance
|
||||
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
|
||||
if: ${{ always() && needs.setup-instance.result == 'success' }}
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -81,7 +81,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -178,7 +178,7 @@ jobs:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
16
.github/workflows/gpu_zk_tests.yml
vendored
16
.github/workflows/gpu_zk_tests.yml
vendored
@@ -51,7 +51,16 @@ jobs:
|
||||
with:
|
||||
files_yaml: |
|
||||
gpu:
|
||||
- tfhe/Cargo.toml
|
||||
- tfhe/build.rs
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- backends/zk-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/zk/**
|
||||
- tfhe-zk-pok/**
|
||||
- 'tfhe/docs/**/**.md'
|
||||
- '.github/workflows/gpu_zk_tests.yml'
|
||||
- ci/slab.toml
|
||||
|
||||
@@ -67,7 +76,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -126,6 +135,9 @@ jobs:
|
||||
- name: Run zk-cuda-backend integration tests
|
||||
run: |
|
||||
make test_zk_cuda_backend
|
||||
make test_zk_pok_experimental_gpu
|
||||
make test_integer_zk_gpu
|
||||
make test_integer_zk_experimental_gpu
|
||||
|
||||
slack-notify:
|
||||
name: gpu_zk_tests/slack-notify
|
||||
@@ -158,7 +170,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
2
.github/workflows/make_release_common.yml
vendored
2
.github/workflows/make_release_common.yml
vendored
@@ -107,7 +107,7 @@ jobs:
|
||||
path: target/package
|
||||
|
||||
- name: Authenticate on registry
|
||||
uses: rust-lang/crates-io-auth-action@b7e9a28eded4986ec6b1fa40eeee8f8f165559ec # v1.0.3
|
||||
uses: rust-lang/crates-io-auth-action@bbd81622f20ce9e2dd9622e3218b975523e45bbe # v1.0.4
|
||||
id: auth
|
||||
|
||||
- name: Publish crate.io package
|
||||
|
||||
@@ -1,12 +1,36 @@
|
||||
name: make_release_cuda
|
||||
# Common workflow to make crate release for CUDA backend
|
||||
name: make_release_common_cuda
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
workflow_call:
|
||||
inputs:
|
||||
dry_run:
|
||||
description: "Dry-run"
|
||||
package-name:
|
||||
type: string
|
||||
required: true
|
||||
dry-run:
|
||||
type: boolean
|
||||
default: true
|
||||
secrets:
|
||||
REPO_CHECKOUT_TOKEN:
|
||||
required: true
|
||||
SLAB_ACTION_TOKEN:
|
||||
required: true
|
||||
SLAB_BASE_URL:
|
||||
required: true
|
||||
SLAB_URL:
|
||||
required: true
|
||||
JOB_SECRET:
|
||||
required: true
|
||||
SLACK_CHANNEL:
|
||||
required: true
|
||||
BOT_USERNAME:
|
||||
required: true
|
||||
SLACK_WEBHOOK:
|
||||
required: true
|
||||
ALLOWED_TEAM:
|
||||
required: true
|
||||
READ_ORG_TOKEN:
|
||||
required: true
|
||||
|
||||
env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
@@ -21,15 +45,15 @@ permissions: {}
|
||||
|
||||
jobs:
|
||||
verify-triggering-actor:
|
||||
name: make_release_cuda/verify-triggering-actor
|
||||
name: make_release_common_cuda/verify-triggering-actor
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
uses: ./.github/workflows/verify_triggering_actor.yml
|
||||
secrets:
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
ALLOWED_TEAM: ${{ secrets.ALLOWED_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
|
||||
setup-instance:
|
||||
name: make_release_cuda/setup-instance
|
||||
name: make_release_common_cuda/setup-instance
|
||||
needs: verify-triggering-actor
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
@@ -37,7 +61,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -47,7 +71,7 @@ jobs:
|
||||
profile: gpu-build
|
||||
|
||||
package:
|
||||
name: make_release_cuda/package
|
||||
name: make_release_common_cuda/package
|
||||
needs: setup-instance
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
outputs:
|
||||
@@ -76,7 +100,6 @@ jobs:
|
||||
toolchain: stable
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
{
|
||||
@@ -89,7 +112,6 @@ jobs:
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${GCC_VERSION}";
|
||||
@@ -101,12 +123,14 @@ jobs:
|
||||
GCC_VERSION: ${{ matrix.gcc }}
|
||||
|
||||
- name: Prepare package
|
||||
env:
|
||||
PACKAGE: ${{ inputs.package-name }}
|
||||
run: |
|
||||
cargo package -p tfhe-cuda-backend
|
||||
cargo package -p "${PACKAGE}"
|
||||
|
||||
- uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
|
||||
with:
|
||||
name: crate-tfhe-cuda-backend
|
||||
name: crate-${{ inputs.package-name }}
|
||||
path: target/package/*.crate
|
||||
|
||||
- name: generate hash
|
||||
@@ -114,8 +138,8 @@ jobs:
|
||||
run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
provenance:
|
||||
name: make_release_cuda/provenance
|
||||
if: ${{ !inputs.dry_run }}
|
||||
name: make_release_common_cuda/provenance
|
||||
if: ${{ !inputs.dry-run }}
|
||||
needs: [package]
|
||||
# This action cannot be pinned to a specific commit (see https://github.com/slsa-framework/slsa-github-generator/blob/main/README.md#referencing-slsa-builders-and-generators)
|
||||
uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0 # zizmor: ignore[unpinned-uses] as said above SLSA cannot be pinned by tag today
|
||||
@@ -128,7 +152,7 @@ jobs:
|
||||
base64-subjects: ${{ needs.package.outputs.hash }}
|
||||
|
||||
publish-cuda-release:
|
||||
name: make_release_cuda/publish-cuda-release
|
||||
name: make_release_common_cuda/publish-cuda-release
|
||||
needs: [setup-instance, package] # for comparing hashes
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
permissions:
|
||||
@@ -150,7 +174,6 @@ jobs:
|
||||
toolchain: stable
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
{
|
||||
@@ -163,7 +186,6 @@ jobs:
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${GCC_VERSION}";
|
||||
@@ -177,22 +199,23 @@ jobs:
|
||||
- name: Download artifact
|
||||
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
|
||||
with:
|
||||
name: crate-tfhe-cuda-backend
|
||||
name: crate-${{ inputs.package-name }}
|
||||
path: target/package
|
||||
|
||||
- name: Authenticate on registry
|
||||
uses: rust-lang/crates-io-auth-action@b7e9a28eded4986ec6b1fa40eeee8f8f165559ec # v1.0.3
|
||||
uses: rust-lang/crates-io-auth-action@bbd81622f20ce9e2dd9622e3218b975523e45bbe # v1.0.4
|
||||
id: auth
|
||||
|
||||
- name: Publish crate.io package
|
||||
env:
|
||||
CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
|
||||
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
|
||||
PACKAGE: ${{ inputs.package-name }}
|
||||
DRY-RUN: ${{ inputs.dry-run && '--dry-run' || '' }}
|
||||
run: |
|
||||
# DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish
|
||||
# would fail. This is safe since DRY_RUN is handled in the env section above.
|
||||
# dry-run expansion cannot be double quoted when variable contains empty string otherwise cargo publish
|
||||
# would fail. This is safe since dry-run is handled in the env section above.
|
||||
# shellcheck disable=SC2086
|
||||
cargo publish -p tfhe-cuda-backend ${DRY_RUN}
|
||||
cargo publish -p "${PACKAGE}" ${DRY-RUN}
|
||||
|
||||
- name: Generate hash
|
||||
id: published_hash
|
||||
@@ -204,7 +227,7 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: failure
|
||||
SLACK_MESSAGE: "SLSA tfhe-cuda-backend crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "SLSA ${{ inputs.package-name }} crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
@@ -212,17 +235,17 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "tfhe-cuda-backend release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "${{ inputs.package-name }} release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: make_release_cuda/teardown-instance
|
||||
name: make_release_common_cuda/teardown-instance
|
||||
if: ${{ always() && needs.setup-instance.result == 'success' }}
|
||||
needs: [setup-instance, publish-cuda-release]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -235,4 +258,4 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (publish-cuda-release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "Instance teardown (${{ inputs.package-name }} release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
21
.github/workflows/make_release_tfhe.yml
vendored
21
.github/workflows/make_release_tfhe.yml
vendored
@@ -16,6 +16,10 @@ on:
|
||||
description: "Push web js package"
|
||||
type: boolean
|
||||
default: true
|
||||
push_web_compat_package:
|
||||
description: "Push web compat (cross-origin) js package"
|
||||
type: boolean
|
||||
default: true
|
||||
push_node_package:
|
||||
description: "Push node js package"
|
||||
type: boolean
|
||||
@@ -99,6 +103,23 @@ jobs:
|
||||
tag: ${{ env.NPM_TAG }}
|
||||
provenance: true
|
||||
|
||||
- name: Build web compat (cross-origin) package
|
||||
if: ${{ inputs.push_web_compat_package }}
|
||||
run: |
|
||||
rm -rf tfhe/pkg
|
||||
|
||||
make build_web_js_api
|
||||
sed -i 's/"tfhe"/"tfhe-compat"/g' tfhe/pkg/package.json
|
||||
|
||||
- name: Publish web compat (cross-origin) package
|
||||
if: ${{ inputs.push_web_compat_package }}
|
||||
uses: JS-DevTools/npm-publish@0fd2f4369c5d6bcfcde6091a7c527d810b9b5c3f
|
||||
with:
|
||||
package: tfhe/pkg/package.json
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
tag: ${{ env.NPM_TAG }}
|
||||
provenance: true
|
||||
|
||||
- name: Build Node package
|
||||
if: ${{ inputs.push_node_package }}
|
||||
run: |
|
||||
|
||||
44
.github/workflows/make_release_tfhe_cuda.yml
vendored
Normal file
44
.github/workflows/make_release_tfhe_cuda.yml
vendored
Normal file
@@ -0,0 +1,44 @@
|
||||
# Publish new release of tfhe-rs CUDA backend on crates.io.
|
||||
name: make_release_tfhe_cuda
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
dry_run:
|
||||
description: "Dry-run"
|
||||
type: boolean
|
||||
default: true
|
||||
|
||||
env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
permissions: {}
|
||||
|
||||
# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
|
||||
|
||||
jobs:
|
||||
make-release:
|
||||
name: make_release_tfhe_cuda/make-release
|
||||
uses: ./.github/workflows/make_release_common_cuda.yml
|
||||
with:
|
||||
package-name: "tfhe-cuda-backend"
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
permissions:
|
||||
actions: read # Needed to detect the GitHub Actions environment
|
||||
id-token: write # Needed to create the provenance via GitHub OIDC
|
||||
contents: write # Needed to upload assets/artifacts
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
32
.github/workflows/make_release_tfhe_safe_serialize.yml
vendored
Normal file
32
.github/workflows/make_release_tfhe_safe_serialize.yml
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
name: make_release_tfhe_safe_serialize
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
dry_run:
|
||||
description: "Dry-run"
|
||||
type: boolean
|
||||
default: true
|
||||
|
||||
permissions: {}
|
||||
|
||||
# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
|
||||
|
||||
jobs:
|
||||
make-release:
|
||||
name: make_release_tfhe_safe_serialize/make-release
|
||||
uses: ./.github/workflows/make_release_common.yml
|
||||
with:
|
||||
package-name: "tfhe-safe-serialize"
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
permissions:
|
||||
actions: read # Needed to detect the GitHub Actions environment
|
||||
id-token: write # Needed to create the provenance via GitHub OIDC
|
||||
contents: write # Needed to upload assets/artifacts
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
44
.github/workflows/make_release_zk_cuda.yml
vendored
Normal file
44
.github/workflows/make_release_zk_cuda.yml
vendored
Normal file
@@ -0,0 +1,44 @@
|
||||
# Publish new release of CUDA Zero-Knowledge primitives on crates.io.
|
||||
name: make_release_zk_cuda
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
dry_run:
|
||||
description: "Dry-run"
|
||||
type: boolean
|
||||
default: true
|
||||
|
||||
env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
permissions: {}
|
||||
|
||||
# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
|
||||
|
||||
jobs:
|
||||
make-release:
|
||||
name: make_release_zk_cuda/make-release
|
||||
uses: ./.github/workflows/make_release_common_cuda.yml
|
||||
with:
|
||||
package-name: "zk-cuda-backend"
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
permissions:
|
||||
actions: read # Needed to detect the GitHub Actions environment
|
||||
id-token: write # Needed to create the provenance via GitHub OIDC
|
||||
contents: write # Needed to upload assets/artifacts
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
4
.github/workflows/parameters_check.yml
vendored
4
.github/workflows/parameters_check.yml
vendored
@@ -53,7 +53,7 @@ jobs:
|
||||
|
||||
- name: Restore Sagemath image from cache
|
||||
id: docker-cache
|
||||
uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
with:
|
||||
path: /tmp/sagemath_image
|
||||
key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}
|
||||
@@ -76,7 +76,7 @@ jobs:
|
||||
- name: Store Sagemath image in cache
|
||||
if: steps.docker-cache.outputs.cache-hit != 'true'
|
||||
continue-on-error: true
|
||||
uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
with:
|
||||
path: /tmp/sagemath_image
|
||||
key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}
|
||||
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -25,6 +25,7 @@ dieharder_run.log
|
||||
|
||||
# Cuda local build
|
||||
backends/tfhe-cuda-backend/cuda/cmake-build-debug/
|
||||
backends/tfhe-cuda-backend/cuda/build/
|
||||
|
||||
# WASM tests
|
||||
tfhe/web_wasm_parallel_tests/server.PID
|
||||
@@ -34,6 +35,9 @@ node_modules/
|
||||
package-lock.json
|
||||
utils/wasm-par-mq/examples/*/pkg/
|
||||
|
||||
# Commit lock files of backward data generation crates
|
||||
!utils/tfhe-backward-compat-data/crates/generate_*/Cargo.lock
|
||||
|
||||
# Python .env
|
||||
.env
|
||||
__pycache__
|
||||
|
||||
@@ -14,10 +14,12 @@ members = [
|
||||
"tfhe-fft",
|
||||
"tfhe-ntt",
|
||||
"tfhe-zk-pok",
|
||||
"utils/benchmark_spec",
|
||||
"utils/param_dedup",
|
||||
"utils/tfhe-backward-compat-checker",
|
||||
"utils/tfhe-backward-compat-data",
|
||||
"utils/tfhe-backward-compat-data/crates/add_new_version",
|
||||
"utils/tfhe-safe-serialize",
|
||||
"utils/tfhe-versionable",
|
||||
"utils/tfhe-versionable-derive",
|
||||
"utils/wasm-par-mq",
|
||||
@@ -43,6 +45,7 @@ rand = "0.8"
|
||||
rayon = "1.11"
|
||||
serde = { version = "1.0", default-features = false }
|
||||
wasm-bindgen = { version = "0.2.114" }
|
||||
wasm-bindgen-futures = { version = "0.4.56" }
|
||||
# js-sys (at this point in time) automatically enables the unsafe-eval feature which we do not want
|
||||
# this does not prevent other deps from enabling it, but it at least conveys our need to not have it
|
||||
# we still enable std, which was part of default before
|
||||
|
||||
2
LICENSE
2
LICENSE
@@ -1,6 +1,6 @@
|
||||
BSD 3-Clause Clear License
|
||||
|
||||
Copyright © 2025 ZAMA.
|
||||
Copyright © 2026 ZAMA.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
|
||||
240
Makefile
240
Makefile
@@ -122,6 +122,12 @@ install_build_wasm32_target:
|
||||
( echo "Unable to install wasm32-unknown-unknown target toolchain, check your rustup installation. \
|
||||
Rustup can be downloaded at https://rustup.rs/" && exit 1 )
|
||||
|
||||
.PHONY: install_check_wasm32_target # Install the wasm32 toolchain used for checks
|
||||
install_check_wasm32_target:
|
||||
rustup target add wasm32-unknown-unknown --toolchain "$(RS_CHECK_TOOLCHAIN)" || \
|
||||
( echo "Unable to install wasm32-unknown-unknown target toolchain, check your rustup installation. \
|
||||
Rustup can be downloaded at https://rustup.rs/" && exit 1 )
|
||||
|
||||
.PHONY: install_cargo_nextest # Install cargo nextest used for shortint tests
|
||||
install_cargo_nextest:
|
||||
@cargo nextest --version > /dev/null 2>&1 || \
|
||||
@@ -350,23 +356,23 @@ check_fmt_js: check_nvm_installed
|
||||
.PHONY: check_fmt_toml # Check TOML files format
|
||||
check_fmt_toml: install_taplo
|
||||
@RUST_LOG=warn taplo fmt --check || \
|
||||
echo "TOML files format check failed. Please run 'make fmt_toml'"
|
||||
{ echo "TOML files format check failed. Please run 'make fmt_toml'"; exit 1; }
|
||||
|
||||
.PHONY: check_typos # Check for typos in codebase
|
||||
check_typos: install_typos_checker
|
||||
@typos && echo "No typos found"
|
||||
@git ls-files ":!*.png" ":!*.cbor" ":!*.bcode" ":!*.ico" ":!*/twiddles.cu" | typos --file-list - && echo "No typos found"
|
||||
|
||||
.PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
|
||||
clippy_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats,extended-types,zk-pok \
|
||||
--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats,extended-types,zk-pok \
|
||||
--all-targets \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
|
||||
.PHONY: check_gpu # Run check on tfhe with "gpu" enabled
|
||||
check_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
|
||||
--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \
|
||||
--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats \
|
||||
--all-targets \
|
||||
-p tfhe
|
||||
|
||||
@@ -380,7 +386,7 @@ clippy_hpu: install_rs_check_toolchain
|
||||
.PHONY: clippy_gpu_hpu # Run clippy lints on tfhe with "gpu" and "hpu" enabled
|
||||
clippy_gpu_hpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=boolean,shortint,integer,internal-keycache,gpu,hpu,pbs-stats,extended-types,zk-pok \
|
||||
--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,hpu,pbs-stats,extended-types,zk-pok \
|
||||
--all-targets \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
|
||||
@@ -473,7 +479,7 @@ clippy_rustdoc_gpu: install_rs_check_toolchain
|
||||
fi && \
|
||||
CARGO_TERM_QUIET=true CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
|
||||
--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu \
|
||||
--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu,gpu-experimental-zk \
|
||||
-p tfhe -- --nocapture
|
||||
|
||||
.PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
|
||||
@@ -484,11 +490,17 @@ clippy_c_api: install_rs_check_toolchain
|
||||
|
||||
.PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
|
||||
clippy_js_wasm_api: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,extended-types \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok,extended-types \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,extended-types \
|
||||
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok,extended-types,parallel-wasm-api \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok,extended-types,cross-origin-wasm-api \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_tasks # Run clippy lints on helper tasks crate.
|
||||
@@ -529,6 +541,15 @@ clippy_zk_pok: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-zk-pok --features=experimental -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_zk_pok_wasm # Run clippy lints on tfhe-zk-pok for wasm32 target
|
||||
clippy_zk_pok_wasm: install_rs_check_toolchain install_check_wasm32_target
|
||||
RUSTFLAGS="$(WASM_RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--target wasm32-unknown-unknown \
|
||||
-p tfhe-zk-pok -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(WASM_RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--target wasm32-unknown-unknown \
|
||||
-p tfhe-zk-pok --features cross-origin-wasm -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_versionable # Run clippy lints on tfhe-versionable
|
||||
clippy_versionable: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
@@ -536,6 +557,11 @@ clippy_versionable: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-versionable -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_safe_serialize # Run clippy lints on tfhe-safe-serialize
|
||||
clippy_safe_serialize: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-safe-serialize -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_param_dedup # Run clippy lints on param_dedup tool
|
||||
clippy_param_dedup: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
@@ -566,10 +592,12 @@ clippy_test_vectors: install_rs_check_toolchain
|
||||
cd apps/test-vectors; RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-test-vectors -- --no-deps -D warnings
|
||||
|
||||
# WARNING: This target is not directly run in CI. When adding a subtarget here,
|
||||
# MAKE SURE TO ALSO ADD IT TO A PCC BATCH BELOW
|
||||
.PHONY: clippy_all # Run all clippy targets
|
||||
clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
|
||||
clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_trivium \
|
||||
clippy_versionable clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup \
|
||||
clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_zk_pok_wasm clippy_trivium \
|
||||
clippy_versionable clippy_safe_serialize clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup \
|
||||
clippy_test_vectors clippy_backward_compat_data clippy_wasm_par_mq
|
||||
|
||||
.PHONY: clippy_fast # Run main clippy targets
|
||||
@@ -666,7 +694,7 @@ build_c_api: install_rs_check_toolchain
|
||||
.PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
|
||||
build_c_api_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,extended-types,gpu \
|
||||
--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,extended-types,gpu,gpu-experimental-zk \
|
||||
-p tfhe
|
||||
|
||||
.PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
|
||||
@@ -675,11 +703,14 @@ build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
|
||||
--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,experimental-force_fft_algo_dif4 \
|
||||
-p tfhe
|
||||
|
||||
.PHONY: build_web_js_api # Build the js API targeting the web browser
|
||||
.PHONY: build_web_js_api # Build the js API targeting the web browser, in sequential or cross origin parallelism modes.
|
||||
build_web_js_api: install_wasm_pack
|
||||
cd tfhe && \
|
||||
RUSTFLAGS="$(WASM_RUSTFLAGS)" wasm-pack build --release --target=web \
|
||||
-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok,extended-types
|
||||
-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok,extended-types,cross-origin-wasm-api && \
|
||||
find pkg/snippets -type f -iname worker_helpers.js -exec sed -i 's|import("../../..")|import("../../../tfhe.js")|g' {} \;
|
||||
cp utils/wasm-par-mq/js/coordinator.js tfhe/pkg/
|
||||
jq '.files += ["snippets"]' tfhe/pkg/package.json > tmp_pkg.json && mv -f tmp_pkg.json tfhe/pkg/package.json
|
||||
|
||||
.PHONY: build_web_js_api_parallel # Build the js API targeting the web browser with parallelism support
|
||||
# parallel wasm requires specific build options, see https://github.com/rust-lang/rust/pull/147225
|
||||
@@ -765,7 +796,7 @@ test_zk_cuda_backend:
|
||||
|
||||
|
||||
.PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
|
||||
test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
|
||||
test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend test_zk_cuda_backend
|
||||
|
||||
.PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
|
||||
test_core_crypto_gpu:
|
||||
@@ -1201,12 +1232,31 @@ test_tfhe_csprng_big_endian: install_cargo_cross
|
||||
RUSTFLAGS="" cross test --profile $(CARGO_PROFILE) \
|
||||
-p tfhe-csprng --target=powerpc64-unknown-linux-gnu
|
||||
|
||||
|
||||
.PHONY: test_zk_pok # Run tfhe-zk-pok tests
|
||||
test_zk_pok:
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
|
||||
-p tfhe-zk-pok --features experimental
|
||||
|
||||
.PHONY: test_zk_pok_experimental_gpu # Run tfhe-zk-pok GPU-accelerated tests
|
||||
test_zk_pok_experimental_gpu:
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
|
||||
-p tfhe-zk-pok --features experimental,gpu-experimental -- gpu
|
||||
|
||||
.PHONY: test_integer_zk_gpu # Run tfhe-zk-pok tests
|
||||
test_integer_zk_gpu:
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
|
||||
--features=integer,zk-pok,gpu -p tfhe -- \
|
||||
integer::gpu::zk::
|
||||
|
||||
.PHONY: test_integer_zk_experimental_gpu # Run tfhe-zk-pok tests
|
||||
test_integer_zk_experimental_gpu:
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
|
||||
--features=integer,zk-pok,gpu,gpu-experimental-zk -p tfhe -- \
|
||||
integer::gpu::zk::
|
||||
|
||||
.PHONY: test_zk_cuda # Run all GPU MSM integration tests (CPU vs GPU comparison + integration test)
|
||||
test_zk_cuda: test_zk_cuda_backend test_zk_pok_experimental_gpu test_integer_zk_gpu test_integer_zk_experimental_gpu
|
||||
|
||||
.PHONY: test_zk_wasm_x86_compat_ci
|
||||
test_zk_wasm_x86_compat_ci: check_nvm_installed
|
||||
source ~/.nvm/nvm.sh && \
|
||||
@@ -1225,6 +1275,11 @@ test_versionable:
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
|
||||
--all-targets -p tfhe-versionable
|
||||
|
||||
.PHONY: test_safe_serialize # Run tests for tfhe-safe-serialize subcrate
|
||||
test_safe_serialize:
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
|
||||
--all-targets -p tfhe-safe-serialize
|
||||
|
||||
# The backward compat data folder holds historical binary data but also rust code to generate and load them.
|
||||
.PHONY: gen_backward_compat_data # Re-generate backward compatibility data
|
||||
gen_backward_compat_data:
|
||||
@@ -1359,6 +1414,19 @@ test_nodejs_wasm_api_ci: build_node_js_api
|
||||
|
||||
# This is an internal target, not meant to be called on its own.
|
||||
run_web_js_api_parallel: build_web_js_api_parallel setup_venv
|
||||
cd $(WEB_SERVER_DIR) && npm install && npm run build
|
||||
source venv/bin/activate && \
|
||||
python ci/webdriver.py \
|
||||
--browser-path $(browser_path) \
|
||||
--driver-path $(driver_path) \
|
||||
--browser-kind $(browser_kind) \
|
||||
--server-cmd $(server_cmd) \
|
||||
--server-workdir "$(WEB_SERVER_DIR)" \
|
||||
--id-pattern $(filter) \
|
||||
--id-exclude-pattern asyncMainThread
|
||||
|
||||
# This is an internal target, not meant to be called on its own.
|
||||
run_web_js_api_cross_origin: build_web_js_api setup_venv
|
||||
cd $(WEB_SERVER_DIR) && npm install && npm run build
|
||||
source venv/bin/activate && \
|
||||
python ci/webdriver.py \
|
||||
@@ -1401,6 +1469,38 @@ test_web_js_api_parallel_firefox_ci: setup_venv
|
||||
nvm use $(NODE_VERSION) && \
|
||||
$(MAKE) test_web_js_api_parallel_firefox
|
||||
|
||||
test_web_js_api_cross_origin_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
|
||||
test_web_js_api_cross_origin_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
|
||||
test_web_js_api_cross_origin_chrome: browser_kind = chrome
|
||||
test_web_js_api_cross_origin_chrome: server_cmd = "npm run server:cross-origin"
|
||||
test_web_js_api_cross_origin_chrome: filter = ZeroKnowledgeTest # Only run zk proof tests in cross-origin mode
|
||||
|
||||
.PHONY: test_web_js_api_cross_origin_chrome # Run tests for the web wasm api in cross-origin mode on Chrome
|
||||
test_web_js_api_cross_origin_chrome: run_web_js_api_cross_origin
|
||||
|
||||
.PHONY: test_web_js_api_cross_origin_chrome_ci # Run tests for the web wasm api in cross-origin mode on Chrome
|
||||
test_web_js_api_cross_origin_chrome_ci: setup_venv
|
||||
source ~/.nvm/nvm.sh && \
|
||||
nvm install $(NODE_VERSION) && \
|
||||
nvm use $(NODE_VERSION) && \
|
||||
$(MAKE) test_web_js_api_cross_origin_chrome
|
||||
|
||||
test_web_js_api_cross_origin_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
|
||||
test_web_js_api_cross_origin_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
|
||||
test_web_js_api_cross_origin_firefox: browser_kind = firefox
|
||||
test_web_js_api_cross_origin_firefox: server_cmd = "npm run server:cross-origin"
|
||||
test_web_js_api_cross_origin_firefox: filter = ZeroKnowledgeTest # Only run zk proof tests in cross-origin mode
|
||||
|
||||
.PHONY: test_web_js_api_cross_origin_firefox # Run tests for the web wasm api in cross-origin mode on Firefox
|
||||
test_web_js_api_cross_origin_firefox: run_web_js_api_cross_origin
|
||||
|
||||
.PHONY: test_web_js_api_cross_origin_firefox_ci # Run tests for the web wasm api in cross-origin mode on Firefox
|
||||
test_web_js_api_cross_origin_firefox_ci: setup_venv
|
||||
source ~/.nvm/nvm.sh && \
|
||||
nvm install $(NODE_VERSION) && \
|
||||
nvm use $(NODE_VERSION) && \
|
||||
$(MAKE) test_web_js_api_cross_origin_firefox
|
||||
|
||||
WASM_PAR_MQ_TEST_DIR=utils/wasm-par-mq/web_tests
|
||||
|
||||
.PHONY: build_wasm_par_mq_tests # Build the wasm-par-mq test WASM package
|
||||
@@ -1564,27 +1664,50 @@ bench_integer_rerand_gpu: install_rs_check_toolchain
|
||||
--bench integer-rerand \
|
||||
--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --profile release_lto_off --
|
||||
|
||||
.PHONY: bench_msm_zk
|
||||
bench_msm_zk: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench zk-msm \
|
||||
--features=zk-pok -p tfhe-benchmark --profile release --
|
||||
|
||||
# GPU benchmarks need --profile release for correct measurements
|
||||
.PHONY: bench_msm_zk_gpu
|
||||
bench_msm_zk_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench zk-msm \
|
||||
--features=gpu,gpu-experimental-zk,zk-pok -p tfhe-benchmark --profile release -- zk::cuda::msm
|
||||
|
||||
# GPU benchmarks need --profile release for correct measurements
|
||||
.PHONY: bench_integer_zk_gpu
|
||||
bench_integer_zk_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-zk-pke \
|
||||
--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release_lto_off --
|
||||
--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release --
|
||||
|
||||
# GPU benchmarks need --profile release for correct measurements
|
||||
.PHONY: bench_integer_zk_experimental_gpu
|
||||
bench_integer_zk_experimental_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-zk-pke \
|
||||
--features=integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats,zk-pok -p tfhe-benchmark --profile release --
|
||||
|
||||
.PHONY: bench_integer_aes_gpu # Run benchmarks for AES on GPU backend
|
||||
bench_integer_aes_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-aes \
|
||||
--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
|
||||
--features=integer,internal-keycache,gpu -p tfhe-benchmark --profile release_lto_off --
|
||||
|
||||
.PHONY: bench_integer_aes256_gpu # Run benchmarks for AES256 on GPU backend
|
||||
bench_integer_aes256_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-aes256 \
|
||||
--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
|
||||
--features=integer,internal-keycache,gpu -p tfhe-benchmark --profile release_lto_off --
|
||||
|
||||
.PHONY: bench_integer_trivium_gpu # Run benchmarks for trivium on GPU backend
|
||||
bench_integer_trivium_gpu: install_rs_check_toolchain
|
||||
@@ -1748,37 +1871,37 @@ bench_web_js_api_parallel_firefox_ci: setup_venv
|
||||
nvm use $(NODE_VERSION) && \
|
||||
$(MAKE) bench_web_js_api_parallel_firefox
|
||||
|
||||
bench_web_js_api_unsafe_coop_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
|
||||
bench_web_js_api_unsafe_coop_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
|
||||
bench_web_js_api_unsafe_coop_chrome: browser_kind = chrome
|
||||
bench_web_js_api_unsafe_coop_chrome: server_cmd = "npm run server:unsafe-coop"
|
||||
bench_web_js_api_unsafe_coop_chrome: filter = ZeroKnowledgeBench # Only bench zk with unsafe coop
|
||||
bench_web_js_api_cross_origin_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
|
||||
bench_web_js_api_cross_origin_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
|
||||
bench_web_js_api_cross_origin_chrome: browser_kind = chrome
|
||||
bench_web_js_api_cross_origin_chrome: server_cmd = "npm run server:cross-origin"
|
||||
bench_web_js_api_cross_origin_chrome: filter = ZeroKnowledgeBench # Only bench zk with cross-origin workers
|
||||
|
||||
.PHONY: bench_web_js_api_unsafe_coop_chrome # Run benchmarks for the web wasm api without cross-origin isolation
|
||||
bench_web_js_api_unsafe_coop_chrome: run_web_js_api_parallel
|
||||
.PHONY: bench_web_js_api_cross_origin_chrome # Run benchmarks for the web wasm api without cross-origin isolation
|
||||
bench_web_js_api_cross_origin_chrome: run_web_js_api_cross_origin
|
||||
|
||||
.PHONY: bench_web_js_api_unsafe_coop_chrome_ci # Run benchmarks for the web wasm api without cross-origin isolation
|
||||
bench_web_js_api_unsafe_coop_chrome_ci: setup_venv
|
||||
.PHONY: bench_web_js_api_cross_origin_chrome_ci # Run benchmarks for the web wasm api without cross-origin isolation
|
||||
bench_web_js_api_cross_origin_chrome_ci: setup_venv
|
||||
source ~/.nvm/nvm.sh && \
|
||||
nvm install $(NODE_VERSION) && \
|
||||
nvm use $(NODE_VERSION) && \
|
||||
$(MAKE) bench_web_js_api_unsafe_coop_chrome
|
||||
$(MAKE) bench_web_js_api_cross_origin_chrome
|
||||
|
||||
bench_web_js_api_unsafe_coop_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
|
||||
bench_web_js_api_unsafe_coop_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
|
||||
bench_web_js_api_unsafe_coop_firefox: browser_kind = firefox
|
||||
bench_web_js_api_unsafe_coop_firefox: server_cmd = "npm run server:unsafe-coop"
|
||||
bench_web_js_api_unsafe_coop_firefox: filter = ZeroKnowledgeBench # Only bench zk with unsafe coop
|
||||
bench_web_js_api_cross_origin_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
|
||||
bench_web_js_api_cross_origin_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
|
||||
bench_web_js_api_cross_origin_firefox: browser_kind = firefox
|
||||
bench_web_js_api_cross_origin_firefox: server_cmd = "npm run server:cross-origin"
|
||||
bench_web_js_api_cross_origin_firefox: filter = ZeroKnowledgeBench # Only bench zk with cross-origin workers
|
||||
|
||||
.PHONY: bench_web_js_api_unsafe_coop_firefox # Run benchmarks for the web wasm api without cross-origin isolation
|
||||
bench_web_js_api_unsafe_coop_firefox: run_web_js_api_parallel
|
||||
.PHONY: bench_web_js_api_cross_origin_firefox # Run benchmarks for the web wasm api without cross-origin isolation
|
||||
bench_web_js_api_cross_origin_firefox: run_web_js_api_cross_origin
|
||||
|
||||
.PHONY: bench_web_js_api_unsafe_coop_firefox_ci # Run benchmarks for the web wasm api without cross-origin isolation
|
||||
bench_web_js_api_unsafe_coop_firefox_ci: setup_venv
|
||||
.PHONY: bench_web_js_api_cross_origin_firefox_ci # Run benchmarks for the web wasm api without cross-origin isolation
|
||||
bench_web_js_api_cross_origin_firefox_ci: setup_venv
|
||||
source ~/.nvm/nvm.sh && \
|
||||
nvm install $(NODE_VERSION) && \
|
||||
nvm use $(NODE_VERSION) && \
|
||||
$(MAKE) bench_web_js_api_unsafe_coop_firefox
|
||||
$(MAKE) bench_web_js_api_cross_origin_firefox
|
||||
|
||||
.PHONY: bench_hlapi_unsigned # Run benchmarks for integer operations
|
||||
bench_hlapi_unsigned: install_rs_check_toolchain
|
||||
@@ -1811,25 +1934,25 @@ bench_hlapi_hpu: install_rs_check_toolchain
|
||||
--bench hlapi \
|
||||
--features=integer,internal-keycache,hpu,hpu-v80,pbs-stats -p tfhe-benchmark --
|
||||
|
||||
.PHONY: bench_hlapi_erc20 # Run benchmarks for ERC20 operations
|
||||
bench_hlapi_erc20: install_rs_check_toolchain
|
||||
.PHONY: bench_hlapi_erc7984 # Run benchmarks for ERC7984 operations
|
||||
bench_hlapi_erc7984: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-erc20 \
|
||||
--bench hlapi-erc7984 \
|
||||
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --
|
||||
|
||||
.PHONY: bench_hlapi_erc20_gpu # Run benchmarks for ERC20 operations on GPU
|
||||
bench_hlapi_erc20_gpu: install_rs_check_toolchain
|
||||
.PHONY: bench_hlapi_erc7984_gpu # Run benchmarks for ERC7984 operations on GPU
|
||||
bench_hlapi_erc7984_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-erc20 \
|
||||
--bench hlapi-erc7984 \
|
||||
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
|
||||
|
||||
.PHONY: bench_hlapi_erc20_gpu_classical # Run benchmarks for ERC20 operations on GPU with classical parameters
|
||||
bench_hlapi_erc20_gpu_classical: install_rs_check_toolchain
|
||||
.PHONY: bench_hlapi_erc7984_gpu_classical # Run benchmarks for ERC7984 operations on GPU with classical parameters
|
||||
bench_hlapi_erc7984_gpu_classical: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=classical \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-erc20 \
|
||||
--bench hlapi-erc7984 \
|
||||
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
|
||||
|
||||
.PHONY: bench_hlapi_dex # Run benchmarks for DEX operations
|
||||
@@ -1853,13 +1976,13 @@ bench_hlapi_dex_gpu_classical: install_rs_check_toolchain
|
||||
--bench hlapi-dex \
|
||||
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
|
||||
|
||||
.PHONY: bench_hlapi_erc20_hpu # Run benchmarks for ECR20 operations on HPU
|
||||
bench_hlapi_erc20_hpu: install_rs_check_toolchain
|
||||
.PHONY: bench_hlapi_erc7984_hpu # Run benchmarks for ECR20 operations on HPU
|
||||
bench_hlapi_erc7984_hpu: install_rs_check_toolchain
|
||||
source ./setup_hpu.sh --config $(HPU_CONFIG); \
|
||||
export V80_PCIE_DEV=${V80_PCIE_DEV}; \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-erc20 \
|
||||
--bench hlapi-erc7984 \
|
||||
--features=integer,internal-keycache,hpu,hpu-v80,pbs-stats -p tfhe-benchmark --
|
||||
|
||||
.PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
|
||||
@@ -1867,6 +1990,13 @@ bench_tfhe_zk_pok: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-zk-pok --
|
||||
|
||||
.PHONY: bench_tfhe_zk_pok_gpu # Run benchmarks for the tfhe_zk_pok crate using GPU acceleration
|
||||
bench_tfhe_zk_pok_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--package tfhe-zk-pok \
|
||||
--features=gpu-experimental --profile release
|
||||
|
||||
.PHONY: bench_hlapi_noise_squash # Run benchmarks for noise squash operation
|
||||
bench_hlapi_noise_squash: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
|
||||
@@ -1908,10 +2038,10 @@ bench_summary: install_rs_check_toolchain
|
||||
--bench hlapi-noise-squash \
|
||||
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::decomp_noise_squash_comp::'
|
||||
|
||||
# ERC20
|
||||
# ERC7984
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-erc20 \
|
||||
--bench hlapi-erc7984 \
|
||||
--features=integer,internal-keycache -p tfhe-benchmark -- '::transfer::overflow'
|
||||
|
||||
# DEX
|
||||
@@ -1953,10 +2083,10 @@ bench_summary_gpu: install_rs_check_toolchain
|
||||
--bench hlapi-noise-squash \
|
||||
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::decomp_noise_squash_comp::'
|
||||
|
||||
# ERC20
|
||||
# ERC7984
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-erc20 \
|
||||
--bench hlapi-erc7984 \
|
||||
--features=integer,gpu,internal-keycache -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow'
|
||||
|
||||
# DEX
|
||||
@@ -2143,8 +2273,10 @@ pcc_batch_6:
|
||||
$(call run_recipe_with_details,clippy_tasks)
|
||||
$(call run_recipe_with_details,clippy_tfhe_csprng)
|
||||
$(call run_recipe_with_details,clippy_zk_pok)
|
||||
$(call run_recipe_with_details,clippy_zk_pok_wasm)
|
||||
$(call run_recipe_with_details,clippy_trivium)
|
||||
$(call run_recipe_with_details,clippy_versionable)
|
||||
$(call run_recipe_with_details,clippy_safe_serialize)
|
||||
$(call run_recipe_with_details,clippy_param_dedup)
|
||||
$(call run_recipe_with_details,docs)
|
||||
|
||||
|
||||
@@ -15,12 +15,3 @@ extend-ignore-identifiers-re = [
|
||||
"0x[0-9a-fA-F]+",
|
||||
"xrt_coreutil",
|
||||
]
|
||||
|
||||
[files]
|
||||
extend-exclude = [
|
||||
"backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cu",
|
||||
"backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu",
|
||||
"backends/tfhe-hpu-backend/config_store/**/*.link_summary",
|
||||
"*.cbor",
|
||||
"*.bcode",
|
||||
]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
BSD 3-Clause Clear License
|
||||
|
||||
Copyright © 2025 ZAMA.
|
||||
Copyright © 2026 ZAMA.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
|
||||
@@ -62,3 +62,29 @@ rules:
|
||||
cuda_synchronize_stream(...);
|
||||
...
|
||||
}
|
||||
|
||||
- id: tfhe-cuda-unwrapped-cuda-runtime-call
|
||||
message: "CUDA runtime API call is not wrapped in `check_cuda_error(...)`."
|
||||
severity: WARNING
|
||||
languages: [c, cpp]
|
||||
options:
|
||||
generic_ellipsis_max_span: 500
|
||||
paths:
|
||||
include:
|
||||
- "*.cu"
|
||||
- "*.cuh"
|
||||
- "*.cpp"
|
||||
- "*.h"
|
||||
exclude:
|
||||
- backends/tfhe-cuda-backend/cuda/check_cuda.cu # contains cuda checking functions
|
||||
- backends/tfhe-cuda-backend/cuda/include/device.h # contains the cuda_check_error macro (and others)
|
||||
patterns:
|
||||
- pattern: $FUNC(...)
|
||||
- metavariable-regex:
|
||||
metavariable: $FUNC
|
||||
regex: "^cuda[A-Z][A-Za-z0-9]*$" # matches cudaMalloc/cudaMemcpy/... (not project helpers like cuda_set_device)
|
||||
- pattern-not-inside: check_cuda_error(...)
|
||||
- pattern-not-inside: |
|
||||
$FUNC(...);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
- pattern-not-inside: $FUNC(...) == $VAL
|
||||
|
||||
@@ -36,5 +36,19 @@ void cuda_glwe_sample_extract_128_async(
|
||||
void const *glwe_array_in, uint32_t const *nth_array, uint32_t num_nths,
|
||||
uint32_t num_lwes_to_extract_per_glwe, uint32_t num_lwes_stored_per_glwe,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size);
|
||||
|
||||
void cuda_modulus_switch_multi_bit_64_async(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in, uint32_t size,
|
||||
uint32_t log_modulus,
|
||||
uint32_t degree,
|
||||
uint32_t grouping_factor);
|
||||
|
||||
void cuda_modulus_switch_multi_bit_128_async(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in, uint32_t size,
|
||||
uint32_t log_modulus,
|
||||
uint32_t degree,
|
||||
uint32_t grouping_factor);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -382,14 +382,17 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
|
||||
->use_sequential_algorithm_to_resolve_group_carries;
|
||||
|
||||
cuda_set_device(0);
|
||||
cudaEventCreateWithFlags(&create_indexes_done, cudaEventDisableTiming);
|
||||
check_cuda_error(
|
||||
cudaEventCreateWithFlags(&create_indexes_done, cudaEventDisableTiming));
|
||||
create_indexes_for_overflow_sub(streams.get_ith(0), num_blocks, group_size,
|
||||
use_seq, allocate_gpu_memory, size_tracker);
|
||||
cudaEventRecord(create_indexes_done, streams.stream(0));
|
||||
check_cuda_error(cudaEventRecord(create_indexes_done, streams.stream(0)));
|
||||
cuda_set_device(1);
|
||||
cudaStreamWaitEvent(streams.stream(1), create_indexes_done, 0);
|
||||
check_cuda_error(
|
||||
cudaStreamWaitEvent(streams.stream(1), create_indexes_done, 0));
|
||||
cuda_set_device(2);
|
||||
cudaStreamWaitEvent(streams.stream(2), create_indexes_done, 0);
|
||||
check_cuda_error(
|
||||
cudaStreamWaitEvent(streams.stream(2), create_indexes_done, 0));
|
||||
|
||||
scatter_indexes_for_overflowing_sub(
|
||||
streams.stream(1), streams.gpu_index(1),
|
||||
@@ -842,7 +845,7 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
|
||||
free(second_indexes_for_overflow_sub_gpu_2);
|
||||
free(scalars_for_overflow_sub_gpu_2);
|
||||
|
||||
cudaEventDestroy(create_indexes_done);
|
||||
check_cuda_error(cudaEventDestroy(create_indexes_done));
|
||||
|
||||
// release sub streams
|
||||
sub_streams_1.release();
|
||||
|
||||
@@ -39,6 +39,28 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_64(void *stream,
|
||||
uint32_t gpu_index,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
// Noise-tests-namespaced wrappers for scratch/cleanup, so that callers
|
||||
// working with the noise-tests PBS variant use a consistent naming scheme.
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer);
|
||||
|
||||
// Noise tests variant: 64-bit torus, polynomial_size=2048 only. Uses the
|
||||
// NOISE_TESTS keybundle mode for noise analysis purposes.
|
||||
void cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride);
|
||||
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_async(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
@@ -56,6 +78,23 @@ void cuda_multi_bit_programmable_bootstrap_128_async(
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
|
||||
const uint32_t gpu_index,
|
||||
int8_t **buffer);
|
||||
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_128(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer);
|
||||
|
||||
void cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lwe_array_in, void const *lwe_input_indexes,
|
||||
void const *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride);
|
||||
}
|
||||
|
||||
#endif // CUDA_MULTI_BIT_H
|
||||
|
||||
@@ -105,11 +105,11 @@ template <typename Torus> struct zk_expand_mem {
|
||||
uint32_t num_lwes;
|
||||
uint32_t num_compact_lists;
|
||||
|
||||
int_radix_lut<Torus> *message_and_carry_extract_luts;
|
||||
int_radix_lut<Torus> *identity_lut;
|
||||
int_radix_lut<Torus> *message_and_carry_extract_luts = nullptr;
|
||||
int_radix_lut<Torus> *identity_lut = nullptr;
|
||||
|
||||
Torus *tmp_expanded_lwes;
|
||||
Torus *tmp_ksed_small_to_big_expanded_lwes;
|
||||
Torus *tmp_expanded_lwes = nullptr;
|
||||
Torus *tmp_ksed_small_to_big_expanded_lwes = nullptr;
|
||||
|
||||
bool gpu_memory_allocated;
|
||||
|
||||
@@ -148,66 +148,6 @@ template <typename Torus> struct zk_expand_mem {
|
||||
PANIC("GPU backend requires carry_modulus equal to message_modulus")
|
||||
}
|
||||
|
||||
// We create the identity LUT only if we are doing a SANITY_CHECK
|
||||
if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
|
||||
identity_lut =
|
||||
new int_radix_lut<Torus>(streams, computing_params, 1, 2 * num_lwes,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
auto identity_lut_f = [](Torus x) -> Torus { return x; };
|
||||
|
||||
identity_lut->generate_and_broadcast_lut(streams, {0}, {identity_lut_f},
|
||||
LUT_0_FOR_ALL_BLOCKS);
|
||||
}
|
||||
|
||||
auto message_extract_lut_f = [casting_params](Torus x) -> Torus {
|
||||
return x % casting_params.message_modulus;
|
||||
};
|
||||
auto carry_extract_lut_f = [casting_params](Torus x) -> Torus {
|
||||
return (x / casting_params.carry_modulus) %
|
||||
casting_params.message_modulus;
|
||||
};
|
||||
|
||||
// Booleans have to be sanitized
|
||||
auto sanitize_bool_f = [](Torus x) -> Torus { return x == 0 ? 0 : 1; };
|
||||
auto message_extract_and_sanitize_bool_lut_f =
|
||||
[message_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
|
||||
return sanitize_bool_f(message_extract_lut_f(x));
|
||||
};
|
||||
auto carry_extract_and_sanitize_bool_lut_f =
|
||||
[carry_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
|
||||
return sanitize_bool_f(carry_extract_lut_f(x));
|
||||
};
|
||||
|
||||
/** In case the casting key casts from BIG to SMALL key we run a single KS
|
||||
to expand using the casting key as ksk. Otherwise, in case the casting key
|
||||
casts from SMALL to BIG key, we first keyswitch from SMALL to BIG using
|
||||
the casting key as ksk, then we keyswitch from BIG to SMALL using the
|
||||
computing ksk, and lastly we apply the PBS. The output is always on the
|
||||
BIG key.
|
||||
**/
|
||||
auto params = casting_params;
|
||||
if (casting_key_type == SMALL_TO_BIG) {
|
||||
params = computing_params;
|
||||
}
|
||||
message_and_carry_extract_luts = new int_radix_lut<Torus>(
|
||||
streams, params, 4, 2 * num_lwes, allocate_gpu_memory, size_tracker);
|
||||
|
||||
// We are always packing two LWEs. We just need to be sure we have enough
|
||||
// space in the carry part to store a message of the same size as is in the
|
||||
// message part.
|
||||
if (params.carry_modulus < params.message_modulus)
|
||||
PANIC("Carry modulus must be at least as large as message modulus");
|
||||
auto num_packed_msgs = 2;
|
||||
|
||||
// Adjust indexes to permute the output and access the correct LUT
|
||||
auto h_indexes_in = static_cast<Torus *>(
|
||||
malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
|
||||
auto h_indexes_out = static_cast<Torus *>(
|
||||
malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
|
||||
auto h_lut_indexes = static_cast<Torus *>(
|
||||
malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
|
||||
|
||||
d_expand_jobs =
|
||||
static_cast<expand_job<Torus> *>(cuda_malloc_with_size_tracking_async(
|
||||
safe_mul_sizeof<expand_job<Torus>>(num_lwes), streams.stream(0),
|
||||
@@ -216,144 +156,202 @@ template <typename Torus> struct zk_expand_mem {
|
||||
h_expand_jobs = static_cast<expand_job<Torus> *>(
|
||||
malloc(safe_mul_sizeof<expand_job<Torus>>(num_lwes)));
|
||||
|
||||
/*
|
||||
* Each LWE contains encrypted data in both carry and message spaces
|
||||
* that needs to be extracted.
|
||||
*
|
||||
* The loop processes each compact list (k) and for each LWE within that
|
||||
* list:
|
||||
* 1. Sets input indexes to read each LWE twice (for carry and message
|
||||
* extraction)
|
||||
* 2. Creates output indexes to properly reorder the results
|
||||
* 3. Selects appropriate LUT index based on whether boolean sanitization is
|
||||
* needed
|
||||
*
|
||||
* We want the output to have always first the content of the message part
|
||||
* and then the content of the carry part of each LWE.
|
||||
*
|
||||
* i.e. msg_extract(LWE_0), carry_extract(LWE_0), msg_extract(LWE_1),
|
||||
* carry_extract(LWE_1), ...
|
||||
*
|
||||
* Aiming that behavior, with 4 LWEs we would have:
|
||||
*
|
||||
* // Each LWE is processed twice
|
||||
* h_indexes_in = {0, 1, 2, 3, 0, 1, 2, 3}
|
||||
*
|
||||
* // First 4 use message LUT, last 4 use carry LUT
|
||||
* h_lut_indexes = {0, 0, 0, 0, 1, 1, 1, 1}
|
||||
*
|
||||
* // Reorders output so message and carry for each LWE appear together
|
||||
* h_indexes_out = {0, 2, 4, 6, 1, 3, 5, 7}
|
||||
*
|
||||
* If an LWE contains a boolean value, its LUT index is shifted by
|
||||
* num_packed_msgs to use the sanitization LUT (which ensures output is
|
||||
* exactly 0 or 1).
|
||||
*/
|
||||
auto offset = 0;
|
||||
for (int k = 0; k < num_compact_lists; k++) {
|
||||
auto num_lwes_in_kth = this->num_lwes_per_compact_list[k];
|
||||
for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
|
||||
auto lwe_index = i + num_packed_msgs * offset;
|
||||
auto lwe_index_in_list = i % num_lwes_in_kth;
|
||||
PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
|
||||
"Cuda error: index %d is beyond the max value %d",
|
||||
lwe_index, num_packed_msgs * num_lwes);
|
||||
h_indexes_in[lwe_index] = lwe_index_in_list + offset;
|
||||
h_indexes_out[lwe_index] =
|
||||
num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
|
||||
PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
|
||||
"Cuda error: index %lu is beyond the max value %lu",
|
||||
(unsigned long)h_indexes_in[lwe_index],
|
||||
(unsigned long)(num_packed_msgs * num_lwes));
|
||||
PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
|
||||
"Cuda error: index %lu is beyond the max value %lu",
|
||||
(unsigned long)h_indexes_out[lwe_index],
|
||||
(unsigned long)(num_packed_msgs * num_lwes));
|
||||
// is_boolean_array tells us which input is a boolean and thus the
|
||||
// related output needs boolean sanitization. It naturally has
|
||||
// total_blocks entries, but h_indexes_out reaches
|
||||
// message_modulus * ceil(total_blocks/2) - 1. When total_blocks is odd,
|
||||
// the ceiling causes out-of-bounds access. Reading garbage "true" would
|
||||
// set h_lut_indexes to an invalid index pointing to uninitialized
|
||||
// memory instead of a real LUT. Rust pads is_boolean_array with FALSE
|
||||
// to match.
|
||||
PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
|
||||
"Cuda error: index %lu for is_boolean_array is out of "
|
||||
"bounds (len is %lu)",
|
||||
(unsigned long)h_indexes_out[lwe_index],
|
||||
(unsigned long)is_boolean_array_len);
|
||||
// NO_CASTING expands directly into the output buffer — no LUTs, no PBS,
|
||||
// no intermediate buffers needed.
|
||||
if (expand_kind != EXPAND_KIND::NO_CASTING) {
|
||||
/** In case the casting key casts from BIG to SMALL key we run a single KS
|
||||
to expand using the casting key as ksk. Otherwise, in case the casting key
|
||||
casts from SMALL to BIG key, we first keyswitch from SMALL to BIG using
|
||||
the casting key as ksk, then we keyswitch from BIG to SMALL using the
|
||||
computing ksk, and lastly we apply the PBS. The output is always on the
|
||||
BIG key.
|
||||
**/
|
||||
auto params = casting_params;
|
||||
if (casting_key_type == SMALL_TO_BIG) {
|
||||
params = computing_params;
|
||||
}
|
||||
offset += num_lwes_in_kth;
|
||||
}
|
||||
|
||||
message_and_carry_extract_luts->set_lwe_indexes(
|
||||
streams.stream(0), streams.gpu_index(0), h_indexes_in, h_indexes_out);
|
||||
// We always pack two LWEs (message and carry parts per LWE)
|
||||
auto num_packed_msgs = 2;
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
|
||||
// Adjust indexes to permute the output and access the correct LUT.
|
||||
//
|
||||
// The loop below fills h_indexes_in and h_indexes_out so that the output
|
||||
// is ordered as: msg_extract(LWE_0), carry_extract(LWE_0),
|
||||
// msg_extract(LWE_1), carry_extract(LWE_1), ...
|
||||
//
|
||||
// With 4 LWEs the arrays look like:
|
||||
// h_indexes_in = {0, 1, 2, 3, 0, 1, 2, 3} (each LWE read twice)
|
||||
// h_lut_indexes = {0, 0, 0, 0, 1, 1, 1, 1} (msg LUT then carry LUT)
|
||||
// h_indexes_out = {0, 2, 4, 6, 1, 3, 5, 7} (interleaved output)
|
||||
//
|
||||
// If an LWE contains a boolean its LUT index is shifted by
|
||||
// num_packed_msgs to use the sanitization LUT (output clamped to {0, 1}).
|
||||
auto h_indexes_in = static_cast<Torus *>(
|
||||
malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
|
||||
auto h_indexes_out = static_cast<Torus *>(
|
||||
malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
|
||||
|
||||
// Index generator for message/carry extraction LUTs
|
||||
auto index_gen = [num_compact_lists,
|
||||
num_lwes_per_compact_list =
|
||||
this->num_lwes_per_compact_list,
|
||||
num_packed_msgs, is_boolean_array,
|
||||
h_indexes_out](Torus *h_lut_indexes, uint32_t) {
|
||||
auto offset = 0;
|
||||
for (int k = 0; k < num_compact_lists; k++) {
|
||||
auto num_lwes_in_kth = num_lwes_per_compact_list[k];
|
||||
auto num_lwes_in_kth = this->num_lwes_per_compact_list[k];
|
||||
for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
|
||||
auto lwe_index = i + num_packed_msgs * offset;
|
||||
auto boolean_offset =
|
||||
is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
|
||||
h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
|
||||
auto lwe_index_in_list = i % num_lwes_in_kth;
|
||||
PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
|
||||
"Cuda error: index %d is beyond the max value %d",
|
||||
lwe_index, num_packed_msgs * num_lwes);
|
||||
h_indexes_in[lwe_index] = lwe_index_in_list + offset;
|
||||
h_indexes_out[lwe_index] =
|
||||
num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
|
||||
PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
|
||||
"Cuda error: index %lu is beyond the max value %lu",
|
||||
(unsigned long)h_indexes_in[lwe_index],
|
||||
(unsigned long)(num_packed_msgs * num_lwes));
|
||||
PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
|
||||
"Cuda error: index %lu is beyond the max value %lu",
|
||||
(unsigned long)h_indexes_out[lwe_index],
|
||||
(unsigned long)(num_packed_msgs * num_lwes));
|
||||
// is_boolean_array tells us which input is a boolean and thus the
|
||||
// related output needs boolean sanitization. It naturally has
|
||||
// total_blocks entries, but h_indexes_out reaches
|
||||
// message_modulus * ceil(total_blocks/2) - 1. When total_blocks is
|
||||
// odd, the ceiling causes out-of-bounds access. Reading garbage
|
||||
// "true" would set h_lut_indexes to an invalid index pointing to
|
||||
// uninitialized memory instead of a real LUT. Rust pads
|
||||
// is_boolean_array with FALSE to match.
|
||||
PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
|
||||
"Cuda error: index %lu for is_boolean_array is out of "
|
||||
"bounds (len is %lu)",
|
||||
(unsigned long)h_indexes_out[lwe_index],
|
||||
(unsigned long)is_boolean_array_len);
|
||||
}
|
||||
offset += num_lwes_in_kth;
|
||||
}
|
||||
};
|
||||
|
||||
message_and_carry_extract_luts->generate_and_broadcast_lut(
|
||||
active_streams, {0, 1, 2, 3},
|
||||
{message_extract_lut_f, carry_extract_lut_f,
|
||||
message_extract_and_sanitize_bool_lut_f,
|
||||
carry_extract_and_sanitize_bool_lut_f},
|
||||
index_gen, true, {}, h_lut_indexes);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
|
||||
|
||||
message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
|
||||
active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
|
||||
// The expanded LWEs will always be on the casting key format
|
||||
tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
safe_mul_sizeof<Torus>(num_lwes, casting_params.big_lwe_dimension + 1),
|
||||
streams.stream(0), streams.gpu_index(0), size_tracker,
|
||||
allocate_gpu_memory);
|
||||
// SANITY_CHECK uses identity_lut (skipping the full message/carry
|
||||
// extraction LUT and the SMALL_TO_BIG intermediate buffer).
|
||||
if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
|
||||
identity_lut =
|
||||
new int_radix_lut<Torus>(streams, casting_params, 1, 2 * num_lwes,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
tmp_ksed_small_to_big_expanded_lwes =
|
||||
(Torus *)cuda_malloc_with_size_tracking_async(
|
||||
safe_mul_sizeof<Torus>(num_lwes,
|
||||
casting_params.big_lwe_dimension + 1),
|
||||
streams.stream(0), streams.gpu_index(0), size_tracker,
|
||||
allocate_gpu_memory);
|
||||
auto identity_lut_f = [](Torus x) -> Torus { return x; };
|
||||
identity_lut->generate_and_broadcast_lut(streams, {0}, {identity_lut_f},
|
||||
LUT_0_FOR_ALL_BLOCKS);
|
||||
identity_lut->set_lwe_indexes(streams.stream(0), streams.gpu_index(0),
|
||||
h_indexes_in, h_indexes_out);
|
||||
identity_lut->allocate_lwe_vector_for_non_trivial_indexes(
|
||||
active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
|
||||
} else {
|
||||
// We are always packing two LWEs. We just need to be sure we have
|
||||
// enough space in the carry part to store a message of the same size
|
||||
// as is in the message part.
|
||||
if (params.carry_modulus < params.message_modulus)
|
||||
PANIC("Carry modulus must be at least as large as message modulus");
|
||||
|
||||
message_and_carry_extract_luts =
|
||||
new int_radix_lut<Torus>(streams, params, 4, 2 * num_lwes,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
message_and_carry_extract_luts->set_lwe_indexes(
|
||||
streams.stream(0), streams.gpu_index(0), h_indexes_in,
|
||||
h_indexes_out);
|
||||
|
||||
auto message_extract_lut_f = [casting_params](Torus x) -> Torus {
|
||||
return x % casting_params.message_modulus;
|
||||
};
|
||||
auto carry_extract_lut_f = [casting_params](Torus x) -> Torus {
|
||||
return (x / casting_params.carry_modulus) %
|
||||
casting_params.message_modulus;
|
||||
};
|
||||
auto sanitize_bool_f = [](Torus x) -> Torus { return x == 0 ? 0 : 1; };
|
||||
auto message_extract_and_sanitize_bool_lut_f =
|
||||
[message_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
|
||||
return sanitize_bool_f(message_extract_lut_f(x));
|
||||
};
|
||||
auto carry_extract_and_sanitize_bool_lut_f =
|
||||
[carry_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
|
||||
return sanitize_bool_f(carry_extract_lut_f(x));
|
||||
};
|
||||
|
||||
auto h_lut_indexes = static_cast<Torus *>(
|
||||
malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
|
||||
|
||||
auto index_gen = [num_compact_lists,
|
||||
num_lwes_per_compact_list =
|
||||
this->num_lwes_per_compact_list,
|
||||
num_packed_msgs, is_boolean_array,
|
||||
h_indexes_out](Torus *h_lut_indexes, uint32_t) {
|
||||
auto offset = 0;
|
||||
for (int k = 0; k < num_compact_lists; k++) {
|
||||
auto num_lwes_in_kth = num_lwes_per_compact_list[k];
|
||||
for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
|
||||
auto lwe_index = i + num_packed_msgs * offset;
|
||||
auto boolean_offset = is_boolean_array[h_indexes_out[lwe_index]]
|
||||
? num_packed_msgs
|
||||
: 0;
|
||||
h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
|
||||
}
|
||||
offset += num_lwes_in_kth;
|
||||
}
|
||||
};
|
||||
|
||||
message_and_carry_extract_luts->generate_and_broadcast_lut(
|
||||
active_streams, {0, 1, 2, 3},
|
||||
{message_extract_lut_f, carry_extract_lut_f,
|
||||
message_extract_and_sanitize_bool_lut_f,
|
||||
carry_extract_and_sanitize_bool_lut_f},
|
||||
index_gen, true, {}, h_lut_indexes);
|
||||
message_and_carry_extract_luts
|
||||
->allocate_lwe_vector_for_non_trivial_indexes(
|
||||
active_streams, 2 * num_lwes, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
free(h_lut_indexes);
|
||||
|
||||
// SANITY_CHECK panics on SMALL_TO_BIG, so this buffer is only needed
|
||||
// on the full casting path.
|
||||
tmp_ksed_small_to_big_expanded_lwes =
|
||||
(Torus *)cuda_malloc_with_size_tracking_async(
|
||||
safe_mul_sizeof<Torus>(num_lwes,
|
||||
casting_params.big_lwe_dimension + 1),
|
||||
streams.stream(0), streams.gpu_index(0), size_tracker,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
// The expanded LWEs will always be on the casting key format
|
||||
tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
safe_mul_sizeof<Torus>(num_lwes,
|
||||
casting_params.big_lwe_dimension + 1),
|
||||
streams.stream(0), streams.gpu_index(0), size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
free(h_indexes_in);
|
||||
free(h_indexes_out);
|
||||
}
|
||||
|
||||
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
||||
free(h_indexes_in);
|
||||
free(h_indexes_out);
|
||||
free(h_lut_indexes);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
message_and_carry_extract_luts->release(streams);
|
||||
delete message_and_carry_extract_luts;
|
||||
|
||||
if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
|
||||
identity_lut->release(streams);
|
||||
delete identity_lut;
|
||||
if (expand_kind != EXPAND_KIND::NO_CASTING) {
|
||||
if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
|
||||
identity_lut->release(streams);
|
||||
delete identity_lut;
|
||||
} else {
|
||||
message_and_carry_extract_luts->release(streams);
|
||||
delete message_and_carry_extract_luts;
|
||||
cuda_drop_with_size_tracking_async(
|
||||
tmp_ksed_small_to_big_expanded_lwes, streams.stream(0),
|
||||
streams.gpu_index(0), gpu_memory_allocated);
|
||||
}
|
||||
cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams.stream(0),
|
||||
streams.gpu_index(0),
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
|
||||
cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams.stream(0),
|
||||
streams.gpu_index(0),
|
||||
gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(tmp_ksed_small_to_big_expanded_lwes,
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(d_expand_jobs, streams.stream(0),
|
||||
streams.gpu_index(0),
|
||||
gpu_memory_allocated);
|
||||
|
||||
@@ -150,3 +150,31 @@ void cuda_glwe_sample_extract_128_async(
|
||||
"N's are powers of two in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_modulus_switch_multi_bit_64_async(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in, uint32_t size,
|
||||
uint32_t log_modulus,
|
||||
uint32_t degree,
|
||||
uint32_t grouping_factor) {
|
||||
|
||||
host_modulus_switch_multi_bit<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in), size, log_modulus, degree,
|
||||
grouping_factor);
|
||||
}
|
||||
|
||||
void cuda_modulus_switch_multi_bit_128_async(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in, uint32_t size,
|
||||
uint32_t log_modulus,
|
||||
uint32_t degree,
|
||||
uint32_t grouping_factor) {
|
||||
|
||||
host_modulus_switch_multi_bit<__uint128_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<__uint128_t *>(lwe_array_out),
|
||||
static_cast<__uint128_t *>(lwe_array_in), size, log_modulus, degree,
|
||||
grouping_factor);
|
||||
}
|
||||
|
||||
@@ -463,5 +463,48 @@ __global__ void __launch_bounds__(512)
|
||||
return;
|
||||
}
|
||||
}
|
||||
// This function is only used for noise tests, it follows the same logic
|
||||
// that is embedded in the keybundle just we need a global function to
|
||||
// be able to test it individually.
|
||||
template <typename Torus, class params>
|
||||
__global__ void
|
||||
modulus_switch_multi_bit(Torus *array_out, const Torus *array_in, int size,
|
||||
uint32_t log_modulus, uint32_t grouping_factor) {
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (tid < size) {
|
||||
int num_monomials = 1 << grouping_factor;
|
||||
int input_offset = tid * grouping_factor;
|
||||
int output_offset = tid * num_monomials;
|
||||
// We calculate all monomials even if the first one is never used.
|
||||
for (int ggsw_idx = 0; ggsw_idx < num_monomials; ggsw_idx++) {
|
||||
array_out[ggsw_idx + output_offset] =
|
||||
calculates_monomial_degree<Torus, params>(&array_in[input_offset],
|
||||
ggsw_idx, grouping_factor);
|
||||
}
|
||||
}
|
||||
}
|
||||
// This aims to be launched only from the noise tests.
|
||||
// That is why we support a specific set of parameters
|
||||
template <typename Torus>
|
||||
__host__ void host_modulus_switch_multi_bit(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *array_out, Torus *array_in,
|
||||
int size, uint32_t log_modulus, uint32_t degree, uint32_t grouping_factor) {
|
||||
cudaSetDevice(gpu_index);
|
||||
int multibit_size = size / grouping_factor;
|
||||
int num_threads = 0, num_blocks = 0;
|
||||
getNumBlocksAndThreads(multibit_size, 1024, num_blocks, num_threads);
|
||||
switch (degree) {
|
||||
case 2048:
|
||||
modulus_switch_multi_bit<Torus, Degree<2048>>
|
||||
<<<num_blocks, num_threads, 0, stream>>>(
|
||||
array_out, array_in, multibit_size, log_modulus, grouping_factor);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: unsupported polynomial size. Supported "
|
||||
"N's are powers of two in the interval [2048].")
|
||||
};
|
||||
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif // CNCRT_TORUS_H
|
||||
|
||||
@@ -326,6 +326,10 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||
uint32_t gpu_index) {
|
||||
if (size == 0)
|
||||
return;
|
||||
|
||||
GPU_ASSERT(src != nullptr, "Cuda error: null device ptr");
|
||||
GPU_ASSERT(dest != nullptr, "Cuda error: null device ptr");
|
||||
|
||||
cudaPointerAttributes attr_dest;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
|
||||
PANIC_IF_FALSE(
|
||||
|
||||
@@ -373,7 +373,8 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
|
||||
|
||||
// Get the number of streaming multiprocessors
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
check_cuda_error(
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
|
||||
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
|
||||
}
|
||||
|
||||
|
||||
@@ -420,6 +420,39 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
|
||||
}
|
||||
}
|
||||
|
||||
// Noise tests variant: identical to host_cg_multi_bit_programmable_bootstrap
|
||||
// but uses NOISE_TESTS keybundle mode.
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_cg_multi_bit_programmable_bootstrap_noise_tests(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, uint64_t const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
// Compute a keybundle with NOISE_TESTS mode instead of GENERIC
|
||||
execute_compute_keybundle_noise_tests<Torus, params>(
|
||||
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, level_count, lwe_offset);
|
||||
|
||||
execute_cg_external_product_loop<Torus, params>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
|
||||
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
|
||||
lut_stride);
|
||||
}
|
||||
}
|
||||
|
||||
// Verify if the grid size satisfies the cooperative group constraints
|
||||
template <typename Torus, class params>
|
||||
__host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
|
||||
@@ -484,7 +517,8 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
|
||||
|
||||
// Get the number of streaming multiprocessors
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
check_cuda_error(
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
|
||||
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
|
||||
}
|
||||
|
||||
|
||||
@@ -784,9 +784,9 @@ __host__ uint64_t scratch_programmable_bootstrap_tbc_128(
|
||||
device_programmable_bootstrap_tbc_128<InputTorus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
full_sm)); // full_sm + minimum_sm_tbc));
|
||||
cudaFuncSetCacheConfig(
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_tbc_128<InputTorus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
cudaFuncCachePreferShared));
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_tbc_128<InputTorus, params, FULLSM>,
|
||||
cudaFuncAttributeNonPortableClusterSizeAllowed, true));
|
||||
@@ -1271,7 +1271,8 @@ __host__ bool verify_cuda_programmable_bootstrap_128_cg_grid_size(
|
||||
|
||||
// Get the number of streaming multiprocessors
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
check_cuda_error(
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
|
||||
|
||||
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
|
||||
}
|
||||
|
||||
@@ -645,6 +645,103 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_64(void *stream,
|
||||
*buffer = nullptr;
|
||||
}
|
||||
|
||||
// Noise-tests-namespaced wrappers: delegate to the standard scratch/cleanup so
|
||||
// that callers using the noise-tests PBS variant have a consistent API.
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
return scratch_cuda_multi_bit_programmable_bootstrap_64_async(
|
||||
stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer) {
|
||||
cleanup_cuda_multi_bit_programmable_bootstrap_64(stream, gpu_index,
|
||||
pbs_buffer);
|
||||
}
|
||||
|
||||
// Noise tests variant of the 64-bit multi-bit PBS, restricted to
|
||||
// polynomial_size=2048. The main difference is that the input
|
||||
// is assumed to be modulus switched before bootstrapping.
|
||||
void cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
|
||||
PANIC_IF_FALSE(num_samples == 1,
|
||||
"Cuda error (multi-bit PBS): num_samples (%d) should be 1",
|
||||
num_samples);
|
||||
|
||||
PANIC_IF_FALSE(base_log <= 64,
|
||||
"Cuda error (multi-bit PBS): base log (%d) should be <= 64",
|
||||
base_log);
|
||||
PANIC_IF_FALSE(polynomial_size == 2048,
|
||||
"Cuda error (multi-bit PBS noise tests): only polynomial "
|
||||
"size 2048 is supported, got %d.",
|
||||
polynomial_size);
|
||||
|
||||
pbs_buffer<uint64_t, MULTI_BIT> *buffer =
|
||||
(pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
|
||||
|
||||
switch (buffer->pbs_variant) {
|
||||
case PBS_VARIANT::TBC:
|
||||
#if CUDA_ARCH >= 900
|
||||
{
|
||||
host_tbc_multi_bit_programmable_bootstrap_noise_tests<uint64_t,
|
||||
Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const uint64_t *>(bootstrapping_key), buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_many_lut, lut_stride);
|
||||
} break;
|
||||
#else
|
||||
PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
|
||||
#endif
|
||||
case PBS_VARIANT::CG:
|
||||
host_cg_multi_bit_programmable_bootstrap_noise_tests<uint64_t,
|
||||
Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const uint64_t *>(bootstrapping_key), buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
host_multi_bit_programmable_bootstrap_noise_tests<uint64_t, Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const uint64_t *>(bootstrapping_key), buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes divisors of the product of num_sms (streaming multiprocessors on the
|
||||
* GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
|
||||
|
||||
@@ -25,7 +25,8 @@ get_start_ith_ggsw_offset(uint32_t polynomial_size, int glwe_dimension,
|
||||
level_count;
|
||||
}
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
template <typename Torus, class params, sharedMemDegree SMD,
|
||||
bool runs_noise_test = false>
|
||||
__global__ void device_multi_bit_programmable_bootstrap_keybundle(
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
|
||||
@@ -55,9 +56,6 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
|
||||
|
||||
if (lwe_iteration < (lwe_dimension / grouping_factor)) {
|
||||
|
||||
const Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
|
||||
|
||||
double2 *keybundle = keybundle_array +
|
||||
// select the input
|
||||
input_idx * keybundle_size_per_input;
|
||||
@@ -86,10 +84,40 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
|
||||
// Precalculate the monomial degrees and store them in shared memory
|
||||
uint32_t *monomial_degrees = (uint32_t *)selected_memory;
|
||||
if (threadIdx.x < (1 << grouping_factor)) {
|
||||
const Torus *lwe_array_group =
|
||||
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
|
||||
monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
|
||||
lwe_array_group, threadIdx.x, grouping_factor);
|
||||
if constexpr (runs_noise_test == true) {
|
||||
// For noise tests the input array contains the input lwe but also the
|
||||
// modswitched results. This allows to avoid changing the accumulation
|
||||
// kernel for the noise tests since the input body will stay in the same
|
||||
// position. The layout of the input array is the following:
|
||||
// | input lwe | modswitched inputs |
|
||||
// | lwe size | lwe_size*grouping_factor |
|
||||
|
||||
// This offset allows to jump directly to the modswitched inputs,
|
||||
// skipping the input lwe
|
||||
const Torus modswitched_offset = lwe_dimension + 1;
|
||||
|
||||
const Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[input_idx] *
|
||||
(lwe_dimension / grouping_factor) *
|
||||
(1 << grouping_factor) +
|
||||
modswitched_offset];
|
||||
|
||||
const Torus *lwe_array_group =
|
||||
block_lwe_array_in + rev_lwe_iteration * (1 << grouping_factor);
|
||||
monomial_degrees[threadIdx.x] = lwe_array_group[threadIdx.x];
|
||||
|
||||
} else {
|
||||
// In production we calculate the monomial degrees on the fly, since
|
||||
// they are not stored in the input array.
|
||||
const Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
|
||||
|
||||
const Torus *lwe_array_group =
|
||||
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
|
||||
monomial_degrees[threadIdx.x] =
|
||||
calculates_monomial_degree<Torus, params>(
|
||||
lwe_array_group, threadIdx.x, grouping_factor);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
@@ -145,7 +173,8 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
|
||||
// Then we can just calculate the offset needed to apply this coefficients, and
|
||||
// the operation transforms into a pointwise vector multiplication, avoiding to
|
||||
// perform extra instructions other than MADD
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
template <typename Torus, class params, sharedMemDegree SMD,
|
||||
bool runs_noise_test = false>
|
||||
__global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
|
||||
@@ -219,10 +248,40 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
|
||||
uint32_t *monomial_degrees = (uint32_t *)selected_memory;
|
||||
|
||||
if (threadIdx.x < (1 << grouping_factor)) {
|
||||
const Torus *lwe_array_group =
|
||||
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
|
||||
monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
|
||||
lwe_array_group, threadIdx.x, grouping_factor);
|
||||
if constexpr (runs_noise_test == true) {
|
||||
// For noise tests the input array contains the input lwe but also the
|
||||
// modswitched results. This allows to avoid changing the accumulation
|
||||
// kernel for the noise tests since the input body will stay in the same
|
||||
// position. The layout of the input array is the following:
|
||||
// | input lwe | modswitched inputs |
|
||||
// | lwe size | lwe_size*grouping_factor |
|
||||
|
||||
// This offset allows to jump directly to the modswitched inputs,
|
||||
// skipping the input lwe
|
||||
const Torus modswitched_offset = lwe_dimension + 1;
|
||||
|
||||
const Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[input_idx] *
|
||||
(lwe_dimension / grouping_factor) *
|
||||
(1 << grouping_factor) +
|
||||
modswitched_offset];
|
||||
|
||||
const Torus *lwe_array_group =
|
||||
block_lwe_array_in + rev_lwe_iteration * (1 << grouping_factor);
|
||||
monomial_degrees[threadIdx.x] = lwe_array_group[threadIdx.x];
|
||||
|
||||
} else {
|
||||
// In production we calculate the monomial degrees on the fly, since
|
||||
// they are not stored in the input array.
|
||||
const Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
|
||||
|
||||
const Torus *lwe_array_group =
|
||||
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
|
||||
monomial_degrees[threadIdx.x] =
|
||||
calculates_monomial_degree<Torus, params>(
|
||||
lwe_array_group, threadIdx.x, grouping_factor);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
@@ -662,6 +721,7 @@ enum class MultiBitKeybundleLaunchMode {
|
||||
AUTO,
|
||||
GENERIC,
|
||||
SPECIALIZED_2_2,
|
||||
NOISE_TESTS,
|
||||
};
|
||||
|
||||
template <typename Torus, class params>
|
||||
@@ -726,30 +786,65 @@ __host__ void execute_compute_keybundle_with_mode(
|
||||
bool use_specialized =
|
||||
launch_mode == MultiBitKeybundleLaunchMode::SPECIALIZED_2_2 ||
|
||||
(launch_mode == MultiBitKeybundleLaunchMode::AUTO &&
|
||||
can_use_specialized) ||
|
||||
(launch_mode == MultiBitKeybundleLaunchMode::NOISE_TESTS &&
|
||||
can_use_specialized);
|
||||
bool use_noise_test_template =
|
||||
launch_mode == MultiBitKeybundleLaunchMode::NOISE_TESTS;
|
||||
if (use_specialized) {
|
||||
dim3 thds_new_keybundle(512, 1, 1);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
|
||||
Torus, Degree<2048>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 3 * full_sm_keybundle));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
|
||||
Torus, Degree<2048>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
check_cuda_error(cudaGetLastError());
|
||||
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
|
||||
Torus, Degree<2048>, FULLSM><<<grid_keybundle, thds_new_keybundle,
|
||||
3 * full_sm_keybundle, stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
|
||||
lwe_dimension, lwe_offset, chunk_size, keybundle_size_per_input);
|
||||
if (use_noise_test_template) {
|
||||
// Set up the noise-test variant of the specialized 2_2 kernel
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
|
||||
Torus, Degree<2048>, FULLSM, true>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
3 * full_sm_keybundle));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
|
||||
Torus, Degree<2048>, FULLSM, true>,
|
||||
cudaFuncCachePreferShared));
|
||||
check_cuda_error(cudaGetLastError());
|
||||
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
|
||||
Torus, Degree<2048>, FULLSM, true>
|
||||
<<<grid_keybundle, thds_new_keybundle, 3 * full_sm_keybundle,
|
||||
stream>>>(lwe_array_in, lwe_input_indexes, keybundle_fft,
|
||||
bootstrapping_key, lwe_dimension, lwe_offset,
|
||||
chunk_size, keybundle_size_per_input);
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
|
||||
Torus, Degree<2048>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
3 * full_sm_keybundle));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
|
||||
Torus, Degree<2048>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
check_cuda_error(cudaGetLastError());
|
||||
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
|
||||
Torus, Degree<2048>, FULLSM><<<grid_keybundle, thds_new_keybundle,
|
||||
3 * full_sm_keybundle, stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
|
||||
lwe_dimension, lwe_offset, chunk_size, keybundle_size_per_input);
|
||||
}
|
||||
} else {
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM>
|
||||
<<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
|
||||
level_count, lwe_offset, chunk_size, keybundle_size_per_input,
|
||||
d_mem, 0);
|
||||
if (use_noise_test_template) {
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM,
|
||||
true>
|
||||
<<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft,
|
||||
bootstrapping_key, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, grouping_factor, level_count, lwe_offset,
|
||||
chunk_size, keybundle_size_per_input, d_mem, 0);
|
||||
} else {
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM>
|
||||
<<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft,
|
||||
bootstrapping_key, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, grouping_factor, level_count, lwe_offset,
|
||||
chunk_size, keybundle_size_per_input, d_mem, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
@@ -796,6 +891,20 @@ __host__ void execute_compute_keybundle_2_2_specialized(
|
||||
grouping_factor, level_count, lwe_offset,
|
||||
MultiBitKeybundleLaunchMode::SPECIALIZED_2_2);
|
||||
}
|
||||
// Used only to run noise tests
|
||||
template <typename Torus, class params>
|
||||
__host__ void execute_compute_keybundle_noise_tests(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
|
||||
execute_compute_keybundle_with_mode<Torus, params>(
|
||||
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, level_count, lwe_offset,
|
||||
MultiBitKeybundleLaunchMode::NOISE_TESTS);
|
||||
}
|
||||
|
||||
template <typename Torus, class params, bool is_first_iter>
|
||||
__host__ void execute_step_one(
|
||||
@@ -955,4 +1064,62 @@ __host__ void host_multi_bit_programmable_bootstrap(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_multi_bit_programmable_bootstrap_noise_tests(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
// Compute a keybundle with NOISE_TESTS mode to enable the specialized
|
||||
// runs_noise_test=true kernel variant for noise measurement
|
||||
execute_compute_keybundle_with_mode<Torus, params>(
|
||||
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, level_count, lwe_offset,
|
||||
MultiBitKeybundleLaunchMode::NOISE_TESTS);
|
||||
// Accumulate (same as standard path)
|
||||
uint32_t chunk_size =
|
||||
std::min((uint32_t)lwe_chunk_size,
|
||||
(lwe_dimension / grouping_factor) - lwe_offset);
|
||||
for (uint32_t j = 0; j < chunk_size; j++) {
|
||||
bool is_first_iter = (j + lwe_offset) == 0;
|
||||
bool is_last_iter =
|
||||
(j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
|
||||
if (is_first_iter) {
|
||||
execute_step_one<Torus, params, true>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, buffer, num_samples, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count);
|
||||
} else {
|
||||
execute_step_one<Torus, params, false>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, buffer, num_samples, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count);
|
||||
}
|
||||
|
||||
if (is_last_iter) {
|
||||
execute_step_two<Torus, params, true>(
|
||||
stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
|
||||
num_samples, glwe_dimension, polynomial_size, level_count, j,
|
||||
num_many_lut, lut_stride);
|
||||
} else {
|
||||
execute_step_two<Torus, params, false>(
|
||||
stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
|
||||
num_samples, glwe_dimension, polynomial_size, level_count, j,
|
||||
num_many_lut, lut_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // MULTIBIT_PBS_H
|
||||
|
||||
@@ -293,6 +293,80 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
|
||||
*buffer = nullptr;
|
||||
}
|
||||
|
||||
// Noise-tests-namespaced wrappers: delegate to the standard scratch/cleanup so
|
||||
// that callers using the noise-tests PBS128 variant have a consistent API.
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
return scratch_cuda_multi_bit_programmable_bootstrap_128_async(
|
||||
stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_128(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer) {
|
||||
cleanup_cuda_multi_bit_programmable_bootstrap_128(stream, gpu_index,
|
||||
pbs_buffer);
|
||||
}
|
||||
|
||||
// Noise tests variant of the 128-bit multi-bit PBS, restricted to
|
||||
// polynomial_size=2048. The input is assumed to contain precomputed
|
||||
// modswitched values in the extended input array layout.
|
||||
void cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lwe_array_in, void const *lwe_input_indexes,
|
||||
void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
PANIC_IF_FALSE(num_samples == 1,
|
||||
"Cuda error (multi-bit PBS): num_samples (%d) should be 1",
|
||||
num_samples);
|
||||
PANIC_IF_FALSE(base_log <= 64,
|
||||
"Cuda error (multi-bit PBS): base log (%d) should be <= 64",
|
||||
base_log);
|
||||
PANIC_IF_FALSE(polynomial_size == 2048,
|
||||
"Cuda error (multi-bit PBS128 noise tests): only polynomial "
|
||||
"size 2048 is supported, got %d.",
|
||||
polynomial_size);
|
||||
|
||||
auto *buffer =
|
||||
reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(mem_ptr);
|
||||
switch (buffer->pbs_variant) {
|
||||
case PBS_VARIANT::CG:
|
||||
host_cg_multi_bit_programmable_bootstrap_noise_tests_128<uint64_t,
|
||||
Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<__uint128_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const __uint128_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const __uint128_t *>(bootstrapping_key), buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
host_multi_bit_programmable_bootstrap_noise_tests_128<uint64_t,
|
||||
Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<__uint128_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const __uint128_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const __uint128_t *>(bootstrapping_key), buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes divisors of the product of num_sms (streaming multiprocessors on the
|
||||
* GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
|
||||
|
||||
@@ -18,7 +18,8 @@ uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle(
|
||||
(size_t)2); // accumulator
|
||||
}
|
||||
|
||||
template <typename InputTorus, class params, sharedMemDegree SMD>
|
||||
template <typename InputTorus, class params, sharedMemDegree SMD,
|
||||
bool runs_noise_test = false>
|
||||
__global__ void device_multi_bit_programmable_bootstrap_keybundle_128(
|
||||
const InputTorus *__restrict__ lwe_array_in,
|
||||
const InputTorus *__restrict__ lwe_input_indexes, double *keybundle_array,
|
||||
@@ -80,11 +81,35 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_128(
|
||||
// Precalculate the monomial degrees and store them in shared memory
|
||||
uint32_t *monomial_degrees = (uint32_t *)selected_memory;
|
||||
if (threadIdx.x < (1 << grouping_factor)) {
|
||||
auto lwe_array_group =
|
||||
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
|
||||
monomial_degrees[threadIdx.x] =
|
||||
calculates_monomial_degree<InputTorus, params>(
|
||||
lwe_array_group, threadIdx.x, grouping_factor);
|
||||
if constexpr (runs_noise_test == true) {
|
||||
// For noise tests the input array contains the input lwe but also the
|
||||
// modswitched results. This allows to avoid changing the accumulation
|
||||
// kernel for the noise tests since the input body will stay in the same
|
||||
// position. The layout of the input array is the following:
|
||||
// | input lwe | modswitched inputs |
|
||||
// | lwe size | lwe_size*grouping_factor |
|
||||
|
||||
// This offset allows to jump directly to the modswitched inputs,
|
||||
// skipping the input lwe
|
||||
const InputTorus modswitched_offset = lwe_dimension + 1;
|
||||
|
||||
const InputTorus *block_lwe_array_in_noise =
|
||||
&lwe_array_in[lwe_input_indexes[input_idx] *
|
||||
(lwe_dimension / grouping_factor) *
|
||||
(1 << grouping_factor) +
|
||||
modswitched_offset];
|
||||
|
||||
const InputTorus *lwe_array_group =
|
||||
block_lwe_array_in_noise +
|
||||
rev_lwe_iteration * (1 << grouping_factor);
|
||||
monomial_degrees[threadIdx.x] = lwe_array_group[threadIdx.x];
|
||||
} else {
|
||||
auto lwe_array_group =
|
||||
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
|
||||
monomial_degrees[threadIdx.x] =
|
||||
calculates_monomial_degree<InputTorus, params>(
|
||||
lwe_array_group, threadIdx.x, grouping_factor);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
@@ -588,6 +613,74 @@ __host__ void execute_compute_keybundle_128(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
// Used only to run noise tests: launches the keybundle kernel with the
|
||||
// runs_noise_test=true variant, which reads modswitched inputs from the
|
||||
// extended input array layout instead of computing them on-the-fly
|
||||
template <typename InputTorus, class params>
|
||||
__host__ void execute_compute_keybundle_noise_tests_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, InputTorus const *lwe_array_in,
|
||||
InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t num_samples,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
uint64_t chunk_size = std::min(
|
||||
lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
|
||||
|
||||
uint64_t keybundle_size_per_input =
|
||||
lwe_chunk_size * level_count * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * (polynomial_size / 2) * 4;
|
||||
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle<
|
||||
__uint128_t>(polynomial_size);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
|
||||
auto d_mem = buffer->d_mem_keybundle;
|
||||
auto keybundle_fft = buffer->keybundle_fft;
|
||||
|
||||
dim3 grid_keybundle(num_samples * chunk_size,
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1), level_count);
|
||||
dim3 thds(polynomial_size / params::opt, 1, 1);
|
||||
|
||||
if (max_shared_memory < full_sm_keybundle) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle_128<
|
||||
InputTorus, params, NOSM, true>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle_128<
|
||||
InputTorus, params, NOSM, true>,
|
||||
cudaFuncCachePreferShared));
|
||||
device_multi_bit_programmable_bootstrap_keybundle_128<InputTorus, params,
|
||||
NOSM, true>
|
||||
<<<grid_keybundle, thds, 0, stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
|
||||
level_count, lwe_offset, chunk_size, keybundle_size_per_input,
|
||||
d_mem, full_sm_keybundle);
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle_128<
|
||||
InputTorus, params, FULLSM, true>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle_128<
|
||||
InputTorus, params, FULLSM, true>,
|
||||
cudaFuncCachePreferShared));
|
||||
device_multi_bit_programmable_bootstrap_keybundle_128<InputTorus, params,
|
||||
FULLSM, true>
|
||||
<<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
|
||||
level_count, lwe_offset, chunk_size, keybundle_size_per_input,
|
||||
d_mem, 0);
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename InputTorus, class params, bool is_first_iter>
|
||||
__host__ void execute_step_one_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
|
||||
@@ -1119,46 +1212,47 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128(
|
||||
int max_active_blocks_per_sm;
|
||||
|
||||
if (max_shared_memory < partial_sm_cg_accumulate) {
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
|
||||
Torus, params, NOSM>,
|
||||
thds, 0);
|
||||
thds, 0));
|
||||
} else if (max_shared_memory < full_sm_cg_accumulate) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
|
||||
PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
|
||||
PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
cudaFuncCachePreferShared));
|
||||
check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
|
||||
Torus, params, PARTIALSM>,
|
||||
thds, partial_sm_cg_accumulate);
|
||||
thds, partial_sm_cg_accumulate));
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
cudaFuncCachePreferShared));
|
||||
check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
|
||||
Torus, params, FULLSM>,
|
||||
thds, full_sm_cg_accumulate);
|
||||
thds, full_sm_cg_accumulate));
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
// Get the number of streaming multiprocessors
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
check_cuda_error(
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
|
||||
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
|
||||
}
|
||||
|
||||
@@ -1199,4 +1293,96 @@ supports_cooperative_groups_on_multibit_programmable_bootstrap_128(
|
||||
}
|
||||
}
|
||||
|
||||
// Noise tests variant: identical to
|
||||
// host_cg_multi_bit_programmable_bootstrap_128 but uses the noise-test
|
||||
// keybundle (runs_noise_test=true) instead of the standard one.
|
||||
template <typename InputTorus, class params>
|
||||
__host__ void host_cg_multi_bit_programmable_bootstrap_noise_tests_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
|
||||
InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
|
||||
__uint128_t const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
// Compute a keybundle with the noise-test kernel variant
|
||||
// (runs_noise_test=true) to read precomputed modswitched values
|
||||
execute_compute_keybundle_noise_tests_128<InputTorus, params>(
|
||||
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, level_count, lwe_offset);
|
||||
|
||||
execute_cg_external_product_loop_128<InputTorus, params>(
|
||||
stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
lwe_offset, num_many_lut, lut_stride);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputTorus, class params>
|
||||
__host__ void host_multi_bit_programmable_bootstrap_noise_tests_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
|
||||
InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
|
||||
__uint128_t const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
// Compute a keybundle with the noise-test kernel variant
|
||||
// (runs_noise_test=true) to read precomputed modswitched values
|
||||
execute_compute_keybundle_noise_tests_128<InputTorus, params>(
|
||||
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, level_count, lwe_offset);
|
||||
|
||||
// Accumulate (same as standard path)
|
||||
uint64_t chunk_size =
|
||||
std::min((uint32_t)lwe_chunk_size,
|
||||
(lwe_dimension / grouping_factor) - lwe_offset);
|
||||
for (uint32_t j = 0; j < chunk_size; j++) {
|
||||
bool is_first_iter = (j + lwe_offset) == 0;
|
||||
bool is_last_iter =
|
||||
(j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
|
||||
if (is_first_iter) {
|
||||
execute_step_one_128<InputTorus, params, true>(
|
||||
stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
base_log, level_count);
|
||||
} else {
|
||||
execute_step_one_128<InputTorus, params, false>(
|
||||
stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
base_log, level_count);
|
||||
}
|
||||
|
||||
if (is_last_iter) {
|
||||
execute_step_two_128<InputTorus, params, true>(
|
||||
stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
|
||||
num_samples, glwe_dimension, polynomial_size, level_count, j,
|
||||
num_many_lut, lut_stride);
|
||||
} else {
|
||||
execute_step_two_128<InputTorus, params, false>(
|
||||
stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
|
||||
num_samples, glwe_dimension, polynomial_size, level_count, j,
|
||||
num_many_lut, lut_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif // PROGRAMMABLE_BOOTSTRAP_MULTIBIT_128_CUH
|
||||
|
||||
@@ -739,7 +739,8 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
|
||||
|
||||
// Get the number of streaming multiprocessors
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
check_cuda_error(
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
|
||||
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
|
||||
}
|
||||
|
||||
|
||||
@@ -795,6 +795,40 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap_2_2_specialized(
|
||||
MultiBitTbcLaunchMode::SPECIALIZED_2_2);
|
||||
}
|
||||
|
||||
// Noise tests variant: uses NOISE_TESTS keybundle mode for the keybundle step
|
||||
// while keeping the standard AUTO accumulate behaviour for the TBC loop.
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_tbc_multi_bit_programmable_bootstrap_noise_tests(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
// Keybundle with NOISE_TESTS mode; the TBC accumulate uses AUTO as usual
|
||||
execute_compute_keybundle_noise_tests<Torus, params>(
|
||||
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, level_count, lwe_offset);
|
||||
|
||||
// Accumulate (unchanged from standard TBC path)
|
||||
execute_tbc_external_product_loop<Torus, params>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
|
||||
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
|
||||
lut_stride, MultiBitTbcLaunchMode::AUTO);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size, uint32_t max_shared_memory) {
|
||||
|
||||
@@ -119,71 +119,73 @@ __host__ void host_expand_without_verification(
|
||||
streams.stream(0), streams.gpu_index(0), true);
|
||||
|
||||
if (mem_ptr->expand_kind == EXPAND_KIND::NO_CASTING) {
|
||||
// This path is added to mimic the CPU fallback behaviour for the no_casting
|
||||
// expand, which is needed for the noise sanity checks.
|
||||
host_lwe_expand<Torus, params>(streams.stream(0), streams.gpu_index(0),
|
||||
lwe_array_out, d_expand_jobs, num_lwes);
|
||||
return;
|
||||
}
|
||||
|
||||
host_lwe_expand<Torus, params>(streams.stream(0), streams.gpu_index(0),
|
||||
expanded_lwes, d_expand_jobs, num_lwes);
|
||||
} else {
|
||||
// This is our default path for the expand with casting if needed.
|
||||
host_lwe_expand<Torus, params>(streams.stream(0), streams.gpu_index(0),
|
||||
expanded_lwes, d_expand_jobs, num_lwes);
|
||||
|
||||
auto lwe_array_input = expanded_lwes;
|
||||
auto ksks = casting_keys;
|
||||
auto message_and_carry_extract_luts = mem_ptr->message_and_carry_extract_luts;
|
||||
auto lwe_array_input = expanded_lwes;
|
||||
auto ksks = casting_keys;
|
||||
auto message_and_carry_extract_luts =
|
||||
mem_ptr->message_and_carry_extract_luts;
|
||||
|
||||
auto lut = mem_ptr->message_and_carry_extract_luts;
|
||||
if (casting_key_type == SMALL_TO_BIG) {
|
||||
if (mem_ptr->expand_kind == EXPAND_KIND::SANITY_CHECK) {
|
||||
PANIC("SANITY_CHECK not supported for SMALL_TO_BIG casting");
|
||||
auto lut = mem_ptr->message_and_carry_extract_luts;
|
||||
if (casting_key_type == SMALL_TO_BIG) {
|
||||
if (mem_ptr->expand_kind == EXPAND_KIND::SANITY_CHECK) {
|
||||
PANIC("SANITY_CHECK not supported for SMALL_TO_BIG casting");
|
||||
}
|
||||
// Keyswitch from small to big key if needed
|
||||
auto ksed_small_to_big_expanded_lwes =
|
||||
mem_ptr->tmp_ksed_small_to_big_expanded_lwes;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec =
|
||||
lut->lwe_trivial_indexes_vec;
|
||||
|
||||
auto casting_params = mem_ptr->casting_params;
|
||||
auto casting_output_dimension = casting_params.big_lwe_dimension;
|
||||
auto casting_input_dimension = casting_params.small_lwe_dimension;
|
||||
auto casting_ks_level = casting_params.ks_level;
|
||||
auto casting_ks_base_log = casting_params.ks_base_log;
|
||||
|
||||
// apply keyswitch to BIG
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams.get_ith(0), ksed_small_to_big_expanded_lwes,
|
||||
lwe_trivial_indexes_vec[0], expanded_lwes, lwe_trivial_indexes_vec[0],
|
||||
casting_keys, casting_input_dimension, casting_output_dimension,
|
||||
casting_ks_base_log, casting_ks_level, num_lwes,
|
||||
lut->using_trivial_lwe_indexes, lut->ks_tmp_buf_vec);
|
||||
|
||||
// In this case, the next keyswitch will use the compute ksk
|
||||
ksks = compute_ksks;
|
||||
lwe_array_input = ksed_small_to_big_expanded_lwes;
|
||||
}
|
||||
// Keyswitch from small to big key if needed
|
||||
auto ksed_small_to_big_expanded_lwes =
|
||||
mem_ptr->tmp_ksed_small_to_big_expanded_lwes;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
|
||||
|
||||
auto casting_params = mem_ptr->casting_params;
|
||||
auto casting_output_dimension = casting_params.big_lwe_dimension;
|
||||
auto casting_input_dimension = casting_params.small_lwe_dimension;
|
||||
auto casting_ks_level = casting_params.ks_level;
|
||||
auto casting_ks_base_log = casting_params.ks_base_log;
|
||||
// Apply LUT
|
||||
cuda_memset_async(lwe_array_out, 0,
|
||||
safe_mul_sizeof<Torus>((size_t)(lwe_dimension + 1),
|
||||
(size_t)num_lwes, (size_t)2),
|
||||
streams.stream(0), streams.gpu_index(0));
|
||||
CudaRadixCiphertextFFI output;
|
||||
into_radix_ciphertext(&output, lwe_array_out, 2 * num_lwes, lwe_dimension);
|
||||
CudaRadixCiphertextFFI input;
|
||||
into_radix_ciphertext(&input, lwe_array_input, 2 * num_lwes, lwe_dimension);
|
||||
// This is a special case only for our noise sanity checks
|
||||
// If we are doing a SANITY_CHECK expand, we just apply the identity LUT
|
||||
// This replicates the CPU fallback behaviour of the casting expand
|
||||
auto final_lut = (mem_ptr->expand_kind == EXPAND_KIND::SANITY_CHECK
|
||||
? mem_ptr->identity_lut
|
||||
: message_and_carry_extract_luts);
|
||||
|
||||
// apply keyswitch to BIG
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams.get_ith(0), ksed_small_to_big_expanded_lwes,
|
||||
lwe_trivial_indexes_vec[0], expanded_lwes, lwe_trivial_indexes_vec[0],
|
||||
casting_keys, casting_input_dimension, casting_output_dimension,
|
||||
casting_ks_base_log, casting_ks_level, num_lwes,
|
||||
lut->using_trivial_lwe_indexes, lut->ks_tmp_buf_vec);
|
||||
|
||||
// In this case, the next keyswitch will use the compute ksk
|
||||
ksks = compute_ksks;
|
||||
lwe_array_input = ksed_small_to_big_expanded_lwes;
|
||||
}
|
||||
|
||||
// Apply LUT
|
||||
cuda_memset_async(lwe_array_out, 0,
|
||||
safe_mul_sizeof<Torus>((size_t)(lwe_dimension + 1),
|
||||
(size_t)num_lwes, (size_t)2),
|
||||
streams.stream(0), streams.gpu_index(0));
|
||||
CudaRadixCiphertextFFI output;
|
||||
into_radix_ciphertext(&output, lwe_array_out, 2 * num_lwes, lwe_dimension);
|
||||
CudaRadixCiphertextFFI input;
|
||||
into_radix_ciphertext(&input, lwe_array_input, 2 * num_lwes, lwe_dimension);
|
||||
// This is a special case only for our noise sanity checks
|
||||
// If we are doing a SANITY_CHECK expand, we just apply the identity LUT
|
||||
// This replicates the CPU fallback behaviour of the casting expand
|
||||
if (mem_ptr->expand_kind == EXPAND_KIND::SANITY_CHECK) {
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, &output, &input, bsks, ksks, mem_ptr->identity_lut,
|
||||
2 * num_lwes);
|
||||
return;
|
||||
}
|
||||
streams, &output, &input, bsks, ksks, final_lut, 2 * num_lwes);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, &output, &input, bsks, ksks, message_and_carry_extract_luts,
|
||||
2 * num_lwes);
|
||||
release_cpu_radix_ciphertext_async(&input);
|
||||
release_cpu_radix_ciphertext_async(&output);
|
||||
release_cpu_radix_ciphertext_async(&input);
|
||||
release_cpu_radix_ciphertext_async(&output);
|
||||
}
|
||||
compact_lwe_lists.release();
|
||||
}
|
||||
|
||||
|
||||
@@ -79,6 +79,30 @@ unsafe extern "C" {
|
||||
polynomial_size: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_modulus_switch_multi_bit_64_async(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
lwe_array_out: *mut ffi::c_void,
|
||||
lwe_array_in: *mut ffi::c_void,
|
||||
size: u32,
|
||||
log_modulus: u32,
|
||||
degree: u32,
|
||||
grouping_factor: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_modulus_switch_multi_bit_128_async(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
lwe_array_out: *mut ffi::c_void,
|
||||
lwe_array_in: *mut ffi::c_void,
|
||||
size: u32,
|
||||
log_modulus: u32,
|
||||
degree: u32,
|
||||
grouping_factor: u32,
|
||||
);
|
||||
}
|
||||
pub const PBS_TYPE_MULTI_BIT: PBS_TYPE = 0;
|
||||
pub const PBS_TYPE_CLASSICAL: PBS_TYPE = 1;
|
||||
pub type PBS_TYPE = ffi::c_uint;
|
||||
@@ -112,9 +136,6 @@ pub type Direction = ffi::c_uint;
|
||||
pub const BitValue_Zero: BitValue = 0;
|
||||
pub const BitValue_One: BitValue = 1;
|
||||
pub type BitValue = ffi::c_uint;
|
||||
pub const RERAND_MODE_RERAND_WITH_KS: RERAND_MODE = 0;
|
||||
pub const RERAND_MODE_RERAND_WITHOUT_KS: RERAND_MODE = 1;
|
||||
pub type RERAND_MODE = ffi::c_uint;
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct CudaStreamsFFI {
|
||||
@@ -2455,6 +2476,9 @@ unsafe extern "C" {
|
||||
glwe_index: u32,
|
||||
);
|
||||
}
|
||||
pub const RERAND_MODE_RERAND_WITH_KS: RERAND_MODE = 0;
|
||||
pub const RERAND_MODE_RERAND_WITHOUT_KS: RERAND_MODE = 1;
|
||||
pub type RERAND_MODE = ffi::c_uint;
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_rerand_64_async(
|
||||
streams: CudaStreamsFFI,
|
||||
@@ -2467,7 +2491,7 @@ unsafe extern "C" {
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
rerand_type: u32,
|
||||
rerand_type: RERAND_MODE,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -3367,6 +3391,48 @@ unsafe extern "C" {
|
||||
pbs_buffer: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
pbs_buffer: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
level_count: u32,
|
||||
input_lwe_ciphertext_count: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_64(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
pbs_buffer: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
lwe_array_out: *mut ffi::c_void,
|
||||
lwe_output_indexes: *const ffi::c_void,
|
||||
lut_vector: *const ffi::c_void,
|
||||
lut_vector_indexes: *const ffi::c_void,
|
||||
lwe_array_in: *const ffi::c_void,
|
||||
lwe_input_indexes: *const ffi::c_void,
|
||||
bootstrapping_key: *const ffi::c_void,
|
||||
buffer: *mut i8,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
grouping_factor: u32,
|
||||
base_log: u32,
|
||||
level_count: u32,
|
||||
num_samples: u32,
|
||||
num_many_lut: u32,
|
||||
lut_stride: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_multi_bit_programmable_bootstrap_128_async(
|
||||
stream: *mut ffi::c_void,
|
||||
@@ -3408,3 +3474,44 @@ unsafe extern "C" {
|
||||
buffer: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
pbs_buffer: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
level_count: u32,
|
||||
input_lwe_ciphertext_count: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_128(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
pbs_buffer: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
lwe_array_out: *mut ffi::c_void,
|
||||
lwe_output_indexes: *const ffi::c_void,
|
||||
lut_vector: *const ffi::c_void,
|
||||
lwe_array_in: *const ffi::c_void,
|
||||
lwe_input_indexes: *const ffi::c_void,
|
||||
bootstrapping_key: *const ffi::c_void,
|
||||
buffer: *mut i8,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
grouping_factor: u32,
|
||||
base_log: u32,
|
||||
level_count: u32,
|
||||
num_samples: u32,
|
||||
num_many_lut: u32,
|
||||
lut_stride: u32,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
BSD 3-Clause Clear License
|
||||
|
||||
Copyright © 2025 ZAMA.
|
||||
Copyright © 2026 ZAMA.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
|
||||
@@ -297,8 +297,8 @@ source setup_hpu.sh --config v80 -p
|
||||
# Run hlapi benches
|
||||
make test_high_level_api_hpu
|
||||
|
||||
# Run hlapi erc20 benches
|
||||
make bench_hlapi_erc20_hpu
|
||||
# Run hlapi erc7984 benches
|
||||
make bench_hlapi_erc7984_hpu
|
||||
|
||||
# Run integer level benches
|
||||
make bench_integer_hpu
|
||||
|
||||
@@ -109,7 +109,7 @@
|
||||
flush_behaviour = "Patient"
|
||||
flush = true
|
||||
|
||||
[firmware.op_cfg.by_op.ERC_20]
|
||||
[firmware.op_cfg.by_op.ERC_7984]
|
||||
fill_batch_fifo = true
|
||||
min_batch_size = false
|
||||
use_tiers = true
|
||||
|
||||
@@ -121,7 +121,7 @@
|
||||
flush_behaviour = "Patient"
|
||||
flush = true
|
||||
|
||||
[firmware.op_cfg.by_op.ERC_20]
|
||||
[firmware.op_cfg.by_op.ERC_7984]
|
||||
fill_batch_fifo = true
|
||||
min_batch_size = false
|
||||
use_tiers = true
|
||||
|
||||
@@ -230,7 +230,7 @@ iop!(
|
||||
[IOP_CMP -> "CMP_NEQ", opcode::CMP_NEQ],
|
||||
[IOP_CT_F_CT_BOOL -> "IF_THEN_ZERO", opcode::IF_THEN_ZERO],
|
||||
[IOP_CT_F_2CT_BOOL -> "IF_THEN_ELSE", opcode::IF_THEN_ELSE],
|
||||
[IOP_2CT_F_3CT -> "ERC_20", opcode::ERC_20],
|
||||
[IOP_2CT_F_3CT -> "ERC_7984", opcode::ERC_7984],
|
||||
[IOP_CT_F_CT -> "MEMCPY", opcode::MEMCPY],
|
||||
[IOP_CT_F_CT -> "ILOG2", opcode::ILOG2],
|
||||
[IOP_CT_F_CT -> "COUNT0", opcode::COUNT0],
|
||||
@@ -240,5 +240,5 @@ iop!(
|
||||
[IOP_CT_F_CT -> "TRAIL0", opcode::TRAIL0],
|
||||
[IOP_CT_F_CT -> "TRAIL1", opcode::TRAIL1],
|
||||
[IOP_NCT_F_2NCT -> "ADD_SIMD", opcode::ADD_SIMD],
|
||||
[IOP_2NCT_F_3NCT -> "ERC_20_SIMD", opcode::ERC_20_SIMD],
|
||||
[IOP_2NCT_F_3NCT -> "ERC_7984_SIMD", opcode::ERC_7984_SIMD],
|
||||
);
|
||||
|
||||
@@ -74,9 +74,9 @@ pub const IF_THEN_ZERO: u8 = 0xCA;
|
||||
pub const IF_THEN_ELSE: u8 = 0xCB;
|
||||
|
||||
// Custom algorithm
|
||||
// ERC20 -> Found xfer algorithm
|
||||
// ERC7984 -> Found xfer algorithm
|
||||
// 2Ct <- func(3Ct)
|
||||
pub const ERC_20: u8 = 0x80;
|
||||
pub const ERC_7984: u8 = 0x80;
|
||||
|
||||
// Count bits
|
||||
pub const COUNT0: u8 = 0x81;
|
||||
@@ -89,7 +89,7 @@ pub const TRAIL1: u8 = 0x87;
|
||||
|
||||
// SIMD for maximum throughput
|
||||
pub const ADD_SIMD: u8 = 0xF0;
|
||||
pub const ERC_20_SIMD: u8 = 0xF1;
|
||||
pub const ERC_7984_SIMD: u8 = 0xF1;
|
||||
//
|
||||
// Utility operations
|
||||
// Used to handle real clone of ciphertext already uploaded in the Hpu memory
|
||||
|
||||
@@ -31,7 +31,7 @@ crate::impl_fw!("Demo" [
|
||||
IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero;
|
||||
IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else;
|
||||
|
||||
ERC_20 => fw_impl::ilp::iop_erc_20;
|
||||
ERC_7984 => fw_impl::ilp::iop_erc_7984;
|
||||
|
||||
CMP_GT => cmp_gt;
|
||||
CMP_GTE => cmp_gte;
|
||||
|
||||
@@ -61,7 +61,7 @@ crate::impl_fw!("Ilp" [
|
||||
IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero;
|
||||
IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else;
|
||||
|
||||
ERC_20 => fw_impl::ilp::iop_erc_20;
|
||||
ERC_7984 => fw_impl::ilp::iop_erc_7984;
|
||||
|
||||
MEMCPY => fw_impl::ilp::iop_memcpy;
|
||||
|
||||
@@ -74,7 +74,7 @@ crate::impl_fw!("Ilp" [
|
||||
TRAIL1 => fw_impl::ilp_log::iop_trail1;
|
||||
// SIMD Implementations
|
||||
ADD_SIMD => fw_impl::llt::iop_add_simd;
|
||||
ERC_20_SIMD => fw_impl::llt::iop_erc_20_simd;
|
||||
ERC_7984_SIMD => fw_impl::llt::iop_erc_7984_simd;
|
||||
]);
|
||||
|
||||
#[instrument(level = "trace", skip(prog))]
|
||||
@@ -1296,13 +1296,13 @@ pub fn iop_if_then_else(prog: &mut Program) {
|
||||
});
|
||||
}
|
||||
|
||||
/// Implement erc_20 fund xfer
|
||||
/// Implement erc_7984 fund xfer
|
||||
/// Targeted algorithm is as follow:
|
||||
/// 1. Check that from has enough funds
|
||||
/// 2. Compute real_amount to xfer (i.e. amount or 0)
|
||||
/// 3. Compute new amount (from - new_amount, to + new_amount)
|
||||
#[instrument(level = "info", skip(prog))]
|
||||
pub fn iop_erc_20(prog: &mut Program) {
|
||||
pub fn iop_erc_7984(prog: &mut Program) {
|
||||
// Allocate metavariables:
|
||||
// Dest -> Operand
|
||||
let mut dst_from = prog.iop_template_var(OperandKind::Dst, 0);
|
||||
@@ -1314,7 +1314,7 @@ pub fn iop_erc_20(prog: &mut Program) {
|
||||
let src_amount = prog.iop_template_var(OperandKind::Src, 2);
|
||||
|
||||
// Add Comment header
|
||||
prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string());
|
||||
prog.push_comment("ERC_7984 (new_from, new_to) <- (from, to, amount)".to_string());
|
||||
|
||||
let props = prog.params();
|
||||
let tfhe_params: asm::DigitParameters = props.clone().into();
|
||||
|
||||
@@ -70,7 +70,7 @@ crate::impl_fw!("Llt" [
|
||||
IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero;
|
||||
IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else;
|
||||
|
||||
ERC_20 => fw_impl::llt::iop_erc_20;
|
||||
ERC_7984 => fw_impl::llt::iop_erc_7984;
|
||||
MEMCPY => fw_impl::ilp::iop_memcpy;
|
||||
|
||||
COUNT0 => fw_impl::ilp_log::iop_count0;
|
||||
@@ -83,7 +83,7 @@ crate::impl_fw!("Llt" [
|
||||
|
||||
// SIMD Implementations
|
||||
ADD_SIMD => fw_impl::llt::iop_add_simd;
|
||||
ERC_20_SIMD => fw_impl::llt::iop_erc_20_simd;
|
||||
ERC_7984_SIMD => fw_impl::llt::iop_erc_7984_simd;
|
||||
]);
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
@@ -225,24 +225,24 @@ pub fn iop_muls(prog: &mut Program) {
|
||||
}
|
||||
|
||||
#[instrument(level = "trace", skip(prog))]
|
||||
pub fn iop_erc_20(prog: &mut Program) {
|
||||
pub fn iop_erc_7984(prog: &mut Program) {
|
||||
// Add Comment header
|
||||
prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string());
|
||||
prog.push_comment("ERC_7984 (new_from, new_to) <- (from, to, amount)".to_string());
|
||||
// TODO: Make sweep of kogge_blk_w
|
||||
// All these little parameters would be very handy to write an
|
||||
// exploration/compilation program which would try to minimize latency by
|
||||
// playing with these.
|
||||
iop_erc_20_rtl(prog, 0, Some(10)).add_to_prog(prog);
|
||||
iop_erc_7984_rtl(prog, 0, Some(10)).add_to_prog(prog);
|
||||
}
|
||||
|
||||
#[instrument(level = "trace", skip(prog))]
|
||||
pub fn iop_erc_20_simd(prog: &mut Program) {
|
||||
pub fn iop_erc_7984_simd(prog: &mut Program) {
|
||||
// Add Comment header
|
||||
prog.push_comment("ERC_20_SIMD (new_from, new_to) <- (from, to, amount)".to_string());
|
||||
prog.push_comment("ERC_7984_SIMD (new_from, new_to) <- (from, to, amount)".to_string());
|
||||
simd(
|
||||
prog,
|
||||
crate::asm::iop::SIMD_N,
|
||||
fw_impl::llt::iop_erc_20_rtl,
|
||||
fw_impl::llt::iop_erc_7984_rtl,
|
||||
None,
|
||||
);
|
||||
}
|
||||
@@ -379,7 +379,7 @@ pub fn iop_rotate_scalar_left(prog: &mut Program) {
|
||||
// Helper Functions
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/// Implement erc_20 fund xfer
|
||||
/// Implement erc_7984 fund xfer
|
||||
/// Targeted algorithm is as follow:
|
||||
/// 1. Check that from has enough funds
|
||||
/// 2. Compute real_amount to xfer (i.e. amount or 0)
|
||||
@@ -391,7 +391,7 @@ pub fn iop_rotate_scalar_left(prog: &mut Program) {
|
||||
/// (dst_from[0], dst_to[0], ..., dst_from[N-1], dst_to[N-1])
|
||||
/// Where N is the batch size
|
||||
#[instrument(level = "trace", skip(prog))]
|
||||
pub fn iop_erc_20_rtl(prog: &mut Program, batch_index: u8, kogge_blk_w: Option<usize>) -> Rtl {
|
||||
pub fn iop_erc_7984_rtl(prog: &mut Program, batch_index: u8, kogge_blk_w: Option<usize>) -> Rtl {
|
||||
// Allocate metavariables:
|
||||
// Dest -> Operand
|
||||
let dst_from = prog.iop_template_var(OperandKind::Dst, 2 * batch_index);
|
||||
|
||||
@@ -24,7 +24,7 @@ bindgen.workspace = true
|
||||
[dependencies]
|
||||
ark-ec.workspace = true
|
||||
ark-ff.workspace = true
|
||||
tfhe-cuda-backend = { version = "=0.14.0", path = "../tfhe-cuda-backend" }
|
||||
tfhe-cuda-backend = { version = "0.14.0", path = "../tfhe-cuda-backend" }
|
||||
|
||||
[features]
|
||||
default = []
|
||||
|
||||
@@ -97,28 +97,23 @@ size_t pippenger_scratch_size_g2(uint32_t n, uint32_t gpu_index);
|
||||
// d_scalars: Device pointer to input BigInt scalars (array of n scalars)
|
||||
// n: Number of points/scalars
|
||||
// d_scratch: Caller-provided device scratch buffer for intermediate results
|
||||
// size_tracker: Reference for tracking GPU memory allocation sizes
|
||||
void point_msm_g1_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
G1Projective *h_result, const G1Affine *d_points,
|
||||
const Scalar *d_scalars, uint32_t n,
|
||||
G1Projective *d_scratch, uint64_t &size_tracker,
|
||||
bool gpu_memory_allocated);
|
||||
G1Projective *d_scratch);
|
||||
|
||||
void point_msm_g1(cudaStream_t stream, uint32_t gpu_index,
|
||||
G1Projective *h_result, const G1Affine *d_points,
|
||||
const Scalar *d_scalars, uint32_t n, G1Projective *d_scratch,
|
||||
uint64_t &size_tracker, bool gpu_memory_allocated);
|
||||
const Scalar *d_scalars, uint32_t n, G1Projective *d_scratch);
|
||||
|
||||
// MSM for G2 points with BigInt scalars (projective result)
|
||||
// Result is written directly to a host pointer.
|
||||
void point_msm_g2_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
G2ProjectivePoint *h_result, const G2Point *d_points,
|
||||
const Scalar *d_scalars, uint32_t n,
|
||||
G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
|
||||
bool gpu_memory_allocated);
|
||||
G2ProjectivePoint *d_scratch);
|
||||
|
||||
void point_msm_g2(cudaStream_t stream, uint32_t gpu_index,
|
||||
G2ProjectivePoint *h_result, const G2Point *d_points,
|
||||
const Scalar *d_scalars, uint32_t n,
|
||||
G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
|
||||
bool gpu_memory_allocated);
|
||||
G2ProjectivePoint *d_scratch);
|
||||
|
||||
@@ -8,17 +8,16 @@
|
||||
// Multi-Scalar Multiplication (MSM) using Pippenger algorithm for BLS12-446
|
||||
|
||||
// Forward declarations for Pippenger implementations
|
||||
void point_msm_g1_pippenger_async(
|
||||
cudaStream_t stream, uint32_t gpu_index, G1Projective *h_result,
|
||||
const G1Affine *d_points, const Scalar *d_scalars, uint32_t n,
|
||||
G1Projective *d_scratch, uint64_t &size_tracker, bool gpu_memory_allocated);
|
||||
void point_msm_g1_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
G1Projective *h_result,
|
||||
const G1Affine *d_points,
|
||||
const Scalar *d_scalars, uint32_t n,
|
||||
G1Projective *d_scratch);
|
||||
void point_msm_g2_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
G2ProjectivePoint *h_result,
|
||||
const G2Point *d_points,
|
||||
const Scalar *d_scalars, uint32_t n,
|
||||
G2ProjectivePoint *d_scratch,
|
||||
uint64_t &size_tracker,
|
||||
bool gpu_memory_allocated);
|
||||
G2ProjectivePoint *d_scratch);
|
||||
|
||||
// ============================================================================
|
||||
// Public MSM API for BigInt scalars
|
||||
@@ -29,11 +28,9 @@ void point_msm_g2_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
void point_msm_g1_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
G1Projective *h_result, const G1Affine *d_points,
|
||||
const Scalar *d_scalars, uint32_t n,
|
||||
G1Projective *d_scratch, uint64_t &size_tracker,
|
||||
bool gpu_memory_allocated) {
|
||||
G1Projective *d_scratch) {
|
||||
point_msm_g1_pippenger_async(stream, gpu_index, h_result, d_points, d_scalars,
|
||||
n, d_scratch, size_tracker,
|
||||
gpu_memory_allocated);
|
||||
n, d_scratch);
|
||||
}
|
||||
|
||||
// MSM with BigInt scalars for G2 (projective coordinates internally)
|
||||
@@ -41,19 +38,17 @@ void point_msm_g1_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
void point_msm_g2_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
G2ProjectivePoint *h_result, const G2Point *d_points,
|
||||
const Scalar *d_scalars, uint32_t n,
|
||||
G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
|
||||
bool gpu_memory_allocated) {
|
||||
G2ProjectivePoint *d_scratch) {
|
||||
point_msm_g2_pippenger_async(stream, gpu_index, h_result, d_points, d_scalars,
|
||||
n, d_scratch, size_tracker,
|
||||
gpu_memory_allocated);
|
||||
n, d_scratch);
|
||||
}
|
||||
|
||||
void point_msm_g1(cudaStream_t stream, uint32_t gpu_index,
|
||||
G1Projective *h_result, const G1Affine *d_points,
|
||||
const Scalar *d_scalars, uint32_t n, G1Projective *d_scratch,
|
||||
uint64_t &size_tracker, bool gpu_memory_allocated) {
|
||||
const Scalar *d_scalars, uint32_t n,
|
||||
G1Projective *d_scratch) {
|
||||
point_msm_g1_async(stream, gpu_index, h_result, d_points, d_scalars, n,
|
||||
d_scratch, size_tracker, gpu_memory_allocated);
|
||||
d_scratch);
|
||||
// The async impl already syncs internally before the CPU-side Horner phase,
|
||||
// so the stream is idle here. This sync is kept for defensive correctness.
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
@@ -62,10 +57,9 @@ void point_msm_g1(cudaStream_t stream, uint32_t gpu_index,
|
||||
void point_msm_g2(cudaStream_t stream, uint32_t gpu_index,
|
||||
G2ProjectivePoint *h_result, const G2Point *d_points,
|
||||
const Scalar *d_scalars, uint32_t n,
|
||||
G2ProjectivePoint *d_scratch, uint64_t &size_tracker,
|
||||
bool gpu_memory_allocated) {
|
||||
G2ProjectivePoint *d_scratch) {
|
||||
point_msm_g2_async(stream, gpu_index, h_result, d_points, d_scalars, n,
|
||||
d_scratch, size_tracker, gpu_memory_allocated);
|
||||
d_scratch);
|
||||
// See comment in point_msm_g1 above.
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
}
|
||||
|
||||
@@ -493,12 +493,13 @@ void horner_combine_cpu(ProjectiveType &result,
|
||||
// window sums. The caller is responsible for allocating and freeing this
|
||||
// buffer.
|
||||
template <typename AffineType, typename ProjectiveType>
|
||||
void point_msm_pippenger_impl_async(
|
||||
cudaStream_t stream, uint32_t gpu_index, ProjectiveType *h_result,
|
||||
const AffineType *d_points, const Scalar *d_scalars, uint32_t n,
|
||||
uint32_t threads_per_block, uint32_t window_size, uint32_t bucket_count,
|
||||
ProjectiveType *d_scratch, uint64_t &size_tracker,
|
||||
bool gpu_memory_allocated) {
|
||||
void point_msm_pippenger_impl_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
ProjectiveType *h_result,
|
||||
const AffineType *d_points,
|
||||
const Scalar *d_scalars, uint32_t n,
|
||||
uint32_t threads_per_block,
|
||||
uint32_t window_size, uint32_t bucket_count,
|
||||
ProjectiveType *d_scratch) {
|
||||
using ProjectivePoint = Projective<ProjectiveType>;
|
||||
|
||||
if (n == 0) {
|
||||
@@ -705,16 +706,13 @@ void point_msm_g1_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
G1Projective *h_result,
|
||||
const G1Affine *d_points,
|
||||
const Scalar *d_scalars, uint32_t n,
|
||||
G1Projective *d_scratch,
|
||||
uint64_t &size_tracker,
|
||||
bool gpu_memory_allocated) {
|
||||
G1Projective *d_scratch) {
|
||||
uint32_t window_size, bucket_count;
|
||||
get_g1_window_params(n, window_size, bucket_count);
|
||||
|
||||
point_msm_pippenger_impl_async<G1Affine, G1Projective>(
|
||||
stream, gpu_index, h_result, d_points, d_scalars, n,
|
||||
msm_threads_per_block<G1Affine>(n), window_size, bucket_count, d_scratch,
|
||||
size_tracker, gpu_memory_allocated);
|
||||
msm_threads_per_block<G1Affine>(n), window_size, bucket_count, d_scratch);
|
||||
}
|
||||
|
||||
// MSM with BigInt scalars for G2 (projective coordinates internally)
|
||||
@@ -724,14 +722,11 @@ void point_msm_g2_pippenger_async(cudaStream_t stream, uint32_t gpu_index,
|
||||
G2ProjectivePoint *h_result,
|
||||
const G2Point *d_points,
|
||||
const Scalar *d_scalars, uint32_t n,
|
||||
G2ProjectivePoint *d_scratch,
|
||||
uint64_t &size_tracker,
|
||||
bool gpu_memory_allocated) {
|
||||
G2ProjectivePoint *d_scratch) {
|
||||
uint32_t window_size, bucket_count;
|
||||
get_g2_window_params(n, window_size, bucket_count);
|
||||
|
||||
point_msm_pippenger_impl_async<G2Point, G2ProjectivePoint>(
|
||||
stream, gpu_index, h_result, d_points, d_scalars, n,
|
||||
msm_threads_per_block<G2Point>(n), window_size, bucket_count, d_scratch,
|
||||
size_tracker, gpu_memory_allocated);
|
||||
msm_threads_per_block<G2Point>(n), window_size, bucket_count, d_scratch);
|
||||
}
|
||||
|
||||
@@ -187,37 +187,82 @@ __host__ __device__ void fp_copy(Fp &dst, const Fp &src) {
|
||||
// "Raw" means without modular reduction - performs a + b and returns carry.
|
||||
// This is an internal helper used by fp_add() which handles reduction.
|
||||
__host__ __device__ UNSIGNED_LIMB fp_add_raw(Fp &c, const Fp &a, const Fp &b) {
|
||||
#if defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 64
|
||||
// PTX carry-chain: add.cc sets the hardware carry flag, addc.cc propagates
|
||||
// it. This replaces 2 software carry-detect comparisons per limb (~14 extra
|
||||
// instructions across 7 limbs) with zero-cost hardware flag propagation.
|
||||
uint64_t carry_out;
|
||||
asm("add.cc.u64 %0, %8, %15;\n\t" // c[0] = a[0] + b[0], set CF
|
||||
"addc.cc.u64 %1, %9, %16;\n\t" // c[1] = a[1] + b[1] + CF
|
||||
"addc.cc.u64 %2, %10, %17;\n\t" // c[2] = a[2] + b[2] + CF
|
||||
"addc.cc.u64 %3, %11, %18;\n\t" // c[3] = a[3] + b[3] + CF
|
||||
"addc.cc.u64 %4, %12, %19;\n\t" // c[4] = a[4] + b[4] + CF
|
||||
"addc.cc.u64 %5, %13, %20;\n\t" // c[5] = a[5] + b[5] + CF
|
||||
"addc.cc.u64 %6, %14, %21;\n\t" // c[6] = a[6] + b[6] + CF
|
||||
"addc.u64 %7, 0, 0;\n\t" // carry_out = 0 + 0 + CF
|
||||
: "=l"(c.limb[0]), "=l"(c.limb[1]), "=l"(c.limb[2]), "=l"(c.limb[3]),
|
||||
"=l"(c.limb[4]), "=l"(c.limb[5]), "=l"(c.limb[6]), "=l"(carry_out)
|
||||
: "l"(a.limb[0]), "l"(a.limb[1]), "l"(a.limb[2]), "l"(a.limb[3]),
|
||||
"l"(a.limb[4]), "l"(a.limb[5]), "l"(a.limb[6]), "l"(b.limb[0]),
|
||||
"l"(b.limb[1]), "l"(b.limb[2]), "l"(b.limb[3]), "l"(b.limb[4]),
|
||||
"l"(b.limb[5]), "l"(b.limb[6]));
|
||||
return carry_out;
|
||||
#else
|
||||
// Host path: portable software carry detection
|
||||
UNSIGNED_LIMB carry = 0;
|
||||
|
||||
for (int i = 0; i < FP_LIMBS; i++) {
|
||||
// Add with carry: c = a + b + carry
|
||||
UNSIGNED_LIMB sum = a.limb[i] + carry;
|
||||
carry = (sum < a.limb[i]) ? 1 : 0; // Check for overflow
|
||||
carry = (sum < a.limb[i]) ? 1 : 0;
|
||||
sum += b.limb[i];
|
||||
carry += (sum < b.limb[i]) ? 1 : 0; // Check for overflow
|
||||
carry += (sum < b.limb[i]) ? 1 : 0;
|
||||
c.limb[i] = sum;
|
||||
}
|
||||
|
||||
return carry;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Subtraction with borrow propagation
|
||||
// "Raw" means without modular reduction - performs a - b and returns borrow.
|
||||
// This is an internal helper used by fp_sub() which handles reduction.
|
||||
__host__ __device__ UNSIGNED_LIMB fp_sub_raw(Fp &c, const Fp &a, const Fp &b) {
|
||||
#if defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 64
|
||||
// PTX borrow-chain: sub.cc sets the hardware borrow flag, subc.cc propagates
|
||||
// it. Same benefit as fp_add_raw -- eliminates 2 comparisons per limb.
|
||||
uint64_t borrow_out;
|
||||
asm("sub.cc.u64 %0, %8, %15;\n\t" // c[0] = a[0] - b[0], set CF
|
||||
"subc.cc.u64 %1, %9, %16;\n\t" // c[1] = a[1] - b[1] - CF
|
||||
"subc.cc.u64 %2, %10, %17;\n\t" // c[2] = a[2] - b[2] - CF
|
||||
"subc.cc.u64 %3, %11, %18;\n\t" // c[3] = a[3] - b[3] - CF
|
||||
"subc.cc.u64 %4, %12, %19;\n\t" // c[4] = a[4] - b[4] - CF
|
||||
"subc.cc.u64 %5, %13, %20;\n\t" // c[5] = a[5] - b[5] - CF
|
||||
"subc.cc.u64 %6, %14, %21;\n\t" // c[6] = a[6] - b[6] - CF
|
||||
"subc.u64 %7, 0, 0;\n\t" // borrow_out = 0 - 0 - CF
|
||||
: "=l"(c.limb[0]), "=l"(c.limb[1]), "=l"(c.limb[2]), "=l"(c.limb[3]),
|
||||
"=l"(c.limb[4]), "=l"(c.limb[5]), "=l"(c.limb[6]), "=l"(borrow_out)
|
||||
: "l"(a.limb[0]), "l"(a.limb[1]), "l"(a.limb[2]), "l"(a.limb[3]),
|
||||
"l"(a.limb[4]), "l"(a.limb[5]), "l"(a.limb[6]), "l"(b.limb[0]),
|
||||
"l"(b.limb[1]), "l"(b.limb[2]), "l"(b.limb[3]), "l"(b.limb[4]),
|
||||
"l"(b.limb[5]), "l"(b.limb[6]));
|
||||
// subc.u64 with 0-0-CF produces 0 if no borrow, or 0xFFFFFFFFFFFFFFFF if
|
||||
// borrow. Normalize to 0/1 for callers that check (borrow != 0) or add it.
|
||||
return borrow_out & 1;
|
||||
#else
|
||||
// Host path: portable software borrow detection
|
||||
UNSIGNED_LIMB borrow = 0;
|
||||
|
||||
for (int i = 0; i < FP_LIMBS; i++) {
|
||||
// Subtract with borrow: c = a - b - borrow
|
||||
UNSIGNED_LIMB diff = a.limb[i] - borrow;
|
||||
borrow = (diff > a.limb[i]) ? 1 : 0; // Check for underflow
|
||||
borrow = (diff > a.limb[i]) ? 1 : 0;
|
||||
UNSIGNED_LIMB old_diff = diff;
|
||||
diff -= b.limb[i];
|
||||
borrow += (diff > old_diff) ? 1 : 0; // Check for underflow
|
||||
borrow += (diff > old_diff) ? 1 : 0;
|
||||
c.limb[i] = diff;
|
||||
}
|
||||
|
||||
return borrow;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Addition with modular reduction: c = (a + b) mod p
|
||||
@@ -226,7 +271,27 @@ __host__ __device__ void fp_add(Fp &c, const Fp &a, const Fp &b) {
|
||||
Fp sum;
|
||||
UNSIGNED_LIMB carry = fp_add_raw(sum, a, b);
|
||||
|
||||
// If there's a carry or sum >= MODULUS, we need to reduce
|
||||
#if defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 64
|
||||
// Branchless reduction: always compute sum - p, then select based on
|
||||
// whether reduction was needed. This avoids divergent branches that stall
|
||||
// warps when some threads need reduction and others don't.
|
||||
//
|
||||
// Decision logic:
|
||||
// carry=1 -> sum overflowed 448 bits, definitely >= p -> use reduced
|
||||
// carry=0, borrow=0 -> sum >= p in 448 bits -> use reduced
|
||||
// carry=0, borrow=1 -> sum < p -> use original sum
|
||||
// So: use_original = (!carry) & borrow
|
||||
Fp reduced;
|
||||
UNSIGNED_LIMB borrow = fp_sub_raw(reduced, sum, fp_modulus());
|
||||
UNSIGNED_LIMB use_original = ((carry ^ 1) & borrow);
|
||||
UNSIGNED_LIMB mask =
|
||||
-use_original; // all-ones if keep sum, all-zeros if keep reduced
|
||||
|
||||
for (int i = 0; i < FP_LIMBS; i++) {
|
||||
c.limb[i] = (sum.limb[i] & mask) | (reduced.limb[i] & ~mask);
|
||||
}
|
||||
#else
|
||||
// Host path: branching is fine on CPU (branch predictor handles it well)
|
||||
const Fp &p = fp_modulus();
|
||||
if (carry || fp_cmp(sum, p) != ComparisonType::Less) {
|
||||
Fp reduced;
|
||||
@@ -235,6 +300,7 @@ __host__ __device__ void fp_add(Fp &c, const Fp &a, const Fp &b) {
|
||||
} else {
|
||||
fp_copy(c, sum);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// Subtraction with modular reduction: c = (a - b) mod p
|
||||
@@ -243,13 +309,28 @@ __host__ __device__ void fp_sub(Fp &c, const Fp &a, const Fp &b) {
|
||||
Fp diff;
|
||||
UNSIGNED_LIMB borrow = fp_sub_raw(diff, a, b);
|
||||
|
||||
// If there was a borrow, we need to add MODULUS
|
||||
#if defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 64
|
||||
// Branchless correction: always compute diff + p, select based on borrow.
|
||||
// Same rationale as fp_add -- avoids warp divergence.
|
||||
// borrow=1 -> a < b, need to add p -> use corrected
|
||||
// borrow=0 -> a >= b, result is valid -> use diff
|
||||
Fp corrected;
|
||||
fp_add_raw(corrected, diff, fp_modulus());
|
||||
UNSIGNED_LIMB mask =
|
||||
-borrow; // all-ones if borrow (use corrected), all-zeros if not
|
||||
|
||||
for (int i = 0; i < FP_LIMBS; i++) {
|
||||
c.limb[i] = (corrected.limb[i] & mask) | (diff.limb[i] & ~mask);
|
||||
}
|
||||
#else
|
||||
// Host path: branching is fine on CPU
|
||||
const Fp &p = fp_modulus();
|
||||
if (borrow) {
|
||||
fp_add_raw(c, diff, p);
|
||||
} else {
|
||||
fp_copy(c, diff);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// Small-constant multiplication via addition chains.
|
||||
@@ -453,23 +534,223 @@ __host__ __device__ void fp_mont_reduce(Fp &c, const UNSIGNED_LIMB *a) {
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// PTX-accelerated CIOS Montgomery multiplication (device path)
|
||||
// ============================================================================
|
||||
// The CIOS algorithm for 7 x 64-bit limbs executes 98 multiply-accumulate
|
||||
// steps across 7 outer iterations. Each step computes:
|
||||
// (carry, t[j]) = t[j] + a[j] * b_i + carry
|
||||
// which is a 64x64->128 multiply plus a three-operand addition with carry.
|
||||
//
|
||||
// The C++ path uses software carry detection: carry = (sum < old) ? 1 : 0.
|
||||
// The PTX path below uses hardware carry flags via the .cc suffix:
|
||||
// - mul.lo.u64 / mul.hi.u64 : 64x64->128 wide multiply
|
||||
// - add.cc.u64 / addc.u64 : addition chain with hardware carry flag
|
||||
//
|
||||
// Each multiply-accumulate step uses 6 PTX instructions instead of ~10+ in
|
||||
// the software-carry version. The 7 outer iterations are fully unrolled, and
|
||||
// the limb-shift loop (t[j] = t[j+1]) is eliminated by register renaming.
|
||||
//
|
||||
// REGISTER ALIASING NOTE: All PTX temporaries (_lo, _hi) are declared as
|
||||
// .reg inside the asm block. This prevents nvcc's register allocator from
|
||||
// aliasing them with C operands (t_j, carry), which was the root cause of
|
||||
// previous correctness bugs where "+l" outputs could share registers with
|
||||
// "l" inputs in the same asm statement.
|
||||
// ============================================================================
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#if LIMB_BITS_CONFIG == 64
|
||||
|
||||
// Multiply-accumulate one limb: (carry_out, t_j) = t_j + a_j * b_i + carry_in
|
||||
//
|
||||
// All intermediates (_lo, _hi) are PTX .reg temporaries inside a { } scope
|
||||
// block to avoid: (1) nvcc register aliasing between C operands, and (2)
|
||||
// duplicate .reg definitions when the macro is expanded multiple times.
|
||||
// The 6-instruction sequence:
|
||||
// mul.lo.u64 _lo, a_j, b_i -- low 64 bits of product
|
||||
// mul.hi.u64 _hi, a_j, b_i -- high 64 bits of product
|
||||
// add.cc.u64 t_j, t_j, _lo -- t_j += _lo, set CF
|
||||
// addc.u64 _hi, _hi, 0 -- _hi += CF
|
||||
// add.cc.u64 t_j, t_j, carry -- t_j += carry_in, set CF
|
||||
// addc.u64 carry, _hi, 0 -- carry_out = _hi + CF
|
||||
#define LIMB_MACC(t_j, carry, a_j, b_i) \
|
||||
asm volatile("{\n\t" \
|
||||
".reg .u64 _lo, _hi;\n\t" \
|
||||
"mul.lo.u64 _lo, %2, %3;\n\t" \
|
||||
"mul.hi.u64 _hi, %2, %3;\n\t" \
|
||||
"add.cc.u64 %0, %0, _lo;\n\t" \
|
||||
"addc.u64 _hi, _hi, 0;\n\t" \
|
||||
"add.cc.u64 %0, %0, %1;\n\t" \
|
||||
"addc.u64 %1, _hi, 0;\n\t" \
|
||||
"}\n\t" \
|
||||
: "+l"(t_j), "+l"(carry) \
|
||||
: "l"(a_j), "l"(b_i))
|
||||
|
||||
// Single CIOS iteration: multiply-accumulate, reduce, and shift.
|
||||
//
|
||||
// Computes:
|
||||
// 1. t += a * b_i (7 limb multiply-accumulate with carry chain)
|
||||
// 2. m = t[0] * p_prime (Montgomery reduction factor)
|
||||
// 3. t += m * p (reduction, zeros out t[0])
|
||||
// 4. Shift t right by one limb (via register renaming into r0..r7)
|
||||
//
|
||||
// The macro lets the compiler allocate registers across all 7 unrolled
|
||||
// iterations, avoiding spills to local memory.
|
||||
#define CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, \
|
||||
a5, a6, b_i, p0, p1, p2, p3, p4, p5, p6, p_prime, \
|
||||
r0, r1, r2, r3, r4, r5, r6, r7) \
|
||||
do { \
|
||||
uint64_t _carry = 0; \
|
||||
/* Step 1: t += a * b_i */ \
|
||||
LIMB_MACC(t0, _carry, a0, b_i); \
|
||||
LIMB_MACC(t1, _carry, a1, b_i); \
|
||||
LIMB_MACC(t2, _carry, a2, b_i); \
|
||||
LIMB_MACC(t3, _carry, a3, b_i); \
|
||||
LIMB_MACC(t4, _carry, a4, b_i); \
|
||||
LIMB_MACC(t5, _carry, a5, b_i); \
|
||||
LIMB_MACC(t6, _carry, a6, b_i); \
|
||||
/* Accumulate final carry into overflow limb t7 */ \
|
||||
uint64_t _overflow; \
|
||||
asm("add.cc.u64 %0, %0, %2;\n\t" \
|
||||
"addc.u64 %1, 0, 0;\n\t" \
|
||||
: "+l"(t7), "=l"(_overflow) \
|
||||
: "l"(_carry)); \
|
||||
\
|
||||
/* Step 2: m = t0 * p_prime mod 2^64 */ \
|
||||
uint64_t _m = t0 * p_prime; \
|
||||
\
|
||||
/* Step 3: t += m * p (zeros out t0) */ \
|
||||
_carry = 0; \
|
||||
LIMB_MACC(t0, _carry, _m, p0); \
|
||||
LIMB_MACC(t1, _carry, _m, p1); \
|
||||
LIMB_MACC(t2, _carry, _m, p2); \
|
||||
LIMB_MACC(t3, _carry, _m, p3); \
|
||||
LIMB_MACC(t4, _carry, _m, p4); \
|
||||
LIMB_MACC(t5, _carry, _m, p5); \
|
||||
LIMB_MACC(t6, _carry, _m, p6); \
|
||||
/* Finalize overflow: t7 = t7 + _carry + _overflow */ \
|
||||
/* Plain adds (no carry chain) -- the CIOS invariant guarantees this */ \
|
||||
/* sum fits in 64 bits so intermediate overflow does not matter. */ \
|
||||
t7 += _carry; \
|
||||
t7 += _overflow; \
|
||||
\
|
||||
/* Step 4: Shift right by one limb via register renaming */ \
|
||||
/* t0 is now zero (by construction of m), discard it */ \
|
||||
r0 = t1; \
|
||||
r1 = t2; \
|
||||
r2 = t3; \
|
||||
r3 = t4; \
|
||||
r4 = t5; \
|
||||
r5 = t6; \
|
||||
r6 = t7; \
|
||||
r7 = 0; \
|
||||
} while (0)
|
||||
|
||||
__device__ __noinline__ void fp_mont_mul_cios_ptx(Fp &c, const Fp &a,
|
||||
const Fp &b) {
|
||||
const uint64_t p0 = DEVICE_MODULUS.limb[0];
|
||||
const uint64_t p1 = DEVICE_MODULUS.limb[1];
|
||||
const uint64_t p2 = DEVICE_MODULUS.limb[2];
|
||||
const uint64_t p3 = DEVICE_MODULUS.limb[3];
|
||||
const uint64_t p4 = DEVICE_MODULUS.limb[4];
|
||||
const uint64_t p5 = DEVICE_MODULUS.limb[5];
|
||||
const uint64_t p6 = DEVICE_MODULUS.limb[6];
|
||||
const uint64_t pp = DEVICE_P_PRIME;
|
||||
|
||||
const uint64_t a0 = a.limb[0], a1 = a.limb[1], a2 = a.limb[2];
|
||||
const uint64_t a3 = a.limb[3], a4 = a.limb[4], a5 = a.limb[5];
|
||||
const uint64_t a6 = a.limb[6];
|
||||
|
||||
// Accumulator: 7 limbs + 1 overflow, initialized to zero
|
||||
uint64_t t0 = 0, t1 = 0, t2 = 0, t3 = 0;
|
||||
uint64_t t4 = 0, t5 = 0, t6 = 0, t7 = 0;
|
||||
|
||||
// 7 fully-unrolled CIOS iterations with register renaming for the shift.
|
||||
// Each iteration processes one limb of b, accumulates a*b[i], reduces,
|
||||
// and shifts. The output registers become the input for the next iteration.
|
||||
|
||||
CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
|
||||
b.limb[0], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
|
||||
t4, t5, t6, t7);
|
||||
|
||||
CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
|
||||
b.limb[1], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
|
||||
t4, t5, t6, t7);
|
||||
|
||||
CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
|
||||
b.limb[2], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
|
||||
t4, t5, t6, t7);
|
||||
|
||||
CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
|
||||
b.limb[3], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
|
||||
t4, t5, t6, t7);
|
||||
|
||||
CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
|
||||
b.limb[4], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
|
||||
t4, t5, t6, t7);
|
||||
|
||||
CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
|
||||
b.limb[5], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
|
||||
t4, t5, t6, t7);
|
||||
|
||||
CIOS_ITERATION_PTX(t0, t1, t2, t3, t4, t5, t6, t7, a0, a1, a2, a3, a4, a5, a6,
|
||||
b.limb[6], p0, p1, p2, p3, p4, p5, p6, pp, t0, t1, t2, t3,
|
||||
t4, t5, t6, t7);
|
||||
|
||||
// Final reduction: if t[0..7] >= p (extended to 8 limbs), subtract p.
|
||||
// Compute (t[0..6] - p[0..6]) with borrow, then subtract borrow from t7.
|
||||
// If t7 after subtraction is non-negative, the reduced result is valid;
|
||||
// otherwise the original t[0..6] is already in [0, p).
|
||||
uint64_t r0, r1, r2, r3, r4, r5, r6, mask;
|
||||
asm("sub.cc.u64 %0, %8, %15;\n\t" // r0 = t0 - p0
|
||||
"subc.cc.u64 %1, %9, %16;\n\t" // r1 = t1 - p1 - borrow
|
||||
"subc.cc.u64 %2, %10, %17;\n\t" // r2 = t2 - p2 - borrow
|
||||
"subc.cc.u64 %3, %11, %18;\n\t" // r3 = t3 - p3 - borrow
|
||||
"subc.cc.u64 %4, %12, %19;\n\t" // r4 = t4 - p4 - borrow
|
||||
"subc.cc.u64 %5, %13, %20;\n\t" // r5 = t5 - p5 - borrow
|
||||
"subc.cc.u64 %6, %14, %21;\n\t" // r6 = t6 - p6 - borrow
|
||||
"subc.u64 %7, %22, 0;\n\t" // mask_src = t7 - 0 - borrow
|
||||
"shr.s64 %7, %7, 63;\n\t" // mask = sign-extend: -1 if negative, 0 if
|
||||
// >= 0
|
||||
: "=l"(r0), "=l"(r1), "=l"(r2), "=l"(r3), "=l"(r4), "=l"(r5), "=l"(r6),
|
||||
"=l"(mask)
|
||||
: "l"(t0), "l"(t1), "l"(t2), "l"(t3), "l"(t4), "l"(t5), "l"(t6), "l"(p0),
|
||||
"l"(p1), "l"(p2), "l"(p3), "l"(p4), "l"(p5), "l"(p6), "l"(t7));
|
||||
|
||||
// Branchless selection:
|
||||
// mask = 0 -> t >= p (use reduced r[0..6])
|
||||
// mask = -1 -> t < p (keep original t[0..6])
|
||||
c.limb[0] = (t0 & mask) | (r0 & ~mask);
|
||||
c.limb[1] = (t1 & mask) | (r1 & ~mask);
|
||||
c.limb[2] = (t2 & mask) | (r2 & ~mask);
|
||||
c.limb[3] = (t3 & mask) | (r3 & ~mask);
|
||||
c.limb[4] = (t4 & mask) | (r4 & ~mask);
|
||||
c.limb[5] = (t5 & mask) | (r5 & ~mask);
|
||||
c.limb[6] = (t6 & mask) | (r6 & ~mask);
|
||||
}
|
||||
|
||||
#undef LIMB_MACC
|
||||
#undef CIOS_ITERATION_PTX
|
||||
|
||||
#endif // LIMB_BITS_CONFIG == 64
|
||||
#endif // __CUDA_ARCH__
|
||||
|
||||
// CIOS (Coarsely Integrated Operand Scanning) Montgomery multiplication
|
||||
// Fuses multiplication and reduction in a single pass for better efficiency.
|
||||
// Uses only FP_LIMBS+1 limbs of working space instead of 2*FP_LIMBS.
|
||||
// Both a and b are in Montgomery form, result is in Montgomery form.
|
||||
__host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
|
||||
#if defined(__CUDA_ARCH__) && LIMB_BITS_CONFIG == 64
|
||||
// Device path: fully unrolled PTX with hardware carry flags
|
||||
fp_mont_mul_cios_ptx(c, a, b);
|
||||
#else
|
||||
// Host path: portable C++ implementation
|
||||
const Fp &p = fp_modulus();
|
||||
UNSIGNED_LIMB p_prime = fp_p_prime();
|
||||
|
||||
// Working array: only n+1 limbs needed (vs 2n for separate mul+reduce)
|
||||
UNSIGNED_LIMB t[FP_LIMBS + 1];
|
||||
#ifdef __CUDA_ARCH__
|
||||
for (int i = 0; i < FP_LIMBS + 1; i++) {
|
||||
t[i] = 0;
|
||||
}
|
||||
#else
|
||||
memset(t, 0, (FP_LIMBS + 1) * sizeof(UNSIGNED_LIMB));
|
||||
#endif
|
||||
|
||||
// Main CIOS loop: for each limb of b
|
||||
for (int i = 0; i < FP_LIMBS; i++) {
|
||||
@@ -529,14 +810,7 @@ __host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
|
||||
}
|
||||
|
||||
// Copy result to output
|
||||
#ifdef __CUDA_ARCH__
|
||||
#pragma unroll
|
||||
for (int i = 0; i < FP_LIMBS; i++) {
|
||||
c.limb[i] = t[i];
|
||||
}
|
||||
#else
|
||||
memcpy(&c.limb[0], t, FP_LIMBS * sizeof(UNSIGNED_LIMB));
|
||||
#endif
|
||||
|
||||
// Final reduction: if result >= p or there's overflow, subtract p
|
||||
if (t[FP_LIMBS] != 0 || fp_cmp(c, p) != ComparisonType::Less) {
|
||||
@@ -545,6 +819,7 @@ __host__ __device__ void fp_mont_mul_cios(Fp &c, const Fp &a, const Fp &b) {
|
||||
fp_copy(c, reduced);
|
||||
}
|
||||
// Result is in Montgomery form
|
||||
#endif
|
||||
}
|
||||
|
||||
// Montgomery multiplication: c = (a * b * R_INV) mod p
|
||||
|
||||
@@ -23,7 +23,8 @@ set(ZK_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../src)
|
||||
set(ZK_PRIMITIVES_DIR ${ZK_SRC_DIR}/primitives)
|
||||
|
||||
# Build device library from tfhe-cuda-backend
|
||||
add_library(tfhe_device_bench STATIC ${TFHE_CUDA_BACKEND_DIR}/src/device.cu)
|
||||
add_library(tfhe_device_bench STATIC ${TFHE_CUDA_BACKEND_DIR}/src/device.cu
|
||||
${TFHE_CUDA_BACKEND_DIR}/src/utils/helper_profile.cu)
|
||||
set_target_properties(
|
||||
tfhe_device_bench
|
||||
PROPERTIES CUDA_SEPARABLE_COMPILATION ON
|
||||
|
||||
@@ -140,14 +140,14 @@ static void BM_G1_MSM(benchmark::State &state) {
|
||||
// Warm-up iterations
|
||||
for (int i = 0; i < WARMUP_ITERATIONS; i++) {
|
||||
point_msm_g1_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
|
||||
d_scalars, n, d_scratch, size_tracker, true);
|
||||
d_scalars, n, d_scratch);
|
||||
}
|
||||
cuda_synchronize_stream(g_benchmark_stream, g_gpu_index);
|
||||
|
||||
// Benchmark loop: only measure the MSM computation, no memory operations
|
||||
for (auto _ : state) {
|
||||
point_msm_g1_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
|
||||
d_scalars, n, d_scratch, size_tracker, true);
|
||||
d_scalars, n, d_scratch);
|
||||
benchmark::ClobberMemory();
|
||||
}
|
||||
|
||||
@@ -221,14 +221,14 @@ static void BM_G2_MSM(benchmark::State &state) {
|
||||
// Warm-up iterations
|
||||
for (int i = 0; i < WARMUP_ITERATIONS; i++) {
|
||||
point_msm_g2_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
|
||||
d_scalars, n, d_scratch, size_tracker, true);
|
||||
d_scalars, n, d_scratch);
|
||||
}
|
||||
cuda_synchronize_stream(g_benchmark_stream, g_gpu_index);
|
||||
|
||||
// Benchmark loop: only measure the MSM computation, no memory operations
|
||||
for (auto _ : state) {
|
||||
point_msm_g2_async(g_benchmark_stream, g_gpu_index, &h_result, d_points,
|
||||
d_scalars, n, d_scratch, size_tracker, true);
|
||||
d_scalars, n, d_scratch);
|
||||
benchmark::ClobberMemory();
|
||||
}
|
||||
|
||||
|
||||
@@ -20,7 +20,8 @@ set(ZK_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../src)
|
||||
set(ZK_PRIMITIVES_DIR ${ZK_SRC_DIR}/primitives)
|
||||
|
||||
# Build device library from tfhe-cuda-backend
|
||||
add_library(tfhe_device STATIC ${TFHE_CUDA_BACKEND_DIR}/src/device.cu)
|
||||
add_library(tfhe_device STATIC ${TFHE_CUDA_BACKEND_DIR}/src/device.cu
|
||||
${TFHE_CUDA_BACKEND_DIR}/src/utils/helper_profile.cu)
|
||||
set_target_properties(
|
||||
tfhe_device
|
||||
PROPERTIES CUDA_SEPARABLE_COMPILATION ON
|
||||
|
||||
@@ -13,8 +13,8 @@
|
||||
// ./build/tests_and_benchmarks/tests/basic/basic_curve_ops
|
||||
|
||||
#include "curve.h"
|
||||
#include "device.h"
|
||||
#include "fp.h"
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
|
||||
@@ -24,7 +24,7 @@ int main() {
|
||||
// (non-Montgomery) form. Convert to Montgomery, then lift to projective for
|
||||
// host-side arithmetic.
|
||||
const G1Affine &gen_normal = g1_generator();
|
||||
assert(!g1_is_infinity(gen_normal));
|
||||
PANIC_IF_FALSE(!g1_is_infinity(gen_normal), "generator must not be infinity");
|
||||
|
||||
G1Affine gen_affine = gen_normal;
|
||||
point_to_montgomery_inplace(gen_affine);
|
||||
@@ -37,21 +37,21 @@ int main() {
|
||||
|
||||
// G + (-G) = identity (Z = 0 in the projective convention)
|
||||
G1Projective identity = G + neg_G;
|
||||
assert(fp_is_zero(identity.Z));
|
||||
PANIC_IF_FALSE(fp_is_zero(identity.Z), "G + (-G) must be identity (Z = 0)");
|
||||
printf("Negation (-G) and G + (-G) = identity: OK\n");
|
||||
|
||||
// ---- Addition: 2*G = G + G, 3*G = 2*G + G ----
|
||||
G1Projective two_G = G + G;
|
||||
assert(!(two_G == G1Projective())); // not the identity
|
||||
PANIC_IF_FALSE(!(two_G == G1Projective{}), "2*G must not be identity");
|
||||
|
||||
G1Projective three_G = two_G + G;
|
||||
assert(!(three_G == G1Projective()));
|
||||
PANIC_IF_FALSE(!(three_G == G1Projective{}), "3*G must not be identity");
|
||||
printf("Addition (2*G, 3*G): OK\n");
|
||||
|
||||
// ---- Compound assignment: G += G ----
|
||||
G1Projective acc = G;
|
||||
acc += G; // acc = 2*G
|
||||
assert(acc == two_G);
|
||||
PANIC_IF_FALSE(acc == two_G, "G += G must equal 2*G");
|
||||
printf("Compound assignment (+=): OK\n");
|
||||
|
||||
// ---- Scalar multiplication: 3*G using Scalar type ----
|
||||
@@ -61,19 +61,22 @@ int main() {
|
||||
scalar_3.limb[0] = 3;
|
||||
|
||||
G1Projective three_G_via_scalar = G * scalar_3;
|
||||
assert(!(three_G_via_scalar == G1Projective()));
|
||||
PANIC_IF_FALSE(!(three_G_via_scalar == G1Projective{}),
|
||||
"3*G via scalar must not be identity");
|
||||
|
||||
// Normalise both to Z = 1 (Montgomery) before comparing coordinates.
|
||||
normalize_projective_g1(three_G);
|
||||
normalize_projective_g1(three_G_via_scalar);
|
||||
assert(three_G == three_G_via_scalar);
|
||||
PANIC_IF_FALSE(three_G == three_G_via_scalar,
|
||||
"3*G via addition must equal 3*G via scalar multiply");
|
||||
printf("Scalar multiplication (3*G == G + G + G): OK\n");
|
||||
|
||||
// ---- Projective -> affine conversion ----
|
||||
// projective_to_affine_g1 keeps coordinates in Montgomery form.
|
||||
G1Affine three_G_affine;
|
||||
projective_to_affine_g1(three_G_affine, three_G);
|
||||
assert(!g1_is_infinity(three_G_affine));
|
||||
PANIC_IF_FALSE(!g1_is_infinity(three_G_affine),
|
||||
"3*G in affine must not be infinity");
|
||||
printf("Projective -> affine conversion: OK\n");
|
||||
|
||||
// ---- Convert to normal-form coordinates ----
|
||||
@@ -82,7 +85,8 @@ int main() {
|
||||
G1Projective result = three_G_via_scalar;
|
||||
normalize_from_montgomery_g1(
|
||||
result); // coordinates now in normal (non-Montgomery) form
|
||||
assert(!fp_is_zero(result.Z)); // Z = 1 (non-zero)
|
||||
PANIC_IF_FALSE(!fp_is_zero(result.Z),
|
||||
"normalized result must have non-zero Z");
|
||||
printf("Conversion to normal-form projective: OK\n");
|
||||
|
||||
printf("All G1 curve operations passed.\n");
|
||||
|
||||
@@ -11,8 +11,8 @@
|
||||
// cmake --build build --target basic_fp_ops
|
||||
// ./build/tests_and_benchmarks/tests/basic/basic_fp_ops
|
||||
|
||||
#include "device.h"
|
||||
#include "fp.h"
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
|
||||
int main() {
|
||||
@@ -25,16 +25,16 @@ int main() {
|
||||
fp_one(b); // b = 1
|
||||
|
||||
c = a + b; // c = 2
|
||||
assert(c.limb[0] == 2);
|
||||
PANIC_IF_FALSE(c.limb[0] == 2, "1 + 1 must equal 2");
|
||||
|
||||
c = c - a; // c = 1
|
||||
assert(fp_is_one(c));
|
||||
PANIC_IF_FALSE(fp_is_one(c), "2 - 1 must equal 1");
|
||||
|
||||
// Compound assignment
|
||||
c += a; // c = 2
|
||||
assert(c.limb[0] == 2);
|
||||
PANIC_IF_FALSE(c.limb[0] == 2, "1 += 1 must equal 2");
|
||||
c -= b; // c = 1
|
||||
assert(fp_is_one(c));
|
||||
PANIC_IF_FALSE(fp_is_one(c), "2 -= 1 must equal 1");
|
||||
|
||||
printf("Addition/subtraction: OK\n");
|
||||
|
||||
@@ -43,7 +43,7 @@ int main() {
|
||||
// form, but for add/sub/neg small normal-form values also work correctly.
|
||||
Fp neg_a = -a; // neg_a = -1 mod p
|
||||
Fp sum = a + neg_a;
|
||||
assert(fp_is_zero(sum)); // 1 + (-1) = 0
|
||||
PANIC_IF_FALSE(fp_is_zero(sum), "1 + (-1) must equal 0");
|
||||
printf("Negation: OK\n");
|
||||
|
||||
// ---- Multiplication (Montgomery form required) ----
|
||||
@@ -56,17 +56,17 @@ int main() {
|
||||
|
||||
result_m = one_m * two_m; // result_m = 2 in Montgomery form
|
||||
fp_from_montgomery(result, result_m);
|
||||
assert(result.limb[0] == 2);
|
||||
PANIC_IF_FALSE(result.limb[0] == 2, "1 * 2 must equal 2");
|
||||
|
||||
result_m = two_m * two_m; // result_m = 4 in Montgomery form
|
||||
fp_from_montgomery(result, result_m);
|
||||
assert(result.limb[0] == 4);
|
||||
PANIC_IF_FALSE(result.limb[0] == 4, "2 * 2 must equal 4");
|
||||
|
||||
// Compound multiplication
|
||||
result_m = two_m;
|
||||
result_m *= two_m; // result_m = 4
|
||||
fp_from_montgomery(result, result_m);
|
||||
assert(result.limb[0] == 4);
|
||||
PANIC_IF_FALSE(result.limb[0] == 4, "2 *= 2 must equal 4");
|
||||
|
||||
// Convert an arbitrary normal-form value to Montgomery before multiplying
|
||||
Fp five_normal, five_m, twenty_five_m, twenty_five;
|
||||
@@ -76,7 +76,7 @@ int main() {
|
||||
|
||||
fp_mont_mul(twenty_five_m, five_m, five_m); // 5 * 5 = 25
|
||||
fp_from_montgomery(twenty_five, twenty_five_m);
|
||||
assert(twenty_five.limb[0] == 25);
|
||||
PANIC_IF_FALSE(twenty_five.limb[0] == 25, "5 * 5 must equal 25");
|
||||
|
||||
printf("Multiplication: OK\n");
|
||||
|
||||
@@ -88,7 +88,7 @@ int main() {
|
||||
|
||||
Fp one_check;
|
||||
fp_div(one_check, five_normal, five_normal); // 5 / 5 = 1
|
||||
assert(fp_is_one(one_check));
|
||||
PANIC_IF_FALSE(fp_is_one(one_check), "5 / 5 must equal 1");
|
||||
|
||||
// Verify: 5 * 5^{-1} == 1 (using fp_div as a cross-check)
|
||||
Fp product;
|
||||
@@ -98,7 +98,7 @@ int main() {
|
||||
fp_zero(two_normal);
|
||||
two_normal.limb[0] = 2;
|
||||
fp_div(product, two_normal, two_normal); // 2 / 2 = 1
|
||||
assert(fp_is_one(product));
|
||||
PANIC_IF_FALSE(fp_is_one(product), "2 / 2 must equal 1");
|
||||
|
||||
printf("Inversion/division: OK\n");
|
||||
|
||||
|
||||
@@ -20,7 +20,6 @@
|
||||
#include "device.h"
|
||||
#include "fp.h"
|
||||
#include "msm.h"
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
@@ -33,7 +32,6 @@ int main() {
|
||||
|
||||
const uint32_t gpu_index = 0;
|
||||
const uint32_t n = 4; // number of points / scalars
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
// ---- Prepare host-side points in Montgomery form ----
|
||||
// Use n doublings of the G1 generator: G, 2*G, 4*G, 8*G.
|
||||
@@ -76,8 +74,7 @@ int main() {
|
||||
|
||||
// ---- Run MSM (synchronous wrapper; result written directly to host) ----
|
||||
G1Projective h_result;
|
||||
point_msm_g1(stream, gpu_index, &h_result, d_points, d_scalars, n, d_scratch,
|
||||
size_tracker, true);
|
||||
point_msm_g1(stream, gpu_index, &h_result, d_points, d_scalars, n, d_scratch);
|
||||
|
||||
// ---- Verify against naive sequential computation on the host ----
|
||||
// Expected = sum over i of (scalar[i] * point[i]).
|
||||
@@ -95,7 +92,8 @@ int main() {
|
||||
// Normalise to Z = 1 (Montgomery) before comparing projective coordinates.
|
||||
normalize_projective_g1(h_result);
|
||||
normalize_projective_g1(expected);
|
||||
assert(h_result == expected);
|
||||
PANIC_IF_FALSE(h_result == expected,
|
||||
"MSM result must match naive sequential computation");
|
||||
printf("MSM result matches naive sequential computation.\n");
|
||||
|
||||
// ---- Cleanup ----
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
#include "fp.h"
|
||||
#include "fp_helpers.h" // Include test-only batch operations and kernels
|
||||
#include <chrono>
|
||||
#include <cinttypes>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <cuda_runtime.h>
|
||||
@@ -297,7 +298,7 @@ protected:
|
||||
|
||||
// Test basic addition (on GPU)
|
||||
TEST_F(FpArithmeticTest, Addition) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp a, b, c, c_cpu;
|
||||
|
||||
// Test: 1 + 1 = 2
|
||||
@@ -320,7 +321,7 @@ TEST_F(FpArithmeticTest, Addition) {
|
||||
|
||||
// Test subtraction (on GPU)
|
||||
TEST_F(FpArithmeticTest, Subtraction) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp a, b, c, a_cpu;
|
||||
|
||||
// Test: 2 - 1 = 1
|
||||
@@ -341,7 +342,7 @@ TEST_F(FpArithmeticTest, Subtraction) {
|
||||
|
||||
// Test multiplication (on GPU)
|
||||
TEST_F(FpArithmeticTest, Multiplication) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp five, three, result, expected;
|
||||
|
||||
fp_zero(five);
|
||||
@@ -370,7 +371,7 @@ TEST_F(FpArithmeticTest, Multiplication) {
|
||||
|
||||
// Test negation (on GPU)
|
||||
TEST_F(FpArithmeticTest, Negation) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp a, neg_a, result;
|
||||
|
||||
fp_zero(a);
|
||||
@@ -395,7 +396,7 @@ TEST_F(FpArithmeticTest, Negation) {
|
||||
|
||||
// Test Montgomery conversion round-trip (on GPU)
|
||||
TEST_F(FpArithmeticTest, MontgomeryRoundTrip) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp value, mont_form, back, mont_form_cpu, back_cpu;
|
||||
|
||||
fp_zero(value);
|
||||
@@ -421,7 +422,7 @@ TEST_F(FpArithmeticTest, MontgomeryRoundTrip) {
|
||||
|
||||
// Test Montgomery multiplication (on GPU)
|
||||
TEST_F(FpArithmeticTest, MontgomeryMultiplication) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp five, three, five_m, three_m, result_m, result, expected, result_cpu;
|
||||
|
||||
fp_zero(five);
|
||||
@@ -460,7 +461,7 @@ TEST_F(FpArithmeticTest, MontgomeryMultiplication) {
|
||||
|
||||
// Test comparison operations (on GPU)
|
||||
TEST_F(FpArithmeticTest, Comparison) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp five, three;
|
||||
|
||||
fp_zero(five);
|
||||
@@ -481,7 +482,7 @@ TEST_F(FpArithmeticTest, Comparison) {
|
||||
|
||||
// Test zero and one (on GPU)
|
||||
TEST_F(FpArithmeticTest, ZeroAndOne) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp zero, one;
|
||||
|
||||
fp_zero(zero);
|
||||
@@ -499,7 +500,7 @@ TEST_F(FpArithmeticTest, ZeroAndOne) {
|
||||
|
||||
// Test copy (on GPU)
|
||||
TEST_F(FpArithmeticTest, Copy) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp a, b, b_cpu;
|
||||
|
||||
fp_zero(a);
|
||||
@@ -522,7 +523,7 @@ TEST_F(FpArithmeticTest, Copy) {
|
||||
|
||||
// Test conditional move (on GPU)
|
||||
TEST_F(FpArithmeticTest, ConditionalMove) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp a, b, result, result_cpu;
|
||||
|
||||
fp_zero(a);
|
||||
@@ -563,7 +564,7 @@ TEST_F(FpArithmeticTest, ConditionalMove) {
|
||||
|
||||
// Test multiplication by zero (on GPU)
|
||||
TEST_F(FpArithmeticTest, MultiplicationByZero) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp a, zero, result, result_cpu;
|
||||
|
||||
fp_zero(zero);
|
||||
@@ -591,7 +592,7 @@ TEST_F(FpArithmeticTest, MultiplicationByZero) {
|
||||
|
||||
// Test inversion (on GPU)
|
||||
TEST_F(FpArithmeticTest, Inversion) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp a, a_inv, result, a_inv_cpu;
|
||||
|
||||
fp_zero(a);
|
||||
@@ -623,7 +624,7 @@ TEST_F(FpArithmeticTest, Inversion) {
|
||||
|
||||
// Test inversion of one (on GPU)
|
||||
TEST_F(FpArithmeticTest, InversionOfOne) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp one, one_inv, one_inv_cpu;
|
||||
|
||||
fp_one(one);
|
||||
@@ -645,7 +646,7 @@ TEST_F(FpArithmeticTest, InversionOfOne) {
|
||||
|
||||
// Test division (on GPU)
|
||||
TEST_F(FpArithmeticTest, Division) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp a, b, quotient, result;
|
||||
|
||||
fp_zero(a);
|
||||
@@ -678,7 +679,7 @@ TEST_F(FpArithmeticTest, Division) {
|
||||
|
||||
// Test division by one (on GPU)
|
||||
TEST_F(FpArithmeticTest, DivisionByOne) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp a, one, result;
|
||||
|
||||
fp_one(one);
|
||||
@@ -707,7 +708,7 @@ TEST_F(FpArithmeticTest, DivisionByOne) {
|
||||
|
||||
// Test exponentiation with small exponent (on GPU)
|
||||
TEST_F(FpArithmeticTest, ExponentiationSmall) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp base, result, expected, result_cpu;
|
||||
|
||||
fp_zero(base);
|
||||
@@ -734,7 +735,7 @@ TEST_F(FpArithmeticTest, ExponentiationSmall) {
|
||||
|
||||
// Test exponentiation to power of one (on GPU)
|
||||
TEST_F(FpArithmeticTest, ExponentiationToPowerOfOne) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp base, result, result_cpu;
|
||||
|
||||
fp_zero(base);
|
||||
@@ -758,7 +759,7 @@ TEST_F(FpArithmeticTest, ExponentiationToPowerOfOne) {
|
||||
|
||||
// Test exponentiation to power of zero (on GPU)
|
||||
TEST_F(FpArithmeticTest, ExponentiationToPowerOfZero) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp base, result, one, result_cpu;
|
||||
|
||||
fp_zero(base);
|
||||
@@ -782,7 +783,7 @@ TEST_F(FpArithmeticTest, ExponentiationToPowerOfZero) {
|
||||
|
||||
// Test exponentiation with large exponent (Fermat's little theorem)
|
||||
TEST_F(FpArithmeticTest, ExponentiationFermat) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp a, result;
|
||||
|
||||
fp_zero(a);
|
||||
@@ -798,7 +799,7 @@ TEST_F(FpArithmeticTest, ExponentiationFermat) {
|
||||
|
||||
// Test exponentiation: a^(p-1) = 1 mod p
|
||||
TEST_F(FpArithmeticTest, ExponentiationFermatInverse) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp a, result, one;
|
||||
|
||||
fp_zero(a);
|
||||
@@ -820,7 +821,7 @@ TEST_F(FpArithmeticTest, ExponentiationFermatInverse) {
|
||||
|
||||
// Test square root (on GPU)
|
||||
TEST_F(FpArithmeticTest, SquareRoot) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp a, square, sqrt_result, verify, square_cpu, sqrt_result_cpu, verify_cpu;
|
||||
|
||||
// Test: sqrt(a^2) = a or -a
|
||||
@@ -872,6 +873,12 @@ TEST_F(FpArithmeticTest, SquareRoot) {
|
||||
// Also test on CPU for comparison
|
||||
Fp neg_a_cpu = -a;
|
||||
|
||||
// Verify GPU negation matches CPU negation
|
||||
EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &neg_a, &neg_a_cpu),
|
||||
ComparisonType::Equal)
|
||||
<< "GPU negation should match CPU negation";
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
|
||||
bool matches_a = (fp_cmp_gpu(stream, gpu_index, &sqrt_result, &a) ==
|
||||
ComparisonType::Equal);
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
@@ -891,7 +898,7 @@ TEST_F(FpArithmeticTest, SquareRoot) {
|
||||
|
||||
// Test square root of zero (on GPU)
|
||||
TEST_F(FpArithmeticTest, SquareRootOfZero) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp zero, result, result_cpu;
|
||||
fp_zero(zero);
|
||||
|
||||
@@ -913,7 +920,7 @@ TEST_F(FpArithmeticTest, SquareRootOfZero) {
|
||||
|
||||
// Test square root of one (on GPU)
|
||||
TEST_F(FpArithmeticTest, SquareRootOfOne) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp one, result, result_cpu;
|
||||
fp_one(one);
|
||||
|
||||
@@ -935,7 +942,7 @@ TEST_F(FpArithmeticTest, SquareRootOfOne) {
|
||||
|
||||
// Test quadratic residue check (on GPU)
|
||||
TEST_F(FpArithmeticTest, IsQuadraticResidue) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp a, square, square_cpu, zero;
|
||||
|
||||
fp_zero(a);
|
||||
@@ -971,7 +978,7 @@ TEST_F(FpArithmeticTest, IsQuadraticResidue) {
|
||||
// device. For now, we test individual conversions on GPU and verify with GPU
|
||||
// comparisons
|
||||
TEST_F(FpArithmeticTest, BatchMontgomeryConversion) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
const int n = 10;
|
||||
Fp normal[n], montgomery[n], back[n];
|
||||
|
||||
@@ -1011,7 +1018,7 @@ TEST_F(FpArithmeticTest, BatchMontgomeryConversion) {
|
||||
|
||||
// Test 1: Addition that doesn't overflow (on GPU)
|
||||
TEST_F(FpArithmeticTest, LargeAddition1) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
// a = large value
|
||||
Fp a = test_utils::make_fp(0x18e00013555855ULL, 0x2b772294629DAULL,
|
||||
0x412736E1F11D66ULL, 0x87BAD325DD638ULL,
|
||||
@@ -1044,7 +1051,7 @@ TEST_F(FpArithmeticTest, LargeAddition1) {
|
||||
|
||||
// Test 2: Addition that triggers reduction (sum > p) (on GPU)
|
||||
TEST_F(FpArithmeticTest, LargeAddition2WithReduction) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
// Use two large numbers that will trigger reduction
|
||||
// a + b should wrap around modulus
|
||||
Fp a = test_utils::make_fp(0x311c0026aab0aaaaULL, 0x56ee4528c573b5ccULL,
|
||||
@@ -1076,7 +1083,7 @@ TEST_F(FpArithmeticTest, LargeAddition2WithReduction) {
|
||||
|
||||
// Test 3: Subtraction without borrow (on GPU)
|
||||
TEST_F(FpArithmeticTest, LargeSubtraction1) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
// a = large value
|
||||
Fp a = test_utils::make_fp(0x18e00013555855ULL, 0x2b772294629DAULL,
|
||||
0x412736E1F11D66ULL, 0x87BAD325DD638ULL,
|
||||
@@ -1106,11 +1113,15 @@ TEST_F(FpArithmeticTest, LargeSubtraction1) {
|
||||
ComparisonType::Equal)
|
||||
<< "GPU result should match CPU result";
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &verify, &verify_cpu),
|
||||
ComparisonType::Equal)
|
||||
<< "GPU subtraction roundtrip should match CPU roundtrip";
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
}
|
||||
|
||||
// Test 4: Subtraction with borrow (a < b) (on GPU)
|
||||
TEST_F(FpArithmeticTest, LargeSubtraction2WithBorrow) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
// a = 50
|
||||
Fp a = test_utils::make_fp(0x32ULL, 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL,
|
||||
0x0ULL);
|
||||
@@ -1144,7 +1155,7 @@ TEST_F(FpArithmeticTest, LargeSubtraction2WithBorrow) {
|
||||
|
||||
// Test 5: Multiplication of large values (triggers reduction) (on GPU)
|
||||
TEST_F(FpArithmeticTest, LargeMultiplication1) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
// a = 2^200 (bit 200 set)
|
||||
Fp a;
|
||||
fp_zero(a);
|
||||
@@ -1190,7 +1201,7 @@ TEST_F(FpArithmeticTest, LargeMultiplication1) {
|
||||
|
||||
// Test 6: (p-1) * (p-1) = 1 (mod p) (on GPU)
|
||||
TEST_F(FpArithmeticTest, LargeMultiplication2ModulusMinus1) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
// a = p - 1
|
||||
Fp a = test_utils::make_fp(0x311c0026aab0aaaaULL, 0x56ee4528c573b5ccULL,
|
||||
0x824e6dc3e23acdeeULL, 0xf75a64bbac71602ULL,
|
||||
@@ -1228,7 +1239,7 @@ TEST_F(FpArithmeticTest, LargeMultiplication2ModulusMinus1) {
|
||||
|
||||
// Test 7: Multiplication with 2: a * 2 = a + a (on GPU)
|
||||
TEST_F(FpArithmeticTest, LargeMultiplication3Half) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
// a = large value
|
||||
Fp a = test_utils::make_fp(0x18e00013555855ULL, 0x2b772294629DAE6ULL,
|
||||
0x412736E1F11D66F7ULL, 0x7BAD325DD638B01ULL,
|
||||
@@ -1264,11 +1275,15 @@ TEST_F(FpArithmeticTest, LargeMultiplication3Half) {
|
||||
ComparisonType::Equal)
|
||||
<< "GPU result should match CPU result";
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
EXPECT_EQ(fp_cmp_gpu(stream, gpu_index, &expected, &expected_cpu),
|
||||
ComparisonType::Equal)
|
||||
<< "GPU addition should match CPU addition";
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
}
|
||||
|
||||
// Test 8: Large number squared (on GPU)
|
||||
TEST_F(FpArithmeticTest, LargeMultiplication4Square) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
// a = large value
|
||||
Fp a = test_utils::make_fp(0x123456789ABCDEFULL, 0xFEDCBA9876543210ULL,
|
||||
0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL, 0x0ULL);
|
||||
@@ -1305,7 +1320,7 @@ TEST_F(FpArithmeticTest, LargeMultiplication4Square) {
|
||||
|
||||
// Test 9: Addition chain near modulus (on GPU)
|
||||
TEST_F(FpArithmeticTest, LargeAddition3Chain) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
// Start with p-1
|
||||
Fp a = test_utils::make_fp(0x311c0026aab0aaaaULL, 0x56ee4528c573b5ccULL,
|
||||
0x824e6dc3e23acdeeULL, 0x0f75a64bbac71602ULL,
|
||||
@@ -1332,7 +1347,7 @@ TEST_F(FpArithmeticTest, LargeAddition3Chain) {
|
||||
|
||||
// Test 10: Complex multiplication with reduction (on GPU)
|
||||
TEST_F(FpArithmeticTest, LargeMultiplication5Complex) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
// a = large prime-like number
|
||||
Fp a = test_utils::make_fp(0x123456789ABCDEFULL, 0xFEDCBA9876543210ULL,
|
||||
0x0123456789ABCDEFULL, 0xFEDCBA9876543210ULL,
|
||||
@@ -1386,7 +1401,7 @@ TEST_F(FpArithmeticTest, LargeMultiplication5Complex) {
|
||||
|
||||
// Test addition associativity: (a + b) + c = a + (b + c) (on GPU)
|
||||
TEST_F(FpPropertyTest, AdditionAssociativity) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 100; i++) {
|
||||
Fp a = random_value();
|
||||
Fp b = random_value();
|
||||
@@ -1412,7 +1427,7 @@ TEST_F(FpPropertyTest, AdditionAssociativity) {
|
||||
|
||||
// Test multiplication associativity: (a * b) * c = a * (b * c) (on GPU)
|
||||
TEST_F(FpPropertyTest, MultiplicationAssociativity) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 50; i++) { // Fewer iterations due to multiplication cost
|
||||
Fp a = random_value();
|
||||
Fp b = random_value();
|
||||
@@ -1438,7 +1453,7 @@ TEST_F(FpPropertyTest, MultiplicationAssociativity) {
|
||||
|
||||
// Test distributivity: a * (b + c) = a*b + a*c (on GPU)
|
||||
TEST_F(FpPropertyTest, MultiplicationDistributivity) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 50; i++) {
|
||||
Fp a = random_value();
|
||||
Fp b = random_value();
|
||||
@@ -1465,7 +1480,7 @@ TEST_F(FpPropertyTest, MultiplicationDistributivity) {
|
||||
|
||||
// Test addition commutativity with random values (on GPU)
|
||||
TEST_F(FpPropertyTest, AdditionCommutativityRandom) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 100; i++) {
|
||||
Fp a = random_value();
|
||||
Fp b = random_value();
|
||||
@@ -1483,7 +1498,7 @@ TEST_F(FpPropertyTest, AdditionCommutativityRandom) {
|
||||
|
||||
// Test multiplication commutativity with random values (on GPU)
|
||||
TEST_F(FpPropertyTest, MultiplicationCommutativityRandom) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 50; i++) {
|
||||
Fp a = random_value();
|
||||
Fp b = random_value();
|
||||
@@ -1501,7 +1516,7 @@ TEST_F(FpPropertyTest, MultiplicationCommutativityRandom) {
|
||||
|
||||
// Test additive identity: a + 0 = a (on GPU)
|
||||
TEST_F(FpPropertyTest, AdditiveIdentity) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 100; i++) {
|
||||
Fp a = random_value();
|
||||
Fp result;
|
||||
@@ -1517,7 +1532,7 @@ TEST_F(FpPropertyTest, AdditiveIdentity) {
|
||||
|
||||
// Test multiplicative identity: a * 1 = a (on GPU)
|
||||
TEST_F(FpPropertyTest, MultiplicativeIdentity) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 100; i++) {
|
||||
Fp a = random_value();
|
||||
Fp result;
|
||||
@@ -1533,7 +1548,7 @@ TEST_F(FpPropertyTest, MultiplicativeIdentity) {
|
||||
|
||||
// Test additive inverse: a + (-a) = 0 (on GPU)
|
||||
TEST_F(FpPropertyTest, AdditiveInverse) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 100; i++) {
|
||||
Fp a = random_value();
|
||||
Fp neg_a, result;
|
||||
@@ -1550,7 +1565,7 @@ TEST_F(FpPropertyTest, AdditiveInverse) {
|
||||
|
||||
// Test double negation: -(-a) = a (on GPU)
|
||||
TEST_F(FpPropertyTest, DoubleNegation) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 100; i++) {
|
||||
Fp a = random_value();
|
||||
Fp neg_a, neg_neg_a;
|
||||
@@ -1568,7 +1583,7 @@ TEST_F(FpPropertyTest, DoubleNegation) {
|
||||
|
||||
// Test subtraction as addition of negation: a - b = a + (-b) (on GPU)
|
||||
TEST_F(FpPropertyTest, SubtractionAsNegation) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 100; i++) {
|
||||
Fp a = random_value();
|
||||
Fp b = random_value();
|
||||
@@ -1590,7 +1605,7 @@ TEST_F(FpPropertyTest, SubtractionAsNegation) {
|
||||
|
||||
// Test Montgomery form round-trip with random values (on GPU)
|
||||
TEST_F(FpPropertyTest, MontgomeryRoundTripRandom) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 100; i++) {
|
||||
Fp a = random_value();
|
||||
Fp mont_form, back;
|
||||
@@ -1607,7 +1622,7 @@ TEST_F(FpPropertyTest, MontgomeryRoundTripRandom) {
|
||||
|
||||
// Test multiplicative inverse: a * a^(-1) = 1 (on GPU)
|
||||
TEST_F(FpPropertyTest, MultiplicativeInverse) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 50; i++) {
|
||||
Fp a = random_value();
|
||||
// Skip zero (on GPU)
|
||||
@@ -1630,7 +1645,7 @@ TEST_F(FpPropertyTest, MultiplicativeInverse) {
|
||||
|
||||
// Test division: (a / b) * b = a (on GPU)
|
||||
TEST_F(FpPropertyTest, DivisionProperty) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 50; i++) {
|
||||
Fp a = random_value();
|
||||
Fp b = random_value();
|
||||
@@ -1654,7 +1669,7 @@ TEST_F(FpPropertyTest, DivisionProperty) {
|
||||
|
||||
// Test division as multiplication by inverse: a / b = a * b^(-1) (on GPU)
|
||||
TEST_F(FpPropertyTest, DivisionAsInverse) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 50; i++) {
|
||||
Fp a = random_value();
|
||||
Fp b = random_value();
|
||||
@@ -1682,7 +1697,7 @@ TEST_F(FpPropertyTest, DivisionAsInverse) {
|
||||
|
||||
// Test exponentiation: (a^e1)^e2 = a^(e1*e2) for small exponents (on GPU)
|
||||
TEST_F(FpPropertyTest, ExponentiationPowerOfPower) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 20; i++) { // Fewer iterations due to cost
|
||||
Fp a = random_value();
|
||||
// Skip zero (on GPU)
|
||||
@@ -1716,7 +1731,7 @@ TEST_F(FpPropertyTest, ExponentiationPowerOfPower) {
|
||||
|
||||
// Test exponentiation: a^e1 * a^e2 = a^(e1+e2) (on GPU)
|
||||
TEST_F(FpPropertyTest, ExponentiationProduct) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 20; i++) { // Fewer iterations due to cost
|
||||
Fp a = random_value();
|
||||
// Skip zero (on GPU)
|
||||
@@ -1751,7 +1766,7 @@ TEST_F(FpPropertyTest, ExponentiationProduct) {
|
||||
|
||||
// Test inversion of inversion: (a^(-1))^(-1) = a (on GPU)
|
||||
TEST_F(FpPropertyTest, DoubleInversion) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 50; i++) {
|
||||
Fp a = random_value();
|
||||
// Skip zero (on GPU)
|
||||
@@ -1775,7 +1790,7 @@ TEST_F(FpPropertyTest, DoubleInversion) {
|
||||
|
||||
// Test square root property: sqrt(a^2) = a (for random a) (on GPU)
|
||||
TEST_F(FpPropertyTest, SquareRootProperty) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 50; i++) {
|
||||
Fp a = random_value();
|
||||
Fp square, sqrt_result, verify;
|
||||
@@ -1819,7 +1834,7 @@ TEST_F(FpPropertyTest, SquareRootProperty) {
|
||||
// Test quadratic residue property: squares are always quadratic residues (on
|
||||
// GPU)
|
||||
TEST_F(FpPropertyTest, QuadraticResidueProperty) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 100; i++) {
|
||||
Fp a = random_value();
|
||||
Fp square;
|
||||
@@ -1841,7 +1856,7 @@ TEST_F(FpPropertyTest, QuadraticResidueProperty) {
|
||||
|
||||
// Test operations with p-1 (on GPU)
|
||||
TEST_F(FpEdgeCaseTest, OperationsWithModulusMinusOne) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
// (p-1) + 1 = 0 (on GPU)
|
||||
Fp result;
|
||||
fp_add_gpu(stream, gpu_index, &result, &modulus_minus_one, &one);
|
||||
@@ -1868,7 +1883,7 @@ TEST_F(FpEdgeCaseTest, OperationsWithModulusMinusOne) {
|
||||
|
||||
// Test operations with p-2 (on GPU)
|
||||
TEST_F(FpEdgeCaseTest, OperationsWithModulusMinusTwo) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
// (p-2) + 1 = p-1 (on GPU)
|
||||
Fp result;
|
||||
fp_add_gpu(stream, gpu_index, &result, &modulus_minus_two, &one);
|
||||
@@ -1888,7 +1903,7 @@ TEST_F(FpEdgeCaseTest, OperationsWithModulusMinusTwo) {
|
||||
|
||||
// Test operations with very small values (on GPU)
|
||||
TEST_F(FpEdgeCaseTest, VerySmallValues) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp zero_val, one_val, two_val, three_val;
|
||||
fp_zero(zero_val);
|
||||
fp_one(one_val);
|
||||
@@ -1930,7 +1945,7 @@ TEST_F(FpEdgeCaseTest, VerySmallValues) {
|
||||
|
||||
// Test operations with max limb values (on GPU)
|
||||
TEST_F(FpEdgeCaseTest, MaxLimbValues) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
// Test that max_limb_value is valid
|
||||
EXPECT_TRUE(test_utils::is_valid_fp(max_limb_value))
|
||||
<< "max_limb_value should be < p";
|
||||
@@ -1953,7 +1968,7 @@ TEST_F(FpEdgeCaseTest, MaxLimbValues) {
|
||||
|
||||
// Test operations with alternating bit patterns (on GPU)
|
||||
TEST_F(FpEdgeCaseTest, AlternatingBitPatterns) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
// Test that alternating_bits is valid
|
||||
EXPECT_TRUE(test_utils::is_valid_fp(alternating_bits))
|
||||
<< "alternating_bits should be < p";
|
||||
@@ -1976,7 +1991,7 @@ TEST_F(FpEdgeCaseTest, AlternatingBitPatterns) {
|
||||
|
||||
// Test edge case: zero operations (on GPU)
|
||||
TEST_F(FpEdgeCaseTest, ZeroOperations) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
// 0 + 0 = 0 (on GPU)
|
||||
Fp result;
|
||||
fp_add_gpu(stream, gpu_index, &result, &zero, &zero);
|
||||
@@ -2006,7 +2021,7 @@ TEST_F(FpEdgeCaseTest, ZeroOperations) {
|
||||
|
||||
// Test edge case: one operations (on GPU)
|
||||
TEST_F(FpEdgeCaseTest, OneOperations) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
// 1 + 1 = 2 (on GPU)
|
||||
Fp result;
|
||||
fp_add_gpu(stream, gpu_index, &result, &one, &one);
|
||||
@@ -2033,7 +2048,7 @@ TEST_F(FpEdgeCaseTest, OneOperations) {
|
||||
|
||||
// Test fp_one_montgomery (on GPU)
|
||||
TEST_F(FpEdgeCaseTest, OneMontgomery) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp one_mont, one_normal;
|
||||
fp_one(one_normal);
|
||||
fp_one_montgomery(one_mont);
|
||||
@@ -2049,7 +2064,7 @@ TEST_F(FpEdgeCaseTest, OneMontgomery) {
|
||||
|
||||
// Test repeated operations (stress test) (on GPU)
|
||||
TEST_F(FpEdgeCaseTest, RepeatedOperations) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp a = test_utils::random_fp(rng);
|
||||
Fp result = a;
|
||||
|
||||
@@ -2091,7 +2106,7 @@ TEST_F(FpEdgeCaseTest, RepeatedOperations) {
|
||||
|
||||
// Test CUDA kernel: array addition
|
||||
TEST_F(FpCudaKernelTest, CudaKernelArrayAdd) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
const int n = 1000;
|
||||
Fp *h_a = new Fp[n];
|
||||
Fp *h_b = new Fp[n];
|
||||
@@ -2126,7 +2141,7 @@ TEST_F(FpCudaKernelTest, CudaKernelArrayAdd) {
|
||||
|
||||
// Test CUDA kernel: array multiplication
|
||||
TEST_F(FpCudaKernelTest, CudaKernelArrayMul) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
const int n = 1000;
|
||||
Fp *h_a = new Fp[n];
|
||||
Fp *h_b = new Fp[n];
|
||||
@@ -2166,7 +2181,7 @@ TEST_F(FpCudaKernelTest, CudaKernelArrayMul) {
|
||||
|
||||
// Test CUDA kernel: array addition with edge cases
|
||||
TEST_F(FpCudaKernelTest, CudaKernelArrayAddEdgeCases) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
const int n = 100;
|
||||
Fp *h_a = new Fp[n];
|
||||
Fp *h_b = new Fp[n];
|
||||
@@ -2216,7 +2231,7 @@ TEST_F(FpCudaKernelTest, CudaKernelArrayAddEdgeCases) {
|
||||
|
||||
// Test CUDA kernel: array multiplication with edge cases
|
||||
TEST_F(FpCudaKernelTest, CudaKernelArrayMulEdgeCases) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
const int n = 100;
|
||||
Fp *h_a = new Fp[n];
|
||||
Fp *h_b = new Fp[n];
|
||||
@@ -2271,7 +2286,7 @@ TEST_F(FpCudaKernelTest, CudaKernelArrayMulEdgeCases) {
|
||||
|
||||
// Test CUDA kernel: large array
|
||||
TEST_F(FpCudaKernelTest, CudaKernelLargeArray) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
const int n = 10000;
|
||||
Fp *h_a = new Fp[n];
|
||||
Fp *h_b = new Fp[n];
|
||||
@@ -2312,7 +2327,7 @@ TEST_F(FpCudaKernelTest, CudaKernelLargeArray) {
|
||||
// Test CUDA kernel: boundary conditions for launch configuration
|
||||
// Tests that the "if (idx < n)" check works correctly at block boundaries
|
||||
TEST_F(FpCudaKernelTest, CudaKernelBoundaryConditions) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
// Test sizes that stress the launch configuration
|
||||
// threadsPerBlock = 256, so test around block boundaries
|
||||
std::vector<int> test_sizes = {1, 255, 256, 257, 511,
|
||||
@@ -2352,7 +2367,7 @@ TEST_F(FpCudaKernelTest, CudaKernelBoundaryConditions) {
|
||||
|
||||
// Test CUDA kernel: verify kernel actually launches (not just CPU fallback)
|
||||
TEST_F(FpCudaKernelTest, CudaKernelActuallyLaunches) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
const int n = 1000;
|
||||
Fp *h_a = new Fp[n];
|
||||
Fp *h_b = new Fp[n];
|
||||
@@ -2384,7 +2399,7 @@ TEST_F(FpCudaKernelTest, CudaKernelActuallyLaunches) {
|
||||
|
||||
// Test CUDA kernel: verify device constant memory is accessible
|
||||
TEST_F(FpCudaKernelTest, CudaKernelDeviceConstants) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
// This test verifies that DEVICE_MODULUS is properly initialized
|
||||
// by running a kernel that uses it (multiplication uses Montgomery which
|
||||
// needs modulus)
|
||||
@@ -2428,7 +2443,7 @@ TEST_F(FpCudaKernelTest, CudaKernelDeviceConstants) {
|
||||
|
||||
// Test CUDA kernel: empty array (edge case)
|
||||
TEST_F(FpCudaKernelTest, CudaKernelEmptyArray) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
const int n = 0;
|
||||
Fp *h_a = nullptr;
|
||||
Fp *h_b = nullptr;
|
||||
@@ -2445,7 +2460,7 @@ TEST_F(FpCudaKernelTest, CudaKernelEmptyArray) {
|
||||
|
||||
// Test CUDA kernel: single element
|
||||
TEST_F(FpCudaKernelTest, CudaKernelSingleElement) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
const int n = 1;
|
||||
Fp *h_a = new Fp[n];
|
||||
Fp *h_b = new Fp[n];
|
||||
@@ -2471,44 +2486,53 @@ TEST_F(FpCudaKernelTest, CudaKernelSingleElement) {
|
||||
// ============================================================================
|
||||
|
||||
// Test to print generator values (for hardcoding)
|
||||
// PRIx64 format specifiers require 64-bit limbs
|
||||
#if LIMB_BITS_CONFIG == 64
|
||||
TEST_F(FpArithmeticTest, PrintGenerators) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
const G1Affine &g1 = g1_generator();
|
||||
const G2Affine &g2 = g2_generator();
|
||||
|
||||
printf("\n=== G1 Generator (Montgomery form) ===\n");
|
||||
printf("x: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
|
||||
"0x%llxULL, 0x%llxULL}\n",
|
||||
printf("x: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
|
||||
"0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
|
||||
"0x%" PRIx64 "ULL}\n",
|
||||
g1.x.limb[0], g1.x.limb[1], g1.x.limb[2], g1.x.limb[3], g1.x.limb[4],
|
||||
g1.x.limb[5], g1.x.limb[6]);
|
||||
printf("y: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
|
||||
"0x%llxULL, 0x%llxULL}\n",
|
||||
printf("y: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
|
||||
"0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
|
||||
"0x%" PRIx64 "ULL}\n",
|
||||
g1.y.limb[0], g1.y.limb[1], g1.y.limb[2], g1.y.limb[3], g1.y.limb[4],
|
||||
g1.y.limb[5], g1.y.limb[6]);
|
||||
|
||||
printf("\n=== G2 Generator (Montgomery form) ===\n");
|
||||
printf("x.c0: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
|
||||
"0x%llxULL, 0x%llxULL}\n",
|
||||
printf("x.c0: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
|
||||
"0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
|
||||
"0x%" PRIx64 "ULL}\n",
|
||||
g2.x.c0.limb[0], g2.x.c0.limb[1], g2.x.c0.limb[2], g2.x.c0.limb[3],
|
||||
g2.x.c0.limb[4], g2.x.c0.limb[5], g2.x.c0.limb[6]);
|
||||
printf("x.c1: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
|
||||
"0x%llxULL, 0x%llxULL}\n",
|
||||
printf("x.c1: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
|
||||
"0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
|
||||
"0x%" PRIx64 "ULL}\n",
|
||||
g2.x.c1.limb[0], g2.x.c1.limb[1], g2.x.c1.limb[2], g2.x.c1.limb[3],
|
||||
g2.x.c1.limb[4], g2.x.c1.limb[5], g2.x.c1.limb[6]);
|
||||
printf("y.c0: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
|
||||
"0x%llxULL, 0x%llxULL}\n",
|
||||
printf("y.c0: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
|
||||
"0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
|
||||
"0x%" PRIx64 "ULL}\n",
|
||||
g2.y.c0.limb[0], g2.y.c0.limb[1], g2.y.c0.limb[2], g2.y.c0.limb[3],
|
||||
g2.y.c0.limb[4], g2.y.c0.limb[5], g2.y.c0.limb[6]);
|
||||
printf("y.c1: {0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, 0x%llxULL, "
|
||||
"0x%llxULL, 0x%llxULL}\n",
|
||||
printf("y.c1: {0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
|
||||
"0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, 0x%" PRIx64 "ULL, "
|
||||
"0x%" PRIx64 "ULL}\n",
|
||||
g2.y.c1.limb[0], g2.y.c1.limb[1], g2.y.c1.limb[2], g2.y.c1.limb[3],
|
||||
g2.y.c1.limb[4], g2.y.c1.limb[5], g2.y.c1.limb[6]);
|
||||
printf("\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
// Test is_on_curve_g1 with point at infinity
|
||||
TEST_F(FpArithmeticTest, CurveG1PointAtInfinity) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
G1Affine point;
|
||||
g1_point_at_infinity(point);
|
||||
|
||||
@@ -2520,7 +2544,7 @@ TEST_F(FpArithmeticTest, CurveG1PointAtInfinity) {
|
||||
// We'll create a point by starting with a valid y and computing x
|
||||
// Or use a known valid point
|
||||
TEST_F(FpArithmeticTest, CurveG1ValidPoint) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
G1Affine point;
|
||||
point.infinity = false;
|
||||
|
||||
@@ -2593,7 +2617,7 @@ TEST_F(FpArithmeticTest, CurveG1ValidPoint) {
|
||||
|
||||
// Test is_on_curve_g1 with invalid point
|
||||
TEST_F(FpArithmeticTest, CurveG1InvalidPoint) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
G1Affine point;
|
||||
point.infinity = false;
|
||||
|
||||
@@ -2606,7 +2630,7 @@ TEST_F(FpArithmeticTest, CurveG1InvalidPoint) {
|
||||
|
||||
// Test that negating y preserves curve validity (on GPU)
|
||||
TEST_F(FpArithmeticTest, CurveG1FieldOperationsConsistency) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
G1Affine point;
|
||||
point.infinity = false;
|
||||
|
||||
@@ -2650,7 +2674,7 @@ TEST_F(FpArithmeticTest, CurveG1FieldOperationsConsistency) {
|
||||
|
||||
// Test is_on_curve_g2 with point at infinity
|
||||
TEST_F(FpArithmeticTest, CurveG2PointAtInfinity) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
G2Affine point;
|
||||
g2_point_at_infinity(point);
|
||||
|
||||
|
||||
@@ -152,7 +152,7 @@ protected:
|
||||
|
||||
// Test basic addition (on GPU)
|
||||
TEST_F(Fp2ArithmeticTest, Addition) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp2 a, b, c, c_cpu;
|
||||
|
||||
// Test: (1 + 0*i) + (1 + 0*i) = (2 + 0*i)
|
||||
@@ -175,7 +175,7 @@ TEST_F(Fp2ArithmeticTest, Addition) {
|
||||
|
||||
// Test subtraction (on GPU)
|
||||
TEST_F(Fp2ArithmeticTest, Subtraction) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp2 a, b, c, a_cpu;
|
||||
|
||||
// Test: (2 + 0*i) - (1 + 0*i) = (1 + 0*i)
|
||||
@@ -197,7 +197,7 @@ TEST_F(Fp2ArithmeticTest, Subtraction) {
|
||||
|
||||
// Test multiplication (on GPU)
|
||||
TEST_F(Fp2ArithmeticTest, Multiplication) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp2 a, b, result, expected, result_cpu;
|
||||
|
||||
// Test: (1 + 1*i) * (1 + 1*i) = (0 + 2*i)
|
||||
@@ -224,7 +224,7 @@ TEST_F(Fp2ArithmeticTest, Multiplication) {
|
||||
|
||||
// Test i * i = -1 (on GPU)
|
||||
TEST_F(Fp2ArithmeticTest, I_Squared) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp2 i_val, result, expected, result_cpu;
|
||||
|
||||
// i = 0 + 1*i
|
||||
@@ -250,7 +250,7 @@ TEST_F(Fp2ArithmeticTest, I_Squared) {
|
||||
|
||||
// Test negation (on GPU)
|
||||
TEST_F(Fp2ArithmeticTest, Negation) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp2 a, neg_a, result, neg_a_cpu, result_cpu;
|
||||
|
||||
a = test_utils_fp2::make_fp2_simple(5, 3);
|
||||
@@ -273,7 +273,7 @@ TEST_F(Fp2ArithmeticTest, Negation) {
|
||||
|
||||
// Test conjugation (on GPU)
|
||||
TEST_F(Fp2ArithmeticTest, Conjugation) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp2 a, conj, result, conj_cpu, result_cpu;
|
||||
|
||||
a = test_utils_fp2::make_fp2_simple(5, 3);
|
||||
@@ -304,7 +304,7 @@ TEST_F(Fp2ArithmeticTest, Conjugation) {
|
||||
|
||||
// Test squaring (on GPU)
|
||||
TEST_F(Fp2ArithmeticTest, Squaring) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp2 a, square, square_cpu;
|
||||
|
||||
// Test: (1 + 1*i)^2 = 2*i
|
||||
@@ -327,7 +327,7 @@ TEST_F(Fp2ArithmeticTest, Squaring) {
|
||||
|
||||
// Test zero and one (on GPU)
|
||||
TEST_F(Fp2ArithmeticTest, ZeroAndOne) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp2 zero_val, one_val;
|
||||
|
||||
fp2_zero(zero_val);
|
||||
@@ -349,7 +349,7 @@ TEST_F(Fp2ArithmeticTest, ZeroAndOne) {
|
||||
|
||||
// Test copy (on GPU)
|
||||
TEST_F(Fp2ArithmeticTest, Copy) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp2 a, b, b_cpu;
|
||||
|
||||
a = test_utils_fp2::make_fp2_simple(42, 123);
|
||||
@@ -370,7 +370,7 @@ TEST_F(Fp2ArithmeticTest, Copy) {
|
||||
|
||||
// Test conditional move (on GPU)
|
||||
TEST_F(Fp2ArithmeticTest, ConditionalMove) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp2 a, b, result, result_cpu;
|
||||
|
||||
a = test_utils_fp2::make_fp2_simple(10, 20);
|
||||
@@ -411,7 +411,7 @@ TEST_F(Fp2ArithmeticTest, ConditionalMove) {
|
||||
|
||||
// Test multiplication by zero (on GPU)
|
||||
TEST_F(Fp2ArithmeticTest, MultiplicationByZero) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp2 a, zero_val, result, result_cpu;
|
||||
|
||||
fp2_zero(zero_val);
|
||||
@@ -432,7 +432,7 @@ TEST_F(Fp2ArithmeticTest, MultiplicationByZero) {
|
||||
|
||||
// Test inversion (on GPU)
|
||||
TEST_F(Fp2ArithmeticTest, Inversion) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp2 a, a_inv, result, a_inv_cpu, result_cpu;
|
||||
|
||||
a = test_utils_fp2::make_fp2_simple(5, 3);
|
||||
@@ -456,7 +456,7 @@ TEST_F(Fp2ArithmeticTest, Inversion) {
|
||||
|
||||
// Test division (on GPU)
|
||||
TEST_F(Fp2ArithmeticTest, Division) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp2 a, b, quotient, result, quotient_cpu, result_cpu;
|
||||
|
||||
a = test_utils_fp2::make_fp2_simple(10, 6);
|
||||
@@ -482,7 +482,7 @@ TEST_F(Fp2ArithmeticTest, Division) {
|
||||
|
||||
// Test multiply by i (on GPU)
|
||||
TEST_F(Fp2ArithmeticTest, MultiplyByI) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp2 a, result, result_cpu;
|
||||
|
||||
// Test: (a + b*i) * i = -b + a*i
|
||||
@@ -509,7 +509,7 @@ TEST_F(Fp2ArithmeticTest, MultiplyByI) {
|
||||
|
||||
// Test Frobenius map (on GPU)
|
||||
TEST_F(Fp2ArithmeticTest, Frobenius) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
Fp2 a, frob, conj, frob_cpu, conj_cpu;
|
||||
|
||||
a = test_utils_fp2::make_fp2_simple(5, 3);
|
||||
@@ -541,7 +541,7 @@ TEST_F(Fp2ArithmeticTest, Frobenius) {
|
||||
|
||||
// Test addition associativity: (a + b) + c = a + (b + c) (on GPU)
|
||||
TEST_F(Fp2PropertyTest, AdditionAssociativity) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 100; i++) {
|
||||
Fp2 a = random_value();
|
||||
Fp2 b = random_value();
|
||||
@@ -567,7 +567,7 @@ TEST_F(Fp2PropertyTest, AdditionAssociativity) {
|
||||
|
||||
// Test multiplication associativity: (a * b) * c = a * (b * c) (on GPU)
|
||||
TEST_F(Fp2PropertyTest, MultiplicationAssociativity) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 50; i++) {
|
||||
Fp2 a = random_value();
|
||||
Fp2 b = random_value();
|
||||
@@ -593,7 +593,7 @@ TEST_F(Fp2PropertyTest, MultiplicationAssociativity) {
|
||||
|
||||
// Test distributivity: a * (b + c) = a*b + a*c (on GPU)
|
||||
TEST_F(Fp2PropertyTest, MultiplicationDistributivity) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 50; i++) {
|
||||
Fp2 a = random_value();
|
||||
Fp2 b = random_value();
|
||||
@@ -620,7 +620,7 @@ TEST_F(Fp2PropertyTest, MultiplicationDistributivity) {
|
||||
|
||||
// Test addition commutativity (on GPU)
|
||||
TEST_F(Fp2PropertyTest, AdditionCommutativity) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 100; i++) {
|
||||
Fp2 a = random_value();
|
||||
Fp2 b = random_value();
|
||||
@@ -638,7 +638,7 @@ TEST_F(Fp2PropertyTest, AdditionCommutativity) {
|
||||
|
||||
// Test multiplication commutativity (on GPU)
|
||||
TEST_F(Fp2PropertyTest, MultiplicationCommutativity) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 50; i++) {
|
||||
Fp2 a = random_value();
|
||||
Fp2 b = random_value();
|
||||
@@ -656,7 +656,7 @@ TEST_F(Fp2PropertyTest, MultiplicationCommutativity) {
|
||||
|
||||
// Test additive identity: a + 0 = a (on GPU)
|
||||
TEST_F(Fp2PropertyTest, AdditiveIdentity) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 100; i++) {
|
||||
Fp2 a = random_value();
|
||||
Fp2 result;
|
||||
@@ -672,7 +672,7 @@ TEST_F(Fp2PropertyTest, AdditiveIdentity) {
|
||||
|
||||
// Test multiplicative identity: a * 1 = a (on GPU)
|
||||
TEST_F(Fp2PropertyTest, MultiplicativeIdentity) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 100; i++) {
|
||||
Fp2 a = random_value();
|
||||
Fp2 result;
|
||||
@@ -688,7 +688,7 @@ TEST_F(Fp2PropertyTest, MultiplicativeIdentity) {
|
||||
|
||||
// Test additive inverse: a + (-a) = 0 (on GPU)
|
||||
TEST_F(Fp2PropertyTest, AdditiveInverse) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 100; i++) {
|
||||
Fp2 a = random_value();
|
||||
Fp2 neg_a, result;
|
||||
@@ -705,7 +705,7 @@ TEST_F(Fp2PropertyTest, AdditiveInverse) {
|
||||
|
||||
// Test multiplicative inverse: a * a^(-1) = 1 (on GPU)
|
||||
TEST_F(Fp2PropertyTest, MultiplicativeInverse) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 50; i++) {
|
||||
Fp2 a = random_value();
|
||||
// Skip zero
|
||||
@@ -726,7 +726,7 @@ TEST_F(Fp2PropertyTest, MultiplicativeInverse) {
|
||||
|
||||
// Test square vs multiply by self: a^2 = a * a (on GPU)
|
||||
TEST_F(Fp2PropertyTest, SquareVsMultiply) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
for (int i = 0; i < 50; i++) {
|
||||
Fp2 a = random_value();
|
||||
|
||||
@@ -747,7 +747,7 @@ TEST_F(Fp2PropertyTest, SquareVsMultiply) {
|
||||
|
||||
// Test CUDA kernel: array addition
|
||||
TEST_F(Fp2CudaKernelTest, CudaKernelArrayAdd) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
const int n = 1000;
|
||||
Fp2 *h_a = new Fp2[n];
|
||||
Fp2 *h_b = new Fp2[n];
|
||||
@@ -784,7 +784,7 @@ TEST_F(Fp2CudaKernelTest, CudaKernelArrayAdd) {
|
||||
|
||||
// Test CUDA kernel: array multiplication
|
||||
TEST_F(Fp2CudaKernelTest, CudaKernelArrayMul) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
const int n = 1000;
|
||||
Fp2 *h_a = new Fp2[n];
|
||||
Fp2 *h_b = new Fp2[n];
|
||||
@@ -825,7 +825,7 @@ TEST_F(Fp2CudaKernelTest, CudaKernelArrayMul) {
|
||||
|
||||
// Test is_on_curve_g2 with point at infinity
|
||||
TEST_F(Fp2ArithmeticTest, CurveG2PointAtInfinity) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
G2Affine point;
|
||||
g2_point_at_infinity(point);
|
||||
|
||||
@@ -835,7 +835,7 @@ TEST_F(Fp2ArithmeticTest, CurveG2PointAtInfinity) {
|
||||
|
||||
// Test is_on_curve_g2 with valid point construction
|
||||
TEST_F(Fp2ArithmeticTest, CurveG2ValidPointCheck) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
G2Affine point;
|
||||
point.infinity = false;
|
||||
|
||||
@@ -860,7 +860,7 @@ TEST_F(Fp2ArithmeticTest, CurveG2ValidPointCheck) {
|
||||
|
||||
// Test that field operations maintain curve validity for G2
|
||||
TEST_F(Fp2ArithmeticTest, CurveG2FieldOperationsConsistency) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
// Create a point (we'll test the consistency check works)
|
||||
G2Affine point;
|
||||
point.infinity = false;
|
||||
|
||||
@@ -24,8 +24,7 @@ static void test_point_msm_g1(cudaStream_t stream, uint32_t gpu_index,
|
||||
auto *d_scratch =
|
||||
static_cast<G1Projective *>(cuda_malloc_with_size_tracking_async(
|
||||
scratch_bytes, stream, gpu_index, size_tracker, true));
|
||||
point_msm_g1(stream, gpu_index, h_result, d_points, d_scalars, n, d_scratch,
|
||||
size_tracker, true);
|
||||
point_msm_g1(stream, gpu_index, h_result, d_points, d_scalars, n, d_scratch);
|
||||
cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);
|
||||
}
|
||||
|
||||
@@ -37,8 +36,7 @@ static void test_point_msm_g2(cudaStream_t stream, uint32_t gpu_index,
|
||||
auto *d_scratch =
|
||||
static_cast<G2Projective *>(cuda_malloc_with_size_tracking_async(
|
||||
scratch_bytes, stream, gpu_index, size_tracker, true));
|
||||
point_msm_g2(stream, gpu_index, h_result, d_points, d_scalars, n, d_scratch,
|
||||
size_tracker, true);
|
||||
point_msm_g2(stream, gpu_index, h_result, d_points, d_scalars, n, d_scratch);
|
||||
cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);
|
||||
}
|
||||
|
||||
|
||||
@@ -147,8 +147,6 @@ unsafe extern "C" {
|
||||
d_scalars: *const Scalar,
|
||||
n: u32,
|
||||
d_scratch: *mut G1ProjectivePoint,
|
||||
gpu_memory_allocated: bool,
|
||||
size_tracker: *mut u64,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -160,8 +158,6 @@ unsafe extern "C" {
|
||||
d_scalars: *const Scalar,
|
||||
n: u32,
|
||||
d_scratch: *mut G2ProjectivePoint,
|
||||
gpu_memory_allocated: bool,
|
||||
size_tracker: *mut u64,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
|
||||
@@ -11,6 +11,8 @@
|
||||
#include <stddef.h>
|
||||
#include <cstring>
|
||||
|
||||
#include "../../tfhe-cuda-backend/cuda/src/utils/helper_profile.cuh"
|
||||
|
||||
// C++ helper functions (not exported, used internally)
|
||||
// These can call template functions since they have C++ linkage
|
||||
static void convert_g1_points_to_montgomery(cudaStream_t stream, uint32_t gpu_index, G1Affine* d_points, uint32_t n) {
|
||||
@@ -64,8 +66,7 @@ bool g2_is_infinity_wrapper(const G2Affine* point) {
|
||||
}
|
||||
|
||||
// Unmanaged MSM wrapper for G1 (points/scalars/scratch on device, result on host)
|
||||
// Points MUST be in Montgomery form. Caller provides scratch buffer and
|
||||
// controls allocation tracking via gpu_memory_allocated.
|
||||
// Points MUST be in Montgomery form. Caller provides scratch buffer.
|
||||
// Zero internal allocations — this is a thin validation + dispatch layer.
|
||||
void g1_msm_unmanaged_wrapper_async(
|
||||
cudaStream_t stream,
|
||||
@@ -74,12 +75,9 @@ void g1_msm_unmanaged_wrapper_async(
|
||||
const G1Affine* d_points,
|
||||
const Scalar* d_scalars,
|
||||
uint32_t n,
|
||||
G1Projective* d_scratch,
|
||||
bool gpu_memory_allocated,
|
||||
uint64_t* size_tracker
|
||||
G1Projective* d_scratch
|
||||
) {
|
||||
PANIC_IF_FALSE(size_tracker != nullptr, "G1 MSM error: size_tracker is null");
|
||||
uint64_t& size_tracker_ref = *size_tracker;
|
||||
PUSH_RANGE("G1 MSM UNMANAGED");
|
||||
PANIC_IF_FALSE(n > 0, "G1 MSM error: n must be positive, got %u", n);
|
||||
PANIC_IF_FALSE(stream != nullptr, "G1 MSM error: stream is null");
|
||||
PANIC_IF_FALSE(h_result != nullptr, "G1 MSM error: h_result is null");
|
||||
@@ -91,13 +89,13 @@ void g1_msm_unmanaged_wrapper_async(
|
||||
cuda_get_number_of_gpus());
|
||||
|
||||
point_msm_g1_async(stream, gpu_index, h_result, d_points, d_scalars, n,
|
||||
d_scratch, size_tracker_ref, gpu_memory_allocated);
|
||||
d_scratch);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
POP_RANGE();
|
||||
}
|
||||
|
||||
// Unmanaged MSM wrapper for G2 (points/scalars/scratch on device, result on host)
|
||||
// Points MUST be in Montgomery form. Caller provides scratch buffer and
|
||||
// controls allocation tracking via gpu_memory_allocated.
|
||||
// Points MUST be in Montgomery form. Caller provides scratch buffer.
|
||||
// Zero internal allocations — this is a thin validation + dispatch layer.
|
||||
void g2_msm_unmanaged_wrapper_async(
|
||||
cudaStream_t stream,
|
||||
@@ -106,12 +104,9 @@ void g2_msm_unmanaged_wrapper_async(
|
||||
const G2Affine* d_points,
|
||||
const Scalar* d_scalars,
|
||||
uint32_t n,
|
||||
G2Projective* d_scratch,
|
||||
bool gpu_memory_allocated,
|
||||
uint64_t* size_tracker
|
||||
G2Projective* d_scratch
|
||||
) {
|
||||
PANIC_IF_FALSE(size_tracker != nullptr, "G2 MSM error: size_tracker is null");
|
||||
uint64_t& size_tracker_ref = *size_tracker;
|
||||
PUSH_RANGE("G2 MSM UNMANAGED");
|
||||
PANIC_IF_FALSE(n > 0, "G2 MSM error: n must be positive, got %u", n);
|
||||
PANIC_IF_FALSE(stream != nullptr, "G2 MSM error: stream is null");
|
||||
PANIC_IF_FALSE(h_result != nullptr, "G2 MSM error: h_result is null");
|
||||
@@ -123,8 +118,9 @@ void g2_msm_unmanaged_wrapper_async(
|
||||
cuda_get_number_of_gpus());
|
||||
|
||||
point_msm_g2_async(stream, gpu_index, h_result, d_points, d_scalars, n,
|
||||
d_scratch, size_tracker_ref, gpu_memory_allocated);
|
||||
d_scratch);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
POP_RANGE();
|
||||
}
|
||||
|
||||
// Scratch size query wrappers (needed for bindgen `.*_wrapper` allowlist)
|
||||
@@ -148,7 +144,8 @@ void g1_msm_managed_wrapper(
|
||||
bool points_in_montgomery,
|
||||
uint64_t* size_tracker
|
||||
) {
|
||||
uint64_t& size_tracker_ref = *size_tracker;
|
||||
PUSH_RANGE("G1 MSM MANAGED");
|
||||
uint64_t& size_tracker_local = *size_tracker;
|
||||
PANIC_IF_FALSE(n > 0, "G1 MSM error: n must be positive, got %u", n);
|
||||
PANIC_IF_FALSE(result != nullptr, "G1 MSM error: result is null");
|
||||
PANIC_IF_FALSE(stream != nullptr, "G1 MSM error: stream is null");
|
||||
@@ -165,8 +162,8 @@ void g1_msm_managed_wrapper(
|
||||
size_t scalars_bytes = safe_mul_sizeof<Scalar>(static_cast<size_t>(n));
|
||||
|
||||
// TODO: We should migrate to _unmanaged_ methods and have scratch/cleanup functions as tfhe-cuda-backend
|
||||
auto* d_points = static_cast<G1Affine*>(cuda_malloc_with_size_tracking_async(points_bytes, stream, gpu_index, size_tracker_ref, true));
|
||||
auto* d_scalars = static_cast<Scalar*>(cuda_malloc_with_size_tracking_async(scalars_bytes, stream, gpu_index, size_tracker_ref, true));
|
||||
auto* d_points = static_cast<G1Affine*>(cuda_malloc_with_size_tracking_async(points_bytes, stream, gpu_index, size_tracker_local, true));
|
||||
auto* d_scalars = static_cast<Scalar*>(cuda_malloc_with_size_tracking_async(scalars_bytes, stream, gpu_index, size_tracker_local, true));
|
||||
|
||||
// Always copy points to GPU first
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(d_points, points, points_bytes, stream, gpu_index, true);
|
||||
@@ -181,14 +178,14 @@ void g1_msm_managed_wrapper(
|
||||
// Allocate scratch buffer sized to match the pippenger internal partitioning
|
||||
size_t scratch_bytes = pippenger_scratch_size_g1(n, gpu_index);
|
||||
auto* d_scratch = static_cast<G1Projective*>(cuda_malloc_with_size_tracking_async(
|
||||
scratch_bytes, stream, gpu_index, size_tracker_ref, true));
|
||||
scratch_bytes, stream, gpu_index, size_tracker_local, true));
|
||||
|
||||
PANIC_IF_FALSE(d_points && d_scalars && d_scratch,
|
||||
"G1 MSM error: device memory allocation failed");
|
||||
|
||||
// Result written directly to host pointer -- no device round-trip needed
|
||||
point_msm_g1_async(stream, gpu_index, result, d_points, d_scalars, n,
|
||||
d_scratch, size_tracker_ref, true);
|
||||
d_scratch);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);
|
||||
@@ -197,6 +194,7 @@ void g1_msm_managed_wrapper(
|
||||
|
||||
// Sync for the async frees above.
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
POP_RANGE();
|
||||
}
|
||||
|
||||
|
||||
@@ -212,7 +210,8 @@ void g2_msm_managed_wrapper(
|
||||
bool points_in_montgomery,
|
||||
uint64_t* size_tracker
|
||||
) {
|
||||
uint64_t& size_tracker_ref = *size_tracker;
|
||||
PUSH_RANGE("G2 MSM MANAGED");
|
||||
uint64_t& size_tracker_local = *size_tracker;
|
||||
PANIC_IF_FALSE(n > 0, "G2 MSM error: n must be positive, got %u", n);
|
||||
PANIC_IF_FALSE(result != nullptr, "G2 MSM error: result is null");
|
||||
PANIC_IF_FALSE(stream != nullptr, "G2 MSM error: stream is null");
|
||||
@@ -229,8 +228,8 @@ void g2_msm_managed_wrapper(
|
||||
size_t scalars_bytes = safe_mul_sizeof<Scalar>(static_cast<size_t>(n));
|
||||
|
||||
// TODO: We should migrate to _unmanaged_ methods and have scratch/cleanup functions as tfhe-cuda-backend
|
||||
auto* d_points = static_cast<G2Affine*>(cuda_malloc_with_size_tracking_async(points_bytes, stream, gpu_index, size_tracker_ref, true));
|
||||
auto* d_scalars = static_cast<Scalar*>(cuda_malloc_with_size_tracking_async(scalars_bytes, stream, gpu_index, size_tracker_ref, true));
|
||||
auto* d_points = static_cast<G2Affine*>(cuda_malloc_with_size_tracking_async(points_bytes, stream, gpu_index, size_tracker_local, true));
|
||||
auto* d_scalars = static_cast<Scalar*>(cuda_malloc_with_size_tracking_async(scalars_bytes, stream, gpu_index, size_tracker_local, true));
|
||||
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(d_points, points, points_bytes, stream, gpu_index, true);
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(d_scalars, scalars, scalars_bytes, stream, gpu_index, true);
|
||||
@@ -243,14 +242,14 @@ void g2_msm_managed_wrapper(
|
||||
// Allocate scratch buffer sized to match the pippenger internal partitioning
|
||||
size_t scratch_bytes = pippenger_scratch_size_g2(n, gpu_index);
|
||||
auto* d_scratch = static_cast<G2Projective*>(cuda_malloc_with_size_tracking_async(
|
||||
scratch_bytes, stream, gpu_index, size_tracker_ref, true));
|
||||
scratch_bytes, stream, gpu_index, size_tracker_local, true));
|
||||
|
||||
PANIC_IF_FALSE(d_points && d_scalars && d_scratch,
|
||||
"G2 MSM error: device memory allocation failed");
|
||||
|
||||
// Result written directly to host pointer -- no device round-trip needed
|
||||
point_msm_g2_async(stream, gpu_index, result, d_points, d_scalars, n,
|
||||
d_scratch, size_tracker_ref, true);
|
||||
d_scratch);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_drop_with_size_tracking_async(d_scratch, stream, gpu_index, true);
|
||||
@@ -259,6 +258,7 @@ void g2_msm_managed_wrapper(
|
||||
|
||||
// Sync for the async frees above.
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
POP_RANGE();
|
||||
}
|
||||
|
||||
void g1_from_montgomery_wrapper(G1Affine* result, const G1Affine* point) {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user