mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-04-28 03:01:21 -04:00
Compare commits
73 Commits
dt/bench/t
...
as/benchma
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
054f90a3bc | ||
|
|
da2588abaa | ||
|
|
2389601219 | ||
|
|
32cf1969bf | ||
|
|
600a30131e | ||
|
|
96d230cf6f | ||
|
|
4790f8ba1c | ||
|
|
79a54df25b | ||
|
|
50d6be121a | ||
|
|
7cd966d8a7 | ||
|
|
6ca929051d | ||
|
|
871cc8f772 | ||
|
|
b938473788 | ||
|
|
74869f5e2f | ||
|
|
326dd6a5c7 | ||
|
|
1abc69751a | ||
|
|
3c2cb273d5 | ||
|
|
b18060e5c8 | ||
|
|
c8827a21a7 | ||
|
|
a7476d0aaa | ||
|
|
10d104e500 | ||
|
|
dbb1f151c8 | ||
|
|
9cb8ad9bff | ||
|
|
d970210ae4 | ||
|
|
5236c21733 | ||
|
|
7598725c7e | ||
|
|
f0cff6176d | ||
|
|
8bb38d4e70 | ||
|
|
35fe71cc07 | ||
|
|
62429da859 | ||
|
|
8a4b3c35f4 | ||
|
|
641fec028f | ||
|
|
8d8379409b | ||
|
|
d547e67f66 | ||
|
|
4cf03c063d | ||
|
|
9372c761dd | ||
|
|
d9dec879e7 | ||
|
|
6a0fb21fd0 | ||
|
|
95058c9b00 | ||
|
|
e19c5826c0 | ||
|
|
adf27ab700 | ||
|
|
32c6db381f | ||
|
|
d79801b340 | ||
|
|
0a4e4cf9e2 | ||
|
|
5d6b3146b1 | ||
|
|
e4ea44c571 | ||
|
|
de331f322a | ||
|
|
844c345e18 | ||
|
|
a9520f8930 | ||
|
|
98a9baf7a8 | ||
|
|
c621c1fc77 | ||
|
|
41fffb0306 | ||
|
|
258db8906a | ||
|
|
70f356d0c6 | ||
|
|
81797a6645 | ||
|
|
bb59dc64fc | ||
|
|
14f14526ee | ||
|
|
bcb9380058 | ||
|
|
25c9f46e2c | ||
|
|
7ef21b4561 | ||
|
|
f1a53609b5 | ||
|
|
f1aebcc4a0 | ||
|
|
2ee80918a0 | ||
|
|
98509e9965 | ||
|
|
9e2f8bc9cb | ||
|
|
72d94a2d94 | ||
|
|
2485d759c0 | ||
|
|
c40f1f8830 | ||
|
|
0c4155129b | ||
|
|
130432db01 | ||
|
|
a17d6ab095 | ||
|
|
6f3162f45b | ||
|
|
bbf34074f1 |
@@ -4,6 +4,9 @@ ignore = [
|
||||
"RUSTSEC-2024-0436",
|
||||
# Ignoring unmaintained 'bincode' crate. Getting rid of it would be too complex on the short term.
|
||||
"RUSTSEC-2025-0141",
|
||||
# Ignoring unsoundness in 'rand' with custom logger. Rand update is currently blocked by
|
||||
# arkworks and we do not use custom loggers.
|
||||
"RUSTSEC-2026-0097",
|
||||
]
|
||||
|
||||
[output]
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# Run backward compatibility tests
|
||||
name: aws_tfhe_backward_compat_tests
|
||||
# Run data related tests
|
||||
name: aws_data_tests
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
@@ -30,8 +30,8 @@ permissions:
|
||||
# zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning
|
||||
|
||||
jobs:
|
||||
backward-compat-tests:
|
||||
name: aws_tfhe_backward_compat_tests/backward-compat-tests (bpr)
|
||||
data-tests:
|
||||
name: aws_data_tests/data-tests (bpr)
|
||||
if: (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
github.event_name != 'push'
|
||||
runs-on: "runs-on=${{ github.run_id }}/runner=cpu-small"
|
||||
@@ -49,22 +49,24 @@ jobs:
|
||||
- name: Get LFS data sha
|
||||
id: hash-lfs-data
|
||||
run: |
|
||||
SHA=$(git lfs ls-files -l -I utils/tfhe-backward-compat-data | sha256sum | cut -d' ' -f1)
|
||||
SHA=$(git lfs ls-files -l -I utils/tfhe-backward-compat-data,tests/corrupted_inputs_deserialization | sha256sum | cut -d' ' -f1)
|
||||
echo "sha=${SHA}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Retrieve data from cache
|
||||
id: retrieve-data-cache
|
||||
uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
with:
|
||||
path: |
|
||||
utils/tfhe-backward-compat-data/**/*.cbor
|
||||
utils/tfhe-backward-compat-data/**/*.bcode
|
||||
tests/corrupted_inputs_deserialization/**/*.bcode
|
||||
key: ${{ steps.hash-lfs-data.outputs.sha }}
|
||||
|
||||
- name: Pull test data
|
||||
if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
make pull_backward_compat_data
|
||||
make pull_corrupted_inputs_data
|
||||
|
||||
# Pull token was stored by action/checkout to be used by lfs, we don't need it anymore
|
||||
- name: Remove git credentials
|
||||
@@ -80,14 +82,19 @@ jobs:
|
||||
run: |
|
||||
make test_backward_compatibility_ci
|
||||
|
||||
- name: Run corrupted inputs deserialization tests
|
||||
run: |
|
||||
make test_corrupted_inputs_ci
|
||||
|
||||
- name: Store data in cache
|
||||
if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
|
||||
continue-on-error: true
|
||||
uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
with:
|
||||
path: |
|
||||
utils/tfhe-backward-compat-data/**/*.cbor
|
||||
utils/tfhe-backward-compat-data/**/*.bcode
|
||||
tests/corrupted_inputs_deserialization/**/*.bcode
|
||||
key: ${{ steps.hash-lfs-data.outputs.sha }}
|
||||
|
||||
- name: Set pull-request URL
|
||||
19
.github/workflows/aws_tfhe_fast_tests.yml
vendored
19
.github/workflows/aws_tfhe_fast_tests.yml
vendored
@@ -16,7 +16,6 @@ env:
|
||||
PULL_REQUEST_MD_LINK: ""
|
||||
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
@@ -37,6 +36,7 @@ jobs:
|
||||
csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
|
||||
zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
|
||||
versionable_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.versionable_any_changed }}
|
||||
safe_serialize_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.safe_serialize_any_changed }}
|
||||
core_crypto_test: ${{ env.IS_PULL_REQUEST == 'false' ||
|
||||
steps.changed-files.outputs.core_crypto_any_changed ||
|
||||
steps.changed-files.outputs.dependencies_any_changed }}
|
||||
@@ -64,7 +64,7 @@ jobs:
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
persist-credentials: "false"
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Check for file changes
|
||||
@@ -79,6 +79,7 @@ jobs:
|
||||
- tfhe-zk-pok/**
|
||||
- utils/tfhe-versionable/**
|
||||
- utils/tfhe-versionable-derive/**
|
||||
- utils/tfhe-safe-serialize/**
|
||||
csprng:
|
||||
- tfhe-csprng/**
|
||||
zk_pok:
|
||||
@@ -86,6 +87,8 @@ jobs:
|
||||
versionable:
|
||||
- utils/tfhe-versionable/**
|
||||
- utils/tfhe-versionable-derive/**
|
||||
safe_serialize:
|
||||
- utils/tfhe-safe-serialize/**
|
||||
core_crypto:
|
||||
- tfhe/src/core_crypto/**
|
||||
boolean:
|
||||
@@ -122,6 +125,7 @@ jobs:
|
||||
steps.changed-files.outputs.csprng_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.versionable_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.safe_serialize_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.boolean_any_changed == 'true' ||
|
||||
steps.changed-files.outputs.shortint_any_changed == 'true' ||
|
||||
@@ -145,7 +149,7 @@ jobs:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
persist-credentials: "false"
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
@@ -170,6 +174,11 @@ jobs:
|
||||
run: |
|
||||
make test_versionable
|
||||
|
||||
- name: Run tfhe-safe-serialize tests
|
||||
if: needs.should-run.outputs.safe_serialize_test == 'true'
|
||||
run: |
|
||||
make test_safe_serialize
|
||||
|
||||
- name: Run core tests
|
||||
if: needs.should-run.outputs.core_crypto_test == 'true'
|
||||
run: |
|
||||
@@ -191,7 +200,7 @@ jobs:
|
||||
|
||||
- name: Node cache restoration
|
||||
id: node-cache
|
||||
uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
with:
|
||||
path: |
|
||||
~/.nvm
|
||||
@@ -204,7 +213,7 @@ jobs:
|
||||
make install_node
|
||||
|
||||
- name: Node cache save
|
||||
uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
if: steps.node-cache.outputs.cache-hit != 'true'
|
||||
with:
|
||||
path: |
|
||||
|
||||
4
.github/workflows/aws_tfhe_noise_checks.yml
vendored
4
.github/workflows/aws_tfhe_noise_checks.yml
vendored
@@ -34,7 +34,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -99,7 +99,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
30
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
30
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
@@ -14,12 +14,11 @@ env:
|
||||
PULL_REQUEST_MD_LINK: ""
|
||||
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
types: [labeled]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
@@ -32,16 +31,16 @@ jobs:
|
||||
if: github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved')
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: read # Needed to check for file change
|
||||
pull-requests: read # Needed to check for file change
|
||||
outputs:
|
||||
wasm_test: ${{ github.event_name == 'workflow_dispatch' ||
|
||||
steps.changed-files.outputs.wasm_any_changed }}
|
||||
steps.changed-files.outputs.wasm_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
persist-credentials: "false"
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Check for file changes
|
||||
@@ -63,6 +62,7 @@ jobs:
|
||||
- tfhe/js_on_wasm_tests/**
|
||||
- tfhe/web_wasm_parallel_tests/**
|
||||
- utils/tfhe-versionable/**
|
||||
- utils/tfhe-safe-serialize/**
|
||||
- .github/workflows/aws_tfhe_wasm_tests.yml
|
||||
|
||||
wasm-tests:
|
||||
@@ -78,7 +78,7 @@ jobs:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
persist-credentials: "false"
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
@@ -92,7 +92,7 @@ jobs:
|
||||
|
||||
- name: Node cache restoration
|
||||
id: node-cache
|
||||
uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
with:
|
||||
path: |
|
||||
~/.nvm
|
||||
@@ -105,7 +105,7 @@ jobs:
|
||||
make install_node
|
||||
|
||||
- name: Node cache save
|
||||
uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
if: steps.node-cache.outputs.cache-hit != 'true'
|
||||
with:
|
||||
path: |
|
||||
@@ -128,15 +128,21 @@ jobs:
|
||||
run: |
|
||||
make test_nodejs_wasm_api_ci
|
||||
|
||||
- name: Run parallel wasm tests
|
||||
run: |
|
||||
make test_web_js_api_parallel_chrome_ci
|
||||
|
||||
- name: Run wasm_par_mq tests
|
||||
run: |
|
||||
make test_wasm_par_mq_chrome_ci
|
||||
make test_wasm_par_mq_firefox_ci
|
||||
|
||||
- name: Run parallel wasm tests
|
||||
run: |
|
||||
make test_web_js_api_parallel_chrome_ci
|
||||
make test_web_js_api_parallel_firefox_ci
|
||||
|
||||
- name: Run cross origin wasm tests
|
||||
run: |
|
||||
make test_web_js_api_cross_origin_chrome_ci
|
||||
make test_web_js_api_cross_origin_firefox_ci
|
||||
|
||||
- name: Run x86_64/wasm zk compatibility tests
|
||||
run: |
|
||||
make test_zk_wasm_x86_compat_ci
|
||||
|
||||
@@ -5,8 +5,9 @@ name: backward_compat_pr_change_report
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- 'utils/tfhe-lints/snapshots/lint_enum_snapshots_*.json'
|
||||
|
||||
env:
|
||||
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
@@ -16,9 +17,35 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
name: backward_compat_pr_change_report/should-run
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: read # Needed to check for file change
|
||||
outputs:
|
||||
backward_report: ${{ steps.changed-files.outputs.backward_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
|
||||
with:
|
||||
files_yaml: |
|
||||
backward:
|
||||
- utils/tfhe-lints/snapshots/*.json
|
||||
|
||||
change-report:
|
||||
name: backward_compat_pr_change_report/change-report (bpr)
|
||||
runs-on: ubuntu-latest
|
||||
needs: should-run
|
||||
if:
|
||||
needs.should-run.outputs.backward_report == 'true'
|
||||
permissions:
|
||||
pull-requests: write # To send and modify message in the PR
|
||||
steps:
|
||||
@@ -53,8 +80,9 @@ jobs:
|
||||
fi
|
||||
|
||||
- name: Find existing comment
|
||||
if: steps.report.outputs.has_report == 'true'
|
||||
id: find-comment
|
||||
uses: peter-evans/find-comment@3eae4d37986fb5a8592848f6a574fdf654e61f9e # v3.1.0
|
||||
uses: peter-evans/find-comment@b30e6a3c0ed37e7c023ccd3f1db5c6c0b0c23aad # v4.0.0
|
||||
with:
|
||||
issue-number: ${{ github.event.pull_request.number }}
|
||||
body-includes: '**Backward-compat snapshot:'
|
||||
|
||||
7
.github/workflows/benchmark_cpu.yml
vendored
7
.github/workflows/benchmark_cpu.yml
vendored
@@ -14,11 +14,12 @@ on:
|
||||
- signed_integer
|
||||
- integer_compression
|
||||
- integer_zk
|
||||
- msm_zk
|
||||
- shortint
|
||||
- shortint_oprf
|
||||
- hlapi_unsigned
|
||||
- hlapi_signed
|
||||
- hlapi_erc20
|
||||
- hlapi_erc7984
|
||||
- hlapi_dex
|
||||
- hlapi_noise_squash
|
||||
- hlapi_kvstore
|
||||
@@ -92,8 +93,8 @@ jobs:
|
||||
|
||||
if inputs_command == "integer_zk":
|
||||
files_to_parse.append("pke_zk_crs_sizes.csv")
|
||||
elif inputs_command == "hlapi_erc20":
|
||||
files_to_parse.append("erc20_pbs_count.csv")
|
||||
elif inputs_command == "hlapi_erc7984":
|
||||
files_to_parse.append("erc7984_pbs_count.csv")
|
||||
elif inputs_command == "hlapi_dex":
|
||||
files_to_parse.extend(
|
||||
[
|
||||
|
||||
6
.github/workflows/benchmark_cpu_common.yml
vendored
6
.github/workflows/benchmark_cpu_common.yml
vendored
@@ -107,7 +107,7 @@ jobs:
|
||||
]:
|
||||
f.write(f"""{env_name}=["{'", "'.join(values_to_join)}"]\n""")
|
||||
|
||||
- name: Set martix arguments outputs
|
||||
- name: Set matrix arguments outputs
|
||||
id: set_matrix_args
|
||||
run: | # zizmor: ignore[template-injection] these env variable are safe
|
||||
{
|
||||
@@ -126,7 +126,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -261,7 +261,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
8
.github/workflows/benchmark_cpu_weekly.yml
vendored
8
.github/workflows/benchmark_cpu_weekly.yml
vendored
@@ -108,14 +108,14 @@ jobs:
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
|
||||
|
||||
run-benchmarks-hlapi-erc20:
|
||||
name: benchmark_cpu_weekly/run-benchmarks-hlapi-erc20
|
||||
run-benchmarks-hlapi-erc7984:
|
||||
name: benchmark_cpu_weekly/run-benchmarks-hlapi-erc7984
|
||||
if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
|
||||
needs: prepare-inputs
|
||||
uses: ./.github/workflows/benchmark_cpu_common.yml
|
||||
with:
|
||||
command: hlapi_erc20
|
||||
additional_file_to_parse: erc20_pbs_count.csv
|
||||
command: hlapi_erc7984
|
||||
additional_file_to_parse: erc7984_pbs_count.csv
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
|
||||
4
.github/workflows/benchmark_ct_key_sizes.yml
vendored
4
.github/workflows/benchmark_ct_key_sizes.yml
vendored
@@ -33,7 +33,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -137,7 +137,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
39
.github/workflows/benchmark_documentation.yml
vendored
39
.github/workflows/benchmark_documentation.yml
vendored
@@ -17,6 +17,10 @@ on:
|
||||
description: "Run GPU core-crypto benchmarks"
|
||||
type: boolean
|
||||
default: true
|
||||
run-gpu-zk-benchmarks:
|
||||
description: "Run GPU ZK benchmarks"
|
||||
type: boolean
|
||||
default: true
|
||||
run-hpu-benchmarks:
|
||||
description: "Run HPU benchmarks"
|
||||
type: boolean
|
||||
@@ -36,7 +40,7 @@ jobs:
|
||||
uses: ./.github/workflows/benchmark_cpu_common.yml
|
||||
if: inputs.run-cpu-benchmarks
|
||||
with:
|
||||
command: integer,hlapi_erc20
|
||||
command: integer,hlapi_erc7984
|
||||
op_flavor: fast_default
|
||||
bench_type: both
|
||||
precisions_set: documentation
|
||||
@@ -91,7 +95,7 @@ jobs:
|
||||
with:
|
||||
profile: multi-h100-sxm5
|
||||
hardware_name: n3-H100-SXM5x8
|
||||
command: integer_multi_bit,hlapi_erc20
|
||||
command: integer_multi_bit,hlapi_erc7984
|
||||
op_flavor: fast_default
|
||||
bench_type: both
|
||||
precisions_set: documentation
|
||||
@@ -110,7 +114,7 @@ jobs:
|
||||
uses: ./.github/workflows/benchmark_hpu_common.yml
|
||||
if: inputs.run-hpu-benchmarks
|
||||
with:
|
||||
command: integer,hlapi_erc20
|
||||
command: integer,hlapi_erc7984
|
||||
op_flavor: default
|
||||
bench_type: both
|
||||
precisions_set: documentation
|
||||
@@ -165,21 +169,42 @@ jobs:
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
|
||||
|
||||
run-benchmarks-gpu-zk-server:
|
||||
name: benchmark_documentation/run-benchmarks-gpu-zk-server
|
||||
uses: ./.github/workflows/benchmark_gpu_common.yml
|
||||
if: inputs.run-gpu-zk-benchmarks
|
||||
with:
|
||||
profile: multi-h100-sxm5
|
||||
hardware_name: n3-H100-SXM5x8
|
||||
command: integer_zk
|
||||
op_flavor: default
|
||||
bench_type: both
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
|
||||
|
||||
generate-svgs-with-benchmarks-run:
|
||||
name: benchmark-documentation/generate-svgs-with-benchmarks-run
|
||||
if: ${{ always() &&
|
||||
(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks ||inputs.run-hpu-benchmarks) &&
|
||||
(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks || inputs.run-hpu-benchmarks) &&
|
||||
inputs.generate-svgs }}
|
||||
needs: [
|
||||
run-benchmarks-cpu-integer, run-benchmarks-gpu-integer, run-benchmarks-hpu-integer,
|
||||
run-benchmarks-cpu-zk-server, run-benchmarks-cpu-zk-client,
|
||||
run-benchmarks-cpu-core-crypto, run-benchmarks-gpu-core-crypto
|
||||
run-benchmarks-cpu-core-crypto, run-benchmarks-gpu-core-crypto,
|
||||
run-benchmarks-gpu-zk-server
|
||||
]
|
||||
uses: ./.github/workflows/generate_svgs.yml
|
||||
with:
|
||||
time_span_days: 5
|
||||
generate-cpu-svgs: ${{ inputs.run-cpu-benchmarks }}
|
||||
generate-gpu-svgs: ${{ inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks }}
|
||||
generate-gpu-svgs: ${{ inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks }}
|
||||
generate-hpu-svgs: ${{ inputs.run-hpu-benchmarks }}
|
||||
secrets:
|
||||
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
|
||||
@@ -188,7 +213,7 @@ jobs:
|
||||
|
||||
generate-svgs-without-benchmarks-run:
|
||||
name: benchmark-documentation/generate-svgs-without-benchmarks-run
|
||||
if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-hpu-benchmarks) &&
|
||||
if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-gpu-zk-benchmarks || inputs.run-hpu-benchmarks) &&
|
||||
inputs.generate-svgs }}
|
||||
uses: ./.github/workflows/generate_svgs.yml
|
||||
with:
|
||||
|
||||
10
.github/workflows/benchmark_gpu.yml
vendored
10
.github/workflows/benchmark_gpu.yml
vendored
@@ -31,10 +31,14 @@ on:
|
||||
- pbs128
|
||||
- ks
|
||||
- ks_pbs
|
||||
- tfhe_zk_pok
|
||||
- msm_zk
|
||||
- integer_zk
|
||||
- integer_zk_experimental
|
||||
- integer_aes
|
||||
- integer_aes256
|
||||
- hlapi_erc20
|
||||
- hlapi_erc7984
|
||||
- hlapi_erc7984_multi_group
|
||||
- hlapi_dex
|
||||
- hlapi_noise_squash
|
||||
op_flavor:
|
||||
@@ -120,8 +124,8 @@ jobs:
|
||||
|
||||
if inputs_command == "integer_zk":
|
||||
files_to_parse.append("pke_zk_crs_sizes.csv")
|
||||
elif inputs_command == "hlapi_erc20":
|
||||
files_to_parse.append("erc20_pbs_count.csv")
|
||||
elif inputs_command == "hlapi_erc7984":
|
||||
files_to_parse.append("erc7984_pbs_count.csv")
|
||||
elif inputs_command == "hlapi_dex":
|
||||
files_to_parse.extend(
|
||||
[
|
||||
|
||||
75
.github/workflows/benchmark_gpu_common.yml
vendored
75
.github/workflows/benchmark_gpu_common.yml
vendored
@@ -111,7 +111,7 @@ jobs:
|
||||
]:
|
||||
f.write(f"""{env_name}=["{'", "'.join(values_to_join)}"]\n""")
|
||||
|
||||
- name: Set martix arguments outputs
|
||||
- name: Set matrix arguments outputs
|
||||
id: set_matrix_args
|
||||
run: | # zizmor: ignore[template-injection] these env variable are safe
|
||||
{
|
||||
@@ -126,17 +126,11 @@ jobs:
|
||||
needs: prepare-matrix
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
|
||||
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
|
||||
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
|
||||
# otherwise we'll try to run the next job on a non-existing on-demand instance.
|
||||
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
|
||||
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
continue-on-error: true
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -145,25 +139,6 @@ jobs:
|
||||
backend: ${{ inputs.backend }}
|
||||
profile: ${{ inputs.profile }}
|
||||
|
||||
- name: Acknowledge remote instance failure
|
||||
if: steps.start-remote-instance.outcome == 'failure' &&
|
||||
inputs.profile != 'single-h100'
|
||||
run: |
|
||||
echo "Remote instance instance has failed to start (profile provided: '${INPUTS_PROFILE}')"
|
||||
echo "Permanent instance instance cannot be used as a substitute (profile needed: 'single-h100')"
|
||||
exit 1
|
||||
env:
|
||||
INPUTS_PROFILE: ${{ inputs.profile }}
|
||||
|
||||
# This will allow to fallback on permanent instances running on Hyperstack.
|
||||
- name: Use permanent remote instance
|
||||
id: use-permanent-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true' &&
|
||||
steps.start-remote-instance.outcome == 'failure' &&
|
||||
inputs.profile == 'single-h100'
|
||||
run: |
|
||||
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# Install dependencies only once since cuda-benchmarks uses a matrix strategy, thus running multiple times.
|
||||
install-dependencies:
|
||||
name: benchmark_gpu_common/install-dependencies
|
||||
@@ -184,7 +159,6 @@ jobs:
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Setup Hyperstack dependencies
|
||||
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
|
||||
uses: ./.github/actions/gpu_setup
|
||||
with:
|
||||
cuda-version: ${{ matrix.cuda }}
|
||||
@@ -263,8 +237,10 @@ jobs:
|
||||
BENCH_PARAMS_TYPE: ${{ matrix.params_type }}
|
||||
BENCH_COMMAND: ${{ matrix.command }}
|
||||
PRECISIONS_SET: ${{ inputs.precisions_set }}
|
||||
__TFHE_RS_BENCH_MULTI_PROC_GROUPS: 2
|
||||
|
||||
- name: Parse results
|
||||
if: ${{ inputs.command != 'hlapi_erc7984_multi_group' }}
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
|
||||
--database tfhe_rs \
|
||||
@@ -282,6 +258,39 @@ jobs:
|
||||
REF_NAME: ${{ github.ref_name }}
|
||||
BENCH_TYPE: ${{ matrix.bench_type }}
|
||||
|
||||
- name: Parse and merge erc7984_multi_group results
|
||||
if: ${{ inputs.command == 'hlapi_erc7984_multi_group' }}
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py tfhe-benchmark/target_p0/criterion "${RESULTS_FILENAME_P0}" \
|
||||
--database tfhe_rs \
|
||||
--hardware "${INPUTS_HARDWARE_NAME}" \
|
||||
--backend gpu \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--branch "${REF_NAME}" \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${BENCH_DATE}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--bench-type "${BENCH_TYPE}"
|
||||
python3 ./ci/benchmark_parser.py tfhe-benchmark/target_p1/criterion "${RESULTS_FILENAME_P1}" \
|
||||
--database tfhe_rs \
|
||||
--hardware "${INPUTS_HARDWARE_NAME}" \
|
||||
--backend gpu \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--branch "${REF_NAME}" \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${BENCH_DATE}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--bench-type "${BENCH_TYPE}"
|
||||
python3 ./ci/merge_multi_group_results.py --bench-type "${BENCH_TYPE}" --output "${RESULTS_FILENAME}" "${RESULTS_FILENAME_P0}" "${RESULTS_FILENAME_P1}"
|
||||
env:
|
||||
INPUTS_HARDWARE_NAME: ${{ inputs.hardware_name }}
|
||||
REF_NAME: ${{ github.ref_name }}
|
||||
BENCH_TYPE: ${{ matrix.bench_type }}
|
||||
RESULTS_FILENAME_P0: parsed_benchmark_results_p0_${{ github.sha }}.json
|
||||
RESULTS_FILENAME_P1: parsed_benchmark_results_p1_${{ github.sha }}.json
|
||||
|
||||
- name: Parse additional benchmarks results files
|
||||
if: ${{ inputs.additional_file_to_parse }}
|
||||
run: |
|
||||
@@ -333,13 +342,13 @@ jobs:
|
||||
|
||||
teardown-instance:
|
||||
name: benchmark_gpu_common/teardown-instance
|
||||
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
|
||||
if: ${{ always() && needs.setup-instance.result == 'success' }}
|
||||
needs: [ setup-instance, cuda-benchmarks, slack-notify ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
18
.github/workflows/benchmark_gpu_coprocessor.yml
vendored
18
.github/workflows/benchmark_gpu_coprocessor.yml
vendored
@@ -42,7 +42,7 @@ env:
|
||||
OPTIMIZATION_TARGET: "throughput"
|
||||
BATCH_SIZE: "5000"
|
||||
SCHEDULING_POLICY: "MAX_PARALLELISM"
|
||||
BENCHMARKS: "erc20"
|
||||
BENCHMARKS: "erc7984"
|
||||
BRANCH_NAME: ${{ github.ref_name }}
|
||||
COMMIT_SHA: ${{ github.sha }}
|
||||
SLAB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
@@ -77,7 +77,7 @@ jobs:
|
||||
if [[ ${IS_MANUAL_RUN} == true ]]; then
|
||||
PROFILE_RAW="${PROFILE_MANUAL_RUN}"
|
||||
else
|
||||
PROFILE_RAW="${PROFILE}"
|
||||
PROFILE_RAW="${PROFILE_SCHEDULED_RUN}"
|
||||
fi
|
||||
# shellcheck disable=SC2001
|
||||
PROFILE_VAL=$(echo "${PROFILE_RAW}" | sed 's|.*[[:space:]](\(.*\))|\1|')
|
||||
@@ -94,7 +94,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -204,7 +204,7 @@ jobs:
|
||||
uses: foundry-rs/foundry-toolchain@8789b3e21e6c11b2697f5eb56eddae542f746c10
|
||||
|
||||
- name: Cache cargo
|
||||
uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3
|
||||
uses: actions/cache@668228422ae6a00e4ad889ee87cd7109ec5666a7 # v5.0.4
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry
|
||||
@@ -214,14 +214,14 @@ jobs:
|
||||
restore-keys: ${{ runner.os }}-cargo-
|
||||
|
||||
- name: Login to GitHub Container Registry
|
||||
uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0
|
||||
uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Login to Chainguard Registry
|
||||
uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0
|
||||
uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0
|
||||
with:
|
||||
registry: cgr.dev
|
||||
username: ${{ secrets.CGR_USERNAME }}
|
||||
@@ -248,13 +248,13 @@ jobs:
|
||||
npm install && npm run deploy:emptyProxies && npx hardhat compile
|
||||
working-directory: fhevm/
|
||||
|
||||
- name: Profile erc20 no-cmux benchmark on GPU
|
||||
- name: Profile erc7984 no-cmux benchmark on GPU
|
||||
run: |
|
||||
BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" \
|
||||
FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" \
|
||||
BENCHMARK_TYPE="THROUGHPUT_200" \
|
||||
OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" \
|
||||
make -e "profile_erc20_gpu"
|
||||
make -e "profile_erc7984_gpu"
|
||||
working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker
|
||||
|
||||
- name: Get nsys profile name
|
||||
@@ -333,7 +333,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
2
.github/workflows/benchmark_hpu.yml
vendored
2
.github/workflows/benchmark_hpu.yml
vendored
@@ -14,7 +14,7 @@ on:
|
||||
- integer
|
||||
- hlapi_unsigned
|
||||
- hlapi_signed
|
||||
- hlapi_erc20
|
||||
- hlapi_erc7984
|
||||
op_flavor:
|
||||
description: "Operations set to run"
|
||||
type: choice
|
||||
|
||||
4
.github/workflows/benchmark_hpu_common.yml
vendored
4
.github/workflows/benchmark_hpu_common.yml
vendored
@@ -95,7 +95,7 @@ jobs:
|
||||
]:
|
||||
f.write(f"""{env_name}=["{'", "'.join(values_to_join)}"]\n""")
|
||||
|
||||
- name: Set martix arguments outputs
|
||||
- name: Set matrix arguments outputs
|
||||
id: set_matrix_args
|
||||
run: | # zizmor: ignore[template-injection] these env variable are safe
|
||||
{
|
||||
@@ -121,7 +121,7 @@ jobs:
|
||||
steps:
|
||||
# Needed as long as hw_regmap repository is private
|
||||
- name: Configure SSH
|
||||
uses: webfactory/ssh-agent@a6f90b1f127823b31d4d4a8d96047790581349bd # v0.9.1
|
||||
uses: webfactory/ssh-agent@e83874834305fe9a4a2997156cb26c5de65a8555 # v0.10.0
|
||||
with:
|
||||
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
|
||||
|
||||
|
||||
@@ -143,7 +143,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -387,7 +387,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
21
.github/workflows/benchmark_summary.yml
vendored
21
.github/workflows/benchmark_summary.yml
vendored
@@ -114,6 +114,27 @@ jobs:
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
|
||||
|
||||
run-benchmarks-gpu-erc7984-multi-group:
|
||||
name: benchmark_documentation/run-benchmarks-gpu-erc7984-multi-group
|
||||
uses: ./.github/workflows/benchmark_gpu_common.yml
|
||||
if: inputs.run-gpu-benchmarks
|
||||
needs: parse-gpu-inputs
|
||||
with:
|
||||
profile: ${{ needs.parse-gpu-inputs.outputs.profile }}
|
||||
hardware_name: ${{ needs.parse-gpu-inputs.outputs.hardware_name }}
|
||||
command: hlapi_erc7984_multi_group
|
||||
bench_type: throughput
|
||||
params_type: multi_bit
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
|
||||
|
||||
# TODO add make recipe for HPU benchmarks
|
||||
# run-benchmarks-hpu:
|
||||
# name: benchmark_documentation/run-benchmarks-hpu
|
||||
|
||||
4
.github/workflows/benchmark_tfhe_fft.yml
vendored
4
.github/workflows/benchmark_tfhe_fft.yml
vendored
@@ -40,7 +40,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -137,7 +137,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/benchmark_tfhe_ntt.yml
vendored
4
.github/workflows/benchmark_tfhe_ntt.yml
vendored
@@ -40,7 +40,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -137,7 +137,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -63,7 +63,7 @@ jobs:
|
||||
with open(env_file, "a") as f:
|
||||
f.write(f"""BROWSER=["{'", "'.join(split_browser)}"]\n""")
|
||||
|
||||
- name: Set martix arguments output
|
||||
- name: Set matrix arguments output
|
||||
id: set_matrix_arg
|
||||
run: | # zizmor: ignore[template-injection] this env variable is safe
|
||||
echo "browser=${{ toJSON(env.BROWSER) }}" >> "${GITHUB_OUTPUT}"
|
||||
@@ -77,7 +77,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -124,7 +124,7 @@ jobs:
|
||||
|
||||
- name: Node cache restoration
|
||||
id: node-cache
|
||||
uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
with:
|
||||
path: |
|
||||
~/.nvm
|
||||
@@ -137,7 +137,7 @@ jobs:
|
||||
make install_node
|
||||
|
||||
- name: Node cache save
|
||||
uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
if: steps.node-cache.outputs.cache-hit != 'true'
|
||||
with:
|
||||
path: |
|
||||
@@ -158,9 +158,9 @@ jobs:
|
||||
env:
|
||||
BROWSER: ${{ matrix.browser }}
|
||||
|
||||
- name: Run benchmarks (unsafe coop)
|
||||
- name: Run benchmarks (cross origin)
|
||||
run: |
|
||||
make bench_web_js_api_unsafe_coop_"${BROWSER}"_ci
|
||||
make bench_web_js_api_cross_origin_"${BROWSER}"_ci
|
||||
env:
|
||||
BROWSER: ${{ matrix.browser }}
|
||||
|
||||
@@ -218,7 +218,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
6
.github/workflows/cargo_build_common.yml
vendored
6
.github/workflows/cargo_build_common.yml
vendored
@@ -94,7 +94,7 @@ jobs:
|
||||
with open(env_file, "a") as f:
|
||||
f.write(f"""RUNNERS=["{'", "'.join(runners)}"]\n""")
|
||||
|
||||
- name: Set martix runners outputs
|
||||
- name: Set matrix runners outputs
|
||||
id: set_matrix_runners
|
||||
run: | # zizmor: ignore[template-injection] these env variable are safe
|
||||
echo "runners=${{ toJSON(env.RUNNERS) }}" >> "${GITHUB_OUTPUT}"
|
||||
@@ -138,7 +138,7 @@ jobs:
|
||||
- name: Node cache restoration
|
||||
if: inputs.run-pcc-cpu-batch == 'pcc_batch_2'
|
||||
id: node-cache
|
||||
uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
with:
|
||||
path: |
|
||||
~/.nvm
|
||||
@@ -151,7 +151,7 @@ jobs:
|
||||
make install_node
|
||||
|
||||
- name: Node cache save
|
||||
uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
if: inputs.run-pcc-cpu-batch == 'pcc_batch_2' && steps.node-cache.outputs.cache-hit != 'true'
|
||||
with:
|
||||
path: |
|
||||
|
||||
4
.github/workflows/cargo_test_ntt.yml
vendored
4
.github/workflows/cargo_test_ntt.yml
vendored
@@ -63,7 +63,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -146,7 +146,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/ci_lint.yml
vendored
4
.github/workflows/ci_lint.yml
vendored
@@ -43,14 +43,14 @@ jobs:
|
||||
echo "version=$(make zizmor_version)" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Check workflows security
|
||||
uses: zizmorcore/zizmor-action@0dce2577a4760a2749d8cfb7a84b7d5585ebcb7d # v0.5.0
|
||||
uses: zizmorcore/zizmor-action@71321a20a9ded102f6e9ce5718a2fcec2c4f70d8 # v0.5.2
|
||||
with:
|
||||
advanced-security: 'false' # Print results directly in logs
|
||||
persona: pedantic
|
||||
version: ${{ steps.get_zizmor.outputs.version }}
|
||||
|
||||
- name: Ensure SHA pinned actions
|
||||
uses: zgosalvez/github-actions-ensure-sha-pinned-actions@70c4af2ed5282c51ba40566d026d6647852ffa3e # v5.0.1
|
||||
uses: zgosalvez/github-actions-ensure-sha-pinned-actions@ca46236c6ce584ae24bc6283ba8dcf4b3ec8a066 # v5.0.4
|
||||
with:
|
||||
allowlist: |
|
||||
slsa-framework/slsa-github-generator
|
||||
|
||||
4
.github/workflows/code_coverage.yml
vendored
4
.github/workflows/code_coverage.yml
vendored
@@ -74,7 +74,7 @@ jobs:
|
||||
make test_shortint_cov
|
||||
|
||||
- name: Upload tfhe coverage to Codecov
|
||||
uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de
|
||||
uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
@@ -88,7 +88,7 @@ jobs:
|
||||
make test_integer_cov
|
||||
|
||||
- name: Upload tfhe coverage to Codecov
|
||||
uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de
|
||||
uses: codecov/codecov-action@57e3a136b779b570ffcdbf80b3bdc90e7fab3de2
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
|
||||
64
.github/workflows/generate_svgs.yml
vendored
64
.github/workflows/generate_svgs.yml
vendored
@@ -209,60 +209,98 @@ jobs:
|
||||
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
|
||||
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
|
||||
|
||||
gpu-zk-server-latency-table:
|
||||
name: generate_documentation_svgs/gpu-zk-server-latency-table
|
||||
uses: ./.github/workflows/generate_svg_common.yml
|
||||
if: inputs.generate-gpu-svgs
|
||||
with:
|
||||
backend: gpu
|
||||
hardware_name: n3-H100-SXM5x8
|
||||
layer: integer
|
||||
bench_subset: zk
|
||||
pbs_kind: multi_bit
|
||||
grouping_factor: 4
|
||||
bench_type: latency
|
||||
time_span_days: ${{ inputs.time_span_days }}
|
||||
output_filename: gpu-zk-benchmark-latency
|
||||
secrets:
|
||||
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
|
||||
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
|
||||
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
|
||||
|
||||
gpu-zk-server-throughput-table:
|
||||
name: generate_documentation_svgs/gpu-zk-server-throughput-table
|
||||
uses: ./.github/workflows/generate_svg_common.yml
|
||||
if: inputs.generate-gpu-svgs
|
||||
with:
|
||||
backend: gpu
|
||||
hardware_name: n3-H100-SXM5x8
|
||||
layer: integer
|
||||
bench_subset: zk
|
||||
pbs_kind: multi_bit
|
||||
grouping_factor: 4
|
||||
bench_type: throughput
|
||||
time_span_days: ${{ inputs.time_span_days }}
|
||||
output_filename: gpu-zk-benchmark-throughput
|
||||
secrets:
|
||||
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
|
||||
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
|
||||
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
|
||||
|
||||
# -----------------------------------------------------------
|
||||
# ERC20 benchmarks tables
|
||||
# ERC7984 benchmarks tables
|
||||
# -----------------------------------------------------------
|
||||
|
||||
cpu-erc20-latency-throughput-table:
|
||||
name: generate_documentation_svgs/cpu-erc20-latency-throughput-table
|
||||
cpu-erc7984-latency-throughput-table:
|
||||
name: generate_documentation_svgs/cpu-erc7984-latency-throughput-table
|
||||
uses: ./.github/workflows/generate_svg_common.yml
|
||||
if: inputs.generate-cpu-svgs
|
||||
with:
|
||||
backend: cpu
|
||||
hardware_name: hpc7a.96xlarge
|
||||
layer: hlapi
|
||||
bench_subset: erc20
|
||||
bench_subset: erc7984
|
||||
pbs_kind: classical
|
||||
bench_type: both
|
||||
time_span_days: ${{ inputs.time_span_days }}
|
||||
output_filename: cpu-hlapi-erc20-benchmark-latency-throughput
|
||||
output_filename: cpu-hlapi-erc7984-benchmark-latency-throughput
|
||||
secrets:
|
||||
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
|
||||
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
|
||||
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
|
||||
|
||||
gpu-erc20-latency-throughput-table:
|
||||
name: generate_documentation_svgs/gpu-erc20-latency-throughput-table
|
||||
gpu-erc7984-latency-throughput-table:
|
||||
name: generate_documentation_svgs/gpu-erc7984-latency-throughput-table
|
||||
uses: ./.github/workflows/generate_svg_common.yml
|
||||
if: inputs.generate-gpu-svgs
|
||||
with:
|
||||
backend: gpu
|
||||
hardware_name: n3-H100-SXM5x8
|
||||
layer: hlapi
|
||||
bench_subset: erc20
|
||||
bench_subset: erc7984
|
||||
pbs_kind: multi_bit
|
||||
grouping_factor: 4
|
||||
bench_type: both
|
||||
time_span_days: ${{ inputs.time_span_days }}
|
||||
output_filename: gpu-hlapi-erc20-benchmark-h100x8-sxm5-latency-throughput
|
||||
output_filename: gpu-hlapi-erc7984-benchmark-h100x8-sxm5-latency-throughput
|
||||
secrets:
|
||||
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
|
||||
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
|
||||
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
|
||||
|
||||
hpu-erc20-latency-throughput-table:
|
||||
name: generate_documentation_svgs/hpu-erc20-latency-throughput-table
|
||||
hpu-erc7984-latency-throughput-table:
|
||||
name: generate_documentation_svgs/hpu-erc7984-latency-throughput-table
|
||||
uses: ./.github/workflows/generate_svg_common.yml
|
||||
if: inputs.generate-hpu-svgs
|
||||
with:
|
||||
backend: hpu
|
||||
hardware_name: hpu_x1
|
||||
layer: hlapi
|
||||
bench_subset: erc20
|
||||
bench_subset: erc7984
|
||||
pbs_kind: classical
|
||||
bench_type: both
|
||||
time_span_days: ${{ inputs.time_span_days }}
|
||||
output_filename: hpu-hlapi-erc20-benchmark-hpux1-latency-throughput.svg
|
||||
output_filename: hpu-hlapi-erc7984-benchmark-hpux1-latency-throughput.svg
|
||||
secrets:
|
||||
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
|
||||
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
|
||||
|
||||
@@ -43,7 +43,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -149,7 +149,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
32
.github/workflows/gpu_core_h100_tests.yml
vendored
32
.github/workflows/gpu_core_h100_tests.yml
vendored
@@ -23,7 +23,7 @@ on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
types: [ labeled, opened, synchronize ]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
@@ -38,6 +38,7 @@ jobs:
|
||||
pull-requests: read # Needed to check for file change
|
||||
outputs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
|
||||
@@ -62,29 +63,24 @@ jobs:
|
||||
- tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**/**.md'
|
||||
- '.github/workflows/gpu_core_h100_tests.yml'
|
||||
core_crypto:
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
|
||||
setup-instance:
|
||||
name: gpu_core_h100_tests/setup-instance
|
||||
needs: should-run
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
|
||||
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
|
||||
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
|
||||
# otherwise we'll try to run the next job on a non-existing on-demand instance.
|
||||
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
|
||||
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
|
||||
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
|
||||
steps:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
continue-on-error: true
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -93,13 +89,6 @@ jobs:
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
|
||||
# This will allow to fallback on permanent instances running on Hyperstack.
|
||||
- name: Use permanent remote instance
|
||||
id: use-permanent-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
|
||||
run: |
|
||||
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# This instance will be spawned especially for pull-request from forked repository
|
||||
- name: Start GitHub instance
|
||||
id: start-github-instance
|
||||
@@ -132,7 +121,6 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Setup Hyperstack dependencies
|
||||
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
|
||||
uses: ./.github/actions/gpu_setup
|
||||
with:
|
||||
cuda-version: ${{ matrix.cuda }}
|
||||
@@ -176,14 +164,14 @@ jobs:
|
||||
|
||||
teardown-instance:
|
||||
name: gpu_core_h100_tests/teardown-instance
|
||||
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
|
||||
if: ${{ always() && needs.setup-instance.result == 'success' }}
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/gpu_fast_tests.yml
vendored
4
.github/workflows/gpu_fast_tests.yml
vendored
@@ -77,7 +77,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -182,7 +182,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
26
.github/workflows/gpu_full_h100_tests.yml
vendored
26
.github/workflows/gpu_full_h100_tests.yml
vendored
@@ -25,17 +25,11 @@ jobs:
|
||||
name: gpu_full_h100_tests/setup-instance
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
|
||||
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
|
||||
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
|
||||
# otherwise we'll try to run the next job on a non-existing on-demand instance.
|
||||
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
|
||||
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
continue-on-error: true
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -44,13 +38,6 @@ jobs:
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
|
||||
# This will allow to fallback on permanent instances running on Hyperstack.
|
||||
- name: Use permanent remote instance
|
||||
id: use-permanent-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
|
||||
run: |
|
||||
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
|
||||
|
||||
cuda-tests-linux:
|
||||
name: gpu_full_h100_tests/cuda-tests-linux
|
||||
needs: [ setup-instance ]
|
||||
@@ -74,7 +61,6 @@ jobs:
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Setup Hyperstack dependencies
|
||||
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
|
||||
uses: ./.github/actions/gpu_setup
|
||||
with:
|
||||
cuda-version: ${{ matrix.cuda }}
|
||||
@@ -118,13 +104,13 @@ jobs:
|
||||
|
||||
teardown-instance:
|
||||
name: gpu_full_h100_tests/teardown-instance
|
||||
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
|
||||
if: ${{ always() && needs.setup-instance.result == 'success' }}
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -80,7 +80,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -186,7 +186,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
31
.github/workflows/gpu_hlapi_h100_tests.yml
vendored
31
.github/workflows/gpu_hlapi_h100_tests.yml
vendored
@@ -23,7 +23,7 @@ on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
types: [ labeled, opened, synchronize ]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
@@ -38,6 +38,7 @@ jobs:
|
||||
pull-requests: read # Needed to check for file change
|
||||
outputs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
|
||||
@@ -65,27 +66,23 @@ jobs:
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**/**.md'
|
||||
- '.github/workflows/gpu_hlapi_h100_tests.yml'
|
||||
core_crypto:
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
|
||||
setup-instance:
|
||||
name: gpu_hlapi_h100_tests/setup-instance
|
||||
needs: should-run
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
|
||||
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
|
||||
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
|
||||
# otherwise we'll try to run the next job on a non-existing on-demand instance.
|
||||
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
|
||||
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
|
||||
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
|
||||
steps:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
continue-on-error: true
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -94,13 +91,6 @@ jobs:
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
|
||||
# This will allow to fallback on permanent instances running on Hyperstack.
|
||||
- name: Use permanent remote instance
|
||||
id: use-permanent-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
|
||||
run: |
|
||||
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# This instance will be spawned especially for pull-request from forked repository
|
||||
- name: Start GitHub instance
|
||||
id: start-github-instance
|
||||
@@ -133,7 +123,6 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Setup Hyperstack dependencies
|
||||
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
|
||||
uses: ./.github/actions/gpu_setup
|
||||
with:
|
||||
cuda-version: ${{ matrix.cuda }}
|
||||
@@ -184,14 +173,14 @@ jobs:
|
||||
|
||||
teardown-instance:
|
||||
name: gpu_hlapi_h100_tests/teardown-instance
|
||||
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
|
||||
if: ${{ always() && needs.setup-instance.result == 'success' }}
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
43
.github/workflows/gpu_integer_long_run_tests.yml
vendored
43
.github/workflows/gpu_integer_long_run_tests.yml
vendored
@@ -17,8 +17,8 @@ on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# Nightly tests will be triggered each evening 8p.m.
|
||||
- cron: "0 20 * * *"
|
||||
# Weekly tests will be triggered every Monday at 8p.m.
|
||||
- cron: "0 20 * * 1"
|
||||
pull_request:
|
||||
|
||||
|
||||
@@ -28,17 +28,48 @@ permissions:
|
||||
# zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
name: gpu_integer_long_run_tests/should-run
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: read # Needed to check for file change
|
||||
outputs:
|
||||
is_needed_in_gpu_ci: ${{ env.IS_PR == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@22103cc46bda19c2b464ffe86db46df6922fd323 # v47.0.5
|
||||
with:
|
||||
files_yaml: |
|
||||
gpu:
|
||||
- tfhe/Cargo.toml
|
||||
- tfhe/build.rs
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
- tfhe/src/integer/gpu/**
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- '.github/workflows/gpu_integer_long_run_tests.yml'
|
||||
|
||||
setup-instance:
|
||||
name: gpu_integer_long_run_tests/setup-instance
|
||||
if: github.event_name != 'schedule' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
|
||||
needs: [should-run]
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
needs.should-run.outputs.is_needed_in_gpu_ci == 'true'
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -112,7 +143,7 @@ jobs:
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/gpu_memory_sanitizer.yml
vendored
4
.github/workflows/gpu_memory_sanitizer.yml
vendored
@@ -74,7 +74,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -166,7 +166,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -74,7 +74,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -166,7 +166,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
12
.github/workflows/gpu_pcc.yml
vendored
12
.github/workflows/gpu_pcc.yml
vendored
@@ -38,7 +38,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -131,6 +131,10 @@ jobs:
|
||||
env:
|
||||
GCC_VERSION: ${{ matrix.gcc }}
|
||||
|
||||
- name: Run semgrep and lint checks on CUDA code
|
||||
run: |
|
||||
make semgrep_and_lint_gpu_code
|
||||
|
||||
- name: Run fmt checks
|
||||
run: |
|
||||
make check_fmt_gpu
|
||||
@@ -139,10 +143,6 @@ jobs:
|
||||
run: |
|
||||
make pcc_gpu
|
||||
|
||||
- name: Run semgrep and lint checks on CUDA code
|
||||
run: |
|
||||
make semgrep_and_lint_gpu_code
|
||||
|
||||
- name: Run semver checks on tfhe-cuda-backend
|
||||
run: |
|
||||
make semver_check_cuda_backend
|
||||
@@ -176,7 +176,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -63,7 +63,6 @@ jobs:
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**/**.md'
|
||||
- '.github/workflows/gpu_signed_integer_classic_tests.yml'
|
||||
- scripts/integer-tests.sh
|
||||
|
||||
@@ -80,7 +79,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -169,7 +168,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -23,7 +23,7 @@ on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
types: [ labeled, opened, synchronize ]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
@@ -38,6 +38,7 @@ jobs:
|
||||
pull-requests: read # Needed to check for file change
|
||||
outputs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
|
||||
@@ -63,30 +64,25 @@ jobs:
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**/**.md'
|
||||
- '.github/workflows/gpu_signed_integer_h100_tests.yml'
|
||||
- scripts/integer-tests.sh
|
||||
core_crypto:
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
|
||||
setup-instance:
|
||||
name: gpu_signed_integer_h100_tests/setup-instance
|
||||
needs: should-run
|
||||
if: github.event_name != 'pull_request' ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
|
||||
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
|
||||
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
|
||||
# otherwise we'll try to run the next job on a non-existing on-demand instance.
|
||||
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
|
||||
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
|
||||
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
|
||||
steps:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
continue-on-error: true
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -95,13 +91,6 @@ jobs:
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
|
||||
# This will allow to fallback on permanent instances running on Hyperstack.
|
||||
- name: Use permanent remote instance
|
||||
id: use-permanent-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
|
||||
run: |
|
||||
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# This instance will be spawned especially for pull-request from forked repository
|
||||
- name: Start GitHub instance
|
||||
id: start-github-instance
|
||||
@@ -134,7 +123,6 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Setup Hyperstack dependencies
|
||||
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
|
||||
uses: ./.github/actions/gpu_setup
|
||||
with:
|
||||
cuda-version: ${{ matrix.cuda }}
|
||||
@@ -176,14 +164,14 @@ jobs:
|
||||
|
||||
teardown-instance:
|
||||
name: gpu_signed_integer_h100_tests/teardown-instance
|
||||
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
|
||||
if: ${{ always() && needs.setup-instance.result == 'success' }}
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -64,7 +64,6 @@ jobs:
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**/**.md'
|
||||
- '.github/workflows/gpu_signed_integer_tests.yml'
|
||||
- scripts/integer-tests.sh
|
||||
|
||||
@@ -81,7 +80,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -178,7 +177,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -63,7 +63,6 @@ jobs:
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**/**.md'
|
||||
- '.github/workflows/gpu_unsigned_integer_classic_tests.yml'
|
||||
- scripts/integer-tests.sh
|
||||
|
||||
@@ -80,7 +79,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -169,7 +168,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -23,7 +23,7 @@ on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
types: [ labeled, opened, synchronize ]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
@@ -38,6 +38,7 @@ jobs:
|
||||
pull-requests: read # Needed to check for file change
|
||||
outputs:
|
||||
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
|
||||
core_crypto_changed: ${{ steps.changed-files.outputs.core_crypto_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
|
||||
@@ -63,30 +64,25 @@ jobs:
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**/**.md'
|
||||
- '.github/workflows/gpu_unsigned_integer_h100_tests.yml'
|
||||
- scripts/integer-tests.sh
|
||||
core_crypto:
|
||||
- tfhe/src/core_crypto/gpu/**
|
||||
|
||||
setup-instance:
|
||||
name: gpu_unsigned_integer_h100_tests/setup-instance
|
||||
needs: should-run
|
||||
if: github.event_name == 'workflow_dispatch' ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
|
||||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true') ||
|
||||
(github.event.action != 'labeled' && needs.should-run.outputs.core_crypto_changed == 'true')
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
|
||||
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
|
||||
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
|
||||
# otherwise we'll try to run the next job on a non-existing on-demand instance.
|
||||
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
|
||||
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
|
||||
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
|
||||
steps:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
continue-on-error: true
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -95,13 +91,6 @@ jobs:
|
||||
backend: hyperstack
|
||||
profile: single-h100
|
||||
|
||||
# This will allow to fallback on permanent instances running on Hyperstack.
|
||||
- name: Use permanent remote instance
|
||||
id: use-permanent-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
|
||||
run: |
|
||||
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# This instance will be spawned especially for pull-request from forked repository
|
||||
- name: Start GitHub instance
|
||||
id: start-github-instance
|
||||
@@ -134,7 +123,6 @@ jobs:
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Setup Hyperstack dependencies
|
||||
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
|
||||
uses: ./.github/actions/gpu_setup
|
||||
with:
|
||||
cuda-version: ${{ matrix.cuda }}
|
||||
@@ -176,14 +164,14 @@ jobs:
|
||||
|
||||
teardown-instance:
|
||||
name: gpu_unsigned_integer_h100_tests/teardown-instance
|
||||
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
|
||||
if: ${{ always() && needs.setup-instance.result == 'success' }}
|
||||
needs: [ setup-instance, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
@@ -64,7 +64,6 @@ jobs:
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/high_level_api/**
|
||||
- tfhe/src/c_api/**
|
||||
- 'tfhe/docs/**/**.md'
|
||||
- '.github/workflows/gpu_unsigned_integer_tests.yml'
|
||||
- scripts/integer-tests.sh
|
||||
|
||||
@@ -81,7 +80,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -178,7 +177,7 @@ jobs:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
13
.github/workflows/gpu_zk_tests.yml
vendored
13
.github/workflows/gpu_zk_tests.yml
vendored
@@ -51,7 +51,13 @@ jobs:
|
||||
with:
|
||||
files_yaml: |
|
||||
gpu:
|
||||
- tfhe/Cargo.toml
|
||||
- tfhe/build.rs
|
||||
- backends/tfhe-cuda-backend/**
|
||||
- backends/zk-cuda-backend/**
|
||||
- tfhe/src/shortint/parameters/**
|
||||
- tfhe/src/zk/**
|
||||
- tfhe-zk-pok/**
|
||||
- '.github/workflows/gpu_zk_tests.yml'
|
||||
- ci/slab.toml
|
||||
|
||||
@@ -67,7 +73,7 @@ jobs:
|
||||
- name: Start remote instance
|
||||
id: start-remote-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -126,6 +132,9 @@ jobs:
|
||||
- name: Run zk-cuda-backend integration tests
|
||||
run: |
|
||||
make test_zk_cuda_backend
|
||||
make test_zk_pok_experimental_gpu
|
||||
make test_integer_zk_gpu
|
||||
make test_integer_zk_experimental_gpu
|
||||
|
||||
slack-notify:
|
||||
name: gpu_zk_tests/slack-notify
|
||||
@@ -158,7 +167,7 @@ jobs:
|
||||
- name: Stop remote instance
|
||||
id: stop-instance
|
||||
if: env.SECRETS_AVAILABLE == 'true'
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
|
||||
4
.github/workflows/make_release_common.yml
vendored
4
.github/workflows/make_release_common.yml
vendored
@@ -101,13 +101,13 @@ jobs:
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Download artifact
|
||||
uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
|
||||
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
|
||||
with:
|
||||
name: crate-${{ inputs.package-name }}
|
||||
path: target/package
|
||||
|
||||
- name: Authenticate on registry
|
||||
uses: rust-lang/crates-io-auth-action@b7e9a28eded4986ec6b1fa40eeee8f8f165559ec # v1.0.3
|
||||
uses: rust-lang/crates-io-auth-action@bbd81622f20ce9e2dd9622e3218b975523e45bbe # v1.0.4
|
||||
id: auth
|
||||
|
||||
- name: Publish crate.io package
|
||||
|
||||
@@ -1,12 +1,36 @@
|
||||
name: make_release_cuda
|
||||
# Common workflow to make crate release for CUDA backend
|
||||
name: make_release_common_cuda
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
workflow_call:
|
||||
inputs:
|
||||
dry_run:
|
||||
description: "Dry-run"
|
||||
package-name:
|
||||
type: string
|
||||
required: true
|
||||
dry-run:
|
||||
type: boolean
|
||||
default: true
|
||||
secrets:
|
||||
REPO_CHECKOUT_TOKEN:
|
||||
required: true
|
||||
SLAB_ACTION_TOKEN:
|
||||
required: true
|
||||
SLAB_BASE_URL:
|
||||
required: true
|
||||
SLAB_URL:
|
||||
required: true
|
||||
JOB_SECRET:
|
||||
required: true
|
||||
SLACK_CHANNEL:
|
||||
required: true
|
||||
BOT_USERNAME:
|
||||
required: true
|
||||
SLACK_WEBHOOK:
|
||||
required: true
|
||||
ALLOWED_TEAM:
|
||||
required: true
|
||||
READ_ORG_TOKEN:
|
||||
required: true
|
||||
|
||||
env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
@@ -21,15 +45,15 @@ permissions: {}
|
||||
|
||||
jobs:
|
||||
verify-triggering-actor:
|
||||
name: make_release_cuda/verify-triggering-actor
|
||||
name: make_release_common_cuda/verify-triggering-actor
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
uses: ./.github/workflows/verify_triggering_actor.yml
|
||||
secrets:
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
ALLOWED_TEAM: ${{ secrets.ALLOWED_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
|
||||
setup-instance:
|
||||
name: make_release_cuda/setup-instance
|
||||
name: make_release_common_cuda/setup-instance
|
||||
needs: verify-triggering-actor
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
@@ -37,7 +61,7 @@ jobs:
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -47,7 +71,7 @@ jobs:
|
||||
profile: gpu-build
|
||||
|
||||
package:
|
||||
name: make_release_cuda/package
|
||||
name: make_release_common_cuda/package
|
||||
needs: setup-instance
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
outputs:
|
||||
@@ -76,7 +100,6 @@ jobs:
|
||||
toolchain: stable
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
{
|
||||
@@ -89,7 +112,6 @@ jobs:
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${GCC_VERSION}";
|
||||
@@ -101,12 +123,14 @@ jobs:
|
||||
GCC_VERSION: ${{ matrix.gcc }}
|
||||
|
||||
- name: Prepare package
|
||||
env:
|
||||
PACKAGE: ${{ inputs.package-name }}
|
||||
run: |
|
||||
cargo package -p tfhe-cuda-backend
|
||||
cargo package -p "${PACKAGE}"
|
||||
|
||||
- uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
|
||||
with:
|
||||
name: crate-tfhe-cuda-backend
|
||||
name: crate-${{ inputs.package-name }}
|
||||
path: target/package/*.crate
|
||||
|
||||
- name: generate hash
|
||||
@@ -114,8 +138,8 @@ jobs:
|
||||
run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
provenance:
|
||||
name: make_release_cuda/provenance
|
||||
if: ${{ !inputs.dry_run }}
|
||||
name: make_release_common_cuda/provenance
|
||||
if: ${{ !inputs.dry-run }}
|
||||
needs: [package]
|
||||
# This action cannot be pinned to a specific commit (see https://github.com/slsa-framework/slsa-github-generator/blob/main/README.md#referencing-slsa-builders-and-generators)
|
||||
uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0 # zizmor: ignore[unpinned-uses] as said above SLSA cannot be pinned by tag today
|
||||
@@ -128,7 +152,7 @@ jobs:
|
||||
base64-subjects: ${{ needs.package.outputs.hash }}
|
||||
|
||||
publish-cuda-release:
|
||||
name: make_release_cuda/publish-cuda-release
|
||||
name: make_release_common_cuda/publish-cuda-release
|
||||
needs: [setup-instance, package] # for comparing hashes
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
permissions:
|
||||
@@ -150,7 +174,6 @@ jobs:
|
||||
toolchain: stable
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
{
|
||||
@@ -163,7 +186,6 @@ jobs:
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${GCC_VERSION}";
|
||||
@@ -175,24 +197,25 @@ jobs:
|
||||
GCC_VERSION: ${{ matrix.gcc }}
|
||||
|
||||
- name: Download artifact
|
||||
uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
|
||||
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
|
||||
with:
|
||||
name: crate-tfhe-cuda-backend
|
||||
name: crate-${{ inputs.package-name }}
|
||||
path: target/package
|
||||
|
||||
- name: Authenticate on registry
|
||||
uses: rust-lang/crates-io-auth-action@b7e9a28eded4986ec6b1fa40eeee8f8f165559ec # v1.0.3
|
||||
uses: rust-lang/crates-io-auth-action@bbd81622f20ce9e2dd9622e3218b975523e45bbe # v1.0.4
|
||||
id: auth
|
||||
|
||||
- name: Publish crate.io package
|
||||
env:
|
||||
CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
|
||||
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
|
||||
PACKAGE: ${{ inputs.package-name }}
|
||||
DRY-RUN: ${{ inputs.dry-run && '--dry-run' || '' }}
|
||||
run: |
|
||||
# DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish
|
||||
# would fail. This is safe since DRY_RUN is handled in the env section above.
|
||||
# dry-run expansion cannot be double quoted when variable contains empty string otherwise cargo publish
|
||||
# would fail. This is safe since dry-run is handled in the env section above.
|
||||
# shellcheck disable=SC2086
|
||||
cargo publish -p tfhe-cuda-backend ${DRY_RUN}
|
||||
cargo publish -p "${PACKAGE}" ${DRY-RUN}
|
||||
|
||||
- name: Generate hash
|
||||
id: published_hash
|
||||
@@ -204,7 +227,7 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: failure
|
||||
SLACK_MESSAGE: "SLSA tfhe-cuda-backend crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "SLSA ${{ inputs.package-name }} crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
@@ -212,17 +235,17 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "tfhe-cuda-backend release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "${{ inputs.package-name }} release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-instance:
|
||||
name: make_release_cuda/teardown-instance
|
||||
name: make_release_common_cuda/teardown-instance
|
||||
if: ${{ always() && needs.setup-instance.result == 'success' }}
|
||||
needs: [setup-instance, publish-cuda-release]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@0a812986560d3f10dc65728b1ccb9ae4c48a8a16 # v1.5.1
|
||||
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
@@ -235,4 +258,4 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Instance teardown (publish-cuda-release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "Instance teardown (${{ inputs.package-name }} release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
21
.github/workflows/make_release_tfhe.yml
vendored
21
.github/workflows/make_release_tfhe.yml
vendored
@@ -16,6 +16,10 @@ on:
|
||||
description: "Push web js package"
|
||||
type: boolean
|
||||
default: true
|
||||
push_web_compat_package:
|
||||
description: "Push web compat (cross-origin) js package"
|
||||
type: boolean
|
||||
default: true
|
||||
push_node_package:
|
||||
description: "Push node js package"
|
||||
type: boolean
|
||||
@@ -99,6 +103,23 @@ jobs:
|
||||
tag: ${{ env.NPM_TAG }}
|
||||
provenance: true
|
||||
|
||||
- name: Build web compat (cross-origin) package
|
||||
if: ${{ inputs.push_web_compat_package }}
|
||||
run: |
|
||||
rm -rf tfhe/pkg
|
||||
|
||||
make build_web_js_api
|
||||
sed -i 's/"tfhe"/"tfhe-compat"/g' tfhe/pkg/package.json
|
||||
|
||||
- name: Publish web compat (cross-origin) package
|
||||
if: ${{ inputs.push_web_compat_package }}
|
||||
uses: JS-DevTools/npm-publish@0fd2f4369c5d6bcfcde6091a7c527d810b9b5c3f
|
||||
with:
|
||||
package: tfhe/pkg/package.json
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
tag: ${{ env.NPM_TAG }}
|
||||
provenance: true
|
||||
|
||||
- name: Build Node package
|
||||
if: ${{ inputs.push_node_package }}
|
||||
run: |
|
||||
|
||||
44
.github/workflows/make_release_tfhe_cuda.yml
vendored
Normal file
44
.github/workflows/make_release_tfhe_cuda.yml
vendored
Normal file
@@ -0,0 +1,44 @@
|
||||
# Publish new release of tfhe-rs CUDA backend on crates.io.
|
||||
name: make_release_tfhe_cuda
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
dry_run:
|
||||
description: "Dry-run"
|
||||
type: boolean
|
||||
default: true
|
||||
|
||||
env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
permissions: {}
|
||||
|
||||
# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
|
||||
|
||||
jobs:
|
||||
make-release:
|
||||
name: make_release_tfhe_cuda/make-release
|
||||
uses: ./.github/workflows/make_release_common_cuda.yml
|
||||
with:
|
||||
package-name: "tfhe-cuda-backend"
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
permissions:
|
||||
actions: read # Needed to detect the GitHub Actions environment
|
||||
id-token: write # Needed to create the provenance via GitHub OIDC
|
||||
contents: write # Needed to upload assets/artifacts
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
32
.github/workflows/make_release_tfhe_safe_serialize.yml
vendored
Normal file
32
.github/workflows/make_release_tfhe_safe_serialize.yml
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
name: make_release_tfhe_safe_serialize
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
dry_run:
|
||||
description: "Dry-run"
|
||||
type: boolean
|
||||
default: true
|
||||
|
||||
permissions: {}
|
||||
|
||||
# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
|
||||
|
||||
jobs:
|
||||
make-release:
|
||||
name: make_release_tfhe_safe_serialize/make-release
|
||||
uses: ./.github/workflows/make_release_common.yml
|
||||
with:
|
||||
package-name: "tfhe-safe-serialize"
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
permissions:
|
||||
actions: read # Needed to detect the GitHub Actions environment
|
||||
id-token: write # Needed to create the provenance via GitHub OIDC
|
||||
contents: write # Needed to upload assets/artifacts
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
44
.github/workflows/make_release_zk_cuda.yml
vendored
Normal file
44
.github/workflows/make_release_zk_cuda.yml
vendored
Normal file
@@ -0,0 +1,44 @@
|
||||
# Publish new release of CUDA Zero-Knowledge primitives on crates.io.
|
||||
name: make_release_zk_cuda
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
dry_run:
|
||||
description: "Dry-run"
|
||||
type: boolean
|
||||
default: true
|
||||
|
||||
env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
permissions: {}
|
||||
|
||||
# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
|
||||
|
||||
jobs:
|
||||
make-release:
|
||||
name: make_release_zk_cuda/make-release
|
||||
uses: ./.github/workflows/make_release_common_cuda.yml
|
||||
with:
|
||||
package-name: "zk-cuda-backend"
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
permissions:
|
||||
actions: read # Needed to detect the GitHub Actions environment
|
||||
id-token: write # Needed to create the provenance via GitHub OIDC
|
||||
contents: write # Needed to upload assets/artifacts
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
4
.github/workflows/parameters_check.yml
vendored
4
.github/workflows/parameters_check.yml
vendored
@@ -53,7 +53,7 @@ jobs:
|
||||
|
||||
- name: Restore Sagemath image from cache
|
||||
id: docker-cache
|
||||
uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/restore@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
with:
|
||||
path: /tmp/sagemath_image
|
||||
key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}
|
||||
@@ -76,7 +76,7 @@ jobs:
|
||||
- name: Store Sagemath image in cache
|
||||
if: steps.docker-cache.outputs.cache-hit != 'true'
|
||||
continue-on-error: true
|
||||
uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 #v5.0.3
|
||||
uses: actions/cache/save@668228422ae6a00e4ad889ee87cd7109ec5666a7 #v5.0.4
|
||||
with:
|
||||
path: /tmp/sagemath_image
|
||||
key: sagemath-image-${{ env.SAGEMATH_VERSION }}-${{ github.sha }}
|
||||
|
||||
47
.github/workflows/sync_on_push.yml
vendored
47
.github/workflows/sync_on_push.yml
vendored
@@ -24,6 +24,8 @@ jobs:
|
||||
SOURCE_REPO: "zama-ai/tfhe-rs"
|
||||
SOURCE_BRANCH: "main"
|
||||
DESTINATION_BRANCH: "main"
|
||||
SOURCE_TAGS: "refs/tags/*"
|
||||
DESTINATION_TAGS: "refs/tags/*"
|
||||
USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
TOKEN: ${{ secrets.SYNC_REPO_TOKEN }}
|
||||
DEST_REPO: ${{ secrets.SYNC_DEST_REPO }}
|
||||
@@ -33,49 +35,16 @@ jobs:
|
||||
git clone --quiet "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs --origin source && cd ./tfhe-rs
|
||||
git remote add destination "https://${USERNAME}:${TOKEN}@github.com/${DEST_REPO}.git"
|
||||
|
||||
# The LFS config disables pulling files by default, so remove it
|
||||
# TODO: see if we need to more precisely fetch LFS files or if git is smart
|
||||
rm .lfsconfig
|
||||
|
||||
echo ">>> Fetching all branches references down locally so subsequent commands can see them..."
|
||||
git fetch source '+refs/heads/*:refs/heads/*' --update-head-ok
|
||||
git fetch --all --tags --update-head-ok --quiet
|
||||
|
||||
echo ">>> Print out all branches"
|
||||
git --no-pager branch -a -vv
|
||||
echo ">>> Sync LFS items from source..."
|
||||
./scripts/lfs_sync.sh source destination "${SOURCE_BRANCH}"
|
||||
|
||||
echo ">>> Pull LFS items from source..."
|
||||
git lfs pull source "${SOURCE_BRANCH}"
|
||||
|
||||
echo ">>> Pushing git changes and LFS content..."
|
||||
echo ">>> Pushing git changes for ${SOURCE_BRANCH}..."
|
||||
git push destination "${SOURCE_BRANCH}:${DESTINATION_BRANCH}" -f
|
||||
|
||||
shred --remove .git/config
|
||||
|
||||
- name: git-sync-tags
|
||||
env:
|
||||
SOURCE_REPO: "zama-ai/tfhe-rs"
|
||||
SOURCE_BRANCH: "refs/tags/*"
|
||||
DESTINATION_BRANCH: "refs/tags/*"
|
||||
USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
TOKEN: ${{ secrets.SYNC_REPO_TOKEN }}
|
||||
DEST_REPO: ${{ secrets.SYNC_DEST_REPO }}
|
||||
run: |
|
||||
echo ">>> Cloning source repo..."
|
||||
git lfs install
|
||||
git clone --quiet "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs-tag --origin source && cd ./tfhe-rs-tag
|
||||
git remote add destination "https://${USERNAME}:${TOKEN}@github.com/${DEST_REPO}.git"
|
||||
|
||||
# The LFS config disables pulling files by default, so remove it
|
||||
# TODO: see if we need to more precisely fetch LFS files for new tags or if git is smart
|
||||
rm .lfsconfig
|
||||
|
||||
echo ">>> Fetching all branches references down locally so subsequent commands can see them..."
|
||||
git fetch source '+refs/heads/*:refs/heads/*' --update-head-ok
|
||||
|
||||
echo ">>> Print out all branches"
|
||||
git --no-pager branch -a -vv
|
||||
|
||||
echo ">>> Pushing git changes and LFS content..."
|
||||
git push destination "${SOURCE_BRANCH}:${DESTINATION_BRANCH}" -f
|
||||
echo ">>> Pushing git tags..."
|
||||
git push destination "${SOURCE_TAGS}:${DESTINATION_TAGS}" -f
|
||||
|
||||
shred --remove .git/config
|
||||
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -25,6 +25,7 @@ dieharder_run.log
|
||||
|
||||
# Cuda local build
|
||||
backends/tfhe-cuda-backend/cuda/cmake-build-debug/
|
||||
backends/tfhe-cuda-backend/cuda/build/
|
||||
|
||||
# WASM tests
|
||||
tfhe/web_wasm_parallel_tests/server.PID
|
||||
@@ -34,6 +35,9 @@ node_modules/
|
||||
package-lock.json
|
||||
utils/wasm-par-mq/examples/*/pkg/
|
||||
|
||||
# Commit lock files of backward data generation crates
|
||||
!utils/tfhe-backward-compat-data/crates/generate_*/Cargo.lock
|
||||
|
||||
# Python .env
|
||||
.env
|
||||
__pycache__
|
||||
|
||||
@@ -12,6 +12,7 @@ ignore:
|
||||
- utils/tfhe-lints/**/main.stderr
|
||||
- utils/tfhe-lints/**/*.json
|
||||
- utils/tfhe-backward-compat-data/**/*.ron # ron files are autogenerated
|
||||
- tests/corrupted_inputs_deserialization/data/proven_compact_list/**/metadata.txt
|
||||
|
||||
rules:
|
||||
# checks if file ends in a newline character
|
||||
|
||||
@@ -14,10 +14,12 @@ members = [
|
||||
"tfhe-fft",
|
||||
"tfhe-ntt",
|
||||
"tfhe-zk-pok",
|
||||
"utils/benchmark_spec",
|
||||
"utils/param_dedup",
|
||||
"utils/tfhe-backward-compat-checker",
|
||||
"utils/tfhe-backward-compat-data",
|
||||
"utils/tfhe-backward-compat-data/crates/add_new_version",
|
||||
"utils/tfhe-safe-serialize",
|
||||
"utils/tfhe-versionable",
|
||||
"utils/tfhe-versionable-derive",
|
||||
"utils/wasm-par-mq",
|
||||
@@ -43,6 +45,7 @@ rand = "0.8"
|
||||
rayon = "1.11"
|
||||
serde = { version = "1.0", default-features = false }
|
||||
wasm-bindgen = { version = "0.2.114" }
|
||||
wasm-bindgen-futures = { version = "0.4.56" }
|
||||
# js-sys (at this point in time) automatically enables the unsafe-eval feature which we do not want
|
||||
# this does not prevent other deps from enabling it, but it at least conveys our need to not have it
|
||||
# we still enable std, which was part of default before
|
||||
|
||||
2
LICENSE
2
LICENSE
@@ -1,6 +1,6 @@
|
||||
BSD 3-Clause Clear License
|
||||
|
||||
Copyright © 2025 ZAMA.
|
||||
Copyright © 2026 ZAMA.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
|
||||
309
Makefile
309
Makefile
@@ -26,6 +26,7 @@ BENCH_CUSTOM_COMMAND:=
|
||||
NODE_VERSION=24.12
|
||||
BACKWARD_COMPAT_DATA_DIR=utils/tfhe-backward-compat-data
|
||||
BACKWARD_COMPAT_DATA_GEN_VERSION:=$(TFHE_VERSION)
|
||||
CORRUPTED_INPUTS_TEST=tests/corrupted_inputs_deserialization
|
||||
TEST_VECTORS_DIR=apps/test-vectors
|
||||
CURRENT_TFHE_VERSION:=$(shell grep '^version[[:space:]]*=' tfhe/Cargo.toml | cut -d '=' -f 2 | xargs)
|
||||
WASM_PACK_VERSION="0.13.1"
|
||||
@@ -121,6 +122,12 @@ install_build_wasm32_target:
|
||||
( echo "Unable to install wasm32-unknown-unknown target toolchain, check your rustup installation. \
|
||||
Rustup can be downloaded at https://rustup.rs/" && exit 1 )
|
||||
|
||||
.PHONY: install_check_wasm32_target # Install the wasm32 toolchain used for checks
|
||||
install_check_wasm32_target:
|
||||
rustup target add wasm32-unknown-unknown --toolchain "$(RS_CHECK_TOOLCHAIN)" || \
|
||||
( echo "Unable to install wasm32-unknown-unknown target toolchain, check your rustup installation. \
|
||||
Rustup can be downloaded at https://rustup.rs/" && exit 1 )
|
||||
|
||||
.PHONY: install_cargo_nextest # Install cargo nextest used for shortint tests
|
||||
install_cargo_nextest:
|
||||
@cargo nextest --version > /dev/null 2>&1 || \
|
||||
@@ -305,8 +312,10 @@ semgrep_and_lint_gpu_code: semgrep_lint_setup_venv
|
||||
find "$(TFHECUDA_SRC)" -name '*.h' -o -name '*.cuh' -o -name '*.cu' \
|
||||
| grep -v '/cmake-build-debug/' \
|
||||
| grep -v '/build/' \
|
||||
| xargs venv/bin/semgrep --config "$(TFHECUDA_SRC)/.semgrep/release-ordering.yaml" --scan-unknown-extensions
|
||||
| xargs venv/bin/semgrep --error --config "$(TFHECUDA_SRC)/.semgrep/release-ordering.yaml" --scan-unknown-extensions
|
||||
venv/bin/python3 "scripts/check_scratch_cleanup.py"
|
||||
@# Split the search string using shell string concatenation so the Makefile line doesn't match itself
|
||||
! git ls-files | xargs grep -n 'TODO: ADD COMM''ENT'
|
||||
|
||||
.PHONY: semver_check_cuda_backend # Run semver checks on tfhe-cuda-backend
|
||||
semver_check_cuda_backend:
|
||||
@@ -349,23 +358,23 @@ check_fmt_js: check_nvm_installed
|
||||
.PHONY: check_fmt_toml # Check TOML files format
|
||||
check_fmt_toml: install_taplo
|
||||
@RUST_LOG=warn taplo fmt --check || \
|
||||
echo "TOML files format check failed. Please run 'make fmt_toml'"
|
||||
{ echo "TOML files format check failed. Please run 'make fmt_toml'"; exit 1; }
|
||||
|
||||
.PHONY: check_typos # Check for typos in codebase
|
||||
check_typos: install_typos_checker
|
||||
@typos && echo "No typos found"
|
||||
@git ls-files ":!*.png" ":!*.cbor" ":!*.bcode" ":!*.ico" ":!*/twiddles.cu" | typos --file-list - && echo "No typos found"
|
||||
|
||||
.PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
|
||||
clippy_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats,extended-types,zk-pok \
|
||||
--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats,extended-types,zk-pok \
|
||||
--all-targets \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
|
||||
.PHONY: check_gpu # Run check on tfhe with "gpu" enabled
|
||||
check_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
|
||||
--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \
|
||||
--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats \
|
||||
--all-targets \
|
||||
-p tfhe
|
||||
|
||||
@@ -379,7 +388,7 @@ clippy_hpu: install_rs_check_toolchain
|
||||
.PHONY: clippy_gpu_hpu # Run clippy lints on tfhe with "gpu" and "hpu" enabled
|
||||
clippy_gpu_hpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=boolean,shortint,integer,internal-keycache,gpu,hpu,pbs-stats,extended-types,zk-pok \
|
||||
--features=boolean,shortint,integer,internal-keycache,gpu,gpu-experimental-zk,hpu,pbs-stats,extended-types,zk-pok \
|
||||
--all-targets \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
|
||||
@@ -472,7 +481,7 @@ clippy_rustdoc_gpu: install_rs_check_toolchain
|
||||
fi && \
|
||||
CARGO_TERM_QUIET=true CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
|
||||
--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu \
|
||||
--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu,gpu-experimental-zk \
|
||||
-p tfhe -- --nocapture
|
||||
|
||||
.PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
|
||||
@@ -483,11 +492,17 @@ clippy_c_api: install_rs_check_toolchain
|
||||
|
||||
.PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
|
||||
clippy_js_wasm_api: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,extended-types \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok,extended-types \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,extended-types \
|
||||
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok,extended-types,parallel-wasm-api \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok,extended-types,cross-origin-wasm-api \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_tasks # Run clippy lints on helper tasks crate.
|
||||
@@ -503,7 +518,7 @@ clippy_trivium: install_rs_check_toolchain
|
||||
.PHONY: clippy_ws_tests # Run clippy on the workspace level tests
|
||||
clippy_ws_tests: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --tests \
|
||||
-p tests --features=shortint,integer,zk-pok -- --no-deps -D warnings
|
||||
-p tests --features=shortint,integer,zk-pok,strings -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
|
||||
clippy_all_targets: install_rs_check_toolchain
|
||||
@@ -528,6 +543,15 @@ clippy_zk_pok: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-zk-pok --features=experimental -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_zk_pok_wasm # Run clippy lints on tfhe-zk-pok for wasm32 target
|
||||
clippy_zk_pok_wasm: install_rs_check_toolchain install_check_wasm32_target
|
||||
RUSTFLAGS="$(WASM_RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--target wasm32-unknown-unknown \
|
||||
-p tfhe-zk-pok -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(WASM_RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--target wasm32-unknown-unknown \
|
||||
-p tfhe-zk-pok --features cross-origin-wasm -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_versionable # Run clippy lints on tfhe-versionable
|
||||
clippy_versionable: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
@@ -535,6 +559,11 @@ clippy_versionable: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-versionable -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_safe_serialize # Run clippy lints on tfhe-safe-serialize
|
||||
clippy_safe_serialize: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-safe-serialize -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_param_dedup # Run clippy lints on param_dedup tool
|
||||
clippy_param_dedup: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
@@ -565,10 +594,12 @@ clippy_test_vectors: install_rs_check_toolchain
|
||||
cd apps/test-vectors; RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-test-vectors -- --no-deps -D warnings
|
||||
|
||||
# WARNING: This target is not directly run in CI. When adding a subtarget here,
|
||||
# MAKE SURE TO ALSO ADD IT TO A PCC BATCH BELOW
|
||||
.PHONY: clippy_all # Run all clippy targets
|
||||
clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
|
||||
clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_trivium \
|
||||
clippy_versionable clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup \
|
||||
clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_zk_pok_wasm clippy_trivium \
|
||||
clippy_versionable clippy_safe_serialize clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup \
|
||||
clippy_test_vectors clippy_backward_compat_data clippy_wasm_par_mq
|
||||
|
||||
.PHONY: clippy_fast # Run main clippy targets
|
||||
@@ -665,7 +696,7 @@ build_c_api: install_rs_check_toolchain
|
||||
.PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
|
||||
build_c_api_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,extended-types,gpu \
|
||||
--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,extended-types,gpu,gpu-experimental-zk \
|
||||
-p tfhe
|
||||
|
||||
.PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
|
||||
@@ -674,11 +705,14 @@ build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
|
||||
--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,experimental-force_fft_algo_dif4 \
|
||||
-p tfhe
|
||||
|
||||
.PHONY: build_web_js_api # Build the js API targeting the web browser
|
||||
.PHONY: build_web_js_api # Build the js API targeting the web browser, in sequential or cross origin parallelism modes.
|
||||
build_web_js_api: install_wasm_pack
|
||||
cd tfhe && \
|
||||
RUSTFLAGS="$(WASM_RUSTFLAGS)" wasm-pack build --release --target=web \
|
||||
-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok,extended-types
|
||||
-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok,extended-types,cross-origin-wasm-api && \
|
||||
find pkg/snippets -type f -iname worker_helpers.js -exec sed -i 's|import("../../..")|import("../../../tfhe.js")|g' {} \;
|
||||
cp utils/wasm-par-mq/js/coordinator.js tfhe/pkg/
|
||||
jq '.files += ["snippets"]' tfhe/pkg/package.json > tmp_pkg.json && mv -f tmp_pkg.json tfhe/pkg/package.json
|
||||
|
||||
.PHONY: build_web_js_api_parallel # Build the js API targeting the web browser with parallelism support
|
||||
# parallel wasm requires specific build options, see https://github.com/rust-lang/rust/pull/147225
|
||||
@@ -764,7 +798,7 @@ test_zk_cuda_backend:
|
||||
|
||||
|
||||
.PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
|
||||
test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
|
||||
test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend test_zk_cuda_backend
|
||||
|
||||
.PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
|
||||
test_core_crypto_gpu:
|
||||
@@ -1200,12 +1234,31 @@ test_tfhe_csprng_big_endian: install_cargo_cross
|
||||
RUSTFLAGS="" cross test --profile $(CARGO_PROFILE) \
|
||||
-p tfhe-csprng --target=powerpc64-unknown-linux-gnu
|
||||
|
||||
|
||||
.PHONY: test_zk_pok # Run tfhe-zk-pok tests
|
||||
test_zk_pok:
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
|
||||
-p tfhe-zk-pok --features experimental
|
||||
|
||||
.PHONY: test_zk_pok_experimental_gpu # Run tfhe-zk-pok GPU-accelerated tests
|
||||
test_zk_pok_experimental_gpu:
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
|
||||
-p tfhe-zk-pok --features experimental,gpu-experimental -- gpu
|
||||
|
||||
.PHONY: test_integer_zk_gpu # Run tfhe-zk-pok tests
|
||||
test_integer_zk_gpu:
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
|
||||
--features=integer,zk-pok,gpu -p tfhe -- \
|
||||
integer::gpu::zk::
|
||||
|
||||
.PHONY: test_integer_zk_experimental_gpu # Run tfhe-zk-pok tests
|
||||
test_integer_zk_experimental_gpu:
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
|
||||
--features=integer,zk-pok,gpu,gpu-experimental-zk -p tfhe -- \
|
||||
integer::gpu::zk::
|
||||
|
||||
.PHONY: test_zk_cuda # Run all GPU MSM integration tests (CPU vs GPU comparison + integration test)
|
||||
test_zk_cuda: test_zk_cuda_backend test_zk_pok_experimental_gpu test_integer_zk_gpu test_integer_zk_experimental_gpu
|
||||
|
||||
.PHONY: test_zk_wasm_x86_compat_ci
|
||||
test_zk_wasm_x86_compat_ci: check_nvm_installed
|
||||
source ~/.nvm/nvm.sh && \
|
||||
@@ -1224,6 +1277,11 @@ test_versionable:
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
|
||||
--all-targets -p tfhe-versionable
|
||||
|
||||
.PHONY: test_safe_serialize # Run tests for tfhe-safe-serialize subcrate
|
||||
test_safe_serialize:
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
|
||||
--all-targets -p tfhe-safe-serialize
|
||||
|
||||
# The backward compat data folder holds historical binary data but also rust code to generate and load them.
|
||||
.PHONY: gen_backward_compat_data # Re-generate backward compatibility data
|
||||
gen_backward_compat_data:
|
||||
@@ -1237,11 +1295,19 @@ new_backward_compat_crate:
|
||||
.PHONY: test_backward_compatibility_ci
|
||||
test_backward_compatibility_ci:
|
||||
TFHE_BACKWARD_COMPAT_DATA_DIR="../$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
|
||||
--features=shortint,integer,zk-pok -p tests test_backward_compatibility -- --nocapture
|
||||
--features=shortint,integer,zk-pok,strings -p tests test_backward_compatibility -- --nocapture
|
||||
|
||||
.PHONY: test_backward_compatibility # Same as test_backward_compatibility_ci but tries to clone the data repo first if needed
|
||||
test_backward_compatibility: pull_backward_compat_data test_backward_compatibility_ci
|
||||
|
||||
.PHONY: test_corrupted_inputs_ci
|
||||
test_corrupted_inputs_ci:
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
|
||||
--features=integer,zk-pok,strings -p tests test_corrupted_inputs_deserialization -- --nocapture
|
||||
|
||||
.PHONY: test_corrupted_inputs # Same as test_corrupted_inputs_ci but pulls data first
|
||||
test_corrupted_inputs: pull_corrupted_inputs_data test_corrupted_inputs_ci
|
||||
|
||||
# Generate the test vectors and update the hash file
|
||||
.PHONY: gen_test_vectors
|
||||
gen_test_vectors:
|
||||
@@ -1350,6 +1416,19 @@ test_nodejs_wasm_api_ci: build_node_js_api
|
||||
|
||||
# This is an internal target, not meant to be called on its own.
|
||||
run_web_js_api_parallel: build_web_js_api_parallel setup_venv
|
||||
cd $(WEB_SERVER_DIR) && npm install && npm run build
|
||||
source venv/bin/activate && \
|
||||
python ci/webdriver.py \
|
||||
--browser-path $(browser_path) \
|
||||
--driver-path $(driver_path) \
|
||||
--browser-kind $(browser_kind) \
|
||||
--server-cmd $(server_cmd) \
|
||||
--server-workdir "$(WEB_SERVER_DIR)" \
|
||||
--id-pattern $(filter) \
|
||||
--id-exclude-pattern asyncMainThread
|
||||
|
||||
# This is an internal target, not meant to be called on its own.
|
||||
run_web_js_api_cross_origin: build_web_js_api setup_venv
|
||||
cd $(WEB_SERVER_DIR) && npm install && npm run build
|
||||
source venv/bin/activate && \
|
||||
python ci/webdriver.py \
|
||||
@@ -1392,6 +1471,38 @@ test_web_js_api_parallel_firefox_ci: setup_venv
|
||||
nvm use $(NODE_VERSION) && \
|
||||
$(MAKE) test_web_js_api_parallel_firefox
|
||||
|
||||
test_web_js_api_cross_origin_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
|
||||
test_web_js_api_cross_origin_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
|
||||
test_web_js_api_cross_origin_chrome: browser_kind = chrome
|
||||
test_web_js_api_cross_origin_chrome: server_cmd = "npm run server:cross-origin"
|
||||
test_web_js_api_cross_origin_chrome: filter = ZeroKnowledgeTest # Only run zk proof tests in cross-origin mode
|
||||
|
||||
.PHONY: test_web_js_api_cross_origin_chrome # Run tests for the web wasm api in cross-origin mode on Chrome
|
||||
test_web_js_api_cross_origin_chrome: run_web_js_api_cross_origin
|
||||
|
||||
.PHONY: test_web_js_api_cross_origin_chrome_ci # Run tests for the web wasm api in cross-origin mode on Chrome
|
||||
test_web_js_api_cross_origin_chrome_ci: setup_venv
|
||||
source ~/.nvm/nvm.sh && \
|
||||
nvm install $(NODE_VERSION) && \
|
||||
nvm use $(NODE_VERSION) && \
|
||||
$(MAKE) test_web_js_api_cross_origin_chrome
|
||||
|
||||
test_web_js_api_cross_origin_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
|
||||
test_web_js_api_cross_origin_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
|
||||
test_web_js_api_cross_origin_firefox: browser_kind = firefox
|
||||
test_web_js_api_cross_origin_firefox: server_cmd = "npm run server:cross-origin"
|
||||
test_web_js_api_cross_origin_firefox: filter = ZeroKnowledgeTest # Only run zk proof tests in cross-origin mode
|
||||
|
||||
.PHONY: test_web_js_api_cross_origin_firefox # Run tests for the web wasm api in cross-origin mode on Firefox
|
||||
test_web_js_api_cross_origin_firefox: run_web_js_api_cross_origin
|
||||
|
||||
.PHONY: test_web_js_api_cross_origin_firefox_ci # Run tests for the web wasm api in cross-origin mode on Firefox
|
||||
test_web_js_api_cross_origin_firefox_ci: setup_venv
|
||||
source ~/.nvm/nvm.sh && \
|
||||
nvm install $(NODE_VERSION) && \
|
||||
nvm use $(NODE_VERSION) && \
|
||||
$(MAKE) test_web_js_api_cross_origin_firefox
|
||||
|
||||
WASM_PAR_MQ_TEST_DIR=utils/wasm-par-mq/web_tests
|
||||
|
||||
.PHONY: build_wasm_par_mq_tests # Build the wasm-par-mq test WASM package
|
||||
@@ -1548,27 +1659,57 @@ bench_integer_rerand: install_rs_check_toolchain
|
||||
--bench integer-rerand \
|
||||
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --
|
||||
|
||||
.PHONY: bench_integer_rerand_gpu # Run benchmarks for integer rerand on GPU backend
|
||||
bench_integer_rerand_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-rerand \
|
||||
--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --profile release_lto_off --
|
||||
|
||||
.PHONY: bench_msm_zk
|
||||
bench_msm_zk: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench zk-msm \
|
||||
--features=zk-pok -p tfhe-benchmark --profile release --
|
||||
|
||||
# GPU benchmarks need --profile release for correct measurements
|
||||
.PHONY: bench_msm_zk_gpu
|
||||
bench_msm_zk_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench zk-msm \
|
||||
--features=gpu,gpu-experimental-zk,zk-pok -p tfhe-benchmark --profile release -- zk::cuda::msm
|
||||
|
||||
# GPU benchmarks need --profile release for correct measurements
|
||||
.PHONY: bench_integer_zk_gpu
|
||||
bench_integer_zk_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-zk-pke \
|
||||
--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release_lto_off --
|
||||
--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release --
|
||||
|
||||
# GPU benchmarks need --profile release for correct measurements
|
||||
.PHONY: bench_integer_zk_experimental_gpu
|
||||
bench_integer_zk_experimental_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-zk-pke \
|
||||
--features=integer,internal-keycache,gpu,gpu-experimental-zk,pbs-stats,zk-pok -p tfhe-benchmark --profile release --
|
||||
|
||||
.PHONY: bench_integer_aes_gpu # Run benchmarks for AES on GPU backend
|
||||
bench_integer_aes_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-aes \
|
||||
--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
|
||||
--features=integer,internal-keycache,gpu -p tfhe-benchmark --profile release_lto_off --
|
||||
|
||||
.PHONY: bench_integer_aes256_gpu # Run benchmarks for AES256 on GPU backend
|
||||
bench_integer_aes256_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-aes256 \
|
||||
--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
|
||||
--features=integer,internal-keycache,gpu -p tfhe-benchmark --profile release_lto_off --
|
||||
|
||||
.PHONY: bench_integer_trivium_gpu # Run benchmarks for trivium on GPU backend
|
||||
bench_integer_trivium_gpu: install_rs_check_toolchain
|
||||
@@ -1732,37 +1873,37 @@ bench_web_js_api_parallel_firefox_ci: setup_venv
|
||||
nvm use $(NODE_VERSION) && \
|
||||
$(MAKE) bench_web_js_api_parallel_firefox
|
||||
|
||||
bench_web_js_api_unsafe_coop_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
|
||||
bench_web_js_api_unsafe_coop_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
|
||||
bench_web_js_api_unsafe_coop_chrome: browser_kind = chrome
|
||||
bench_web_js_api_unsafe_coop_chrome: server_cmd = "npm run server:unsafe-coop"
|
||||
bench_web_js_api_unsafe_coop_chrome: filter = ZeroKnowledgeBench # Only bench zk with unsafe coop
|
||||
bench_web_js_api_cross_origin_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
|
||||
bench_web_js_api_cross_origin_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
|
||||
bench_web_js_api_cross_origin_chrome: browser_kind = chrome
|
||||
bench_web_js_api_cross_origin_chrome: server_cmd = "npm run server:cross-origin"
|
||||
bench_web_js_api_cross_origin_chrome: filter = ZeroKnowledgeBench # Only bench zk with cross-origin workers
|
||||
|
||||
.PHONY: bench_web_js_api_unsafe_coop_chrome # Run benchmarks for the web wasm api without cross-origin isolation
|
||||
bench_web_js_api_unsafe_coop_chrome: run_web_js_api_parallel
|
||||
.PHONY: bench_web_js_api_cross_origin_chrome # Run benchmarks for the web wasm api without cross-origin isolation
|
||||
bench_web_js_api_cross_origin_chrome: run_web_js_api_cross_origin
|
||||
|
||||
.PHONY: bench_web_js_api_unsafe_coop_chrome_ci # Run benchmarks for the web wasm api without cross-origin isolation
|
||||
bench_web_js_api_unsafe_coop_chrome_ci: setup_venv
|
||||
.PHONY: bench_web_js_api_cross_origin_chrome_ci # Run benchmarks for the web wasm api without cross-origin isolation
|
||||
bench_web_js_api_cross_origin_chrome_ci: setup_venv
|
||||
source ~/.nvm/nvm.sh && \
|
||||
nvm install $(NODE_VERSION) && \
|
||||
nvm use $(NODE_VERSION) && \
|
||||
$(MAKE) bench_web_js_api_unsafe_coop_chrome
|
||||
$(MAKE) bench_web_js_api_cross_origin_chrome
|
||||
|
||||
bench_web_js_api_unsafe_coop_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
|
||||
bench_web_js_api_unsafe_coop_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
|
||||
bench_web_js_api_unsafe_coop_firefox: browser_kind = firefox
|
||||
bench_web_js_api_unsafe_coop_firefox: server_cmd = "npm run server:unsafe-coop"
|
||||
bench_web_js_api_unsafe_coop_firefox: filter = ZeroKnowledgeBench # Only bench zk with unsafe coop
|
||||
bench_web_js_api_cross_origin_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
|
||||
bench_web_js_api_cross_origin_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
|
||||
bench_web_js_api_cross_origin_firefox: browser_kind = firefox
|
||||
bench_web_js_api_cross_origin_firefox: server_cmd = "npm run server:cross-origin"
|
||||
bench_web_js_api_cross_origin_firefox: filter = ZeroKnowledgeBench # Only bench zk with cross-origin workers
|
||||
|
||||
.PHONY: bench_web_js_api_unsafe_coop_firefox # Run benchmarks for the web wasm api without cross-origin isolation
|
||||
bench_web_js_api_unsafe_coop_firefox: run_web_js_api_parallel
|
||||
.PHONY: bench_web_js_api_cross_origin_firefox # Run benchmarks for the web wasm api without cross-origin isolation
|
||||
bench_web_js_api_cross_origin_firefox: run_web_js_api_cross_origin
|
||||
|
||||
.PHONY: bench_web_js_api_unsafe_coop_firefox_ci # Run benchmarks for the web wasm api without cross-origin isolation
|
||||
bench_web_js_api_unsafe_coop_firefox_ci: setup_venv
|
||||
.PHONY: bench_web_js_api_cross_origin_firefox_ci # Run benchmarks for the web wasm api without cross-origin isolation
|
||||
bench_web_js_api_cross_origin_firefox_ci: setup_venv
|
||||
source ~/.nvm/nvm.sh && \
|
||||
nvm install $(NODE_VERSION) && \
|
||||
nvm use $(NODE_VERSION) && \
|
||||
$(MAKE) bench_web_js_api_unsafe_coop_firefox
|
||||
$(MAKE) bench_web_js_api_cross_origin_firefox
|
||||
|
||||
.PHONY: bench_hlapi_unsigned # Run benchmarks for integer operations
|
||||
bench_hlapi_unsigned: install_rs_check_toolchain
|
||||
@@ -1795,27 +1936,61 @@ bench_hlapi_hpu: install_rs_check_toolchain
|
||||
--bench hlapi \
|
||||
--features=integer,internal-keycache,hpu,hpu-v80,pbs-stats -p tfhe-benchmark --
|
||||
|
||||
.PHONY: bench_hlapi_erc20 # Run benchmarks for ERC20 operations
|
||||
bench_hlapi_erc20: install_rs_check_toolchain
|
||||
.PHONY: bench_hlapi_erc7984 # Run benchmarks for ERC7984 operations
|
||||
bench_hlapi_erc7984: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-erc20 \
|
||||
--bench hlapi-erc7984 \
|
||||
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --
|
||||
|
||||
.PHONY: bench_hlapi_erc20_gpu # Run benchmarks for ERC20 operations on GPU
|
||||
bench_hlapi_erc20_gpu: install_rs_check_toolchain
|
||||
.PHONY: bench_hlapi_erc7984_gpu # Run benchmarks for ERC7984 operations on GPU
|
||||
bench_hlapi_erc7984_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-erc20 \
|
||||
--bench hlapi-erc7984 \
|
||||
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
|
||||
|
||||
.PHONY: bench_hlapi_erc20_gpu_classical # Run benchmarks for ERC20 operations on GPU with classical parameters
|
||||
bench_hlapi_erc20_gpu_classical: install_rs_check_toolchain
|
||||
.PHONY: bench_hlapi_erc7984_gpu_classical # Run benchmarks for ERC7984 operations on GPU with classical parameters
|
||||
bench_hlapi_erc7984_gpu_classical: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=classical \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-erc20 \
|
||||
--bench hlapi-erc7984 \
|
||||
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
|
||||
|
||||
.PHONY: bench_hlapi_erc7984_multi_group_gpu # Runs ERC7984 bench in two processes (half of gpus for each) and aggregates results
|
||||
bench_hlapi_erc7984_multi_group_gpu: install_rs_check_toolchain
|
||||
# This next line must be kept here: the code can not remove this file without a risk of concurrency issues
|
||||
# we don't know which process starts first and which one deletes the files - file deletion may also be not atomic)
|
||||
rm -f /dev/shm/sem.tfhe_bench_*
|
||||
NUM_GROUPS=$${__TFHE_RS_BENCH_MULTI_PROC_GROUPS:-2}; \
|
||||
[ "$$NUM_GROUPS" -ge 2 ] || { echo "Error: __TFHE_RS_BENCH_MULTI_PROC_GROUPS must be at least 2, got $$NUM_GROUPS"; exit 1; }; \
|
||||
trap "echo 'User interrupted the benchmark, stopping all workers!'; rm -f /dev/shm/sem.tfhe_bench_*; kill 0" INT TERM; \
|
||||
for i in $$(seq 0 $$((NUM_GROUPS - 1))); do \
|
||||
GPU_LIST=$$(python3 ci/split_gpus.py $$i $$NUM_GROUPS) || exit 1; \
|
||||
echo "Starting benchmark group $$i with CUDA_VISIBLE_DEVICES=$$GPU_LIST"; \
|
||||
CUDA_VISIBLE_DEVICES=$$GPU_LIST CARGO_TARGET_DIR=target_p$$i RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_GPU_PROCESS_COUNT=$$NUM_GROUPS \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-erc7984 \
|
||||
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow' & \
|
||||
done; \
|
||||
wait
|
||||
|
||||
.PHONY: bench_hlapi_erc7984_multi_group_fake_multi_gpu # Runs ERC7984 bench in two processes in parallel on a single GPU (use to debug bench_hlapi_erc7984_multi_group_gpu)
|
||||
bench_hlapi_erc7984_multi_group_fake_multi_gpu: install_rs_check_toolchain
|
||||
# This next line must be kept here: the code can not remove this file without a risk of concurrency issues
|
||||
# we don't know which process starts first and which one deletes the files - file deletion may also be not atomic)
|
||||
rm -f /dev/shm/sem.tfhe_bench_*
|
||||
trap "echo 'User interrupted the benchmark, stopping all workers!'; rm -f /dev/shm/sem.tfhe_bench_*; kill 0" INT TERM; \
|
||||
CARGO_TARGET_DIR=target_p0 RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=throughput __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_GPU_PROCESS_COUNT=2 \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-erc7984 \
|
||||
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow' & \
|
||||
CARGO_TARGET_DIR=target_p1 RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=throughput __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_GPU_PROCESS_COUNT=2 \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-erc7984 \
|
||||
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow' & \
|
||||
wait
|
||||
|
||||
.PHONY: bench_hlapi_dex # Run benchmarks for DEX operations
|
||||
bench_hlapi_dex: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
@@ -1837,13 +2012,13 @@ bench_hlapi_dex_gpu_classical: install_rs_check_toolchain
|
||||
--bench hlapi-dex \
|
||||
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
|
||||
|
||||
.PHONY: bench_hlapi_erc20_hpu # Run benchmarks for ECR20 operations on HPU
|
||||
bench_hlapi_erc20_hpu: install_rs_check_toolchain
|
||||
.PHONY: bench_hlapi_erc7984_hpu # Run benchmarks for ECR20 operations on HPU
|
||||
bench_hlapi_erc7984_hpu: install_rs_check_toolchain
|
||||
source ./setup_hpu.sh --config $(HPU_CONFIG); \
|
||||
export V80_PCIE_DEV=${V80_PCIE_DEV}; \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-erc20 \
|
||||
--bench hlapi-erc7984 \
|
||||
--features=integer,internal-keycache,hpu,hpu-v80,pbs-stats -p tfhe-benchmark --
|
||||
|
||||
.PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
|
||||
@@ -1851,6 +2026,13 @@ bench_tfhe_zk_pok: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-zk-pok --
|
||||
|
||||
.PHONY: bench_tfhe_zk_pok_gpu # Run benchmarks for the tfhe_zk_pok crate using GPU acceleration
|
||||
bench_tfhe_zk_pok_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--package tfhe-zk-pok \
|
||||
--features=gpu-experimental --profile release
|
||||
|
||||
.PHONY: bench_hlapi_noise_squash # Run benchmarks for noise squash operation
|
||||
bench_hlapi_noise_squash: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
|
||||
@@ -1892,10 +2074,10 @@ bench_summary: install_rs_check_toolchain
|
||||
--bench hlapi-noise-squash \
|
||||
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::decomp_noise_squash_comp::'
|
||||
|
||||
# ERC20
|
||||
# ERC7984
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-erc20 \
|
||||
--bench hlapi-erc7984 \
|
||||
--features=integer,internal-keycache -p tfhe-benchmark -- '::transfer::overflow'
|
||||
|
||||
# DEX
|
||||
@@ -1937,11 +2119,16 @@ bench_summary_gpu: install_rs_check_toolchain
|
||||
--bench hlapi-noise-squash \
|
||||
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::decomp_noise_squash_comp::'
|
||||
|
||||
# ERC20
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
|
||||
# This make target only runs the latency benchmark. This is because
|
||||
# summary benchmarks must use the multi-process-multi-group throughput target
|
||||
# to measure throughput. That target must be followed by specific post-processing steps.
|
||||
# Thus that target is run in a separate step in benchmark_summary.yml.
|
||||
ifneq ($(filter latency both,$(BENCH_TYPE)),)
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=latency __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-erc20 \
|
||||
--bench hlapi-erc7984 \
|
||||
--features=integer,gpu,internal-keycache -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow'
|
||||
endif
|
||||
|
||||
# DEX
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
|
||||
@@ -2033,6 +2220,10 @@ write_params_to_file: install_rs_check_toolchain
|
||||
pull_backward_compat_data:
|
||||
./scripts/pull_lfs_data.sh $(BACKWARD_COMPAT_DATA_DIR)
|
||||
|
||||
.PHONY: pull_corrupted_inputs_data # Pull the data files needed for corrupted inputs deserialization tests
|
||||
pull_corrupted_inputs_data:
|
||||
./scripts/pull_lfs_data.sh $(CORRUPTED_INPUTS_TEST)
|
||||
|
||||
.PHONY: pull_hpu_files # Pull the hpu files
|
||||
pull_hpu_files:
|
||||
./scripts/pull_lfs_data.sh backends/tfhe-hpu-backend/
|
||||
@@ -2123,8 +2314,10 @@ pcc_batch_6:
|
||||
$(call run_recipe_with_details,clippy_tasks)
|
||||
$(call run_recipe_with_details,clippy_tfhe_csprng)
|
||||
$(call run_recipe_with_details,clippy_zk_pok)
|
||||
$(call run_recipe_with_details,clippy_zk_pok_wasm)
|
||||
$(call run_recipe_with_details,clippy_trivium)
|
||||
$(call run_recipe_with_details,clippy_versionable)
|
||||
$(call run_recipe_with_details,clippy_safe_serialize)
|
||||
$(call run_recipe_with_details,clippy_param_dedup)
|
||||
$(call run_recipe_with_details,docs)
|
||||
|
||||
|
||||
@@ -15,12 +15,3 @@ extend-ignore-identifiers-re = [
|
||||
"0x[0-9a-fA-F]+",
|
||||
"xrt_coreutil",
|
||||
]
|
||||
|
||||
[files]
|
||||
extend-exclude = [
|
||||
"backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cu",
|
||||
"backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu",
|
||||
"backends/tfhe-hpu-backend/config_store/**/*.link_summary",
|
||||
"*.cbor",
|
||||
"*.bcode",
|
||||
]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
BSD 3-Clause Clear License
|
||||
|
||||
Copyright © 2025 ZAMA.
|
||||
Copyright © 2026 ZAMA.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
|
||||
@@ -62,3 +62,29 @@ rules:
|
||||
cuda_synchronize_stream(...);
|
||||
...
|
||||
}
|
||||
|
||||
- id: tfhe-cuda-unwrapped-cuda-runtime-call
|
||||
message: "CUDA runtime API call is not wrapped in `check_cuda_error(...)`."
|
||||
severity: WARNING
|
||||
languages: [c, cpp]
|
||||
options:
|
||||
generic_ellipsis_max_span: 500
|
||||
paths:
|
||||
include:
|
||||
- "*.cu"
|
||||
- "*.cuh"
|
||||
- "*.cpp"
|
||||
- "*.h"
|
||||
exclude:
|
||||
- backends/tfhe-cuda-backend/cuda/check_cuda.cu # contains cuda checking functions
|
||||
- backends/tfhe-cuda-backend/cuda/include/device.h # contains the cuda_check_error macro (and others)
|
||||
patterns:
|
||||
- pattern: $FUNC(...)
|
||||
- metavariable-regex:
|
||||
metavariable: $FUNC
|
||||
regex: "^cuda[A-Z][A-Za-z0-9]*$" # matches cudaMalloc/cudaMemcpy/... (not project helpers like cuda_set_device)
|
||||
- pattern-not-inside: check_cuda_error(...)
|
||||
- pattern-not-inside: |
|
||||
$FUNC(...);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
- pattern-not-inside: $FUNC(...) == $VAL
|
||||
|
||||
@@ -36,5 +36,19 @@ void cuda_glwe_sample_extract_128_async(
|
||||
void const *glwe_array_in, uint32_t const *nth_array, uint32_t num_nths,
|
||||
uint32_t num_lwes_to_extract_per_glwe, uint32_t num_lwes_stored_per_glwe,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size);
|
||||
|
||||
void cuda_modulus_switch_multi_bit_64_async(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in, uint32_t size,
|
||||
uint32_t log_modulus,
|
||||
uint32_t degree,
|
||||
uint32_t grouping_factor);
|
||||
|
||||
void cuda_modulus_switch_multi_bit_128_async(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in, uint32_t size,
|
||||
uint32_t log_modulus,
|
||||
uint32_t degree,
|
||||
uint32_t grouping_factor);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -382,14 +382,17 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
|
||||
->use_sequential_algorithm_to_resolve_group_carries;
|
||||
|
||||
cuda_set_device(0);
|
||||
cudaEventCreateWithFlags(&create_indexes_done, cudaEventDisableTiming);
|
||||
check_cuda_error(
|
||||
cudaEventCreateWithFlags(&create_indexes_done, cudaEventDisableTiming));
|
||||
create_indexes_for_overflow_sub(streams.get_ith(0), num_blocks, group_size,
|
||||
use_seq, allocate_gpu_memory, size_tracker);
|
||||
cudaEventRecord(create_indexes_done, streams.stream(0));
|
||||
check_cuda_error(cudaEventRecord(create_indexes_done, streams.stream(0)));
|
||||
cuda_set_device(1);
|
||||
cudaStreamWaitEvent(streams.stream(1), create_indexes_done, 0);
|
||||
check_cuda_error(
|
||||
cudaStreamWaitEvent(streams.stream(1), create_indexes_done, 0));
|
||||
cuda_set_device(2);
|
||||
cudaStreamWaitEvent(streams.stream(2), create_indexes_done, 0);
|
||||
check_cuda_error(
|
||||
cudaStreamWaitEvent(streams.stream(2), create_indexes_done, 0));
|
||||
|
||||
scatter_indexes_for_overflowing_sub(
|
||||
streams.stream(1), streams.gpu_index(1),
|
||||
@@ -842,7 +845,7 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
|
||||
free(second_indexes_for_overflow_sub_gpu_2);
|
||||
free(scalars_for_overflow_sub_gpu_2);
|
||||
|
||||
cudaEventDestroy(create_indexes_done);
|
||||
check_cuda_error(cudaEventDestroy(create_indexes_done));
|
||||
|
||||
// release sub streams
|
||||
sub_streams_1.release();
|
||||
|
||||
@@ -153,7 +153,13 @@ void cuda_full_propagation_64_inplace_async(
|
||||
void cleanup_cuda_full_propagation_64_inplace(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_mult_64_async(
|
||||
void cuda_integer_mult_inplace_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_inout,
|
||||
bool const is_bool_left, CudaRadixCiphertextFFI const *radix_lwe_right,
|
||||
bool const is_bool_right, void *const *bsks, void *const *ksks,
|
||||
int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks);
|
||||
|
||||
uint64_t scratch_cuda_integer_mult_inplace_64_async(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, bool const is_boolean_left,
|
||||
bool const is_boolean_right, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
@@ -162,17 +168,8 @@ uint64_t scratch_cuda_integer_mult_64_async(
|
||||
uint32_t num_blocks, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_mult_64_async(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_left,
|
||||
bool const is_bool_left,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_right,
|
||||
bool const is_bool_right, void *const *bsks,
|
||||
void *const *ksks, int8_t *mem_ptr,
|
||||
uint32_t polynomial_size, uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_integer_mult_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
void cleanup_cuda_integer_mult_inplace_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void cuda_negate_ciphertext_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
@@ -273,7 +270,12 @@ void cleanup_cuda_integer_comparison_64(CudaStreamsFFI streams,
|
||||
void cleanup_cuda_integer_scalar_comparison_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_boolean_bitop_64_async(
|
||||
void cuda_boolean_bitop_inplace_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_inout,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks);
|
||||
|
||||
uint64_t scratch_cuda_boolean_bitop_inplace_64_async(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -283,15 +285,8 @@ uint64_t scratch_cuda_boolean_bitop_64_async(
|
||||
bool is_unchecked, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_boolean_bitop_64_async(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2,
|
||||
int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks);
|
||||
|
||||
void cleanup_cuda_boolean_bitop_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
void cleanup_cuda_boolean_bitop_inplace_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_boolean_bitnot_64_async(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
@@ -316,42 +311,40 @@ void cuda_bitnot_ciphertext_64(CudaStreamsFFI streams,
|
||||
uint32_t param_message_modulus,
|
||||
uint32_t param_carry_modulus);
|
||||
|
||||
uint64_t scratch_cuda_integer_bitop_64_async(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
uint64_t scratch_cuda_integer_scalar_bitop_64_async(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_scalar_bitop_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
|
||||
void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
|
||||
void cuda_integer_bitop_inplace_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_inout,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks);
|
||||
|
||||
void cuda_integer_bitop_64_async(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2,
|
||||
int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks);
|
||||
void cuda_integer_scalar_bitop_inplace_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_inout,
|
||||
void const *clear_blocks, void const *h_clear_blocks,
|
||||
uint32_t num_clear_blocks, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks);
|
||||
|
||||
void cleanup_cuda_integer_bitop_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
uint64_t scratch_cuda_integer_bitop_inplace_64_async(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cleanup_cuda_integer_scalar_bitop_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
void cleanup_cuda_integer_bitop_inplace_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_scalar_bitop_inplace_64_async(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cleanup_cuda_integer_scalar_bitop_inplace_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_cmux_64_async(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
|
||||
@@ -2,12 +2,17 @@
|
||||
|
||||
#include "integer.h"
|
||||
|
||||
enum RERAND_MODE {
|
||||
RERAND_WITH_KS = 0,
|
||||
RERAND_WITHOUT_KS = 1,
|
||||
};
|
||||
|
||||
extern "C" {
|
||||
uint64_t scratch_cuda_rerand_64_async(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, bool allocate_gpu_memory);
|
||||
uint32_t carry_modulus, bool allocate_gpu_memory, RERAND_MODE rerand_type);
|
||||
|
||||
void cuda_rerand_64_async(
|
||||
CudaStreamsFFI streams, void *lwe_array,
|
||||
|
||||
@@ -3,16 +3,18 @@
|
||||
#include "checked_arithmetic.h"
|
||||
#include "integer_utilities.h"
|
||||
#include "keyswitch/ks_enums.h"
|
||||
#include "rerand.h"
|
||||
#include "zk/expand.cuh"
|
||||
#include "zk/zk_utilities.h"
|
||||
|
||||
template <typename Torus> struct int_rerand_mem {
|
||||
int_radix_params params;
|
||||
|
||||
Torus *tmp_zero_lwes;
|
||||
Torus *tmp_ksed_zero_lwes;
|
||||
Torus *lwe_trivial_indexes;
|
||||
Torus *tmp_expanded_zero_lwes = nullptr;
|
||||
Torus *tmp_ksed_expanded_zero_lwes = nullptr;
|
||||
Torus *lwe_trivial_indexes = nullptr;
|
||||
uint32_t num_lwes;
|
||||
RERAND_MODE rerand_mode;
|
||||
|
||||
bool gpu_memory_allocated;
|
||||
|
||||
@@ -20,24 +22,20 @@ template <typename Torus> struct int_rerand_mem {
|
||||
ks_tmp_buf_vec; // not allocated, ReRand not using GEMM KS for now
|
||||
// kept empty to pass to the KS function indicating GEMM KS disabled
|
||||
|
||||
expand_job<Torus> *d_expand_jobs;
|
||||
expand_job<Torus> *h_expand_jobs;
|
||||
expand_job<Torus> *d_expand_jobs = nullptr;
|
||||
expand_job<Torus> *h_expand_jobs = nullptr;
|
||||
|
||||
int_rerand_mem(CudaStreams streams, int_radix_params params,
|
||||
const uint32_t num_lwes, const bool allocate_gpu_memory,
|
||||
uint64_t &size_tracker)
|
||||
: params(params), num_lwes(num_lwes),
|
||||
const uint32_t num_lwes, const RERAND_MODE rerand_mode,
|
||||
const bool allocate_gpu_memory, uint64_t &size_tracker)
|
||||
: params(params), num_lwes(num_lwes), rerand_mode(rerand_mode),
|
||||
gpu_memory_allocated(allocate_gpu_memory) {
|
||||
|
||||
tmp_zero_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
safe_mul_sizeof<Torus>(num_lwes, params.big_lwe_dimension + 1),
|
||||
streams.stream(0), streams.gpu_index(0), size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
tmp_ksed_zero_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
safe_mul_sizeof<Torus>(num_lwes, params.small_lwe_dimension + 1),
|
||||
streams.stream(0), streams.gpu_index(0), size_tracker,
|
||||
allocate_gpu_memory);
|
||||
tmp_expanded_zero_lwes =
|
||||
static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
|
||||
safe_mul_sizeof<Torus>(num_lwes, params.big_lwe_dimension + 1),
|
||||
streams.stream(0), streams.gpu_index(0), size_tracker,
|
||||
allocate_gpu_memory));
|
||||
|
||||
d_expand_jobs =
|
||||
static_cast<expand_job<Torus> *>(cuda_malloc_with_size_tracking_async(
|
||||
@@ -46,47 +44,63 @@ template <typename Torus> struct int_rerand_mem {
|
||||
|
||||
h_expand_jobs = static_cast<expand_job<Torus> *>(
|
||||
malloc(safe_mul_sizeof<expand_job<Torus>>(num_lwes)));
|
||||
PANIC_IF_FALSE(h_expand_jobs != nullptr,
|
||||
"host allocation failed for h_expand_jobs");
|
||||
|
||||
auto h_lwe_trivial_indexes =
|
||||
static_cast<Torus *>(malloc(safe_mul_sizeof<Torus>(num_lwes)));
|
||||
for (auto i = 0; i < num_lwes; ++i) {
|
||||
h_lwe_trivial_indexes[i] = i;
|
||||
if (rerand_mode == RERAND_MODE::RERAND_WITH_KS) {
|
||||
tmp_ksed_expanded_zero_lwes =
|
||||
static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
|
||||
safe_mul_sizeof<Torus>(num_lwes, params.small_lwe_dimension + 1),
|
||||
streams.stream(0), streams.gpu_index(0), size_tracker,
|
||||
allocate_gpu_memory));
|
||||
|
||||
auto h_lwe_trivial_indexes =
|
||||
static_cast<Torus *>(malloc(safe_mul_sizeof<Torus>(num_lwes)));
|
||||
PANIC_IF_FALSE(h_lwe_trivial_indexes != nullptr,
|
||||
"host allocation failed for h_lwe_trivial_indexes");
|
||||
for (uint32_t i = 0; i < num_lwes; ++i) {
|
||||
h_lwe_trivial_indexes[i] = i;
|
||||
}
|
||||
lwe_trivial_indexes =
|
||||
static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
|
||||
safe_mul_sizeof<Torus>(num_lwes), streams.stream(0),
|
||||
streams.gpu_index(0), size_tracker, allocate_gpu_memory));
|
||||
cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_trivial_indexes,
|
||||
safe_mul_sizeof<Torus>(num_lwes),
|
||||
streams.stream(0), streams.gpu_index(0));
|
||||
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
||||
free(h_lwe_trivial_indexes);
|
||||
} else {
|
||||
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
||||
}
|
||||
lwe_trivial_indexes = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
safe_mul_sizeof<Torus>(num_lwes), streams.stream(0),
|
||||
streams.gpu_index(0), size_tracker, allocate_gpu_memory);
|
||||
cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_trivial_indexes,
|
||||
safe_mul_sizeof<Torus>(num_lwes),
|
||||
streams.stream(0), streams.gpu_index(0));
|
||||
|
||||
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
||||
|
||||
free(h_lwe_trivial_indexes);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
cuda_drop_with_size_tracking_async(tmp_zero_lwes, streams.stream(0),
|
||||
streams.gpu_index(0),
|
||||
cuda_drop_with_size_tracking_async(tmp_expanded_zero_lwes,
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
gpu_memory_allocated);
|
||||
tmp_zero_lwes = nullptr;
|
||||
cuda_drop_with_size_tracking_async(tmp_ksed_zero_lwes, streams.stream(0),
|
||||
streams.gpu_index(0),
|
||||
gpu_memory_allocated);
|
||||
tmp_ksed_zero_lwes = nullptr;
|
||||
cuda_drop_with_size_tracking_async(lwe_trivial_indexes, streams.stream(0),
|
||||
streams.gpu_index(0),
|
||||
gpu_memory_allocated);
|
||||
lwe_trivial_indexes = nullptr;
|
||||
tmp_expanded_zero_lwes = nullptr;
|
||||
cuda_drop_with_size_tracking_async(d_expand_jobs, streams.stream(0),
|
||||
streams.gpu_index(0),
|
||||
gpu_memory_allocated);
|
||||
d_expand_jobs = nullptr;
|
||||
|
||||
for (auto i = 0; i < ks_tmp_buf_vec.size(); i++) {
|
||||
cleanup_cuda_keyswitch(streams.stream(i), streams.gpu_index(i),
|
||||
ks_tmp_buf_vec[i], gpu_memory_allocated);
|
||||
if (rerand_mode == RERAND_MODE::RERAND_WITH_KS) {
|
||||
cuda_drop_with_size_tracking_async(
|
||||
tmp_ksed_expanded_zero_lwes, streams.stream(0), streams.gpu_index(0),
|
||||
gpu_memory_allocated);
|
||||
tmp_ksed_expanded_zero_lwes = nullptr;
|
||||
cuda_drop_with_size_tracking_async(lwe_trivial_indexes, streams.stream(0),
|
||||
streams.gpu_index(0),
|
||||
gpu_memory_allocated);
|
||||
lwe_trivial_indexes = nullptr;
|
||||
|
||||
for (size_t i = 0; i < ks_tmp_buf_vec.size(); i++) {
|
||||
cleanup_cuda_keyswitch(streams.stream(i), streams.gpu_index(i),
|
||||
ks_tmp_buf_vec[i], gpu_memory_allocated);
|
||||
}
|
||||
ks_tmp_buf_vec.clear();
|
||||
}
|
||||
ks_tmp_buf_vec.clear();
|
||||
|
||||
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
||||
free(h_expand_jobs);
|
||||
|
||||
@@ -64,6 +64,12 @@ void cuda_add_lwe_ciphertext_vector_plaintext_64(
|
||||
void const *lwe_array_in, const uint64_t plaintext_in,
|
||||
const uint32_t input_lwe_dimension,
|
||||
const uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_add_lwe_ciphertext_vector_inplace_32(
|
||||
void *stream, uint32_t gpu_index, CudaRadixCiphertextFFI *lwe_array_inout,
|
||||
CudaRadixCiphertextFFI const *input_2);
|
||||
void cuda_add_lwe_ciphertext_vector_inplace_64(
|
||||
void *stream, uint32_t gpu_index, CudaRadixCiphertextFFI *lwe_array_inout,
|
||||
CudaRadixCiphertextFFI const *input_2);
|
||||
}
|
||||
|
||||
#endif // CUDA_LINALG_H_
|
||||
|
||||
@@ -39,6 +39,28 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_64(void *stream,
|
||||
uint32_t gpu_index,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
// Noise-tests-namespaced wrappers for scratch/cleanup, so that callers
|
||||
// working with the noise-tests PBS variant use a consistent naming scheme.
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer);
|
||||
|
||||
// Noise tests variant: 64-bit torus, polynomial_size=2048 only. Uses the
|
||||
// NOISE_TESTS keybundle mode for noise analysis purposes.
|
||||
void cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride);
|
||||
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_async(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
@@ -56,6 +78,23 @@ void cuda_multi_bit_programmable_bootstrap_128_async(
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
|
||||
const uint32_t gpu_index,
|
||||
int8_t **buffer);
|
||||
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_128(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer);
|
||||
|
||||
void cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lwe_array_in, void const *lwe_input_indexes,
|
||||
void const *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride);
|
||||
}
|
||||
|
||||
#endif // CUDA_MULTI_BIT_H
|
||||
|
||||
@@ -105,11 +105,11 @@ template <typename Torus> struct zk_expand_mem {
|
||||
uint32_t num_lwes;
|
||||
uint32_t num_compact_lists;
|
||||
|
||||
int_radix_lut<Torus> *message_and_carry_extract_luts;
|
||||
int_radix_lut<Torus> *identity_lut;
|
||||
int_radix_lut<Torus> *message_and_carry_extract_luts = nullptr;
|
||||
int_radix_lut<Torus> *identity_lut = nullptr;
|
||||
|
||||
Torus *tmp_expanded_lwes;
|
||||
Torus *tmp_ksed_small_to_big_expanded_lwes;
|
||||
Torus *tmp_expanded_lwes = nullptr;
|
||||
Torus *tmp_ksed_small_to_big_expanded_lwes = nullptr;
|
||||
|
||||
bool gpu_memory_allocated;
|
||||
|
||||
@@ -148,66 +148,6 @@ template <typename Torus> struct zk_expand_mem {
|
||||
PANIC("GPU backend requires carry_modulus equal to message_modulus")
|
||||
}
|
||||
|
||||
// We create the identity LUT only if we are doing a SANITY_CHECK
|
||||
if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
|
||||
identity_lut =
|
||||
new int_radix_lut<Torus>(streams, computing_params, 1, 2 * num_lwes,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
auto identity_lut_f = [](Torus x) -> Torus { return x; };
|
||||
|
||||
identity_lut->generate_and_broadcast_lut(streams, {0}, {identity_lut_f},
|
||||
LUT_0_FOR_ALL_BLOCKS);
|
||||
}
|
||||
|
||||
auto message_extract_lut_f = [casting_params](Torus x) -> Torus {
|
||||
return x % casting_params.message_modulus;
|
||||
};
|
||||
auto carry_extract_lut_f = [casting_params](Torus x) -> Torus {
|
||||
return (x / casting_params.carry_modulus) %
|
||||
casting_params.message_modulus;
|
||||
};
|
||||
|
||||
// Booleans have to be sanitized
|
||||
auto sanitize_bool_f = [](Torus x) -> Torus { return x == 0 ? 0 : 1; };
|
||||
auto message_extract_and_sanitize_bool_lut_f =
|
||||
[message_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
|
||||
return sanitize_bool_f(message_extract_lut_f(x));
|
||||
};
|
||||
auto carry_extract_and_sanitize_bool_lut_f =
|
||||
[carry_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
|
||||
return sanitize_bool_f(carry_extract_lut_f(x));
|
||||
};
|
||||
|
||||
/** In case the casting key casts from BIG to SMALL key we run a single KS
|
||||
to expand using the casting key as ksk. Otherwise, in case the casting key
|
||||
casts from SMALL to BIG key, we first keyswitch from SMALL to BIG using
|
||||
the casting key as ksk, then we keyswitch from BIG to SMALL using the
|
||||
computing ksk, and lastly we apply the PBS. The output is always on the
|
||||
BIG key.
|
||||
**/
|
||||
auto params = casting_params;
|
||||
if (casting_key_type == SMALL_TO_BIG) {
|
||||
params = computing_params;
|
||||
}
|
||||
message_and_carry_extract_luts = new int_radix_lut<Torus>(
|
||||
streams, params, 4, 2 * num_lwes, allocate_gpu_memory, size_tracker);
|
||||
|
||||
// We are always packing two LWEs. We just need to be sure we have enough
|
||||
// space in the carry part to store a message of the same size as is in the
|
||||
// message part.
|
||||
if (params.carry_modulus < params.message_modulus)
|
||||
PANIC("Carry modulus must be at least as large as message modulus");
|
||||
auto num_packed_msgs = 2;
|
||||
|
||||
// Adjust indexes to permute the output and access the correct LUT
|
||||
auto h_indexes_in = static_cast<Torus *>(
|
||||
malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
|
||||
auto h_indexes_out = static_cast<Torus *>(
|
||||
malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
|
||||
auto h_lut_indexes = static_cast<Torus *>(
|
||||
malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
|
||||
|
||||
d_expand_jobs =
|
||||
static_cast<expand_job<Torus> *>(cuda_malloc_with_size_tracking_async(
|
||||
safe_mul_sizeof<expand_job<Torus>>(num_lwes), streams.stream(0),
|
||||
@@ -216,144 +156,202 @@ template <typename Torus> struct zk_expand_mem {
|
||||
h_expand_jobs = static_cast<expand_job<Torus> *>(
|
||||
malloc(safe_mul_sizeof<expand_job<Torus>>(num_lwes)));
|
||||
|
||||
/*
|
||||
* Each LWE contains encrypted data in both carry and message spaces
|
||||
* that needs to be extracted.
|
||||
*
|
||||
* The loop processes each compact list (k) and for each LWE within that
|
||||
* list:
|
||||
* 1. Sets input indexes to read each LWE twice (for carry and message
|
||||
* extraction)
|
||||
* 2. Creates output indexes to properly reorder the results
|
||||
* 3. Selects appropriate LUT index based on whether boolean sanitization is
|
||||
* needed
|
||||
*
|
||||
* We want the output to have always first the content of the message part
|
||||
* and then the content of the carry part of each LWE.
|
||||
*
|
||||
* i.e. msg_extract(LWE_0), carry_extract(LWE_0), msg_extract(LWE_1),
|
||||
* carry_extract(LWE_1), ...
|
||||
*
|
||||
* Aiming that behavior, with 4 LWEs we would have:
|
||||
*
|
||||
* // Each LWE is processed twice
|
||||
* h_indexes_in = {0, 1, 2, 3, 0, 1, 2, 3}
|
||||
*
|
||||
* // First 4 use message LUT, last 4 use carry LUT
|
||||
* h_lut_indexes = {0, 0, 0, 0, 1, 1, 1, 1}
|
||||
*
|
||||
* // Reorders output so message and carry for each LWE appear together
|
||||
* h_indexes_out = {0, 2, 4, 6, 1, 3, 5, 7}
|
||||
*
|
||||
* If an LWE contains a boolean value, its LUT index is shifted by
|
||||
* num_packed_msgs to use the sanitization LUT (which ensures output is
|
||||
* exactly 0 or 1).
|
||||
*/
|
||||
auto offset = 0;
|
||||
for (int k = 0; k < num_compact_lists; k++) {
|
||||
auto num_lwes_in_kth = this->num_lwes_per_compact_list[k];
|
||||
for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
|
||||
auto lwe_index = i + num_packed_msgs * offset;
|
||||
auto lwe_index_in_list = i % num_lwes_in_kth;
|
||||
PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
|
||||
"Cuda error: index %d is beyond the max value %d",
|
||||
lwe_index, num_packed_msgs * num_lwes);
|
||||
h_indexes_in[lwe_index] = lwe_index_in_list + offset;
|
||||
h_indexes_out[lwe_index] =
|
||||
num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
|
||||
PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
|
||||
"Cuda error: index %lu is beyond the max value %lu",
|
||||
(unsigned long)h_indexes_in[lwe_index],
|
||||
(unsigned long)(num_packed_msgs * num_lwes));
|
||||
PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
|
||||
"Cuda error: index %lu is beyond the max value %lu",
|
||||
(unsigned long)h_indexes_out[lwe_index],
|
||||
(unsigned long)(num_packed_msgs * num_lwes));
|
||||
// is_boolean_array tells us which input is a boolean and thus the
|
||||
// related output needs boolean sanitization. It naturally has
|
||||
// total_blocks entries, but h_indexes_out reaches
|
||||
// message_modulus * ceil(total_blocks/2) - 1. When total_blocks is odd,
|
||||
// the ceiling causes out-of-bounds access. Reading garbage "true" would
|
||||
// set h_lut_indexes to an invalid index pointing to uninitialized
|
||||
// memory instead of a real LUT. Rust pads is_boolean_array with FALSE
|
||||
// to match.
|
||||
PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
|
||||
"Cuda error: index %lu for is_boolean_array is out of "
|
||||
"bounds (len is %lu)",
|
||||
(unsigned long)h_indexes_out[lwe_index],
|
||||
(unsigned long)is_boolean_array_len);
|
||||
// NO_CASTING expands directly into the output buffer — no LUTs, no PBS,
|
||||
// no intermediate buffers needed.
|
||||
if (expand_kind != EXPAND_KIND::NO_CASTING) {
|
||||
/** In case the casting key casts from BIG to SMALL key we run a single KS
|
||||
to expand using the casting key as ksk. Otherwise, in case the casting key
|
||||
casts from SMALL to BIG key, we first keyswitch from SMALL to BIG using
|
||||
the casting key as ksk, then we keyswitch from BIG to SMALL using the
|
||||
computing ksk, and lastly we apply the PBS. The output is always on the
|
||||
BIG key.
|
||||
**/
|
||||
auto params = casting_params;
|
||||
if (casting_key_type == SMALL_TO_BIG) {
|
||||
params = computing_params;
|
||||
}
|
||||
offset += num_lwes_in_kth;
|
||||
}
|
||||
|
||||
message_and_carry_extract_luts->set_lwe_indexes(
|
||||
streams.stream(0), streams.gpu_index(0), h_indexes_in, h_indexes_out);
|
||||
// We always pack two LWEs (message and carry parts per LWE)
|
||||
auto num_packed_msgs = 2;
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
|
||||
// Adjust indexes to permute the output and access the correct LUT.
|
||||
//
|
||||
// The loop below fills h_indexes_in and h_indexes_out so that the output
|
||||
// is ordered as: msg_extract(LWE_0), carry_extract(LWE_0),
|
||||
// msg_extract(LWE_1), carry_extract(LWE_1), ...
|
||||
//
|
||||
// With 4 LWEs the arrays look like:
|
||||
// h_indexes_in = {0, 1, 2, 3, 0, 1, 2, 3} (each LWE read twice)
|
||||
// h_lut_indexes = {0, 0, 0, 0, 1, 1, 1, 1} (msg LUT then carry LUT)
|
||||
// h_indexes_out = {0, 2, 4, 6, 1, 3, 5, 7} (interleaved output)
|
||||
//
|
||||
// If an LWE contains a boolean its LUT index is shifted by
|
||||
// num_packed_msgs to use the sanitization LUT (output clamped to {0, 1}).
|
||||
auto h_indexes_in = static_cast<Torus *>(
|
||||
malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
|
||||
auto h_indexes_out = static_cast<Torus *>(
|
||||
malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
|
||||
|
||||
// Index generator for message/carry extraction LUTs
|
||||
auto index_gen = [num_compact_lists,
|
||||
num_lwes_per_compact_list =
|
||||
this->num_lwes_per_compact_list,
|
||||
num_packed_msgs, is_boolean_array,
|
||||
h_indexes_out](Torus *h_lut_indexes, uint32_t) {
|
||||
auto offset = 0;
|
||||
for (int k = 0; k < num_compact_lists; k++) {
|
||||
auto num_lwes_in_kth = num_lwes_per_compact_list[k];
|
||||
auto num_lwes_in_kth = this->num_lwes_per_compact_list[k];
|
||||
for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
|
||||
auto lwe_index = i + num_packed_msgs * offset;
|
||||
auto boolean_offset =
|
||||
is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
|
||||
h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
|
||||
auto lwe_index_in_list = i % num_lwes_in_kth;
|
||||
PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
|
||||
"Cuda error: index %d is beyond the max value %d",
|
||||
lwe_index, num_packed_msgs * num_lwes);
|
||||
h_indexes_in[lwe_index] = lwe_index_in_list + offset;
|
||||
h_indexes_out[lwe_index] =
|
||||
num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
|
||||
PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
|
||||
"Cuda error: index %lu is beyond the max value %lu",
|
||||
(unsigned long)h_indexes_in[lwe_index],
|
||||
(unsigned long)(num_packed_msgs * num_lwes));
|
||||
PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
|
||||
"Cuda error: index %lu is beyond the max value %lu",
|
||||
(unsigned long)h_indexes_out[lwe_index],
|
||||
(unsigned long)(num_packed_msgs * num_lwes));
|
||||
// is_boolean_array tells us which input is a boolean and thus the
|
||||
// related output needs boolean sanitization. It naturally has
|
||||
// total_blocks entries, but h_indexes_out reaches
|
||||
// message_modulus * ceil(total_blocks/2) - 1. When total_blocks is
|
||||
// odd, the ceiling causes out-of-bounds access. Reading garbage
|
||||
// "true" would set h_lut_indexes to an invalid index pointing to
|
||||
// uninitialized memory instead of a real LUT. Rust pads
|
||||
// is_boolean_array with FALSE to match.
|
||||
PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
|
||||
"Cuda error: index %lu for is_boolean_array is out of "
|
||||
"bounds (len is %lu)",
|
||||
(unsigned long)h_indexes_out[lwe_index],
|
||||
(unsigned long)is_boolean_array_len);
|
||||
}
|
||||
offset += num_lwes_in_kth;
|
||||
}
|
||||
};
|
||||
|
||||
message_and_carry_extract_luts->generate_and_broadcast_lut(
|
||||
active_streams, {0, 1, 2, 3},
|
||||
{message_extract_lut_f, carry_extract_lut_f,
|
||||
message_extract_and_sanitize_bool_lut_f,
|
||||
carry_extract_and_sanitize_bool_lut_f},
|
||||
index_gen, true, {}, h_lut_indexes);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
|
||||
|
||||
message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
|
||||
active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
|
||||
// The expanded LWEs will always be on the casting key format
|
||||
tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
safe_mul_sizeof<Torus>(num_lwes, casting_params.big_lwe_dimension + 1),
|
||||
streams.stream(0), streams.gpu_index(0), size_tracker,
|
||||
allocate_gpu_memory);
|
||||
// SANITY_CHECK uses identity_lut (skipping the full message/carry
|
||||
// extraction LUT and the SMALL_TO_BIG intermediate buffer).
|
||||
if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
|
||||
identity_lut =
|
||||
new int_radix_lut<Torus>(streams, casting_params, 1, 2 * num_lwes,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
tmp_ksed_small_to_big_expanded_lwes =
|
||||
(Torus *)cuda_malloc_with_size_tracking_async(
|
||||
safe_mul_sizeof<Torus>(num_lwes,
|
||||
casting_params.big_lwe_dimension + 1),
|
||||
streams.stream(0), streams.gpu_index(0), size_tracker,
|
||||
allocate_gpu_memory);
|
||||
auto identity_lut_f = [](Torus x) -> Torus { return x; };
|
||||
identity_lut->generate_and_broadcast_lut(streams, {0}, {identity_lut_f},
|
||||
LUT_0_FOR_ALL_BLOCKS);
|
||||
identity_lut->set_lwe_indexes(streams.stream(0), streams.gpu_index(0),
|
||||
h_indexes_in, h_indexes_out);
|
||||
identity_lut->allocate_lwe_vector_for_non_trivial_indexes(
|
||||
active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
|
||||
} else {
|
||||
// We are always packing two LWEs. We just need to be sure we have
|
||||
// enough space in the carry part to store a message of the same size
|
||||
// as is in the message part.
|
||||
if (params.carry_modulus < params.message_modulus)
|
||||
PANIC("Carry modulus must be at least as large as message modulus");
|
||||
|
||||
message_and_carry_extract_luts =
|
||||
new int_radix_lut<Torus>(streams, params, 4, 2 * num_lwes,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
message_and_carry_extract_luts->set_lwe_indexes(
|
||||
streams.stream(0), streams.gpu_index(0), h_indexes_in,
|
||||
h_indexes_out);
|
||||
|
||||
auto message_extract_lut_f = [casting_params](Torus x) -> Torus {
|
||||
return x % casting_params.message_modulus;
|
||||
};
|
||||
auto carry_extract_lut_f = [casting_params](Torus x) -> Torus {
|
||||
return (x / casting_params.carry_modulus) %
|
||||
casting_params.message_modulus;
|
||||
};
|
||||
auto sanitize_bool_f = [](Torus x) -> Torus { return x == 0 ? 0 : 1; };
|
||||
auto message_extract_and_sanitize_bool_lut_f =
|
||||
[message_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
|
||||
return sanitize_bool_f(message_extract_lut_f(x));
|
||||
};
|
||||
auto carry_extract_and_sanitize_bool_lut_f =
|
||||
[carry_extract_lut_f, sanitize_bool_f](Torus x) -> Torus {
|
||||
return sanitize_bool_f(carry_extract_lut_f(x));
|
||||
};
|
||||
|
||||
auto h_lut_indexes = static_cast<Torus *>(
|
||||
malloc(safe_mul_sizeof<Torus>(num_packed_msgs, num_lwes)));
|
||||
|
||||
auto index_gen = [num_compact_lists,
|
||||
num_lwes_per_compact_list =
|
||||
this->num_lwes_per_compact_list,
|
||||
num_packed_msgs, is_boolean_array,
|
||||
h_indexes_out](Torus *h_lut_indexes, uint32_t) {
|
||||
auto offset = 0;
|
||||
for (int k = 0; k < num_compact_lists; k++) {
|
||||
auto num_lwes_in_kth = num_lwes_per_compact_list[k];
|
||||
for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
|
||||
auto lwe_index = i + num_packed_msgs * offset;
|
||||
auto boolean_offset = is_boolean_array[h_indexes_out[lwe_index]]
|
||||
? num_packed_msgs
|
||||
: 0;
|
||||
h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
|
||||
}
|
||||
offset += num_lwes_in_kth;
|
||||
}
|
||||
};
|
||||
|
||||
message_and_carry_extract_luts->generate_and_broadcast_lut(
|
||||
active_streams, {0, 1, 2, 3},
|
||||
{message_extract_lut_f, carry_extract_lut_f,
|
||||
message_extract_and_sanitize_bool_lut_f,
|
||||
carry_extract_and_sanitize_bool_lut_f},
|
||||
index_gen, true, {}, h_lut_indexes);
|
||||
message_and_carry_extract_luts
|
||||
->allocate_lwe_vector_for_non_trivial_indexes(
|
||||
active_streams, 2 * num_lwes, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
free(h_lut_indexes);
|
||||
|
||||
// SANITY_CHECK panics on SMALL_TO_BIG, so this buffer is only needed
|
||||
// on the full casting path.
|
||||
tmp_ksed_small_to_big_expanded_lwes =
|
||||
(Torus *)cuda_malloc_with_size_tracking_async(
|
||||
safe_mul_sizeof<Torus>(num_lwes,
|
||||
casting_params.big_lwe_dimension + 1),
|
||||
streams.stream(0), streams.gpu_index(0), size_tracker,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
// The expanded LWEs will always be on the casting key format
|
||||
tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
safe_mul_sizeof<Torus>(num_lwes,
|
||||
casting_params.big_lwe_dimension + 1),
|
||||
streams.stream(0), streams.gpu_index(0), size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
free(h_indexes_in);
|
||||
free(h_indexes_out);
|
||||
}
|
||||
|
||||
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
||||
free(h_indexes_in);
|
||||
free(h_indexes_out);
|
||||
free(h_lut_indexes);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
message_and_carry_extract_luts->release(streams);
|
||||
delete message_and_carry_extract_luts;
|
||||
|
||||
if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
|
||||
identity_lut->release(streams);
|
||||
delete identity_lut;
|
||||
if (expand_kind != EXPAND_KIND::NO_CASTING) {
|
||||
if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
|
||||
identity_lut->release(streams);
|
||||
delete identity_lut;
|
||||
} else {
|
||||
message_and_carry_extract_luts->release(streams);
|
||||
delete message_and_carry_extract_luts;
|
||||
cuda_drop_with_size_tracking_async(
|
||||
tmp_ksed_small_to_big_expanded_lwes, streams.stream(0),
|
||||
streams.gpu_index(0), gpu_memory_allocated);
|
||||
}
|
||||
cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams.stream(0),
|
||||
streams.gpu_index(0),
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
|
||||
cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams.stream(0),
|
||||
streams.gpu_index(0),
|
||||
gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(tmp_ksed_small_to_big_expanded_lwes,
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(d_expand_jobs, streams.stream(0),
|
||||
streams.gpu_index(0),
|
||||
gpu_memory_allocated);
|
||||
|
||||
@@ -83,6 +83,8 @@ void cuda_modulus_switch_inplace_64_async(void *stream, uint32_t gpu_index,
|
||||
void cuda_modulus_switch_64_async(void *stream, uint32_t gpu_index,
|
||||
void *lwe_out, const void *lwe_in,
|
||||
uint32_t size, uint32_t log_modulus) {
|
||||
PANIC_IF_FALSE(lwe_out != lwe_in, "Output and input pointers must be "
|
||||
"different for out-of-place operations");
|
||||
host_modulus_switch<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_out),
|
||||
static_cast<const uint64_t *>(lwe_in), size,
|
||||
@@ -93,6 +95,8 @@ void cuda_centered_modulus_switch_64_async(void *stream, uint32_t gpu_index,
|
||||
void *lwe_out, const void *lwe_in,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t log_modulus) {
|
||||
PANIC_IF_FALSE(lwe_out != lwe_in, "Output and input pointers must be "
|
||||
"different for out-of-place operations");
|
||||
host_centered_modulus_switch_inplace<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_out), static_cast<const uint64_t *>(lwe_in),
|
||||
@@ -146,3 +150,31 @@ void cuda_glwe_sample_extract_128_async(
|
||||
"N's are powers of two in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_modulus_switch_multi_bit_64_async(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in, uint32_t size,
|
||||
uint32_t log_modulus,
|
||||
uint32_t degree,
|
||||
uint32_t grouping_factor) {
|
||||
|
||||
host_modulus_switch_multi_bit<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in), size, log_modulus, degree,
|
||||
grouping_factor);
|
||||
}
|
||||
|
||||
void cuda_modulus_switch_multi_bit_128_async(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in, uint32_t size,
|
||||
uint32_t log_modulus,
|
||||
uint32_t degree,
|
||||
uint32_t grouping_factor) {
|
||||
|
||||
host_modulus_switch_multi_bit<__uint128_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<__uint128_t *>(lwe_array_out),
|
||||
static_cast<__uint128_t *>(lwe_array_in), size, log_modulus, degree,
|
||||
grouping_factor);
|
||||
}
|
||||
|
||||
@@ -217,6 +217,8 @@ void cuda_closest_representable_64_async(void *stream, uint32_t gpu_index,
|
||||
void const *input, void *output,
|
||||
uint32_t base_log,
|
||||
uint32_t level_count) {
|
||||
PANIC_IF_FALSE(output != input, "Output and input pointers must be different "
|
||||
"for out-of-place operations");
|
||||
host_cuda_closest_representable(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<const uint64_t *>(input),
|
||||
static_cast<uint64_t *>(output), base_log,
|
||||
|
||||
@@ -463,5 +463,48 @@ __global__ void __launch_bounds__(512)
|
||||
return;
|
||||
}
|
||||
}
|
||||
// This function is only used for noise tests, it follows the same logic
|
||||
// that is embedded in the keybundle just we need a global function to
|
||||
// be able to test it individually.
|
||||
template <typename Torus, class params>
|
||||
__global__ void
|
||||
modulus_switch_multi_bit(Torus *array_out, const Torus *array_in, int size,
|
||||
uint32_t log_modulus, uint32_t grouping_factor) {
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (tid < size) {
|
||||
int num_monomials = 1 << grouping_factor;
|
||||
int input_offset = tid * grouping_factor;
|
||||
int output_offset = tid * num_monomials;
|
||||
// We calculate all monomials even if the first one is never used.
|
||||
for (int ggsw_idx = 0; ggsw_idx < num_monomials; ggsw_idx++) {
|
||||
array_out[ggsw_idx + output_offset] =
|
||||
calculates_monomial_degree<Torus, params>(&array_in[input_offset],
|
||||
ggsw_idx, grouping_factor);
|
||||
}
|
||||
}
|
||||
}
|
||||
// This aims to be launched only from the noise tests.
|
||||
// That is why we support a specific set of parameters
|
||||
template <typename Torus>
|
||||
__host__ void host_modulus_switch_multi_bit(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *array_out, Torus *array_in,
|
||||
int size, uint32_t log_modulus, uint32_t degree, uint32_t grouping_factor) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
int multibit_size = size / grouping_factor;
|
||||
int num_threads = 0, num_blocks = 0;
|
||||
getNumBlocksAndThreads(multibit_size, 1024, num_blocks, num_threads);
|
||||
switch (degree) {
|
||||
case 2048:
|
||||
modulus_switch_multi_bit<Torus, Degree<2048>>
|
||||
<<<num_blocks, num_threads, 0, stream>>>(
|
||||
array_out, array_in, multibit_size, log_modulus, grouping_factor);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: unsupported polynomial size. Supported "
|
||||
"N's are powers of two in the interval [2048].")
|
||||
};
|
||||
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif // CNCRT_TORUS_H
|
||||
|
||||
@@ -326,6 +326,10 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||
uint32_t gpu_index) {
|
||||
if (size == 0)
|
||||
return;
|
||||
|
||||
GPU_ASSERT(src != nullptr, "Cuda error: null device ptr");
|
||||
GPU_ASSERT(dest != nullptr, "Cuda error: null device ptr");
|
||||
|
||||
cudaPointerAttributes attr_dest;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
|
||||
PANIC_IF_FALSE(
|
||||
|
||||
@@ -1,6 +1,16 @@
|
||||
#include "integer/bitwise_ops.cuh"
|
||||
|
||||
uint64_t scratch_cuda_boolean_bitop_64_async(
|
||||
void cuda_boolean_bitop_inplace_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_inout,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
// In-place variant: lwe_array_inout op= lwe_array_2, no aliasing check needed
|
||||
host_boolean_bitop<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_inout, lwe_array_inout, lwe_array_2,
|
||||
(boolean_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_boolean_bitop_inplace_64_async(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -20,20 +30,8 @@ uint64_t scratch_cuda_boolean_bitop_64_async(
|
||||
lwe_ciphertext_count, params, op_type, is_unchecked, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_boolean_bitop_64_async(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2,
|
||||
int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
|
||||
host_boolean_bitop<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2,
|
||||
(boolean_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
|
||||
}
|
||||
|
||||
void cleanup_cuda_boolean_bitop_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_boolean_bitop_inplace_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
boolean_bitop_buffer<uint64_t> *mem_ptr =
|
||||
(boolean_bitop_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
@@ -80,44 +78,6 @@ void cleanup_cuda_boolean_bitnot_64(CudaStreamsFFI streams,
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_bitop_64_async(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_bitop<uint64_t>(
|
||||
CudaStreams(streams), (int_bitop_buffer<uint64_t> **)mem_ptr,
|
||||
lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_scalar_bitop_64_async(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_bitop<uint64_t>(
|
||||
CudaStreams(streams), (int_bitop_buffer<uint64_t> **)mem_ptr,
|
||||
lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_bitnot_ciphertext_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *radix_ciphertext,
|
||||
uint32_t ct_message_modulus,
|
||||
@@ -129,20 +89,37 @@ void cuda_bitnot_ciphertext_64(CudaStreamsFFI streams,
|
||||
cuda_synchronize_stream(cuda_streams.stream(0), cuda_streams.gpu_index(0));
|
||||
}
|
||||
|
||||
void cuda_integer_bitop_64_async(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2,
|
||||
int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
|
||||
host_bitop<uint64_t>(CudaStreams(streams), lwe_array_out, lwe_array_1,
|
||||
void cuda_integer_bitop_inplace_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_inout,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
// In-place variant: lwe_array_inout op= lwe_array_2, no aliasing check needed
|
||||
host_bitop<uint64_t>(CudaStreams(streams), lwe_array_inout, lwe_array_inout,
|
||||
lwe_array_2, (int_bitop_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks));
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_bitop_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
uint64_t scratch_cuda_integer_bitop_inplace_64_async(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_bitop<uint64_t>(
|
||||
CudaStreams(streams), (int_bitop_buffer<uint64_t> **)mem_ptr,
|
||||
lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_bitop_inplace_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_bitop_buffer<uint64_t> *mem_ptr =
|
||||
(int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
@@ -151,8 +128,27 @@ void cleanup_cuda_integer_bitop_64(CudaStreamsFFI streams,
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_scalar_bitop_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
uint64_t scratch_cuda_integer_scalar_bitop_inplace_64_async(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_bitop<uint64_t>(
|
||||
CudaStreams(streams), (int_bitop_buffer<uint64_t> **)mem_ptr,
|
||||
lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_scalar_bitop_inplace_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_bitop_buffer<uint64_t> *mem_ptr =
|
||||
(int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
void extend_radix_with_trivial_zero_blocks_msb_64(
|
||||
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
|
||||
CudaStreamsFFI streams) {
|
||||
PANIC_IF_FALSE(output != input, "Output and input pointers must be different "
|
||||
"for out-of-place operations");
|
||||
auto cuda_streams = CudaStreams(streams);
|
||||
host_extend_radix_with_trivial_zero_blocks_msb<uint64_t>(output, input,
|
||||
cuda_streams);
|
||||
@@ -12,6 +14,8 @@ void extend_radix_with_trivial_zero_blocks_msb_64(
|
||||
void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
CudaStreamsFFI streams) {
|
||||
PANIC_IF_FALSE(output != input, "Output and input pointers must be different "
|
||||
"for out-of-place operations");
|
||||
|
||||
auto cuda_streams = CudaStreams(streams);
|
||||
host_trim_radix_blocks_lsb<uint64_t>(output, input, cuda_streams);
|
||||
@@ -21,6 +25,8 @@ void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
|
||||
void trim_radix_blocks_msb_64(CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
CudaStreamsFFI streams) {
|
||||
PANIC_IF_FALSE(output != input, "Output and input pointers must be different "
|
||||
"for out-of-place operations");
|
||||
|
||||
auto cuda_streams = CudaStreams(streams);
|
||||
host_trim_radix_blocks_msb<uint64_t>(output, input, cuda_streams);
|
||||
@@ -54,6 +60,8 @@ void cuda_cast_to_unsigned_64_async(CudaStreamsFFI streams,
|
||||
int8_t *mem_ptr, uint32_t target_num_blocks,
|
||||
bool input_is_signed, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
PANIC_IF_FALSE(output != input, "Output and input pointers must be different "
|
||||
"for out-of-place operations");
|
||||
|
||||
host_cast_to_unsigned<uint64_t>(
|
||||
CudaStreams(streams), output, input,
|
||||
@@ -97,6 +105,8 @@ void cuda_cast_to_signed_64_async(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
int8_t *mem, bool input_is_signed,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
PANIC_IF_FALSE(output != input, "Output and input pointers must be different "
|
||||
"for out-of-place operations");
|
||||
|
||||
host_cast_to_signed<uint64_t>(CudaStreams(streams), output, input,
|
||||
(int_cast_to_signed_buffer<uint64_t> *)mem,
|
||||
|
||||
@@ -30,6 +30,18 @@ void cuda_cmux_64_async(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI const *lwe_array_true,
|
||||
CudaRadixCiphertextFFI const *lwe_array_false,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks) {
|
||||
PANIC_IF_FALSE(
|
||||
lwe_array_out != lwe_condition,
|
||||
"Output and condition pointers must be different for out-of-place "
|
||||
"operations");
|
||||
PANIC_IF_FALSE(
|
||||
lwe_array_out != lwe_array_true,
|
||||
"Output and true-branch pointers must be different for out-of-place "
|
||||
"operations");
|
||||
PANIC_IF_FALSE(
|
||||
lwe_array_out != lwe_array_false,
|
||||
"Output and false-branch pointers must be different for out-of-place "
|
||||
"operations");
|
||||
PUSH_RANGE("cmux")
|
||||
host_cmux<uint64_t>(CudaStreams(streams), lwe_array_out, lwe_condition,
|
||||
lwe_array_true, lwe_array_false,
|
||||
|
||||
@@ -80,6 +80,12 @@ void cuda_integer_comparison_64_async(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2,
|
||||
int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
PANIC_IF_FALSE(lwe_array_out != lwe_array_1,
|
||||
"Output and first input pointers must be different for "
|
||||
"out-of-place operations");
|
||||
PANIC_IF_FALSE(lwe_array_out != lwe_array_2,
|
||||
"Output and second input pointers must be different for "
|
||||
"out-of-place operations");
|
||||
PUSH_RANGE("comparison")
|
||||
if (lwe_array_1->num_radix_blocks != lwe_array_2->num_radix_blocks)
|
||||
PANIC("Cuda error: input num radix blocks must be the same")
|
||||
@@ -167,6 +173,9 @@ void cuda_integer_are_all_comparisons_block_true_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
|
||||
PANIC_IF_FALSE(lwe_array_out != lwe_array_in,
|
||||
"Output and input pointers must be different for out-of-place "
|
||||
"operations");
|
||||
|
||||
int_comparison_buffer<uint64_t> *buffer =
|
||||
(int_comparison_buffer<uint64_t> *)mem_ptr;
|
||||
@@ -209,6 +218,9 @@ void cuda_integer_is_at_least_one_comparisons_block_true_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
|
||||
PANIC_IF_FALSE(lwe_array_out != lwe_array_in,
|
||||
"Output and input pointers must be different for out-of-place "
|
||||
"operations");
|
||||
|
||||
int_comparison_buffer<uint64_t> *buffer =
|
||||
(int_comparison_buffer<uint64_t> *)mem_ptr;
|
||||
|
||||
@@ -27,6 +27,21 @@ void cuda_integer_div_rem_64_async(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI const *divisor,
|
||||
bool is_signed, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
PANIC_IF_FALSE(quotient != numerator,
|
||||
"Quotient and numerator pointers must be different for "
|
||||
"out-of-place operations");
|
||||
PANIC_IF_FALSE(quotient != divisor,
|
||||
"Quotient and divisor pointers must be different for "
|
||||
"out-of-place operations");
|
||||
PANIC_IF_FALSE(remainder != numerator,
|
||||
"Remainder and numerator pointers must be different for "
|
||||
"out-of-place operations");
|
||||
PANIC_IF_FALSE(remainder != divisor,
|
||||
"Remainder and divisor pointers must be different for "
|
||||
"out-of-place operations");
|
||||
PANIC_IF_FALSE(quotient != remainder,
|
||||
"Quotient and remainder pointers must be different for "
|
||||
"out-of-place operations");
|
||||
PUSH_RANGE("div")
|
||||
auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
|
||||
|
||||
|
||||
@@ -30,6 +30,9 @@ void cuda_integer_count_of_consecutive_bits_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
|
||||
CudaRadixCiphertextFFI const *input_ct, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
PANIC_IF_FALSE(output_ct != input_ct,
|
||||
"Output and input pointers must be different for out-of-place "
|
||||
"operations");
|
||||
|
||||
host_integer_count_of_consecutive_bits<uint64_t, uint64_t>(
|
||||
CudaStreams(streams), output_ct, input_ct,
|
||||
@@ -81,6 +84,9 @@ void cuda_integer_ilog2_64_async(
|
||||
CudaRadixCiphertextFFI const *trivial_ct_2,
|
||||
CudaRadixCiphertextFFI const *trivial_ct_m_minus_1_block, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
PANIC_IF_FALSE(output_ct != input_ct,
|
||||
"Output and input pointers must be different for out-of-place "
|
||||
"operations");
|
||||
|
||||
host_integer_ilog2<uint64_t, uint64_t>(
|
||||
CudaStreams(streams), output_ct, input_ct, trivial_ct_neg_n, trivial_ct_2,
|
||||
|
||||
@@ -212,6 +212,9 @@ void cuda_apply_univariate_lut_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks) {
|
||||
PANIC_IF_FALSE(output_radix_lwe != input_radix_lwe,
|
||||
"Output and input pointers must be different for out-of-place "
|
||||
"operations");
|
||||
|
||||
host_apply_univariate_lut<uint64_t>(
|
||||
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
|
||||
@@ -243,6 +246,9 @@ void cuda_apply_many_univariate_lut_64_async(
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
PANIC_IF_FALSE(output_radix_lwe != input_radix_lwe,
|
||||
"Output and input pointers must be different for out-of-place "
|
||||
"operations");
|
||||
|
||||
host_apply_many_univariate_lut<uint64_t>(
|
||||
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
|
||||
@@ -312,6 +318,9 @@ void cuda_apply_noise_squashing_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks) {
|
||||
PANIC_IF_FALSE(output_radix_lwe != input_radix_lwe,
|
||||
"Output and input pointers must be different for out-of-place "
|
||||
"operations");
|
||||
|
||||
PUSH_RANGE("apply noise squashing")
|
||||
integer_radix_apply_noise_squashing<uint64_t>(
|
||||
|
||||
@@ -61,11 +61,65 @@ void generate_ids_update_degrees(uint64_t *terms_degree, size_t *h_lwe_idx_in,
|
||||
|
||||
total_count = message_count + carry_count;
|
||||
}
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the integer radix multiplication in keyswitch->bootstrap order.
|
||||
*/
|
||||
uint64_t scratch_cuda_integer_mult_64_async(
|
||||
void cuda_integer_mult_inplace_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_inout,
|
||||
bool const is_bool_left, CudaRadixCiphertextFFI const *radix_lwe_right,
|
||||
bool const is_bool_right, void *const *bsks, void *const *ksks,
|
||||
int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) {
|
||||
// In-place variant: radix_lwe_inout *= radix_lwe_right, no aliasing check
|
||||
// needed
|
||||
PUSH_RANGE("mul_inplace")
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_integer_mult_radix<uint64_t, AmortizedDegree<256>>(
|
||||
CudaStreams(streams), radix_lwe_inout, radix_lwe_inout, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 512:
|
||||
host_integer_mult_radix<uint64_t, AmortizedDegree<512>>(
|
||||
CudaStreams(streams), radix_lwe_inout, radix_lwe_inout, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_mult_radix<uint64_t, AmortizedDegree<1024>>(
|
||||
CudaStreams(streams), radix_lwe_inout, radix_lwe_inout, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_mult_radix<uint64_t, AmortizedDegree<2048>>(
|
||||
CudaStreams(streams), radix_lwe_inout, radix_lwe_inout, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_mult_radix<uint64_t, AmortizedDegree<4096>>(
|
||||
CudaStreams(streams), radix_lwe_inout, radix_lwe_inout, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_mult_radix<uint64_t, AmortizedDegree<8192>>(
|
||||
CudaStreams(streams), radix_lwe_inout, radix_lwe_inout, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_mult_radix<uint64_t, AmortizedDegree<16384>>(
|
||||
CudaStreams(streams), radix_lwe_inout, radix_lwe_inout, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
"Supported N's are powers of two in the interval [256..16384].")
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_mult_inplace_64_async(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, bool const is_boolean_left,
|
||||
bool const is_boolean_right, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
@@ -97,94 +151,8 @@ uint64_t scratch_cuda_integer_mult_64_async(
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Computes a multiplication between two 64 bit radix lwe ciphertexts
|
||||
* encrypting integer values. keyswitch -> bootstrap pattern is used, function
|
||||
* works for single pair of radix ciphertexts, 'v_stream' can be used for
|
||||
* parallelization
|
||||
* - 'v_stream' is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - 'gpu_index' is the index of the GPU to be used in the kernel launch
|
||||
* - 'radix_lwe_out' is 64 bit radix big lwe ciphertext, product of
|
||||
* multiplication
|
||||
* - 'radix_lwe_left' left radix big lwe ciphertext
|
||||
* - 'radix_lwe_right' right radix big lwe ciphertext
|
||||
* - 'bsk' bootstrapping key in fourier domain
|
||||
* - 'ksk' keyswitching key
|
||||
* - 'mem_ptr'
|
||||
* - 'message_modulus' message_modulus
|
||||
* - 'carry_modulus' carry_modulus
|
||||
* - 'glwe_dimension' glwe_dimension
|
||||
* - 'lwe_dimension' is the dimension of small lwe ciphertext
|
||||
* - 'polynomial_size' polynomial size
|
||||
* - 'pbs_base_log' base log used in the pbs
|
||||
* - 'pbs_level' decomposition level count used in the pbs
|
||||
* - 'ks_level' decomposition level count used in the keyswitch
|
||||
* - 'num_blocks' is the number of big lwe ciphertext blocks inside radix
|
||||
* ciphertext
|
||||
* - 'pbs_type' selects which PBS implementation should be used
|
||||
*/
|
||||
void cuda_integer_mult_64_async(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_left,
|
||||
bool const is_bool_left,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_right,
|
||||
bool const is_bool_right, void *const *bsks,
|
||||
void *const *ksks, int8_t *mem_ptr,
|
||||
uint32_t polynomial_size, uint32_t num_blocks) {
|
||||
PUSH_RANGE("mul")
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_integer_mult_radix<uint64_t, AmortizedDegree<256>>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 512:
|
||||
host_integer_mult_radix<uint64_t, AmortizedDegree<512>>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_mult_radix<uint64_t, AmortizedDegree<1024>>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_mult_radix<uint64_t, AmortizedDegree<2048>>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_mult_radix<uint64_t, AmortizedDegree<4096>>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_mult_radix<uint64_t, AmortizedDegree<8192>>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_mult_radix<uint64_t, AmortizedDegree<16384>>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
"Supported N's are powers of two in the interval [256..16384].")
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_mult_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_integer_mult_inplace_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup mul")
|
||||
int_mul_memory<uint64_t> *mem_ptr =
|
||||
(int_mul_memory<uint64_t> *)(*mem_ptr_void);
|
||||
@@ -221,6 +189,9 @@ void cuda_partial_sum_ciphertexts_vec_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
PANIC_IF_FALSE(radix_lwe_out != radix_lwe_vec,
|
||||
"Output and input pointers must be different for out-of-place "
|
||||
"operations");
|
||||
|
||||
auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;
|
||||
if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
|
||||
|
||||
@@ -5,6 +5,9 @@ void cuda_negate_ciphertext_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
uint32_t message_modulus, uint32_t carry_modulus,
|
||||
uint32_t num_radix_blocks) {
|
||||
PANIC_IF_FALSE(lwe_array_out != lwe_array_in,
|
||||
"Output and input pointers must be different for out-of-place "
|
||||
"operations");
|
||||
|
||||
auto cuda_streams = CudaStreams(streams);
|
||||
host_negation<uint64_t>(cuda_streams, lwe_array_out, lwe_array_in,
|
||||
|
||||
@@ -5,7 +5,7 @@ uint64_t scratch_cuda_rerand_64_async(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, bool allocate_gpu_memory) {
|
||||
uint32_t carry_modulus, bool allocate_gpu_memory, RERAND_MODE rerand_type) {
|
||||
PUSH_RANGE("scratch rerand")
|
||||
int_radix_params params(PBS_TYPE::CLASSICAL, 0, 0, big_lwe_dimension,
|
||||
small_lwe_dimension, ks_level, ks_base_log, 0, 0, 0,
|
||||
@@ -13,8 +13,9 @@ uint64_t scratch_cuda_rerand_64_async(
|
||||
PBS_MS_REDUCTION_T::NO_REDUCTION);
|
||||
|
||||
uint64_t ret = scratch_cuda_rerand<uint64_t>(
|
||||
CudaStreams(streams), (int_rerand_mem<uint64_t> **)mem_ptr,
|
||||
lwe_ciphertext_count, params, allocate_gpu_memory);
|
||||
CudaStreams(streams),
|
||||
reinterpret_cast<int_rerand_mem<uint64_t> **>(mem_ptr),
|
||||
lwe_ciphertext_count, params, allocate_gpu_memory, rerand_type);
|
||||
POP_RANGE()
|
||||
return ret;
|
||||
}
|
||||
@@ -28,7 +29,7 @@ void cuda_rerand_64_async(
|
||||
CudaStreamsFFI streams, void *lwe_array,
|
||||
const void *lwe_flattened_encryptions_of_zero_compact_array_in,
|
||||
int8_t *mem_ptr, void *const *ksk) {
|
||||
|
||||
PUSH_RANGE("rerand")
|
||||
auto rerand_buffer = reinterpret_cast<int_rerand_mem<uint64_t> *>(mem_ptr);
|
||||
|
||||
switch (rerand_buffer->params.big_lwe_dimension) {
|
||||
@@ -37,49 +38,49 @@ void cuda_rerand_64_async(
|
||||
streams, static_cast<uint64_t *>(lwe_array),
|
||||
static_cast<const uint64_t *>(
|
||||
lwe_flattened_encryptions_of_zero_compact_array_in),
|
||||
(uint64_t **)(ksk), rerand_buffer);
|
||||
reinterpret_cast<uint64_t *const *>(ksk), rerand_buffer);
|
||||
break;
|
||||
case 512:
|
||||
host_rerand_inplace<uint64_t, AmortizedDegree<512>>(
|
||||
streams, static_cast<uint64_t *>(lwe_array),
|
||||
static_cast<const uint64_t *>(
|
||||
lwe_flattened_encryptions_of_zero_compact_array_in),
|
||||
(uint64_t **)(ksk), rerand_buffer);
|
||||
reinterpret_cast<uint64_t *const *>(ksk), rerand_buffer);
|
||||
break;
|
||||
case 1024:
|
||||
host_rerand_inplace<uint64_t, AmortizedDegree<1024>>(
|
||||
streams, static_cast<uint64_t *>(lwe_array),
|
||||
static_cast<const uint64_t *>(
|
||||
lwe_flattened_encryptions_of_zero_compact_array_in),
|
||||
(uint64_t **)(ksk), rerand_buffer);
|
||||
reinterpret_cast<uint64_t *const *>(ksk), rerand_buffer);
|
||||
break;
|
||||
case 2048:
|
||||
host_rerand_inplace<uint64_t, AmortizedDegree<2048>>(
|
||||
streams, static_cast<uint64_t *>(lwe_array),
|
||||
static_cast<const uint64_t *>(
|
||||
lwe_flattened_encryptions_of_zero_compact_array_in),
|
||||
(uint64_t **)(ksk), rerand_buffer);
|
||||
reinterpret_cast<uint64_t *const *>(ksk), rerand_buffer);
|
||||
break;
|
||||
case 4096:
|
||||
host_rerand_inplace<uint64_t, AmortizedDegree<4096>>(
|
||||
streams, static_cast<uint64_t *>(lwe_array),
|
||||
static_cast<const uint64_t *>(
|
||||
lwe_flattened_encryptions_of_zero_compact_array_in),
|
||||
(uint64_t **)(ksk), rerand_buffer);
|
||||
reinterpret_cast<uint64_t *const *>(ksk), rerand_buffer);
|
||||
break;
|
||||
case 8192:
|
||||
host_rerand_inplace<uint64_t, AmortizedDegree<8192>>(
|
||||
streams, static_cast<uint64_t *>(lwe_array),
|
||||
static_cast<const uint64_t *>(
|
||||
lwe_flattened_encryptions_of_zero_compact_array_in),
|
||||
(uint64_t **)(ksk), rerand_buffer);
|
||||
reinterpret_cast<uint64_t *const *>(ksk), rerand_buffer);
|
||||
break;
|
||||
case 16384:
|
||||
host_rerand_inplace<uint64_t, AmortizedDegree<16384>>(
|
||||
streams, static_cast<uint64_t *>(lwe_array),
|
||||
static_cast<const uint64_t *>(
|
||||
lwe_flattened_encryptions_of_zero_compact_array_in),
|
||||
(uint64_t **)(ksk), rerand_buffer);
|
||||
reinterpret_cast<uint64_t *const *>(ksk), rerand_buffer);
|
||||
break;
|
||||
default:
|
||||
PANIC("CUDA error: lwe_dimension not supported."
|
||||
@@ -87,12 +88,12 @@ void cuda_rerand_64_async(
|
||||
" in the interval [256..16384].");
|
||||
break;
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cleanup_cuda_rerand_64(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup rerand")
|
||||
int_rerand_mem<uint64_t> *mem_ptr =
|
||||
(int_rerand_mem<uint64_t> *)(*mem_ptr_void);
|
||||
auto *mem_ptr = reinterpret_cast<int_rerand_mem<uint64_t> *>(*mem_ptr_void);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
|
||||
@@ -14,26 +14,25 @@ void host_rerand_inplace(
|
||||
CudaStreams const streams, Torus *lwe_array,
|
||||
const Torus *lwe_flattened_encryptions_of_zero_compact_array_in,
|
||||
Torus *const *ksk, int_rerand_mem<Torus> *mem_ptr) {
|
||||
auto zero_lwes = mem_ptr->tmp_zero_lwes;
|
||||
auto rerand_mode = mem_ptr->rerand_mode;
|
||||
auto expanded_zero_lwes = mem_ptr->tmp_expanded_zero_lwes;
|
||||
auto num_lwes = mem_ptr->num_lwes;
|
||||
auto ksed_zero_lwes = mem_ptr->tmp_ksed_zero_lwes;
|
||||
auto lwe_trivial_indexes = mem_ptr->lwe_trivial_indexes;
|
||||
auto ksk_params = mem_ptr->params;
|
||||
auto output_dimension = ksk_params.small_lwe_dimension;
|
||||
auto input_dimension = ksk_params.big_lwe_dimension;
|
||||
auto ks_level = ksk_params.ks_level;
|
||||
auto ks_base_log = ksk_params.ks_base_log;
|
||||
auto message_modulus = ksk_params.message_modulus;
|
||||
auto carry_modulus = ksk_params.carry_modulus;
|
||||
|
||||
GPU_ASSERT(sizeof(Torus) == 8,
|
||||
"Cuda error: expand is only supported on 64 bits");
|
||||
auto rerand_params = mem_ptr->params;
|
||||
auto message_modulus = rerand_params.message_modulus;
|
||||
auto carry_modulus = rerand_params.carry_modulus;
|
||||
auto input_dimension = rerand_params.big_lwe_dimension;
|
||||
// Default to input dimension; overridden to small_lwe_dimension in the KS
|
||||
// path
|
||||
auto output_dimension = input_dimension;
|
||||
|
||||
static_assert(sizeof(Torus) == 8, "expand is only supported on 64 bits");
|
||||
|
||||
// Expand encryptions of zero
|
||||
// Wraps the input into a flattened_compact_lwe_lists type
|
||||
auto compact_lwe_lists = flattened_compact_lwe_lists<Torus>(
|
||||
const_cast<Torus *>(lwe_flattened_encryptions_of_zero_compact_array_in),
|
||||
&num_lwes, (uint32_t)1, input_dimension);
|
||||
&num_lwes, static_cast<uint32_t>(1), input_dimension);
|
||||
auto h_expand_jobs = mem_ptr->h_expand_jobs;
|
||||
auto d_expand_jobs = mem_ptr->d_expand_jobs;
|
||||
|
||||
@@ -53,20 +52,30 @@ void host_rerand_inplace(
|
||||
streams.stream(0), streams.gpu_index(0), true);
|
||||
|
||||
host_lwe_expand<Torus, params>(streams.stream(0), streams.gpu_index(0),
|
||||
zero_lwes, d_expand_jobs, num_lwes);
|
||||
expanded_zero_lwes, d_expand_jobs, num_lwes);
|
||||
|
||||
// Keyswitch
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams.get_ith(0), ksed_zero_lwes, lwe_trivial_indexes, zero_lwes,
|
||||
lwe_trivial_indexes, ksk, input_dimension, output_dimension, ks_base_log,
|
||||
ks_level, num_lwes, true, mem_ptr->ks_tmp_buf_vec);
|
||||
auto lwes_to_be_added = expanded_zero_lwes;
|
||||
if (rerand_mode == RERAND_MODE::RERAND_WITH_KS) {
|
||||
lwes_to_be_added = mem_ptr->tmp_ksed_expanded_zero_lwes;
|
||||
output_dimension = rerand_params.small_lwe_dimension;
|
||||
auto ks_level = rerand_params.ks_level;
|
||||
auto ks_base_log = rerand_params.ks_base_log;
|
||||
auto lwe_trivial_indexes = mem_ptr->lwe_trivial_indexes;
|
||||
|
||||
// Keyswitch
|
||||
execute_keyswitch_async<Torus>(streams.get_ith(0), lwes_to_be_added,
|
||||
lwe_trivial_indexes, expanded_zero_lwes,
|
||||
lwe_trivial_indexes, ksk, input_dimension,
|
||||
output_dimension, ks_base_log, ks_level,
|
||||
num_lwes, true, mem_ptr->ks_tmp_buf_vec);
|
||||
}
|
||||
|
||||
// Add ks output to ct
|
||||
// Check sizes
|
||||
CudaRadixCiphertextFFI lwes_ffi;
|
||||
into_radix_ciphertext(&lwes_ffi, lwe_array, num_lwes, output_dimension);
|
||||
CudaRadixCiphertextFFI ksed_zero_lwes_ffi;
|
||||
into_radix_ciphertext(&ksed_zero_lwes_ffi, ksed_zero_lwes, num_lwes,
|
||||
into_radix_ciphertext(&ksed_zero_lwes_ffi, lwes_to_be_added, num_lwes,
|
||||
output_dimension);
|
||||
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), &lwes_ffi,
|
||||
&lwes_ffi, &ksed_zero_lwes_ffi, num_lwes,
|
||||
@@ -81,10 +90,11 @@ __host__ uint64_t scratch_cuda_rerand(CudaStreams streams,
|
||||
int_rerand_mem<Torus> **mem_ptr,
|
||||
uint32_t num_lwes,
|
||||
int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
bool allocate_gpu_memory,
|
||||
RERAND_MODE rerand_mode) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_rerand_mem<Torus>(streams, params, num_lwes,
|
||||
*mem_ptr = new int_rerand_mem<Torus>(streams, params, num_lwes, rerand_mode,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
#include "integer/scalar_bitops.cuh"
|
||||
|
||||
void cuda_integer_scalar_bitop_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
|
||||
void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
|
||||
void cuda_integer_scalar_bitop_inplace_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_inout,
|
||||
void const *clear_blocks, void const *h_clear_blocks,
|
||||
uint32_t num_clear_blocks, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
// In-place variant: lwe_array_inout op= scalar, no aliasing check needed
|
||||
host_scalar_bitop<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_input,
|
||||
CudaStreams(streams), lwe_array_inout, lwe_array_inout,
|
||||
static_cast<const uint64_t *>(clear_blocks),
|
||||
static_cast<const uint64_t *>(h_clear_blocks), num_clear_blocks,
|
||||
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
|
||||
|
||||
@@ -36,6 +36,9 @@ void cuda_integer_scalar_comparison_64_async(
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, void const *scalar_blocks,
|
||||
void const *h_scalar_blocks, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t num_scalar_blocks) {
|
||||
PANIC_IF_FALSE(lwe_array_out != lwe_array_in,
|
||||
"Output and input pointers must be different for out-of-place "
|
||||
"operations");
|
||||
|
||||
// The output ciphertext might be a boolean block or a radix ciphertext
|
||||
// depending on the case (eq/gt vs max/min) so the amount of blocks to
|
||||
|
||||
@@ -118,6 +118,9 @@ void cuda_integer_unsigned_scalar_div_rem_radix_64_async(
|
||||
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
|
||||
void const *clear_blocks, void const *h_clear_blocks,
|
||||
uint32_t num_clear_blocks) {
|
||||
PANIC_IF_FALSE(quotient_ct != remainder_ct,
|
||||
"Quotient and remainder pointers must be different for "
|
||||
"out-of-place operations");
|
||||
|
||||
host_integer_unsigned_scalar_div_rem_radix<uint64_t>(
|
||||
CudaStreams(streams), quotient_ct, remainder_ct,
|
||||
@@ -168,6 +171,9 @@ void cuda_integer_signed_scalar_div_rem_radix_64_async(
|
||||
uint64_t const *divisor_has_at_least_one_set,
|
||||
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
|
||||
uint32_t numerator_bits) {
|
||||
PANIC_IF_FALSE(quotient_ct != remainder_ct,
|
||||
"Quotient and remainder pointers must be different for "
|
||||
"out-of-place operations");
|
||||
|
||||
host_integer_signed_scalar_div_rem_radix<uint64_t>(
|
||||
CudaStreams(streams), quotient_ct, remainder_ct,
|
||||
|
||||
@@ -25,6 +25,12 @@ void cuda_unchecked_all_eq_slices_64_async(
|
||||
CudaRadixCiphertextFFI const *lhs, CudaRadixCiphertextFFI const *rhs,
|
||||
uint32_t num_inputs, uint32_t num_blocks, int8_t *mem, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
PANIC_IF_FALSE(match_ct != lhs,
|
||||
"Output and first input pointers must be different for "
|
||||
"out-of-place operations");
|
||||
PANIC_IF_FALSE(match_ct != rhs,
|
||||
"Output and second input pointers must be different for "
|
||||
"out-of-place operations");
|
||||
|
||||
host_unchecked_all_eq_slices<uint64_t>(
|
||||
CudaStreams(streams), match_ct, lhs, rhs, num_inputs, num_blocks,
|
||||
@@ -68,6 +74,12 @@ void cuda_unchecked_contains_sub_slice_64_async(
|
||||
CudaRadixCiphertextFFI const *lhs, CudaRadixCiphertextFFI const *rhs,
|
||||
uint32_t num_rhs, uint32_t num_blocks, int8_t *mem, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
PANIC_IF_FALSE(match_ct != lhs,
|
||||
"Output and first input pointers must be different for "
|
||||
"out-of-place operations");
|
||||
PANIC_IF_FALSE(match_ct != rhs,
|
||||
"Output and second input pointers must be different for "
|
||||
"out-of-place operations");
|
||||
|
||||
host_unchecked_contains_sub_slice<uint64_t>(
|
||||
CudaStreams(streams), match_ct, lhs, rhs, num_rhs, num_blocks,
|
||||
|
||||
@@ -27,6 +27,15 @@ void cuda_unchecked_match_value_64_async(
|
||||
CudaRadixCiphertextFFI const *lwe_array_in_ct,
|
||||
const uint64_t *h_match_inputs, const uint64_t *h_match_outputs,
|
||||
int8_t *mem, void *const *bsks, void *const *ksks) {
|
||||
PANIC_IF_FALSE(lwe_array_out_result != lwe_array_in_ct,
|
||||
"Output result and input pointers must be different for "
|
||||
"out-of-place operations");
|
||||
PANIC_IF_FALSE(lwe_array_out_boolean != lwe_array_in_ct,
|
||||
"Output boolean and input pointers must be different for "
|
||||
"out-of-place operations");
|
||||
PANIC_IF_FALSE(lwe_array_out_result != lwe_array_out_boolean,
|
||||
"Result and boolean output pointers must be different for "
|
||||
"out-of-place operations");
|
||||
|
||||
host_unchecked_match_value<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out_result, lwe_array_out_boolean,
|
||||
@@ -75,6 +84,9 @@ void cuda_unchecked_match_value_or_64_async(
|
||||
const uint64_t *h_match_inputs, const uint64_t *h_match_outputs,
|
||||
const uint64_t *h_or_value, int8_t *mem, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
PANIC_IF_FALSE(lwe_array_out != lwe_array_in_ct,
|
||||
"Output and input pointers must be different for out-of-place "
|
||||
"operations");
|
||||
|
||||
host_unchecked_match_value_or<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_in_ct, h_match_inputs,
|
||||
@@ -120,6 +132,12 @@ void cuda_unchecked_contains_64_async(CudaStreamsFFI streams,
|
||||
uint32_t num_inputs, uint32_t num_blocks,
|
||||
int8_t *mem, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
PANIC_IF_FALSE(output != inputs,
|
||||
"Output and first input pointers must be different for "
|
||||
"out-of-place operations");
|
||||
PANIC_IF_FALSE(output != value,
|
||||
"Output and second input pointers must be different for "
|
||||
"out-of-place operations");
|
||||
|
||||
host_unchecked_contains<uint64_t>(
|
||||
CudaStreams(streams), output, inputs, value, num_inputs, num_blocks,
|
||||
@@ -163,6 +181,8 @@ void cuda_unchecked_contains_clear_64_async(
|
||||
CudaRadixCiphertextFFI const *inputs, const uint64_t *h_clear_val,
|
||||
uint32_t num_inputs, uint32_t num_blocks, int8_t *mem, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
PANIC_IF_FALSE(output != inputs, "Output and input pointers must be "
|
||||
"different for out-of-place operations");
|
||||
|
||||
host_unchecked_contains_clear<uint64_t>(
|
||||
CudaStreams(streams), output, inputs, h_clear_val, num_inputs, num_blocks,
|
||||
@@ -206,6 +226,8 @@ void cuda_unchecked_is_in_clears_64_async(
|
||||
CudaRadixCiphertextFFI const *input, const uint64_t *h_cleartexts,
|
||||
uint32_t num_clears, uint32_t num_blocks, int8_t *mem, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
PANIC_IF_FALSE(output != input, "Output and input pointers must be different "
|
||||
"for out-of-place operations");
|
||||
|
||||
host_unchecked_is_in_clears<uint64_t>(
|
||||
CudaStreams(streams), output, input, h_cleartexts, num_clears, num_blocks,
|
||||
@@ -250,6 +272,13 @@ void cuda_unchecked_index_in_clears_64_async(
|
||||
const uint64_t *h_cleartexts, uint32_t num_clears, uint32_t num_blocks,
|
||||
uint32_t num_blocks_index, int8_t *mem, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
PANIC_IF_FALSE(index_ct != input, "Output and input pointers must be "
|
||||
"different for out-of-place operations");
|
||||
PANIC_IF_FALSE(match_ct != input, "Output and input pointers must be "
|
||||
"different for out-of-place operations");
|
||||
PANIC_IF_FALSE(index_ct != match_ct,
|
||||
"Index and match output pointers must be different for "
|
||||
"out-of-place operations");
|
||||
|
||||
host_unchecked_index_in_clears<uint64_t>(
|
||||
CudaStreams(streams), index_ct, match_ct, input, h_cleartexts, num_clears,
|
||||
@@ -295,6 +324,13 @@ void cuda_unchecked_first_index_in_clears_64_async(
|
||||
const uint64_t *h_unique_values, const uint64_t *h_unique_indices,
|
||||
uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
int8_t *mem, void *const *bsks, void *const *ksks) {
|
||||
PANIC_IF_FALSE(index_ct != input, "Output and input pointers must be "
|
||||
"different for out-of-place operations");
|
||||
PANIC_IF_FALSE(match_ct != input, "Output and input pointers must be "
|
||||
"different for out-of-place operations");
|
||||
PANIC_IF_FALSE(index_ct != match_ct,
|
||||
"Index and match output pointers must be different for "
|
||||
"out-of-place operations");
|
||||
|
||||
host_unchecked_first_index_in_clears<uint64_t>(
|
||||
CudaStreams(streams), index_ct, match_ct, input, h_unique_values,
|
||||
@@ -340,6 +376,13 @@ void cuda_unchecked_first_index_of_clear_64_async(
|
||||
const uint64_t *h_clear_val, uint32_t num_inputs, uint32_t num_blocks,
|
||||
uint32_t num_blocks_index, int8_t *mem, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
PANIC_IF_FALSE(index_ct != inputs, "Output and input pointers must be "
|
||||
"different for out-of-place operations");
|
||||
PANIC_IF_FALSE(match_ct != inputs, "Output and input pointers must be "
|
||||
"different for out-of-place operations");
|
||||
PANIC_IF_FALSE(index_ct != match_ct,
|
||||
"Index and match output pointers must be different for "
|
||||
"out-of-place operations");
|
||||
|
||||
host_unchecked_first_index_of_clear<uint64_t>(
|
||||
CudaStreams(streams), index_ct, match_ct, inputs, h_clear_val, num_inputs,
|
||||
@@ -385,6 +428,13 @@ void cuda_unchecked_first_index_of_64_async(
|
||||
CudaRadixCiphertextFFI const *value, uint32_t num_inputs,
|
||||
uint32_t num_blocks, uint32_t num_blocks_index, int8_t *mem,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
PANIC_IF_FALSE(index_ct != inputs, "Output and input pointers must be "
|
||||
"different for out-of-place operations");
|
||||
PANIC_IF_FALSE(match_ct != inputs, "Output and input pointers must be "
|
||||
"different for out-of-place operations");
|
||||
PANIC_IF_FALSE(index_ct != match_ct,
|
||||
"Index and match output pointers must be different for "
|
||||
"out-of-place operations");
|
||||
|
||||
host_unchecked_first_index_of<uint64_t>(
|
||||
CudaStreams(streams), index_ct, match_ct, inputs, value, num_inputs,
|
||||
@@ -431,6 +481,13 @@ void cuda_unchecked_index_of_64_async(CudaStreamsFFI streams,
|
||||
uint32_t num_inputs, uint32_t num_blocks,
|
||||
uint32_t num_blocks_index, int8_t *mem,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
PANIC_IF_FALSE(index_ct != inputs, "Output and input pointers must be "
|
||||
"different for out-of-place operations");
|
||||
PANIC_IF_FALSE(match_ct != inputs, "Output and input pointers must be "
|
||||
"different for out-of-place operations");
|
||||
PANIC_IF_FALSE(index_ct != match_ct,
|
||||
"Index and match output pointers must be different for "
|
||||
"out-of-place operations");
|
||||
|
||||
host_unchecked_index_of<uint64_t>(
|
||||
CudaStreams(streams), index_ct, match_ct, inputs, value, num_inputs,
|
||||
@@ -477,6 +534,13 @@ void cuda_unchecked_index_of_clear_64_async(
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_scalar_blocks,
|
||||
uint32_t num_blocks_index, int8_t *mem, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
PANIC_IF_FALSE(index_ct != inputs, "Output and input pointers must be "
|
||||
"different for out-of-place operations");
|
||||
PANIC_IF_FALSE(match_ct != inputs, "Output and input pointers must be "
|
||||
"different for out-of-place operations");
|
||||
PANIC_IF_FALSE(index_ct != match_ct,
|
||||
"Index and match output pointers must be different for "
|
||||
"out-of-place operations");
|
||||
|
||||
host_unchecked_index_of_clear<uint64_t>(
|
||||
CudaStreams(streams), index_ct, match_ct, inputs,
|
||||
|
||||
@@ -5,6 +5,12 @@ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
|
||||
CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input_1,
|
||||
CudaRadixCiphertextFFI const *input_2) {
|
||||
PANIC_IF_FALSE(output != input_1,
|
||||
"Output and first input pointers must be different for "
|
||||
"out-of-place operations");
|
||||
PANIC_IF_FALSE(output != input_2,
|
||||
"Output and second input pointers must be different for "
|
||||
"out-of-place operations");
|
||||
|
||||
if (output->num_radix_blocks != input_1->num_radix_blocks ||
|
||||
output->num_radix_blocks != input_2->num_radix_blocks)
|
||||
@@ -44,6 +50,12 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
|
||||
CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input_1,
|
||||
CudaRadixCiphertextFFI const *input_2) {
|
||||
PANIC_IF_FALSE(output != input_1,
|
||||
"Output and first input pointers must be different for "
|
||||
"out-of-place operations");
|
||||
PANIC_IF_FALSE(output != input_2,
|
||||
"Output and second input pointers must be different for "
|
||||
"out-of-place operations");
|
||||
|
||||
if (output->num_radix_blocks != input_1->num_radix_blocks ||
|
||||
output->num_radix_blocks != input_2->num_radix_blocks)
|
||||
@@ -53,6 +65,30 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
|
||||
cuda_synchronize_stream(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
}
|
||||
|
||||
void cuda_add_lwe_ciphertext_vector_inplace_32(
|
||||
void *stream, uint32_t gpu_index, CudaRadixCiphertextFFI *lwe_array_inout,
|
||||
CudaRadixCiphertextFFI const *input_2) {
|
||||
// In-place variant: lwe_array_inout += input_2, no aliasing check needed
|
||||
if (lwe_array_inout->num_radix_blocks != input_2->num_radix_blocks)
|
||||
PANIC("Cuda error: input and output num radix blocks must be the same")
|
||||
host_addition<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
lwe_array_inout, lwe_array_inout, input_2,
|
||||
lwe_array_inout->num_radix_blocks, 0, 0);
|
||||
cuda_synchronize_stream(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
}
|
||||
|
||||
void cuda_add_lwe_ciphertext_vector_inplace_64(
|
||||
void *stream, uint32_t gpu_index, CudaRadixCiphertextFFI *lwe_array_inout,
|
||||
CudaRadixCiphertextFFI const *input_2) {
|
||||
// In-place variant: lwe_array_inout += input_2, no aliasing check needed
|
||||
if (lwe_array_inout->num_radix_blocks != input_2->num_radix_blocks)
|
||||
PANIC("Cuda error: input and output num radix blocks must be the same")
|
||||
host_addition<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
lwe_array_inout, lwe_array_inout, input_2,
|
||||
lwe_array_inout->num_radix_blocks, 0, 0);
|
||||
cuda_synchronize_stream(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform the addition of a u32 input LWE ciphertext vector with a u32
|
||||
* plaintext vector. See the equivalent operation on u64 data for more details.
|
||||
@@ -62,6 +98,9 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
|
||||
void const *lwe_array_in, void const *plaintext_array_in,
|
||||
const uint32_t input_lwe_dimension,
|
||||
const uint32_t input_lwe_ciphertext_count) {
|
||||
PANIC_IF_FALSE(lwe_array_out != lwe_array_in,
|
||||
"Output and input pointers must be different for out-of-place "
|
||||
"operations");
|
||||
|
||||
host_addition_plaintext<uint32_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
@@ -104,6 +143,9 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
|
||||
void const *lwe_array_in, void const *plaintext_array_in,
|
||||
const uint32_t input_lwe_dimension,
|
||||
const uint32_t input_lwe_ciphertext_count) {
|
||||
PANIC_IF_FALSE(lwe_array_out != lwe_array_in,
|
||||
"Output and input pointers must be different for out-of-place "
|
||||
"operations");
|
||||
|
||||
host_addition_plaintext<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
@@ -144,6 +186,9 @@ void cuda_add_lwe_ciphertext_vector_plaintext_64(
|
||||
void const *lwe_array_in, const uint64_t plaintext_in,
|
||||
const uint32_t input_lwe_dimension,
|
||||
const uint32_t input_lwe_ciphertext_count) {
|
||||
PANIC_IF_FALSE(lwe_array_out != lwe_array_in,
|
||||
"Output and input pointers must be different for out-of-place "
|
||||
"operations");
|
||||
|
||||
host_addition_plaintext_scalar<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
|
||||
@@ -10,6 +10,9 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
|
||||
void const *lwe_array_in, void const *cleartext_array_in,
|
||||
const uint32_t input_lwe_dimension,
|
||||
const uint32_t input_lwe_ciphertext_count) {
|
||||
PANIC_IF_FALSE(lwe_array_out != lwe_array_in,
|
||||
"Output and input pointers must be different for out-of-place "
|
||||
"operations");
|
||||
|
||||
host_cleartext_vec_multiplication<uint32_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
@@ -52,6 +55,9 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
|
||||
void const *lwe_array_in, void const *cleartext_array_in,
|
||||
const uint32_t input_lwe_dimension,
|
||||
const uint32_t input_lwe_ciphertext_count) {
|
||||
PANIC_IF_FALSE(lwe_array_out != lwe_array_in,
|
||||
"Output and input pointers must be different for out-of-place "
|
||||
"operations");
|
||||
|
||||
host_cleartext_vec_multiplication<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
@@ -80,6 +86,12 @@ void cuda_wrapping_polynomial_mul_one_to_many_64_async(
|
||||
void *stream, uint32_t gpu_index, void *result, void const *poly_lhs,
|
||||
int8_t *circulant, void const *poly_rhs, uint32_t polynomial_size,
|
||||
uint32_t n_rhs) {
|
||||
PANIC_IF_FALSE(result != poly_lhs,
|
||||
"Output and left input pointers must be different for "
|
||||
"out-of-place operations");
|
||||
PANIC_IF_FALSE(result != poly_rhs,
|
||||
"Output and right input pointers must be different for "
|
||||
"out-of-place operations");
|
||||
|
||||
host_wrapping_polynomial_mul_one_to_many<uint64_t, ulonglong4>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
@@ -92,6 +104,13 @@ void cuda_glwe_wrapping_polynomial_mul_one_to_many_64_async(
|
||||
void *stream, uint32_t gpu_index, void *result, void const *glwe_lhs,
|
||||
int8_t *circulant, void const *poly_rhs, uint32_t polynomial_size,
|
||||
uint32_t glwe_dimension, uint32_t n_rhs) {
|
||||
PANIC_IF_FALSE(result != glwe_lhs,
|
||||
"Output and left input pointers must be different for "
|
||||
"out-of-place operations");
|
||||
PANIC_IF_FALSE(result != poly_rhs,
|
||||
"Output and right input pointers must be different for "
|
||||
"out-of-place operations");
|
||||
|
||||
host_glwe_wrapping_polynomial_mul_one_to_many<uint64_t, ulonglong4>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(result), static_cast<uint64_t const *>(glwe_lhs),
|
||||
|
||||
@@ -8,6 +8,9 @@ void cuda_negate_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_array_in, const uint32_t input_lwe_dimension,
|
||||
const uint32_t input_lwe_ciphertext_count) {
|
||||
PANIC_IF_FALSE(lwe_array_out != lwe_array_in,
|
||||
"Output and input pointers must be different for out-of-place "
|
||||
"operations");
|
||||
|
||||
host_negation<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
@@ -42,6 +45,9 @@ void cuda_negate_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_array_in, const uint32_t input_lwe_dimension,
|
||||
const uint32_t input_lwe_ciphertext_count) {
|
||||
PANIC_IF_FALSE(lwe_array_out != lwe_array_in,
|
||||
"Output and input pointers must be different for out-of-place "
|
||||
"operations");
|
||||
|
||||
host_negation<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
|
||||
@@ -373,7 +373,8 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
|
||||
|
||||
// Get the number of streaming multiprocessors
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
check_cuda_error(
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
|
||||
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
|
||||
}
|
||||
|
||||
|
||||
@@ -420,6 +420,39 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
|
||||
}
|
||||
}
|
||||
|
||||
// Noise tests variant: identical to host_cg_multi_bit_programmable_bootstrap
|
||||
// but uses NOISE_TESTS keybundle mode.
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_cg_multi_bit_programmable_bootstrap_noise_tests(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, uint64_t const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
// Compute a keybundle with NOISE_TESTS mode instead of GENERIC
|
||||
execute_compute_keybundle_noise_tests<Torus, params>(
|
||||
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, level_count, lwe_offset);
|
||||
|
||||
execute_cg_external_product_loop<Torus, params>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
|
||||
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
|
||||
lut_stride);
|
||||
}
|
||||
}
|
||||
|
||||
// Verify if the grid size satisfies the cooperative group constraints
|
||||
template <typename Torus, class params>
|
||||
__host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
|
||||
@@ -484,7 +517,8 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
|
||||
|
||||
// Get the number of streaming multiprocessors
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
check_cuda_error(
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
|
||||
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
|
||||
}
|
||||
|
||||
|
||||
@@ -784,9 +784,9 @@ __host__ uint64_t scratch_programmable_bootstrap_tbc_128(
|
||||
device_programmable_bootstrap_tbc_128<InputTorus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
full_sm)); // full_sm + minimum_sm_tbc));
|
||||
cudaFuncSetCacheConfig(
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_tbc_128<InputTorus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
cudaFuncCachePreferShared));
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_tbc_128<InputTorus, params, FULLSM>,
|
||||
cudaFuncAttributeNonPortableClusterSizeAllowed, true));
|
||||
@@ -1271,7 +1271,8 @@ __host__ bool verify_cuda_programmable_bootstrap_128_cg_grid_size(
|
||||
|
||||
// Get the number of streaming multiprocessors
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
check_cuda_error(
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
|
||||
|
||||
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
|
||||
}
|
||||
|
||||
@@ -645,6 +645,103 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_64(void *stream,
|
||||
*buffer = nullptr;
|
||||
}
|
||||
|
||||
// Noise-tests-namespaced wrappers: delegate to the standard scratch/cleanup so
|
||||
// that callers using the noise-tests PBS variant have a consistent API.
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
return scratch_cuda_multi_bit_programmable_bootstrap_64_async(
|
||||
stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer) {
|
||||
cleanup_cuda_multi_bit_programmable_bootstrap_64(stream, gpu_index,
|
||||
pbs_buffer);
|
||||
}
|
||||
|
||||
// Noise tests variant of the 64-bit multi-bit PBS, restricted to
|
||||
// polynomial_size=2048. The main difference is that the input
|
||||
// is assumed to be modulus switched before bootstrapping.
|
||||
void cuda_multi_bit_programmable_bootstrap_noise_tests_64_async(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
|
||||
PANIC_IF_FALSE(num_samples == 1,
|
||||
"Cuda error (multi-bit PBS): num_samples (%d) should be 1",
|
||||
num_samples);
|
||||
|
||||
PANIC_IF_FALSE(base_log <= 64,
|
||||
"Cuda error (multi-bit PBS): base log (%d) should be <= 64",
|
||||
base_log);
|
||||
PANIC_IF_FALSE(polynomial_size == 2048,
|
||||
"Cuda error (multi-bit PBS noise tests): only polynomial "
|
||||
"size 2048 is supported, got %d.",
|
||||
polynomial_size);
|
||||
|
||||
pbs_buffer<uint64_t, MULTI_BIT> *buffer =
|
||||
(pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
|
||||
|
||||
switch (buffer->pbs_variant) {
|
||||
case PBS_VARIANT::TBC:
|
||||
#if CUDA_ARCH >= 900
|
||||
{
|
||||
host_tbc_multi_bit_programmable_bootstrap_noise_tests<uint64_t,
|
||||
Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const uint64_t *>(bootstrapping_key), buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_many_lut, lut_stride);
|
||||
} break;
|
||||
#else
|
||||
PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
|
||||
#endif
|
||||
case PBS_VARIANT::CG:
|
||||
host_cg_multi_bit_programmable_bootstrap_noise_tests<uint64_t,
|
||||
Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const uint64_t *>(bootstrapping_key), buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
host_multi_bit_programmable_bootstrap_noise_tests<uint64_t, Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const uint64_t *>(bootstrapping_key), buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes divisors of the product of num_sms (streaming multiprocessors on the
|
||||
* GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
|
||||
|
||||
@@ -25,7 +25,8 @@ get_start_ith_ggsw_offset(uint32_t polynomial_size, int glwe_dimension,
|
||||
level_count;
|
||||
}
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
template <typename Torus, class params, sharedMemDegree SMD,
|
||||
bool runs_noise_test = false>
|
||||
__global__ void device_multi_bit_programmable_bootstrap_keybundle(
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
|
||||
@@ -55,9 +56,6 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
|
||||
|
||||
if (lwe_iteration < (lwe_dimension / grouping_factor)) {
|
||||
|
||||
const Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
|
||||
|
||||
double2 *keybundle = keybundle_array +
|
||||
// select the input
|
||||
input_idx * keybundle_size_per_input;
|
||||
@@ -86,10 +84,40 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
|
||||
// Precalculate the monomial degrees and store them in shared memory
|
||||
uint32_t *monomial_degrees = (uint32_t *)selected_memory;
|
||||
if (threadIdx.x < (1 << grouping_factor)) {
|
||||
const Torus *lwe_array_group =
|
||||
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
|
||||
monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
|
||||
lwe_array_group, threadIdx.x, grouping_factor);
|
||||
if constexpr (runs_noise_test == true) {
|
||||
// For noise tests the input array contains the input lwe but also the
|
||||
// modswitched results. This allows to avoid changing the accumulation
|
||||
// kernel for the noise tests since the input body will stay in the same
|
||||
// position. The layout of the input array is the following:
|
||||
// | input lwe | modswitched inputs |
|
||||
// | lwe size | lwe_size*grouping_factor |
|
||||
|
||||
// This offset allows to jump directly to the modswitched inputs,
|
||||
// skipping the input lwe
|
||||
const Torus modswitched_offset = lwe_dimension + 1;
|
||||
|
||||
const Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[input_idx] *
|
||||
(lwe_dimension / grouping_factor) *
|
||||
(1 << grouping_factor) +
|
||||
modswitched_offset];
|
||||
|
||||
const Torus *lwe_array_group =
|
||||
block_lwe_array_in + rev_lwe_iteration * (1 << grouping_factor);
|
||||
monomial_degrees[threadIdx.x] = lwe_array_group[threadIdx.x];
|
||||
|
||||
} else {
|
||||
// In production we calculate the monomial degrees on the fly, since
|
||||
// they are not stored in the input array.
|
||||
const Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
|
||||
|
||||
const Torus *lwe_array_group =
|
||||
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
|
||||
monomial_degrees[threadIdx.x] =
|
||||
calculates_monomial_degree<Torus, params>(
|
||||
lwe_array_group, threadIdx.x, grouping_factor);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
@@ -145,7 +173,8 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
|
||||
// Then we can just calculate the offset needed to apply this coefficients, and
|
||||
// the operation transforms into a pointwise vector multiplication, avoiding to
|
||||
// perform extra instructions other than MADD
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
template <typename Torus, class params, sharedMemDegree SMD,
|
||||
bool runs_noise_test = false>
|
||||
__global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
|
||||
@@ -219,10 +248,40 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
|
||||
uint32_t *monomial_degrees = (uint32_t *)selected_memory;
|
||||
|
||||
if (threadIdx.x < (1 << grouping_factor)) {
|
||||
const Torus *lwe_array_group =
|
||||
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
|
||||
monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
|
||||
lwe_array_group, threadIdx.x, grouping_factor);
|
||||
if constexpr (runs_noise_test == true) {
|
||||
// For noise tests the input array contains the input lwe but also the
|
||||
// modswitched results. This allows to avoid changing the accumulation
|
||||
// kernel for the noise tests since the input body will stay in the same
|
||||
// position. The layout of the input array is the following:
|
||||
// | input lwe | modswitched inputs |
|
||||
// | lwe size | lwe_size*grouping_factor |
|
||||
|
||||
// This offset allows to jump directly to the modswitched inputs,
|
||||
// skipping the input lwe
|
||||
const Torus modswitched_offset = lwe_dimension + 1;
|
||||
|
||||
const Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[input_idx] *
|
||||
(lwe_dimension / grouping_factor) *
|
||||
(1 << grouping_factor) +
|
||||
modswitched_offset];
|
||||
|
||||
const Torus *lwe_array_group =
|
||||
block_lwe_array_in + rev_lwe_iteration * (1 << grouping_factor);
|
||||
monomial_degrees[threadIdx.x] = lwe_array_group[threadIdx.x];
|
||||
|
||||
} else {
|
||||
// In production we calculate the monomial degrees on the fly, since
|
||||
// they are not stored in the input array.
|
||||
const Torus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
|
||||
|
||||
const Torus *lwe_array_group =
|
||||
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
|
||||
monomial_degrees[threadIdx.x] =
|
||||
calculates_monomial_degree<Torus, params>(
|
||||
lwe_array_group, threadIdx.x, grouping_factor);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
@@ -662,6 +721,7 @@ enum class MultiBitKeybundleLaunchMode {
|
||||
AUTO,
|
||||
GENERIC,
|
||||
SPECIALIZED_2_2,
|
||||
NOISE_TESTS,
|
||||
};
|
||||
|
||||
template <typename Torus, class params>
|
||||
@@ -726,30 +786,65 @@ __host__ void execute_compute_keybundle_with_mode(
|
||||
bool use_specialized =
|
||||
launch_mode == MultiBitKeybundleLaunchMode::SPECIALIZED_2_2 ||
|
||||
(launch_mode == MultiBitKeybundleLaunchMode::AUTO &&
|
||||
can_use_specialized) ||
|
||||
(launch_mode == MultiBitKeybundleLaunchMode::NOISE_TESTS &&
|
||||
can_use_specialized);
|
||||
bool use_noise_test_template =
|
||||
launch_mode == MultiBitKeybundleLaunchMode::NOISE_TESTS;
|
||||
if (use_specialized) {
|
||||
dim3 thds_new_keybundle(512, 1, 1);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
|
||||
Torus, Degree<2048>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 3 * full_sm_keybundle));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
|
||||
Torus, Degree<2048>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
check_cuda_error(cudaGetLastError());
|
||||
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
|
||||
Torus, Degree<2048>, FULLSM><<<grid_keybundle, thds_new_keybundle,
|
||||
3 * full_sm_keybundle, stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
|
||||
lwe_dimension, lwe_offset, chunk_size, keybundle_size_per_input);
|
||||
if (use_noise_test_template) {
|
||||
// Set up the noise-test variant of the specialized 2_2 kernel
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
|
||||
Torus, Degree<2048>, FULLSM, true>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
3 * full_sm_keybundle));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
|
||||
Torus, Degree<2048>, FULLSM, true>,
|
||||
cudaFuncCachePreferShared));
|
||||
check_cuda_error(cudaGetLastError());
|
||||
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
|
||||
Torus, Degree<2048>, FULLSM, true>
|
||||
<<<grid_keybundle, thds_new_keybundle, 3 * full_sm_keybundle,
|
||||
stream>>>(lwe_array_in, lwe_input_indexes, keybundle_fft,
|
||||
bootstrapping_key, lwe_dimension, lwe_offset,
|
||||
chunk_size, keybundle_size_per_input);
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
|
||||
Torus, Degree<2048>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
3 * full_sm_keybundle));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
|
||||
Torus, Degree<2048>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
check_cuda_error(cudaGetLastError());
|
||||
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
|
||||
Torus, Degree<2048>, FULLSM><<<grid_keybundle, thds_new_keybundle,
|
||||
3 * full_sm_keybundle, stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
|
||||
lwe_dimension, lwe_offset, chunk_size, keybundle_size_per_input);
|
||||
}
|
||||
} else {
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM>
|
||||
<<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
|
||||
level_count, lwe_offset, chunk_size, keybundle_size_per_input,
|
||||
d_mem, 0);
|
||||
if (use_noise_test_template) {
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM,
|
||||
true>
|
||||
<<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft,
|
||||
bootstrapping_key, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, grouping_factor, level_count, lwe_offset,
|
||||
chunk_size, keybundle_size_per_input, d_mem, 0);
|
||||
} else {
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM>
|
||||
<<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft,
|
||||
bootstrapping_key, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, grouping_factor, level_count, lwe_offset,
|
||||
chunk_size, keybundle_size_per_input, d_mem, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
@@ -796,6 +891,20 @@ __host__ void execute_compute_keybundle_2_2_specialized(
|
||||
grouping_factor, level_count, lwe_offset,
|
||||
MultiBitKeybundleLaunchMode::SPECIALIZED_2_2);
|
||||
}
|
||||
// Used only to run noise tests
|
||||
template <typename Torus, class params>
|
||||
__host__ void execute_compute_keybundle_noise_tests(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
|
||||
execute_compute_keybundle_with_mode<Torus, params>(
|
||||
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, level_count, lwe_offset,
|
||||
MultiBitKeybundleLaunchMode::NOISE_TESTS);
|
||||
}
|
||||
|
||||
template <typename Torus, class params, bool is_first_iter>
|
||||
__host__ void execute_step_one(
|
||||
@@ -955,4 +1064,62 @@ __host__ void host_multi_bit_programmable_bootstrap(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_multi_bit_programmable_bootstrap_noise_tests(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
// Compute a keybundle with NOISE_TESTS mode to enable the specialized
|
||||
// runs_noise_test=true kernel variant for noise measurement
|
||||
execute_compute_keybundle_with_mode<Torus, params>(
|
||||
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, level_count, lwe_offset,
|
||||
MultiBitKeybundleLaunchMode::NOISE_TESTS);
|
||||
// Accumulate (same as standard path)
|
||||
uint32_t chunk_size =
|
||||
std::min((uint32_t)lwe_chunk_size,
|
||||
(lwe_dimension / grouping_factor) - lwe_offset);
|
||||
for (uint32_t j = 0; j < chunk_size; j++) {
|
||||
bool is_first_iter = (j + lwe_offset) == 0;
|
||||
bool is_last_iter =
|
||||
(j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
|
||||
if (is_first_iter) {
|
||||
execute_step_one<Torus, params, true>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, buffer, num_samples, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count);
|
||||
} else {
|
||||
execute_step_one<Torus, params, false>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, buffer, num_samples, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count);
|
||||
}
|
||||
|
||||
if (is_last_iter) {
|
||||
execute_step_two<Torus, params, true>(
|
||||
stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
|
||||
num_samples, glwe_dimension, polynomial_size, level_count, j,
|
||||
num_many_lut, lut_stride);
|
||||
} else {
|
||||
execute_step_two<Torus, params, false>(
|
||||
stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
|
||||
num_samples, glwe_dimension, polynomial_size, level_count, j,
|
||||
num_many_lut, lut_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // MULTIBIT_PBS_H
|
||||
|
||||
@@ -293,6 +293,81 @@ void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
|
||||
*buffer = nullptr;
|
||||
}
|
||||
|
||||
// Noise-tests-namespaced wrappers: delegate to the standard scratch/cleanup so
|
||||
// that callers using the noise-tests PBS128 variant have a consistent API.
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
return scratch_cuda_multi_bit_programmable_bootstrap_128_async(
|
||||
stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap_noise_tests_128(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer) {
|
||||
cleanup_cuda_multi_bit_programmable_bootstrap_128(stream, gpu_index,
|
||||
pbs_buffer);
|
||||
cuda_synchronize_stream(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
}
|
||||
|
||||
// Noise tests variant of the 128-bit multi-bit PBS, restricted to
|
||||
// polynomial_size=2048. The input is assumed to contain precomputed
|
||||
// modswitched values in the extended input array layout.
|
||||
void cuda_multi_bit_programmable_bootstrap_noise_tests_128_async(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lwe_array_in, void const *lwe_input_indexes,
|
||||
void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
PANIC_IF_FALSE(num_samples == 1,
|
||||
"Cuda error (multi-bit PBS): num_samples (%d) should be 1",
|
||||
num_samples);
|
||||
PANIC_IF_FALSE(base_log <= 64,
|
||||
"Cuda error (multi-bit PBS): base log (%d) should be <= 64",
|
||||
base_log);
|
||||
PANIC_IF_FALSE(polynomial_size == 2048,
|
||||
"Cuda error (multi-bit PBS128 noise tests): only polynomial "
|
||||
"size 2048 is supported, got %d.",
|
||||
polynomial_size);
|
||||
|
||||
auto *buffer =
|
||||
reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(mem_ptr);
|
||||
switch (buffer->pbs_variant) {
|
||||
case PBS_VARIANT::CG:
|
||||
host_cg_multi_bit_programmable_bootstrap_noise_tests_128<uint64_t,
|
||||
Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<__uint128_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const __uint128_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const __uint128_t *>(bootstrapping_key), buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
host_multi_bit_programmable_bootstrap_noise_tests_128<uint64_t,
|
||||
Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<__uint128_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const __uint128_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const __uint128_t *>(bootstrapping_key), buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes divisors of the product of num_sms (streaming multiprocessors on the
|
||||
* GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
|
||||
|
||||
@@ -18,7 +18,8 @@ uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle(
|
||||
(size_t)2); // accumulator
|
||||
}
|
||||
|
||||
template <typename InputTorus, class params, sharedMemDegree SMD>
|
||||
template <typename InputTorus, class params, sharedMemDegree SMD,
|
||||
bool runs_noise_test = false>
|
||||
__global__ void device_multi_bit_programmable_bootstrap_keybundle_128(
|
||||
const InputTorus *__restrict__ lwe_array_in,
|
||||
const InputTorus *__restrict__ lwe_input_indexes, double *keybundle_array,
|
||||
@@ -80,11 +81,35 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_128(
|
||||
// Precalculate the monomial degrees and store them in shared memory
|
||||
uint32_t *monomial_degrees = (uint32_t *)selected_memory;
|
||||
if (threadIdx.x < (1 << grouping_factor)) {
|
||||
auto lwe_array_group =
|
||||
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
|
||||
monomial_degrees[threadIdx.x] =
|
||||
calculates_monomial_degree<InputTorus, params>(
|
||||
lwe_array_group, threadIdx.x, grouping_factor);
|
||||
if constexpr (runs_noise_test == true) {
|
||||
// For noise tests the input array contains the input lwe but also the
|
||||
// modswitched results. This allows to avoid changing the accumulation
|
||||
// kernel for the noise tests since the input body will stay in the same
|
||||
// position. The layout of the input array is the following:
|
||||
// | input lwe | modswitched inputs |
|
||||
// | lwe size | lwe_size*grouping_factor |
|
||||
|
||||
// This offset allows to jump directly to the modswitched inputs,
|
||||
// skipping the input lwe
|
||||
const InputTorus modswitched_offset = lwe_dimension + 1;
|
||||
|
||||
const InputTorus *block_lwe_array_in_noise =
|
||||
&lwe_array_in[lwe_input_indexes[input_idx] *
|
||||
(lwe_dimension / grouping_factor) *
|
||||
(1 << grouping_factor) +
|
||||
modswitched_offset];
|
||||
|
||||
const InputTorus *lwe_array_group =
|
||||
block_lwe_array_in_noise +
|
||||
rev_lwe_iteration * (1 << grouping_factor);
|
||||
monomial_degrees[threadIdx.x] = lwe_array_group[threadIdx.x];
|
||||
} else {
|
||||
auto lwe_array_group =
|
||||
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
|
||||
monomial_degrees[threadIdx.x] =
|
||||
calculates_monomial_degree<InputTorus, params>(
|
||||
lwe_array_group, threadIdx.x, grouping_factor);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
@@ -588,6 +613,74 @@ __host__ void execute_compute_keybundle_128(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
// Used only to run noise tests: launches the keybundle kernel with the
|
||||
// runs_noise_test=true variant, which reads modswitched inputs from the
|
||||
// extended input array layout instead of computing them on-the-fly
|
||||
template <typename InputTorus, class params>
|
||||
__host__ void execute_compute_keybundle_noise_tests_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, InputTorus const *lwe_array_in,
|
||||
InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t num_samples,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
uint64_t chunk_size = std::min(
|
||||
lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
|
||||
|
||||
uint64_t keybundle_size_per_input =
|
||||
lwe_chunk_size * level_count * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * (polynomial_size / 2) * 4;
|
||||
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle<
|
||||
__uint128_t>(polynomial_size);
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
|
||||
auto d_mem = buffer->d_mem_keybundle;
|
||||
auto keybundle_fft = buffer->keybundle_fft;
|
||||
|
||||
dim3 grid_keybundle(num_samples * chunk_size,
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1), level_count);
|
||||
dim3 thds(polynomial_size / params::opt, 1, 1);
|
||||
|
||||
if (max_shared_memory < full_sm_keybundle) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle_128<
|
||||
InputTorus, params, NOSM, true>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle_128<
|
||||
InputTorus, params, NOSM, true>,
|
||||
cudaFuncCachePreferShared));
|
||||
device_multi_bit_programmable_bootstrap_keybundle_128<InputTorus, params,
|
||||
NOSM, true>
|
||||
<<<grid_keybundle, thds, 0, stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
|
||||
level_count, lwe_offset, chunk_size, keybundle_size_per_input,
|
||||
d_mem, full_sm_keybundle);
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle_128<
|
||||
InputTorus, params, FULLSM, true>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle_128<
|
||||
InputTorus, params, FULLSM, true>,
|
||||
cudaFuncCachePreferShared));
|
||||
device_multi_bit_programmable_bootstrap_keybundle_128<InputTorus, params,
|
||||
FULLSM, true>
|
||||
<<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
|
||||
level_count, lwe_offset, chunk_size, keybundle_size_per_input,
|
||||
d_mem, 0);
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename InputTorus, class params, bool is_first_iter>
|
||||
__host__ void execute_step_one_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
|
||||
@@ -1119,46 +1212,47 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128(
|
||||
int max_active_blocks_per_sm;
|
||||
|
||||
if (max_shared_memory < partial_sm_cg_accumulate) {
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
|
||||
Torus, params, NOSM>,
|
||||
thds, 0);
|
||||
thds, 0));
|
||||
} else if (max_shared_memory < full_sm_cg_accumulate) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
|
||||
PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
|
||||
PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
cudaFuncCachePreferShared));
|
||||
check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
|
||||
Torus, params, PARTIALSM>,
|
||||
thds, partial_sm_cg_accumulate);
|
||||
thds, partial_sm_cg_accumulate));
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
cudaFuncCachePreferShared));
|
||||
check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
|
||||
Torus, params, FULLSM>,
|
||||
thds, full_sm_cg_accumulate);
|
||||
thds, full_sm_cg_accumulate));
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
// Get the number of streaming multiprocessors
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
check_cuda_error(
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
|
||||
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
|
||||
}
|
||||
|
||||
@@ -1199,4 +1293,96 @@ supports_cooperative_groups_on_multibit_programmable_bootstrap_128(
|
||||
}
|
||||
}
|
||||
|
||||
// Noise tests variant: identical to
|
||||
// host_cg_multi_bit_programmable_bootstrap_128 but uses the noise-test
|
||||
// keybundle (runs_noise_test=true) instead of the standard one.
|
||||
template <typename InputTorus, class params>
|
||||
__host__ void host_cg_multi_bit_programmable_bootstrap_noise_tests_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
|
||||
InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
|
||||
__uint128_t const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
// Compute a keybundle with the noise-test kernel variant
|
||||
// (runs_noise_test=true) to read precomputed modswitched values
|
||||
execute_compute_keybundle_noise_tests_128<InputTorus, params>(
|
||||
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, level_count, lwe_offset);
|
||||
|
||||
execute_cg_external_product_loop_128<InputTorus, params>(
|
||||
stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
lwe_offset, num_many_lut, lut_stride);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputTorus, class params>
|
||||
__host__ void host_multi_bit_programmable_bootstrap_noise_tests_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
|
||||
InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
|
||||
__uint128_t const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
// Compute a keybundle with the noise-test kernel variant
|
||||
// (runs_noise_test=true) to read precomputed modswitched values
|
||||
execute_compute_keybundle_noise_tests_128<InputTorus, params>(
|
||||
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, level_count, lwe_offset);
|
||||
|
||||
// Accumulate (same as standard path)
|
||||
uint64_t chunk_size =
|
||||
std::min((uint32_t)lwe_chunk_size,
|
||||
(lwe_dimension / grouping_factor) - lwe_offset);
|
||||
for (uint32_t j = 0; j < chunk_size; j++) {
|
||||
bool is_first_iter = (j + lwe_offset) == 0;
|
||||
bool is_last_iter =
|
||||
(j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
|
||||
if (is_first_iter) {
|
||||
execute_step_one_128<InputTorus, params, true>(
|
||||
stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
base_log, level_count);
|
||||
} else {
|
||||
execute_step_one_128<InputTorus, params, false>(
|
||||
stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
base_log, level_count);
|
||||
}
|
||||
|
||||
if (is_last_iter) {
|
||||
execute_step_two_128<InputTorus, params, true>(
|
||||
stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
|
||||
num_samples, glwe_dimension, polynomial_size, level_count, j,
|
||||
num_many_lut, lut_stride);
|
||||
} else {
|
||||
execute_step_two_128<InputTorus, params, false>(
|
||||
stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
|
||||
num_samples, glwe_dimension, polynomial_size, level_count, j,
|
||||
num_many_lut, lut_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif // PROGRAMMABLE_BOOTSTRAP_MULTIBIT_128_CUH
|
||||
|
||||
@@ -739,7 +739,8 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
|
||||
|
||||
// Get the number of streaming multiprocessors
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
check_cuda_error(
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0));
|
||||
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
|
||||
}
|
||||
|
||||
|
||||
@@ -795,6 +795,40 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap_2_2_specialized(
|
||||
MultiBitTbcLaunchMode::SPECIALIZED_2_2);
|
||||
}
|
||||
|
||||
// Noise tests variant: uses NOISE_TESTS keybundle mode for the keybundle step
|
||||
// while keeping the standard AUTO accumulate behaviour for the TBC loop.
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_tbc_multi_bit_programmable_bootstrap_noise_tests(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
// Keybundle with NOISE_TESTS mode; the TBC accumulate uses AUTO as usual
|
||||
execute_compute_keybundle_noise_tests<Torus, params>(
|
||||
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, level_count, lwe_offset);
|
||||
|
||||
// Accumulate (unchanged from standard TBC path)
|
||||
execute_tbc_external_product_loop<Torus, params>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
|
||||
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
|
||||
lut_stride, MultiBitTbcLaunchMode::AUTO);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size, uint32_t max_shared_memory) {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user