mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-11 15:48:20 -05:00
Compare commits
100 Commits
br/hlapi
...
tm/array-f
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
92163c2646 | ||
|
|
a5c876fdac | ||
|
|
2d8ea2de16 | ||
|
|
494e0e0601 | ||
|
|
8c838da209 | ||
|
|
c13587b713 | ||
|
|
8dea5cf145 | ||
|
|
0d41b4f445 | ||
|
|
068cbc0f41 | ||
|
|
f8947ddff3 | ||
|
|
1b98312e2c | ||
|
|
d3dd010deb | ||
|
|
15762623d1 | ||
|
|
c6865ab880 | ||
|
|
e376df2fa4 | ||
|
|
bd739c2d48 | ||
|
|
9960f5e8b6 | ||
|
|
776f08b534 | ||
|
|
ac13eed3b1 | ||
|
|
17d3a492b6 | ||
|
|
ba87f1ba5e | ||
|
|
c70ad3374e | ||
|
|
c7ec835e5f | ||
|
|
075b2259d3 | ||
|
|
23ebd42209 | ||
|
|
bb1ff363d3 | ||
|
|
7bcd6b94da | ||
|
|
57cbab9fe1 | ||
|
|
97ce0f6ecf | ||
|
|
b6c21ef1fe | ||
|
|
e599608831 | ||
|
|
f243491442 | ||
|
|
b5248930a2 | ||
|
|
2d280d98d2 | ||
|
|
10b57f8a8e | ||
|
|
242df05eb2 | ||
|
|
899d4a7750 | ||
|
|
48dfeb21dc | ||
|
|
a46ce3fb51 | ||
|
|
192777bde6 | ||
|
|
3aa198311c | ||
|
|
7034d4ceb4 | ||
|
|
799ae92f59 | ||
|
|
36e9371fdf | ||
|
|
8c88678ee8 | ||
|
|
e1beea5ecb | ||
|
|
701411044b | ||
|
|
405fdec6b9 | ||
|
|
b3355e2b2f | ||
|
|
e4d856afdf | ||
|
|
22ddba7145 | ||
|
|
d955696fe0 | ||
|
|
eb0b9643bb | ||
|
|
d68305e984 | ||
|
|
3d64316c66 | ||
|
|
4bba35e926 | ||
|
|
187159d9f9 | ||
|
|
0cf9f9f3bd | ||
|
|
dcb6049441 | ||
|
|
7203cc3564 | ||
|
|
b198c18498 | ||
|
|
916e6e6a61 | ||
|
|
9ac776185a | ||
|
|
28e44ca237 | ||
|
|
6432b98591 | ||
|
|
15cce9f641 | ||
|
|
5090e9152b | ||
|
|
24572edb1c | ||
|
|
303f67fe11 | ||
|
|
86a40bcea9 | ||
|
|
97c0290ff7 | ||
|
|
3ba6a72166 | ||
|
|
dbd158c641 | ||
|
|
0a738c368a | ||
|
|
4325da72cf | ||
|
|
e1620d4087 | ||
|
|
6805778cb8 | ||
|
|
802945fa52 | ||
|
|
fff86fb3b4 | ||
|
|
950915a108 | ||
|
|
5e6562878a | ||
|
|
d0743e9d3d | ||
|
|
981083360e | ||
|
|
848f9d165c | ||
|
|
2b57fc7bd8 | ||
|
|
e3d90341cf | ||
|
|
dd94d6f823 | ||
|
|
25362b2db2 | ||
|
|
fe5542f39e | ||
|
|
42112c53c2 | ||
|
|
bc2e595cf5 | ||
|
|
378b84946f | ||
|
|
8a4c5ba8ef | ||
|
|
940a9ba860 | ||
|
|
c475dc058e | ||
|
|
215ded90c0 | ||
|
|
8a2d93aaa8 | ||
|
|
5a48483247 | ||
|
|
702989f796 | ||
|
|
cb1e298ebe |
2
.gitattributes
vendored
2
.gitattributes
vendored
@@ -1 +1,3 @@
|
||||
*.hpu filter=lfs diff=lfs merge=lfs -text
|
||||
*.bcode filter=lfs diff=lfs merge=lfs -text
|
||||
*.cbor filter=lfs diff=lfs merge=lfs -text
|
||||
|
||||
@@ -71,45 +71,26 @@ jobs:
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Use specific data branch
|
||||
if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
|
||||
env:
|
||||
PR_BRANCH: ${{ github.head_ref || github.ref_name }}
|
||||
# Cache key is an aggregated hash of lfs files hashes
|
||||
- name: Get LFS data sha
|
||||
id: hash-lfs-data
|
||||
run: |
|
||||
echo "BACKWARD_COMPAT_DATA_BRANCH=${PR_BRANCH}" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Get backward compat branch
|
||||
id: backward_compat_branch
|
||||
run: |
|
||||
BRANCH="$(make backward_compat_branch)"
|
||||
echo "branch=${BRANCH}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Get backward compat branch head SHA
|
||||
id: backward_compat_sha
|
||||
run: |
|
||||
SHA=$(git ls-remote "${REPO_URL}" refs/heads/"${BACKWARD_COMPAT_BRANCH}" | awk '{print $1}')
|
||||
SHA=$(git lfs ls-files -l -I utils/tfhe-backward-compat-data | sha256sum | cut -d' ' -f1)
|
||||
echo "sha=${SHA}" >> "${GITHUB_OUTPUT}"
|
||||
env:
|
||||
REPO_URL: "https://github.com/zama-ai/tfhe-backward-compat-data"
|
||||
BACKWARD_COMPAT_BRANCH: ${{ steps.backward_compat_branch.outputs.branch }}
|
||||
|
||||
- name: Retrieve data from cache
|
||||
id: retrieve-data-cache
|
||||
uses: actions/cache/restore@5a3ec84eff668545956fd18022155c47e93e2684 #v4.2.3
|
||||
with:
|
||||
path: tests/tfhe-backward-compat-data
|
||||
key: ${{ steps.backward_compat_branch.outputs.branch }}_${{ steps.backward_compat_sha.outputs.sha }}
|
||||
path: |
|
||||
utils/tfhe-backward-compat-data/**/*.cbor
|
||||
utils/tfhe-backward-compat-data/**/*.bcode
|
||||
key: ${{ steps.hash-lfs-data.outputs.sha }}
|
||||
|
||||
- name: Clone test data
|
||||
- name: Pull test data
|
||||
if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
repository: zama-ai/tfhe-backward-compat-data
|
||||
path: tests/tfhe-backward-compat-data
|
||||
lfs: 'true'
|
||||
ref: ${{ steps.backward_compat_branch.outputs.branch }}
|
||||
run: |
|
||||
make pull_backward_compat_data
|
||||
|
||||
- name: Run backward compatibility tests
|
||||
run: |
|
||||
@@ -120,8 +101,10 @@ jobs:
|
||||
continue-on-error: true
|
||||
uses: actions/cache/save@5a3ec84eff668545956fd18022155c47e93e2684 #v4.2.3
|
||||
with:
|
||||
path: tests/tfhe-backward-compat-data
|
||||
key: ${{ steps.backward_compat_branch.outputs.branch }}_${{ steps.backward_compat_sha.outputs.sha }}
|
||||
path: |
|
||||
utils/tfhe-backward-compat-data/**/*.cbor
|
||||
utils/tfhe-backward-compat-data/**/*.bcode
|
||||
key: ${{ steps.hash-lfs-data.outputs.sha }}
|
||||
|
||||
- name: Set pull-request URL
|
||||
if: ${{ failure() && github.event_name == 'pull_request' }}
|
||||
|
||||
2
.github/workflows/aws_tfhe_integer_tests.yml
vendored
2
.github/workflows/aws_tfhe_integer_tests.yml
vendored
@@ -103,7 +103,7 @@ jobs:
|
||||
name: Unsigned integer tests
|
||||
needs: setup-instance
|
||||
concurrency:
|
||||
group: ${{ github.workflow_ref }}
|
||||
group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
steps:
|
||||
|
||||
@@ -104,7 +104,7 @@ jobs:
|
||||
name: Signed integer tests
|
||||
needs: setup-instance
|
||||
concurrency:
|
||||
group: ${{ github.workflow_ref }}
|
||||
group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
steps:
|
||||
|
||||
1
.github/workflows/benchmark_gpu.yml
vendored
1
.github/workflows/benchmark_gpu.yml
vendored
@@ -31,6 +31,7 @@ on:
|
||||
- ks
|
||||
- ks_pbs
|
||||
- integer_zk
|
||||
- hlapi_noise_squash
|
||||
op_flavor:
|
||||
description: "Operations set to run"
|
||||
type: choice
|
||||
|
||||
76
.github/workflows/benchmark_gpu_weekly.yml
vendored
76
.github/workflows/benchmark_gpu_weekly.yml
vendored
@@ -10,37 +10,16 @@ on:
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
run-benchmarks-1-h100:
|
||||
name: Run integer benchmarks (1xH100)
|
||||
run-benchmarks-8-h100-sxm5-integer:
|
||||
name: Run integer benchmarks (8xH100-SXM5)
|
||||
if: github.repository == 'zama-ai/tfhe-rs'
|
||||
uses: ./.github/workflows/benchmark_gpu_common.yml
|
||||
with:
|
||||
profile: single-h100
|
||||
hardware_name: n3-H100x1
|
||||
command: integer,integer_multi_bit
|
||||
op_flavor: default
|
||||
bench_type: latency
|
||||
all_precisions: true
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
|
||||
|
||||
run-benchmarks-2-h100:
|
||||
name: Run integer benchmarks (2xH100)
|
||||
if: github.repository == 'zama-ai/tfhe-rs'
|
||||
uses: ./.github/workflows/benchmark_gpu_common.yml
|
||||
with:
|
||||
profile: 2-h100
|
||||
hardware_name: n3-H100x2
|
||||
profile: multi-h100-sxm5
|
||||
hardware_name: n3-H100x8-SXM5
|
||||
command: integer_multi_bit
|
||||
op_flavor: default
|
||||
bench_type: latency
|
||||
bench_type: both
|
||||
all_precisions: true
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
@@ -52,16 +31,16 @@ jobs:
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
|
||||
|
||||
run-benchmarks-8-h100:
|
||||
name: Run integer benchmarks (8xH100)
|
||||
run-benchmarks-8-h100-sxm5-integer-compression:
|
||||
name: Run integer compression benchmarks (8xH100-SXM5)
|
||||
if: github.repository == 'zama-ai/tfhe-rs'
|
||||
uses: ./.github/workflows/benchmark_gpu_common.yml
|
||||
with:
|
||||
profile: multi-h100
|
||||
hardware_name: n3-H100x8
|
||||
command: integer_multi_bit
|
||||
profile: multi-h100-sxm5
|
||||
hardware_name: n3-H100x8-SXM5
|
||||
command: integer_compression
|
||||
op_flavor: default
|
||||
bench_type: latency
|
||||
bench_type: both
|
||||
all_precisions: true
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
@@ -73,16 +52,37 @@ jobs:
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
|
||||
|
||||
run-benchmarks-l40:
|
||||
name: Run integer benchmarks (L40)
|
||||
run-benchmarks-8-h100-sxm5-integer-zk:
|
||||
name: Run integer zk benchmarks (8xH100-SXM5)
|
||||
if: github.repository == 'zama-ai/tfhe-rs'
|
||||
uses: ./.github/workflows/benchmark_gpu_common.yml
|
||||
with:
|
||||
profile: l40
|
||||
hardware_name: n3-L40x1
|
||||
command: integer_multi_bit,integer_compression,pbs,ks
|
||||
profile: multi-h100-sxm5
|
||||
hardware_name: n3-H100x8-SXM5
|
||||
command: integer_zk
|
||||
op_flavor: default
|
||||
bench_type: latency
|
||||
bench_type: both
|
||||
all_precisions: true
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
JOB_SECRET: ${{ secrets.JOB_SECRET }}
|
||||
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
SLAB_URL: ${{ secrets.SLAB_URL }}
|
||||
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
|
||||
|
||||
run-benchmarks-8-h100-sxm5-noise-squash:
|
||||
name: Run integer zk benchmarks (8xH100-SXM5)
|
||||
if: github.repository == 'zama-ai/tfhe-rs'
|
||||
uses: ./.github/workflows/benchmark_gpu_common.yml
|
||||
with:
|
||||
profile: multi-h100-sxm5
|
||||
hardware_name: n3-H100x8-SXM5
|
||||
command: hlapi_noise_squash
|
||||
op_flavor: default
|
||||
bench_type: both
|
||||
all_precisions: true
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
|
||||
2
.github/workflows/benchmark_hpu_integer.yml
vendored
2
.github/workflows/benchmark_hpu_integer.yml
vendored
@@ -33,6 +33,7 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
lfs: true
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Get benchmark details
|
||||
@@ -61,6 +62,7 @@ jobs:
|
||||
|
||||
- name: Run benchmarks
|
||||
run: |
|
||||
make pull_hpu_files
|
||||
make bench_integer_hpu
|
||||
make bench_hlapi_erc20_hpu
|
||||
|
||||
|
||||
2
.github/workflows/benchmark_tfhe_fft.yml
vendored
2
.github/workflows/benchmark_tfhe_fft.yml
vendored
@@ -48,7 +48,7 @@ jobs:
|
||||
name: Execute FFT benchmarks in EC2
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow_ref }}
|
||||
group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
|
||||
2
.github/workflows/benchmark_tfhe_ntt.yml
vendored
2
.github/workflows/benchmark_tfhe_ntt.yml
vendored
@@ -48,7 +48,7 @@ jobs:
|
||||
name: Execute NTT benchmarks in EC2
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow_ref }}
|
||||
group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
|
||||
8
.github/workflows/cargo_build.yml
vendored
8
.github/workflows/cargo_build.yml
vendored
@@ -49,6 +49,14 @@ jobs:
|
||||
mv linelint-linux-amd64 /usr/local/bin/linelint
|
||||
make check_newline
|
||||
|
||||
# This is needed for the ws tests clippy checks
|
||||
- name: Use specific data branch
|
||||
if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
|
||||
env:
|
||||
PR_BRANCH: ${{ github.head_ref || github.ref_name }}
|
||||
run: |
|
||||
echo "BACKWARD_COMPAT_DATA_BRANCH=${PR_BRANCH}" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run pcc checks
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
|
||||
2
.github/workflows/cargo_test_fft.yml
vendored
2
.github/workflows/cargo_test_fft.yml
vendored
@@ -13,7 +13,7 @@ env:
|
||||
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref }}
|
||||
group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
|
||||
2
.github/workflows/cargo_test_ntt.yml
vendored
2
.github/workflows/cargo_test_ntt.yml
vendored
@@ -13,7 +13,7 @@ env:
|
||||
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref }}
|
||||
group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
|
||||
63
.github/workflows/data_pr_close.yml
vendored
63
.github/workflows/data_pr_close.yml
vendored
@@ -1,63 +0,0 @@
|
||||
name: Close or Merge corresponding PR on the data repo
|
||||
|
||||
# When a PR with the data_PR tag is closed or merged, this will close the corresponding PR in the data repo.
|
||||
|
||||
env:
|
||||
DATA_REPO: zama-ai/tfhe-backward-compat-data
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
PR_BRANCH: ${{ github.head_ref || github.ref_name }}
|
||||
CLOSE_TYPE: ${{ github.event.pull_request.merged && 'merge' || 'close' }}
|
||||
|
||||
# only trigger on pull request closed events
|
||||
on:
|
||||
pull_request:
|
||||
types: [ closed ]
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
auto_close_job:
|
||||
if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') && github.repository == 'zama-ai/tfhe-rs' }}
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.FHE_ACTIONS_TOKEN }} # Needed for gh CLI commands
|
||||
steps:
|
||||
- name: Fetch PR number
|
||||
run: |
|
||||
PR_NUMBER=$(gh pr view "${PR_BRANCH}" --repo "${DATA_REPO}" --json number | jq '.number')
|
||||
echo "DATA_REPO_PR_NUMBER=${PR_NUMBER}" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Comment on the PR to indicate the reason of the close
|
||||
run: |
|
||||
gh pr comment "${PR_BRANCH}" \
|
||||
--repo "${DATA_REPO}" \
|
||||
--body "PR ${CLOSE_TYPE}d because the corresponding PR in main repo was ${CLOSE_TYPE}d: ${REPO}#${EVENT_NUMBER}"
|
||||
env:
|
||||
REPO: ${{ github.repository }}
|
||||
EVENT_NUMBER: ${{ github.event.number }}
|
||||
|
||||
- name: Merge the Pull Request in the data repo
|
||||
if: ${{ github.event.pull_request.merged }}
|
||||
run: |
|
||||
gh pr merge "${PR_BRANCH}" \
|
||||
--repo "${DATA_REPO}" \
|
||||
--rebase \
|
||||
--delete-branch
|
||||
|
||||
- name: Close the Pull Request in the data repo
|
||||
if: ${{ !github.event.pull_request.merged }}
|
||||
run: |
|
||||
gh pr close "${PR_BRANCH}" \
|
||||
--repo "${DATA_REPO}" \
|
||||
--delete-branch
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() && job.status == 'failure' }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Failed to auto-${{ env.CLOSE_TYPE }} PR on data repo: https://github.com/${{ env.DATA_REPO }}/pull/${{ env.DATA_REPO_PR_NUMBER }}"
|
||||
@@ -137,7 +137,7 @@ jobs:
|
||||
# No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
|
||||
- name: Run multi-bit CUDA integer tests
|
||||
run: |
|
||||
BIG_TESTS_INSTANCE=TRUE make test_integer_multi_bit_gpu_ci
|
||||
BIG_TESTS_INSTANCE=TRUE NO_BIG_PARAMS_GPU=TRUE make test_integer_multi_bit_gpu_ci
|
||||
|
||||
- name: Run user docs tests
|
||||
run: |
|
||||
|
||||
@@ -11,6 +11,7 @@ env:
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
IS_PR: ${{ github.event_name == 'pull_request' }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
@@ -18,6 +19,8 @@ on:
|
||||
schedule:
|
||||
# Nightly tests will be triggered each evening 8p.m.
|
||||
- cron: "0 20 * * *"
|
||||
pull_request:
|
||||
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
@@ -78,7 +81,11 @@ jobs:
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
make test_integer_long_run_gpu
|
||||
if [[ "${IS_PR}" == "true" ]]; then
|
||||
make test_integer_short_run_gpu
|
||||
else
|
||||
make test_integer_long_run_gpu
|
||||
fi
|
||||
|
||||
slack-notify:
|
||||
name: Slack Notification
|
||||
|
||||
@@ -25,9 +25,6 @@ on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
schedule:
|
||||
# Nightly tests @ 1AM after each work day
|
||||
- cron: "0 1 * * MON-FRI"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
@@ -25,9 +25,6 @@ on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
schedule:
|
||||
# Nightly tests @ 1AM after each work day
|
||||
- cron: "0 1 * * MON-FRI"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
2
.github/workflows/hpu_hlapi_tests.yml
vendored
2
.github/workflows/hpu_hlapi_tests.yml
vendored
@@ -13,7 +13,7 @@ env:
|
||||
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref }}
|
||||
group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
|
||||
|
||||
2
.github/workflows/sync_on_push.yml
vendored
2
.github/workflows/sync_on_push.yml
vendored
@@ -21,7 +21,7 @@ jobs:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
- name: git-sync
|
||||
uses: wei/git-sync@55c6b63b4f21607da0e9877ca9b4d11a29fc6d83
|
||||
uses: valtech-sd/git-sync@e734cfe9485a92e720eac5af8a4555dde5fecf88
|
||||
with:
|
||||
source_repo: "zama-ai/tfhe-rs"
|
||||
source_branch: "main"
|
||||
|
||||
26
.github/workflows/unverified_prs.yml
vendored
Normal file
26
.github/workflows/unverified_prs.yml
vendored
Normal file
@@ -0,0 +1,26 @@
|
||||
name: 'Close unverified PRs'
|
||||
on:
|
||||
schedule:
|
||||
- cron: '30 1 * * *'
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
stale:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
issues: read
|
||||
pull-requests: write
|
||||
steps:
|
||||
- uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
|
||||
with:
|
||||
stale-pr-message: 'This PR is unverified and has been open for 2 days, it will now be closed. If you want to contribute please sign the CLA as indicated by the bot.'
|
||||
days-before-stale: 2
|
||||
days-before-close: 0
|
||||
# We are not interested in suppressing issues so have a currently non existent label
|
||||
# if we ever accept issues to become stale/closable this label will be the signal for that
|
||||
only-issue-labels: can-be-auto-closed
|
||||
# Only unverified PRs are an issue
|
||||
exempt-pr-labels: cla-signed
|
||||
# We don't want people commenting to keep an unverified PR
|
||||
ignore-updates: true
|
||||
@@ -10,6 +10,7 @@ ignore:
|
||||
- keys
|
||||
- coverage
|
||||
- utils/tfhe-lints/ui/main.stderr
|
||||
- utils/tfhe-backward-compat-data/**/*.ron # ron files are autogenerated
|
||||
|
||||
rules:
|
||||
# checks if file ends in a newline character
|
||||
|
||||
@@ -170,6 +170,8 @@ On the contrary, these changes are *not* data breaking:
|
||||
* Renaming a type (unless it implements the `Named` trait).
|
||||
* Adding a variant to the end of an enum.
|
||||
|
||||
Historical data from previous TFHE-rs versions are stored inside `utils/tfhe-backward-compat-data`. They are used to check on every PR that backward compatibility has been preserved.
|
||||
|
||||
## Example: adding a field
|
||||
|
||||
Suppose you want to add an i32 field to a type named `MyType`. The original type is defined as:
|
||||
|
||||
@@ -18,7 +18,7 @@ members = [
|
||||
]
|
||||
|
||||
exclude = [
|
||||
"tests/backward_compatibility_tests",
|
||||
"utils/tfhe-backward-compat-data",
|
||||
"utils/tfhe-lints",
|
||||
"apps/trivium",
|
||||
]
|
||||
|
||||
102
Makefile
102
Makefile
@@ -22,10 +22,7 @@ BENCH_TYPE?=latency
|
||||
BENCH_PARAM_TYPE?=classical
|
||||
BENCH_PARAMS_SET?=default
|
||||
NODE_VERSION=22.6
|
||||
BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
|
||||
BACKWARD_COMPAT_DATA_BRANCH?=$(shell ./scripts/backward_compat_data_version.py)
|
||||
BACKWARD_COMPAT_DATA_PROJECT=tfhe-backward-compat-data
|
||||
BACKWARD_COMPAT_DATA_DIR=$(BACKWARD_COMPAT_DATA_PROJECT)
|
||||
BACKWARD_COMPAT_DATA_DIR=utils/tfhe-backward-compat-data
|
||||
TFHE_SPEC:=tfhe
|
||||
WASM_PACK_VERSION="0.13.1"
|
||||
# We are kind of hacking the cut here, the version cannot contain a quote '"'
|
||||
@@ -159,23 +156,23 @@ install_tarpaulin: install_rs_build_toolchain
|
||||
|
||||
.PHONY: install_cargo_dylint # Install custom tfhe-rs lints
|
||||
install_cargo_dylint:
|
||||
cargo install cargo-dylint dylint-link
|
||||
cargo install --locked cargo-dylint dylint-link
|
||||
|
||||
.PHONY: install_typos_checker # Install typos checker
|
||||
install_typos_checker: install_rs_build_toolchain
|
||||
@typos --version > /dev/null 2>&1 || \
|
||||
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install typos-cli || \
|
||||
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked typos-cli || \
|
||||
( echo "Unable to install typos-cli, unknown error." && exit 1 )
|
||||
|
||||
.PHONY: install_zizmor # Install zizmor workflow security checker
|
||||
install_zizmor: install_rs_build_toolchain
|
||||
@zizmor --version > /dev/null 2>&1 || \
|
||||
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install zizmor --version ~1.9 || \
|
||||
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked zizmor --version ~1.9 || \
|
||||
( echo "Unable to install zizmor, unknown error." && exit 1 )
|
||||
|
||||
.PHONY: install_cargo_cross # Install custom tfhe-rs lints
|
||||
.PHONY: install_cargo_cross # Install cross for big endian tests
|
||||
install_cargo_cross: install_rs_build_toolchain
|
||||
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install cross
|
||||
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked cross
|
||||
|
||||
.PHONY: setup_venv # Setup Python virtualenv for wasm tests
|
||||
setup_venv:
|
||||
@@ -252,6 +249,9 @@ install_mlc: install_rs_build_toolchain
|
||||
.PHONY: fmt # Format rust code
|
||||
fmt: install_rs_check_toolchain
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR) fmt
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt
|
||||
|
||||
.PHONY: fmt_js # Format javascript code
|
||||
fmt_js: check_nvm_installed
|
||||
@@ -273,6 +273,9 @@ fmt_c_tests:
|
||||
.PHONY: check_fmt # Check rust code format
|
||||
check_fmt: install_rs_check_toolchain
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR) fmt --check
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt --check
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt --check
|
||||
|
||||
.PHONY: check_fmt_c_tests # Check C tests format
|
||||
check_fmt_c_tests:
|
||||
@@ -483,10 +486,22 @@ clippy_param_dedup: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p param_dedup -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_backward_compat_data # Run clippy lints on tfhe-backward-compat-data
|
||||
clippy_backward_compat_data: install_rs_check_toolchain # the toolchain is selected with toolchain.toml
|
||||
@# Some old crates are x86 specific, only run in that case
|
||||
@if uname -a | grep -q x86; then \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options \
|
||||
-C $(BACKWARD_COMPAT_DATA_DIR) clippy --all-targets \
|
||||
-- --no-deps -D warnings; \
|
||||
else \
|
||||
echo "Cannot run clippy for backward compat crate on non x86 platform for now."; \
|
||||
fi
|
||||
|
||||
.PHONY: clippy_all # Run all clippy targets
|
||||
clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
|
||||
clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_trivium \
|
||||
clippy_versionable clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup
|
||||
clippy_versionable clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup \
|
||||
clippy_backward_compat_data
|
||||
|
||||
.PHONY: clippy_fast # Run main clippy targets
|
||||
clippy_fast: clippy_rustdoc clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks \
|
||||
@@ -502,6 +517,12 @@ clippy_hpu_backend: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-hpu-backend -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_hpu_mockup # Run clippy lints on tfhe-hpu-mockup
|
||||
clippy_hpu_mockup: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--all-targets \
|
||||
-p tfhe-hpu-mockup -- --no-deps -D warnings
|
||||
|
||||
.PHONY: check_rust_bindings_did_not_change # Check rust bindings are up to date for tfhe-cuda-backend
|
||||
check_rust_bindings_did_not_change:
|
||||
cargo build -p tfhe-cuda-backend && "$(MAKE)" fmt_gpu && \
|
||||
@@ -514,6 +535,9 @@ check_rust_bindings_did_not_change:
|
||||
tfhe_lints: install_cargo_dylint
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo dylint --all -p tfhe --no-deps -- \
|
||||
--features=boolean,shortint,integer,strings,zk-pok
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo dylint --all -p tfhe-zk-pok --no-deps -- \
|
||||
--features=experimental
|
||||
|
||||
|
||||
.PHONY: build_core # Build core_crypto without experimental features
|
||||
build_core: install_rs_build_toolchain install_rs_check_toolchain
|
||||
@@ -655,6 +679,14 @@ test_integer_gpu: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
|
||||
--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
|
||||
|
||||
.PHONY: test_integer_gpu_debug # Run the tests of the integer module with Debug flags for CUDA
|
||||
test_integer_gpu_debug: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile release_lto_off \
|
||||
--features=integer,gpu-debug -vv -p $(TFHE_SPEC) -- integer::gpu::server_key:: --test-threads=1 --nocapture
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile release_lto_off \
|
||||
--features=integer,gpu-debug -p $(TFHE_SPEC) -- integer::gpu::server_key::
|
||||
|
||||
|
||||
.PHONY: test_integer_long_run_gpu # Run the long run integer tests on the gpu backend
|
||||
test_integer_long_run_gpu: install_rs_check_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
@@ -663,6 +695,12 @@ test_integer_long_run_gpu: install_rs_check_toolchain install_cargo_nextest
|
||||
--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
|
||||
--tfhe-package "$(TFHE_SPEC)" --backend "gpu"
|
||||
|
||||
.PHONY: test_integer_short_run_gpu # Run the long run integer tests on the gpu backend
|
||||
test_integer_short_run_gpu: install_rs_check_toolchain install_cargo_nextest
|
||||
TFHE_RS_TEST_LONG_TESTS_MINIMAL=TRUE \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::radix::tests_long_run::test_random_op_sequence integer::gpu::server_key::radix::tests_long_run::test_signed_random_op_sequence --test-threads=1 --nocapture
|
||||
|
||||
.PHONY: test_integer_compression
|
||||
test_integer_compression: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
@@ -1025,16 +1063,11 @@ test_tfhe_lints: install_cargo_dylint
|
||||
# Here we use the "patch" functionality of Cargo to make sure the repo used for the data is the same as the one used for the code.
|
||||
.PHONY: test_backward_compatibility_ci
|
||||
test_backward_compatibility_ci: install_rs_build_toolchain
|
||||
TFHE_BACKWARD_COMPAT_DATA_DIR="$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).path=\"tests/$(BACKWARD_COMPAT_DATA_DIR)\"" \
|
||||
TFHE_BACKWARD_COMPAT_DATA_DIR="../$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=shortint,integer,zk-pok -p tests test_backward_compatibility -- --nocapture
|
||||
|
||||
.PHONY: test_backward_compatibility # Same as test_backward_compatibility_ci but tries to clone the data repo first if needed
|
||||
test_backward_compatibility: tests/$(BACKWARD_COMPAT_DATA_DIR) test_backward_compatibility_ci
|
||||
|
||||
.PHONY: backward_compat_branch # Prints the required backward compatibility branch
|
||||
backward_compat_branch:
|
||||
@echo "$(BACKWARD_COMPAT_DATA_BRANCH)"
|
||||
test_backward_compatibility: pull_backward_compat_data test_backward_compatibility_ci
|
||||
|
||||
.PHONY: doc # Build rust doc
|
||||
doc: install_rs_check_toolchain
|
||||
@@ -1079,6 +1112,10 @@ check_intra_md_links: install_mlc
|
||||
check_md_links: install_mlc
|
||||
mlc --match-file-extension tfhe/docs
|
||||
|
||||
.PHONY: check_doc_paths_use_dash # Check paths use "-" instead of "_" in docs for gitbook compatibility
|
||||
check_doc_paths_use_dash:
|
||||
python3 ./scripts/check_doc_paths_use_dash.py
|
||||
|
||||
.PHONY: check_parameter_export_ok # Checks exported "current" shortint parameter module is correct
|
||||
check_parameter_export_ok:
|
||||
python3 ./scripts/check_current_param_export.py
|
||||
@@ -1436,6 +1473,20 @@ bench_tfhe_zk_pok: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-zk-pok --
|
||||
|
||||
.PHONY: bench_hlapi_noise_squash # Run benchmarks for noise squash operation
|
||||
bench_hlapi_noise_squash: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-noise-squash \
|
||||
--features=integer,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --
|
||||
|
||||
.PHONY: bench_hlapi_noise_squash_gpu # Run benchmarks for noise squash operation on GPU
|
||||
bench_hlapi_noise_squash_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-noise-squash \
|
||||
--features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --
|
||||
|
||||
#
|
||||
# Utility tools
|
||||
#
|
||||
@@ -1493,11 +1544,13 @@ write_params_to_file: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run \
|
||||
--example write_params_to_file --features=boolean,shortint,hpu,internal-keycache
|
||||
|
||||
.PHONY: clone_backward_compat_data # Clone the data repo needed for backward compatibility tests
|
||||
clone_backward_compat_data:
|
||||
./scripts/clone_backward_compat_data.sh $(BACKWARD_COMPAT_DATA_URL) $(BACKWARD_COMPAT_DATA_BRANCH) tests/$(BACKWARD_COMPAT_DATA_DIR)
|
||||
.PHONY: pull_backward_compat_data # Pull the data files needed for backward compatibility tests
|
||||
pull_backward_compat_data:
|
||||
./scripts/pull_lfs_data.sh $(BACKWARD_COMPAT_DATA_DIR)
|
||||
|
||||
tests/$(BACKWARD_COMPAT_DATA_DIR): clone_backward_compat_data
|
||||
.PHONY: pull_hpu_files # Pull the hpu files
|
||||
pull_hpu_files:
|
||||
./scripts/pull_lfs_data.sh backends/tfhe-hpu-backend/
|
||||
|
||||
#
|
||||
# Real use case examples
|
||||
@@ -1523,7 +1576,8 @@ sha256_bool: install_rs_check_toolchain
|
||||
|
||||
.PHONY: pcc # pcc stands for pre commit checks (except GPU)
|
||||
pcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
|
||||
check_md_docs_are_tested check_intra_md_links clippy_all check_compile_tests test_tfhe_lints \
|
||||
check_md_docs_are_tested check_intra_md_links check_doc_paths_use_dash \
|
||||
clippy_all check_compile_tests test_tfhe_lints \
|
||||
tfhe_lints
|
||||
|
||||
.PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
|
||||
@@ -1531,11 +1585,11 @@ pcc_gpu: check_rust_bindings_did_not_change clippy_rustdoc_gpu \
|
||||
clippy_gpu clippy_cuda_backend clippy_bench_gpu check_compile_tests_benches_gpu
|
||||
|
||||
.PHONY: pcc_hpu # pcc stands for pre commit checks for HPU compilation
|
||||
pcc_hpu: clippy_hpu clippy_hpu_backend test_integer_hpu_mockup_ci_fast
|
||||
pcc_hpu: clippy_hpu clippy_hpu_backend clippy_hpu_mockup test_integer_hpu_mockup_ci_fast
|
||||
|
||||
.PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
|
||||
fpcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
|
||||
check_md_docs_are_tested clippy_fast check_compile_tests
|
||||
check_md_docs_are_tested check_intra_md_links check_doc_paths_use_dash clippy_fast check_compile_tests
|
||||
|
||||
.PHONY: conformance # Automatically fix problems that can be fixed
|
||||
conformance: fix_newline fmt fmt_js
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
<a href="https://github.com/zama-ai/tfhe-rs/releases"><img src="https://img.shields.io/github/v/release/zama-ai/tfhe-rs?style=flat-square"></a>
|
||||
<a href="LICENSE"><img src="https://img.shields.io/badge/License-BSD--3--Clause--Clear-%23ffb243?style=flat-square"></a>
|
||||
<a href="https://github.com/zama-ai/bounty-program"><img src="https://img.shields.io/badge/Contribute-Zama%20Bounty%20Program-%23ffd208?style=flat-square"></a>
|
||||
<a href="https://slsa.dev"><img alt="SLSA 3" src="https://slsa.dev/images/gh-badge-level3.svg" /></a>
|
||||
</p>
|
||||
|
||||
## About
|
||||
@@ -148,7 +149,7 @@ To run this code, use the following command:
|
||||
> Note that when running code that uses `TFHE-rs`, it is highly recommended
|
||||
to run in release mode with cargo's `--release` flag to have the best performances possible.
|
||||
|
||||
*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick_start)*
|
||||
*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick-start)*
|
||||
|
||||
<p align="right">
|
||||
<a href="#about" > ↑ Back to top </a>
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tfhe-cuda-backend"
|
||||
version = "0.10.0"
|
||||
version = "0.11.0"
|
||||
edition = "2021"
|
||||
authors = ["Zama team"]
|
||||
license = "BSD-3-Clause-Clear"
|
||||
@@ -19,3 +19,4 @@ bindgen = "0.71"
|
||||
[features]
|
||||
experimental-multi-arch = []
|
||||
profile = []
|
||||
debug = []
|
||||
|
||||
@@ -53,6 +53,11 @@ fn main() {
|
||||
cmake_config.define("USE_NVTOOLS", "OFF");
|
||||
}
|
||||
|
||||
if cfg!(feature = "debug") {
|
||||
cmake_config.define("CMAKE_BUILD_TYPE", "DEBUG");
|
||||
cmake_config.define("CMAKE_CXX_FLAGS", "-Wuninitialized -O0");
|
||||
}
|
||||
|
||||
// Build the CMake project
|
||||
let dest = cmake_config.build();
|
||||
println!("cargo:rustc-link-search=native={}", dest.display());
|
||||
|
||||
@@ -52,6 +52,8 @@ endif()
|
||||
|
||||
if(NOT CMAKE_BUILD_TYPE)
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
else()
|
||||
message("Building CUDA backend in ${CMAKE_BUILD_TYPE}")
|
||||
endif()
|
||||
|
||||
# Add OpenMP support
|
||||
|
||||
@@ -49,12 +49,13 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index);
|
||||
|
||||
void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
|
||||
uint32_t gpu_index,
|
||||
uint64_t *size_tracker,
|
||||
uint64_t &size_tracker,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
|
||||
|
||||
bool cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
|
||||
uint64_t cuda_device_total_memory(uint32_t gpu_index);
|
||||
|
||||
void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
|
||||
uint64_t size,
|
||||
|
||||
@@ -20,7 +20,7 @@ template <typename Torus> struct int_compression {
|
||||
uint32_t gpu_count, int_radix_params compression_params,
|
||||
uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
|
||||
uint32_t storage_log_modulus, bool allocate_gpu_memory,
|
||||
uint64_t *size_tracker) {
|
||||
uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->compression_params = compression_params;
|
||||
this->lwe_per_glwe = lwe_per_glwe;
|
||||
@@ -38,7 +38,7 @@ template <typename Torus> struct int_compression {
|
||||
lwe_per_glwe * glwe_accumulator_size * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0], size_tracker, allocate_gpu_memory);
|
||||
|
||||
*size_tracker += scratch_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
size_tracker += scratch_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
streams[0], gpu_indexes[0], &fp_ks_buffer,
|
||||
compression_params.small_lwe_dimension,
|
||||
compression_params.glwe_dimension, compression_params.polynomial_size,
|
||||
@@ -76,7 +76,7 @@ template <typename Torus> struct int_decompression {
|
||||
int_radix_params compression_params,
|
||||
uint32_t num_radix_blocks, uint32_t body_count,
|
||||
uint32_t storage_log_modulus, bool allocate_gpu_memory,
|
||||
uint64_t *size_tracker) {
|
||||
uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->encryption_params = encryption_params;
|
||||
this->compression_params = compression_params;
|
||||
|
||||
@@ -48,6 +48,34 @@ typedef struct {
|
||||
uint32_t lwe_dimension;
|
||||
} CudaRadixCiphertextFFI;
|
||||
|
||||
typedef struct {
|
||||
uint64_t const *chosen_multiplier_has_at_least_one_set;
|
||||
uint64_t const *decomposed_chosen_multiplier;
|
||||
|
||||
uint32_t const num_scalars;
|
||||
uint32_t const active_bits;
|
||||
uint64_t const shift_pre;
|
||||
uint32_t const shift_post;
|
||||
uint32_t const ilog2_chosen_multiplier;
|
||||
uint32_t const chosen_multiplier_num_bits;
|
||||
|
||||
bool const is_chosen_multiplier_zero;
|
||||
bool const is_abs_chosen_multiplier_one;
|
||||
bool const is_chosen_multiplier_negative;
|
||||
bool const is_chosen_multiplier_pow2;
|
||||
bool const chosen_multiplier_has_more_bits_than_numerator;
|
||||
// if signed: test if chosen_multiplier >= 2^{num_bits - 1}
|
||||
bool const is_chosen_multiplier_geq_two_pow_numerator;
|
||||
|
||||
uint32_t const ilog2_divisor;
|
||||
|
||||
bool const is_divisor_zero;
|
||||
bool const is_abs_divisor_one;
|
||||
bool const is_divisor_negative;
|
||||
bool const is_divisor_pow2;
|
||||
bool const divisor_has_more_bits_than_numerator;
|
||||
} CudaScalarDivisorFFI;
|
||||
|
||||
uint64_t scratch_cuda_apply_univariate_lut_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
|
||||
@@ -395,14 +423,14 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
bool reduce_degrees_for_single_carry_propagation, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *radix_lwe_vec,
|
||||
bool reduce_degrees_for_single_carry_propagation, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
|
||||
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
|
||||
@@ -549,27 +577,6 @@ void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
|
||||
void *const *streams,
|
||||
uint32_t const *gpu_indexes);
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool anticipated_buffer_drop,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_radix_scalar_mul_high_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, void *const *ksks,
|
||||
uint64_t rhs, uint64_t const *decomposed_scalar,
|
||||
uint64_t const *has_at_least_one_set,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_scalars);
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_mul_high_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_apply_noise_squashing_kb(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
@@ -615,6 +622,26 @@ void cleanup_cuda_sub_and_propagate_single_carry(void *const *streams,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi);
|
||||
|
||||
void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
@@ -635,5 +662,74 @@ void cleanup_cuda_extend_radix_with_sign_msb_64(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits);
|
||||
|
||||
void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint32_t const active_bits_divisor, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint64_t const *divisor_has_at_least_one_set,
|
||||
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
|
||||
void const *clear_blocks, void const *h_clear_blocks,
|
||||
uint32_t num_clear_blocks);
|
||||
|
||||
void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint32_t const active_bits_divisor, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint64_t const *divisor_has_at_least_one_set,
|
||||
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
|
||||
uint32_t numerator_bits);
|
||||
|
||||
void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
} // extern C
|
||||
#endif // CUDA_INTEGER_H
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -66,6 +66,9 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
uint32_t num_many_lut, uint32_t lut_stride);
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
@@ -95,8 +98,12 @@ uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
|
||||
|
||||
template <typename Torus, class params>
|
||||
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size);
|
||||
|
||||
uint32_t polynomial_size,
|
||||
uint64_t full_sm_keybundle);
|
||||
template <typename Torus, class params>
|
||||
uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size,
|
||||
uint64_t full_sm_keybundle);
|
||||
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
int8_t *d_mem_keybundle = NULL;
|
||||
int8_t *d_mem_acc_step_one = NULL;
|
||||
@@ -115,7 +122,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
|
||||
PBS_VARIANT pbs_variant, bool allocate_gpu_memory,
|
||||
uint64_t *size_tracker) {
|
||||
uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
@@ -281,4 +288,146 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename InputTorus>
|
||||
struct pbs_buffer_128<InputTorus, PBS_TYPE::MULTI_BIT> {
|
||||
int8_t *d_mem_keybundle = NULL;
|
||||
int8_t *d_mem_acc_step_one = NULL;
|
||||
int8_t *d_mem_acc_step_two = NULL;
|
||||
int8_t *d_mem_acc_cg = NULL;
|
||||
int8_t *d_mem_acc_tbc = NULL;
|
||||
uint32_t lwe_chunk_size;
|
||||
double *keybundle_fft;
|
||||
__uint128_t *global_accumulator;
|
||||
double *global_join_buffer;
|
||||
|
||||
PBS_VARIANT pbs_variant;
|
||||
bool gpu_memory_allocated;
|
||||
|
||||
pbs_buffer_128(cudaStream_t stream, uint32_t gpu_index,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t lwe_chunk_size, PBS_VARIANT pbs_variant,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
this->pbs_variant = pbs_variant;
|
||||
this->lwe_chunk_size = lwe_chunk_size;
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
|
||||
// default
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle<
|
||||
__uint128_t>(polynomial_size);
|
||||
uint64_t full_sm_accumulate_step_one =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<
|
||||
__uint128_t>(polynomial_size);
|
||||
uint64_t full_sm_accumulate_step_two =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<
|
||||
__uint128_t>(polynomial_size);
|
||||
uint64_t partial_sm_accumulate_step_one =
|
||||
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
|
||||
__uint128_t>(polynomial_size);
|
||||
// cg
|
||||
uint64_t full_sm_cg_accumulate =
|
||||
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<__uint128_t>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm_cg_accumulate =
|
||||
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<
|
||||
__uint128_t>(polynomial_size);
|
||||
|
||||
auto num_blocks_keybundle = input_lwe_ciphertext_count * lwe_chunk_size *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) *
|
||||
level_count;
|
||||
auto num_blocks_acc_step_one =
|
||||
level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
|
||||
auto num_blocks_acc_step_two =
|
||||
input_lwe_ciphertext_count * (glwe_dimension + 1);
|
||||
auto num_blocks_acc_cg =
|
||||
level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
|
||||
|
||||
// Keybundle
|
||||
if (max_shared_memory < full_sm_keybundle)
|
||||
d_mem_keybundle = (int8_t *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_keybundle * full_sm_keybundle, stream, gpu_index,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
switch (pbs_variant) {
|
||||
case PBS_VARIANT::CG:
|
||||
// Accumulator CG
|
||||
if (max_shared_memory < partial_sm_cg_accumulate)
|
||||
d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_acc_cg * full_sm_cg_accumulate, stream, gpu_index,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
else if (max_shared_memory < full_sm_cg_accumulate)
|
||||
d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_acc_cg * partial_sm_cg_accumulate, stream, gpu_index,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
// Accumulator step one
|
||||
if (max_shared_memory < partial_sm_accumulate_step_one)
|
||||
d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_acc_step_one * full_sm_accumulate_step_one, stream,
|
||||
gpu_index, size_tracker, allocate_gpu_memory);
|
||||
else if (max_shared_memory < full_sm_accumulate_step_one)
|
||||
d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream,
|
||||
gpu_index, size_tracker, allocate_gpu_memory);
|
||||
|
||||
// Accumulator step two
|
||||
if (max_shared_memory < full_sm_accumulate_step_two)
|
||||
d_mem_acc_step_two = (int8_t *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_acc_step_two * full_sm_accumulate_step_two, stream,
|
||||
gpu_index, size_tracker, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unsupported implementation variant.")
|
||||
}
|
||||
|
||||
keybundle_fft = (double *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_keybundle * (polynomial_size / 2) * 4 * sizeof(double),
|
||||
stream, gpu_index, size_tracker, allocate_gpu_memory);
|
||||
global_accumulator = (__uint128_t *)cuda_malloc_with_size_tracking_async(
|
||||
input_lwe_ciphertext_count * (glwe_dimension + 1) * polynomial_size *
|
||||
sizeof(__uint128_t),
|
||||
stream, gpu_index, size_tracker, allocate_gpu_memory);
|
||||
global_join_buffer = (double *)cuda_malloc_with_size_tracking_async(
|
||||
level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
(polynomial_size / 2) * 4 * sizeof(double),
|
||||
stream, gpu_index, size_tracker, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void release(cudaStream_t stream, uint32_t gpu_index) {
|
||||
|
||||
if (d_mem_keybundle)
|
||||
cuda_drop_with_size_tracking_async(d_mem_keybundle, stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
switch (pbs_variant) {
|
||||
case DEFAULT:
|
||||
if (d_mem_acc_step_one)
|
||||
cuda_drop_with_size_tracking_async(d_mem_acc_step_one, stream,
|
||||
gpu_index, gpu_memory_allocated);
|
||||
if (d_mem_acc_step_two)
|
||||
cuda_drop_with_size_tracking_async(d_mem_acc_step_two, stream,
|
||||
gpu_index, gpu_memory_allocated);
|
||||
break;
|
||||
case CG:
|
||||
if (d_mem_acc_cg)
|
||||
cuda_drop_with_size_tracking_async(d_mem_acc_cg, stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unsupported implementation variant.")
|
||||
}
|
||||
|
||||
cuda_drop_with_size_tracking_async(keybundle_fft, stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(global_join_buffer, stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // CUDA_MULTI_BIT_UTILITIES_H
|
||||
|
||||
@@ -90,7 +90,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
PBS_VARIANT pbs_variant, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array, uint64_t *size_tracker) {
|
||||
bool allocate_ms_array, uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
cuda_set_device(gpu_index);
|
||||
this->uses_noise_reduction = allocate_ms_array;
|
||||
@@ -240,7 +240,10 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename InputTorus, PBS_TYPE pbs_type> struct pbs_buffer_128 {
|
||||
template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer_128;
|
||||
|
||||
template <typename InputTorus>
|
||||
struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
|
||||
int8_t *d_mem;
|
||||
|
||||
__uint128_t *global_accumulator;
|
||||
@@ -257,7 +260,7 @@ template <typename InputTorus, PBS_TYPE pbs_type> struct pbs_buffer_128 {
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array,
|
||||
uint64_t *size_tracker) {
|
||||
uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
cuda_set_device(gpu_index);
|
||||
this->pbs_variant = pbs_variant;
|
||||
|
||||
@@ -15,6 +15,11 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor);
|
||||
|
||||
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
|
||||
void *stream, uint32_t gpu_index, void *dest, void const *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor);
|
||||
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
@@ -33,6 +38,25 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
|
||||
uint32_t gpu_index,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride);
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
|
||||
const uint32_t gpu_index,
|
||||
int8_t **buffer);
|
||||
}
|
||||
|
||||
#endif // CUDA_MULTI_BIT_H
|
||||
|
||||
@@ -27,7 +27,7 @@ template <typename Torus> struct zk_expand_mem {
|
||||
int_radix_params casting_params, KS_TYPE casting_key_type,
|
||||
const uint32_t *num_lwes_per_compact_list,
|
||||
const bool *is_boolean_array, uint32_t num_compact_lists,
|
||||
bool allocate_gpu_memory, uint64_t *size_tracker)
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker)
|
||||
: computing_params(computing_params), casting_params(casting_params),
|
||||
num_compact_lists(num_compact_lists),
|
||||
casting_key_type(casting_key_type) {
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
file(GLOB_RECURSE SOURCES "*.cu")
|
||||
add_library(tfhe_cuda_backend STATIC ${SOURCES})
|
||||
add_library(tfhe_cuda_backend STATIC ${SOURCES} pbs/programmable_bootstrap_multibit_128.cuh
|
||||
pbs/programmable_bootstrap_multibit_128.cu)
|
||||
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
|
||||
target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
|
||||
target_include_directories(tfhe_cuda_backend PRIVATE .)
|
||||
|
||||
@@ -38,6 +38,16 @@ __device__ Torus *get_ith_block(Torus *ksk, int i, int level,
|
||||
// Each thread in x are used to calculate one output.
|
||||
// threads in y are used to paralelize the lwe_dimension_in loop.
|
||||
// shared memory is used to store intermediate results of the reduction.
|
||||
// Note: To reduce register pressure we have slightly changed the algorithm,
|
||||
// the idea consists in calculating the negate value of the output. So, instead
|
||||
// of accumulating subtractions using -=, we accumulate additions using += in
|
||||
// the local_lwe_out. This seems to work better cause profits madd ops and save
|
||||
// some regs. For this to work, we need to negate the input
|
||||
// lwe_array_in[lwe_dimension_in], and negate back the output at the end to get
|
||||
// the correct results. Additionally, we split the calculation of the ksk offset
|
||||
// in two parts, a constant part is calculated before the loop, and a variable
|
||||
// part is calculated inside the loop. This seems to help with the register
|
||||
// pressure as well.
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
@@ -60,7 +70,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
lwe_array_in, lwe_input_indexes[blockIdx.x], lwe_dimension_in + 1);
|
||||
|
||||
if (tid == lwe_dimension_out && threadIdx.y == 0) {
|
||||
local_lwe_out = block_lwe_array_in[lwe_dimension_in];
|
||||
local_lwe_out = -block_lwe_array_in[lwe_dimension_in];
|
||||
}
|
||||
const Torus mask_mod_b = (1ll << base_log) - 1ll;
|
||||
|
||||
@@ -73,12 +83,12 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
for (int i = start_i; i < end_i; i++) {
|
||||
Torus state =
|
||||
init_decomposer_state(block_lwe_array_in[i], base_log, level_count);
|
||||
|
||||
uint32_t offset = i * level_count * (lwe_dimension_out + 1);
|
||||
for (int j = 0; j < level_count; j++) {
|
||||
auto ksk_block =
|
||||
get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
|
||||
|
||||
Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
|
||||
local_lwe_out -= (Torus)ksk_block[tid] * decomposed;
|
||||
local_lwe_out +=
|
||||
(Torus)ksk[tid + j * (lwe_dimension_out + 1) + offset] * decomposed;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -93,7 +103,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
lwe_acc_out[shmem_index + offset * blockDim.x];
|
||||
}
|
||||
if (threadIdx.y == 0)
|
||||
block_lwe_array_out[tid] = lwe_acc_out[shmem_index];
|
||||
block_lwe_array_out[tid] = -lwe_acc_out[shmem_index];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -176,10 +186,10 @@ __host__ uint64_t scratch_packing_keyswitch_lwe_list_to_glwe(
|
||||
? glwe_accumulator_size
|
||||
: lwe_dimension * 2;
|
||||
|
||||
uint64_t size_tracker;
|
||||
uint64_t size_tracker = 0;
|
||||
uint64_t buffer_size = 2 * num_lwes * memory_unit * sizeof(Torus);
|
||||
*fp_ks_buffer = (int8_t *)cuda_malloc_with_size_tracking_async(
|
||||
buffer_size, stream, gpu_index, &size_tracker, allocate_gpu_memory);
|
||||
buffer_size, stream, gpu_index, size_tracker, allocate_gpu_memory);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
|
||||
@@ -66,6 +66,13 @@ __device__ inline void typecast_torus_to_double<uint64_t>(uint64_t x,
|
||||
r = __ll2double_rn(x);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ inline void typecast_torus_to_double<__uint128_t>(__uint128_t x,
|
||||
double &r) {
|
||||
// We truncate x
|
||||
r = __ll2double_rn(static_cast<uint64_t>(x));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ inline T init_decomposer_state(T input, uint32_t base_log,
|
||||
uint32_t level_count) {
|
||||
|
||||
@@ -74,10 +74,9 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
/// asynchronously.
|
||||
void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
|
||||
uint32_t gpu_index,
|
||||
uint64_t *size_tracker,
|
||||
uint64_t &size_tracker,
|
||||
bool allocate_gpu_memory) {
|
||||
if (size_tracker != nullptr)
|
||||
*size_tracker += size;
|
||||
size_tracker += size;
|
||||
void *ptr = nullptr;
|
||||
if (!allocate_gpu_memory)
|
||||
return ptr;
|
||||
@@ -106,8 +105,9 @@ void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
|
||||
/// asynchronously.
|
||||
void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
|
||||
uint32_t gpu_index) {
|
||||
return cuda_malloc_with_size_tracking_async(size, stream, gpu_index, nullptr,
|
||||
true);
|
||||
uint64_t size_tracker = 0;
|
||||
return cuda_malloc_with_size_tracking_async(size, stream, gpu_index,
|
||||
size_tracker, true);
|
||||
}
|
||||
|
||||
/// Check that allocation is valid
|
||||
@@ -122,6 +122,13 @@ bool cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t cuda_device_total_memory(uint32_t gpu_index) {
|
||||
cuda_set_device(gpu_index);
|
||||
size_t total_mem = 0, free_mem = 0;
|
||||
check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
|
||||
return total_mem;
|
||||
}
|
||||
|
||||
/// Returns
|
||||
/// false if Cooperative Groups is not supported.
|
||||
/// true otherwise
|
||||
|
||||
@@ -234,6 +234,29 @@ __device__ void convert_u128_to_f128_as_torus(
|
||||
}
|
||||
}
|
||||
|
||||
// params is expected to be full degree not half degree
|
||||
// same as convert_u128_to_f128_as_torus() but expects input to be on registers
|
||||
template <class params>
|
||||
__device__ void convert_u128_on_regs_to_f128_as_torus(
|
||||
double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
|
||||
const __uint128_t *in_re_on_regs, const __uint128_t *in_im_on_regs) {
|
||||
|
||||
const double normalization = pow(2., -128.);
|
||||
Index tid = threadIdx.x;
|
||||
// #pragma unroll
|
||||
for (Index i = 0; i < params::opt / 2; i++) {
|
||||
auto out_re = u128_to_signed_to_f128(in_re_on_regs[i]);
|
||||
auto out_im = u128_to_signed_to_f128(in_im_on_regs[i]);
|
||||
|
||||
out_re_hi[tid] = out_re.hi * normalization;
|
||||
out_re_lo[tid] = out_re.lo * normalization;
|
||||
out_im_hi[tid] = out_im.hi * normalization;
|
||||
out_im_lo[tid] = out_im.lo * normalization;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__device__ void
|
||||
convert_f128_to_u128_as_torus(__uint128_t *out_re, __uint128_t *out_im,
|
||||
@@ -272,7 +295,7 @@ batch_convert_u128_to_f128_as_integer(double *out_re_hi, double *out_re_lo,
|
||||
}
|
||||
|
||||
// params is expected to be full degree not half degree
|
||||
// converts standqard input into complex<128> represented by 4 double
|
||||
// converts standard input into complex<128> represented by 4 double
|
||||
// with following pattern: [re_hi_0, re_hi_1, ... re_hi_n, re_lo_0, re_lo_1,
|
||||
// ... re_lo_n, im_hi_0, im_hi_1, ..., im_hi_n, im_lo_0, im_lo_1, ..., im_lo_n]
|
||||
template <class params>
|
||||
@@ -291,7 +314,7 @@ batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo,
|
||||
}
|
||||
|
||||
// params is expected to be full degree not half degree
|
||||
// converts standqard input into complex<128> represented by 4 double
|
||||
// converts standard input into complex<128> represented by 4 double
|
||||
// with following pattern: [re_hi_0, re_lo_0, im_hi_0, im_lo_0, re_hi_1,
|
||||
// re_lo_1, im_hi_1, im_lo_1,
|
||||
// ...,re_hi_n, re_lo_n, im_hi_n, im_lo_n, ]
|
||||
|
||||
@@ -26,7 +26,7 @@ __host__ uint64_t scratch_cuda_integer_abs_kb(
|
||||
if (is_signed) {
|
||||
*mem_ptr = new int_abs_buffer<Torus>(streams, gpu_indexes, gpu_count,
|
||||
params, num_blocks,
|
||||
allocate_gpu_memory, &size_tracker);
|
||||
allocate_gpu_memory, size_tracker);
|
||||
}
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
@@ -61,7 +61,7 @@ __host__ uint64_t scratch_cuda_integer_radix_bitop_kb(
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_bitop_buffer<Torus>(streams, gpu_indexes, gpu_count, op,
|
||||
params, num_radix_blocks,
|
||||
allocate_gpu_memory, &size_tracker);
|
||||
allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
|
||||
@@ -44,7 +44,7 @@ __host__ uint64_t scratch_extend_radix_with_sign_msb(
|
||||
|
||||
*mem_ptr = new int_extend_radix_with_sign_msb_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
num_additional_blocks, allocate_gpu_memory, &size_tracker);
|
||||
num_additional_blocks, allocate_gpu_memory, size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
@@ -100,7 +100,7 @@ __host__ uint64_t scratch_cuda_integer_radix_cmux_kb(
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_cmux_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, predicate_lut_f, params,
|
||||
num_radix_blocks, allocate_gpu_memory, &size_tracker);
|
||||
num_radix_blocks, allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -684,7 +684,7 @@ __host__ uint64_t scratch_cuda_integer_radix_comparison_check_kb(
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_comparison_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, op, params, num_radix_blocks, is_signed,
|
||||
allocate_gpu_memory, &size_tracker);
|
||||
allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
|
||||
@@ -370,7 +370,7 @@ __host__ uint64_t scratch_cuda_compress_integer_radix_ciphertext(
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_compression<Torus>(
|
||||
streams, gpu_indexes, gpu_count, compression_params, num_radix_blocks,
|
||||
lwe_per_glwe, storage_log_modulus, allocate_gpu_memory, &size_tracker);
|
||||
lwe_per_glwe, storage_log_modulus, allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
@@ -386,7 +386,7 @@ __host__ uint64_t scratch_cuda_integer_decompress_radix_ciphertext(
|
||||
*mem_ptr = new int_decompression<Torus>(
|
||||
streams, gpu_indexes, gpu_count, encryption_params, compression_params,
|
||||
num_radix_blocks, body_count, storage_log_modulus, allocate_gpu_memory,
|
||||
&size_tracker);
|
||||
size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -28,7 +28,7 @@ __host__ uint64_t scratch_cuda_integer_div_rem_kb(
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_div_rem_memory<Torus>(streams, gpu_indexes, gpu_count,
|
||||
params, is_signed, num_blocks,
|
||||
allocate_gpu_memory, &size_tracker);
|
||||
allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
|
||||
@@ -398,7 +398,7 @@ uint64_t scratch_cuda_apply_noise_squashing_mem(
|
||||
*mem_ptr = new int_noise_squashing_lut<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count, params, glwe_dimension,
|
||||
polynomial_size, num_radix_blocks, original_num_blocks,
|
||||
allocate_gpu_memory, &size_tracker);
|
||||
allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
|
||||
@@ -1472,7 +1472,7 @@ uint64_t scratch_cuda_full_propagation(cudaStream_t const *streams,
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr =
|
||||
new int_fullprop_buffer<Torus>(streams, gpu_indexes, gpu_count, params,
|
||||
allocate_gpu_memory, &size_tracker);
|
||||
allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
@@ -1707,7 +1707,7 @@ uint64_t scratch_cuda_apply_univariate_lut_kb(
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
|
||||
1, num_radix_blocks, allocate_gpu_memory,
|
||||
&size_tracker);
|
||||
size_tracker);
|
||||
// It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
|
||||
// 0
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
@@ -1743,7 +1743,7 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb(
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
|
||||
1, num_radix_blocks, num_many_lut,
|
||||
allocate_gpu_memory, &size_tracker);
|
||||
allocate_gpu_memory, size_tracker);
|
||||
// It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
|
||||
// 0
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
@@ -1779,7 +1779,7 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb(
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
|
||||
1, num_radix_blocks, allocate_gpu_memory,
|
||||
&size_tracker);
|
||||
size_tracker);
|
||||
// It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
|
||||
// 0
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
@@ -1817,7 +1817,7 @@ uint64_t scratch_cuda_propagate_single_carry_kb_inplace(
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_sc_prop_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks, requested_flag,
|
||||
uses_carry, allocate_gpu_memory, &size_tracker);
|
||||
uses_carry, allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
// This function perform the three steps of Thomas' new carry propagation
|
||||
@@ -1935,15 +1935,26 @@ void host_add_and_propagate_single_carry(
|
||||
PUSH_RANGE("add & propagate sc")
|
||||
if (lhs_array->num_radix_blocks != rhs_array->num_radix_blocks)
|
||||
PANIC("Cuda error: input and output num radix blocks must be the same")
|
||||
if (lhs_array->lwe_dimension != rhs_array->lwe_dimension ||
|
||||
lhs_array->lwe_dimension != input_carries->lwe_dimension ||
|
||||
lhs_array->lwe_dimension != carry_out->lwe_dimension)
|
||||
PANIC("Cuda error: input and output lwe dimension must be the same")
|
||||
|
||||
// Check input carries if used
|
||||
if (uses_carry == 1) {
|
||||
if (input_carries == nullptr)
|
||||
PANIC("Cuda error: if uses_carry is enabled, input_carries cannot be a "
|
||||
"null pointer");
|
||||
if (lhs_array->lwe_dimension != input_carries->lwe_dimension)
|
||||
PANIC(
|
||||
"Cuda error: input and input_carries lwe dimension must be the same");
|
||||
}
|
||||
|
||||
// Allow nullptr for carry_out if FLAG_NONE is requested
|
||||
if ((requested_flag == outputFlag::FLAG_OVERFLOW ||
|
||||
requested_flag == outputFlag::FLAG_CARRY) &&
|
||||
carry_out == nullptr)
|
||||
PANIC("Cuda error: when requesting FLAG_CARRY, carry_out must be a valid "
|
||||
"pointer")
|
||||
requested_flag == outputFlag::FLAG_CARRY)) {
|
||||
if (carry_out == nullptr)
|
||||
PANIC("Cuda error: when requesting FLAG_CARRY or FLAG_OVERFLOW, "
|
||||
"carry_out must be a valid pointer")
|
||||
if (lhs_array->lwe_dimension != carry_out->lwe_dimension)
|
||||
PANIC("Cuda error: input and carry_out lwe dimension must be the same")
|
||||
}
|
||||
|
||||
auto num_radix_blocks = lhs_array->num_radix_blocks;
|
||||
auto params = mem->params;
|
||||
@@ -2047,6 +2058,7 @@ void host_add_and_propagate_single_carry(
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], lhs_array, 0, num_radix_blocks,
|
||||
mem->output_flag, 0, num_radix_blocks);
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], carry_out, 0, 1, mem->output_flag,
|
||||
num_radix_blocks, num_radix_blocks + 1);
|
||||
@@ -2068,7 +2080,7 @@ uint64_t scratch_cuda_integer_overflowing_sub(
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_borrow_prop_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
compute_overflow, allocate_gpu_memory, &size_tracker);
|
||||
compute_overflow, allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
|
||||
@@ -210,7 +210,8 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
bool reduce_degrees_for_single_carry_propagation, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
@@ -220,79 +221,26 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
return scratch_cuda_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
|
||||
max_num_radix_in_vec, params, allocate_gpu_memory);
|
||||
max_num_radix_in_vec, reduce_degrees_for_single_carry_propagation, params,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *radix_lwe_vec,
|
||||
bool reduce_degrees_for_single_carry_propagation, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;
|
||||
if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
|
||||
PANIC("Cuda error: input vector length should be a multiple of the "
|
||||
"output's number of radix blocks")
|
||||
switch (mem->params.polynomial_size) {
|
||||
case 512:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
"Supported N's are powers of two in the interval [256..16384].")
|
||||
}
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
|
||||
|
||||
@@ -20,7 +20,6 @@
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
#include <queue>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
@@ -273,133 +272,27 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
|
||||
}
|
||||
}
|
||||
|
||||
struct radix_columns {
|
||||
std::vector<size_t> columns_counter;
|
||||
size_t num_blocks;
|
||||
size_t num_radix_in_vec;
|
||||
size_t chunk_size;
|
||||
radix_columns(const uint64_t *const input_degrees, size_t num_blocks,
|
||||
size_t num_radix_in_vec, size_t chunk_size,
|
||||
bool &needs_processing)
|
||||
: num_blocks(num_blocks), num_radix_in_vec(num_radix_in_vec),
|
||||
chunk_size(chunk_size) {
|
||||
needs_processing = false;
|
||||
columns_counter.resize(num_blocks, 0);
|
||||
for (size_t i = 0; i < num_radix_in_vec; ++i) {
|
||||
for (size_t j = 0; j < num_blocks; ++j) {
|
||||
if (input_degrees[i * num_blocks + j])
|
||||
columns_counter[j] += 1;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < num_blocks; ++i) {
|
||||
if (columns_counter[i] > chunk_size) {
|
||||
needs_processing = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void next_accumulation(size_t &total_ciphertexts, size_t &message_ciphertexts,
|
||||
bool &needs_processing) {
|
||||
message_ciphertexts = 0;
|
||||
total_ciphertexts = 0;
|
||||
needs_processing = false;
|
||||
for (int i = num_blocks - 1; i > 0; --i) {
|
||||
size_t cur_count = columns_counter[i];
|
||||
size_t prev_count = columns_counter[i - 1];
|
||||
size_t new_count = 0;
|
||||
|
||||
// accumulated_blocks from current columns
|
||||
new_count += cur_count / chunk_size;
|
||||
// all accumulated message blocks needs pbs
|
||||
message_ciphertexts += new_count;
|
||||
// carry blocks from previous columns
|
||||
new_count += prev_count / chunk_size;
|
||||
// both carry and message blocks that needs pbs
|
||||
total_ciphertexts += new_count;
|
||||
// now add remaining non accumulated blocks that does not require pbs
|
||||
new_count += cur_count % chunk_size;
|
||||
|
||||
columns_counter[i] = new_count;
|
||||
|
||||
if (new_count > chunk_size)
|
||||
needs_processing = true;
|
||||
}
|
||||
|
||||
// now do it for 0th block
|
||||
size_t new_count = columns_counter[0] / chunk_size;
|
||||
message_ciphertexts += new_count;
|
||||
total_ciphertexts += new_count;
|
||||
new_count += columns_counter[0] % chunk_size;
|
||||
columns_counter[0] = new_count;
|
||||
|
||||
if (new_count > chunk_size) {
|
||||
needs_processing = true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
inline void calculate_final_degrees(uint64_t *const out_degrees,
|
||||
const uint64_t *const input_degrees,
|
||||
size_t num_blocks, size_t num_radix_in_vec,
|
||||
size_t chunk_size,
|
||||
uint64_t message_modulus) {
|
||||
|
||||
auto get_degree = [message_modulus](uint64_t degree) -> uint64_t {
|
||||
return std::min(message_modulus - 1, degree);
|
||||
};
|
||||
std::vector<std::queue<uint64_t>> columns(num_blocks);
|
||||
for (size_t i = 0; i < num_radix_in_vec; ++i) {
|
||||
for (size_t j = 0; j < num_blocks; ++j) {
|
||||
if (input_degrees[i * num_blocks + j])
|
||||
columns[j].push(input_degrees[i * num_blocks + j]);
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < num_blocks; ++i) {
|
||||
auto &col = columns[i];
|
||||
while (col.size() > 1) {
|
||||
uint32_t cur_degree = 0;
|
||||
size_t mn = std::min(chunk_size, col.size());
|
||||
for (int j = 0; j < mn; ++j) {
|
||||
cur_degree += col.front();
|
||||
col.pop();
|
||||
}
|
||||
const uint64_t new_degree = get_degree(cur_degree);
|
||||
col.push(new_degree);
|
||||
if ((i + 1) < num_blocks) {
|
||||
columns[i + 1].push(new_degree);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_blocks; i++) {
|
||||
out_degrees[i] = (columns[i].empty()) ? 0 : columns[i].front();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
|
||||
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
bool reduce_degrees_for_single_carry_propagation, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks_in_radix,
|
||||
max_num_radix_in_vec, allocate_gpu_memory, &size_tracker);
|
||||
max_num_radix_in_vec, reduce_degrees_for_single_carry_propagation,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *terms,
|
||||
bool reduce_degrees_for_single_carry_propagation, void *const *bsks,
|
||||
uint64_t *const *ksks,
|
||||
CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
|
||||
uint32_t num_radix_blocks, uint32_t num_radix_in_vec) {
|
||||
@@ -424,10 +317,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
auto d_columns_counter = mem_ptr->d_columns_counter;
|
||||
auto d_new_columns = mem_ptr->d_new_columns;
|
||||
auto d_new_columns_counter = mem_ptr->d_new_columns_counter;
|
||||
auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
|
||||
auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
|
||||
|
||||
auto luts_message_carry = mem_ptr->luts_message_carry;
|
||||
|
||||
auto glwe_dimension = mem_ptr->params.glwe_dimension;
|
||||
auto polynomial_size = mem_ptr->params.polynomial_size;
|
||||
@@ -442,8 +331,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
uint32_t num_many_lut = 1;
|
||||
uint32_t lut_stride = 0;
|
||||
|
||||
if (terms->num_radix_blocks == 0)
|
||||
if (terms->num_radix_blocks == 0) {
|
||||
return;
|
||||
}
|
||||
if (num_radix_in_vec == 1) {
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
radix_lwe_out, 0, num_radix_blocks,
|
||||
@@ -460,10 +350,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
return;
|
||||
}
|
||||
|
||||
if (mem_ptr->mem_reuse) {
|
||||
mem_ptr->setup_lookup_tables(streams, gpu_indexes, gpu_count);
|
||||
}
|
||||
|
||||
if (current_blocks != terms) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
current_blocks, terms);
|
||||
@@ -481,11 +367,17 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
bool needs_processing = false;
|
||||
radix_columns current_columns(current_blocks->degrees, num_radix_blocks,
|
||||
num_radix_in_vec, chunk_size, needs_processing);
|
||||
int number_of_threads = min(256, params::degree);
|
||||
int number_of_threads = std::min(256, (int)mem_ptr->params.polynomial_size);
|
||||
int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
|
||||
const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);
|
||||
|
||||
mem_ptr->setup_lookup_tables(streams, gpu_indexes, gpu_count,
|
||||
num_radix_in_vec, current_blocks->degrees);
|
||||
|
||||
while (needs_processing) {
|
||||
auto luts_message_carry = mem_ptr->luts_message_carry;
|
||||
auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
|
||||
auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
|
||||
calculate_chunks<Torus>
|
||||
<<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
|
||||
(Torus *)(current_blocks->ptr), d_columns, d_columns_counter,
|
||||
@@ -496,8 +388,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0), d_columns,
|
||||
d_columns_counter, chunk_size);
|
||||
|
||||
size_t total_ciphertexts;
|
||||
size_t total_messages;
|
||||
uint32_t total_ciphertexts;
|
||||
uint32_t total_messages;
|
||||
current_columns.next_accumulation(total_ciphertexts, total_messages,
|
||||
needs_processing);
|
||||
|
||||
@@ -549,9 +441,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
luts_message_carry->using_trivial_lwe_indexes = false;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, current_blocks,
|
||||
current_blocks, bsks, ksks, ms_noise_reduction_key,
|
||||
luts_message_carry, total_ciphertexts);
|
||||
streams, gpu_indexes, gpu_count, current_blocks, current_blocks, bsks,
|
||||
ksks, ms_noise_reduction_key, luts_message_carry, total_ciphertexts);
|
||||
}
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
std::swap(d_columns, d_new_columns);
|
||||
@@ -563,15 +454,18 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
(Torus *)(radix_lwe_out->ptr), (Torus *)(current_blocks->ptr),
|
||||
d_columns, d_columns_counter, chunk_size, big_lwe_size);
|
||||
|
||||
if (reduce_degrees_for_single_carry_propagation) {
|
||||
if (mem_ptr->reduce_degrees_for_single_carry_propagation) {
|
||||
auto luts_message_carry = mem_ptr->luts_message_carry;
|
||||
auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
|
||||
auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
|
||||
prepare_final_pbs_indexes<Torus>
|
||||
<<<1, 2 * num_radix_blocks, 0, streams[0]>>>(
|
||||
d_pbs_indexes_in, d_pbs_indexes_out,
|
||||
luts_message_carry->get_lut_indexes(0, 0), num_radix_blocks);
|
||||
|
||||
cuda_memset_async(
|
||||
(Torus *)(current_blocks->ptr) + big_lwe_size * num_radix_blocks, 0,
|
||||
big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], current_blocks, num_radix_blocks,
|
||||
num_radix_blocks + 1);
|
||||
|
||||
auto active_gpu_count =
|
||||
get_active_gpu_count(2 * num_radix_blocks, gpu_count);
|
||||
@@ -772,10 +666,10 @@ __host__ void host_integer_mult_radix_kb(
|
||||
size_t b_id = i % num_blocks;
|
||||
terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
|
||||
}
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<Torus, params>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, true,
|
||||
bsks, ksks, ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem,
|
||||
num_blocks, 2 * num_blocks);
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, bsks,
|
||||
ksks, ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem, num_blocks,
|
||||
2 * num_blocks);
|
||||
|
||||
auto scp_mem_ptr = mem_ptr->sc_prop_mem;
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
@@ -796,7 +690,7 @@ __host__ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb(
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_mul_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, is_boolean_left,
|
||||
is_boolean_right, num_radix_blocks, allocate_gpu_memory, &size_tracker);
|
||||
is_boolean_right, num_radix_blocks, allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
|
||||
@@ -121,7 +121,7 @@ __host__ uint64_t scratch_cuda_integer_overflowing_sub_kb(
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_overflowing_sub_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory,
|
||||
allocate_ms_array, &size_tracker);
|
||||
allocate_ms_array, size_tracker);
|
||||
POP_RANGE()
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ void create_zero_radix_ciphertext_async(cudaStream_t const stream,
|
||||
CudaRadixCiphertextFFI *radix,
|
||||
const uint32_t num_radix_blocks,
|
||||
const uint32_t lwe_dimension,
|
||||
uint64_t *size_tracker,
|
||||
uint64_t &size_tracker,
|
||||
bool allocate_gpu_memory) {
|
||||
PUSH_RANGE("create zero radix ct");
|
||||
radix->lwe_dimension = lwe_dimension;
|
||||
|
||||
@@ -277,8 +277,9 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
auto overflowed = x_0 < x_1;
|
||||
return (Torus)(invert_flags.second ^ overflowed);
|
||||
};
|
||||
uint64_t size = 0;
|
||||
int_radix_lut<Torus> *one_block_lut = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, 1, 1, true, nullptr);
|
||||
streams, gpu_indexes, gpu_count, params, 1, 1, true, size);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], one_block_lut->get_lut(0, 0),
|
||||
@@ -578,8 +579,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
is_x_less_than_y_given_input_borrow<Torus>(x_0, x_1, 0,
|
||||
message_modulus);
|
||||
};
|
||||
uint64_t size = 0;
|
||||
int_radix_lut<Torus> *one_block_lut = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, 1, 1, true, nullptr);
|
||||
streams, gpu_indexes, gpu_count, params, 1, 1, true, size);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], one_block_lut->get_lut(0, 0),
|
||||
|
||||
202
backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cu
Normal file
202
backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cu
Normal file
@@ -0,0 +1,202 @@
|
||||
#include "scalar_div.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
allocate_ms_array);
|
||||
|
||||
return scratch_integer_unsigned_scalar_div_radix<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
|
||||
(int_unsigned_scalar_div_mem<uint64_t> **)mem_ptr, num_blocks,
|
||||
scalar_divisor_ffi, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi) {
|
||||
|
||||
host_integer_unsigned_scalar_div_radix<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count, numerator_ct,
|
||||
(int_unsigned_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key, scalar_divisor_ffi);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_unsigned_scalar_div_mem<uint64_t> *mem_ptr =
|
||||
(int_unsigned_scalar_div_mem<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
allocate_ms_array);
|
||||
|
||||
return scratch_integer_signed_scalar_div_radix_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
|
||||
(int_signed_scalar_div_mem<uint64_t> **)mem_ptr, num_blocks,
|
||||
scalar_divisor_ffi, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits) {
|
||||
|
||||
host_integer_signed_scalar_div_radix_kb<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count, numerator_ct,
|
||||
(int_signed_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key, scalar_divisor_ffi, numerator_bits);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_signed_scalar_div_mem<uint64_t> *mem_ptr =
|
||||
(int_signed_scalar_div_mem<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint32_t const active_bits_divisor, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
allocate_ms_array);
|
||||
|
||||
return scratch_integer_unsigned_scalar_div_rem_radix<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
|
||||
(int_unsigned_scalar_div_rem_buffer<uint64_t> **)mem_ptr, num_blocks,
|
||||
scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint64_t const *divisor_has_at_least_one_set,
|
||||
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
|
||||
void const *clear_blocks, void const *h_clear_blocks,
|
||||
uint32_t num_clear_blocks) {
|
||||
|
||||
host_integer_unsigned_scalar_div_rem_radix<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count, quotient_ct,
|
||||
remainder_ct, (int_unsigned_scalar_div_rem_buffer<uint64_t> *)mem_ptr,
|
||||
bsks, (uint64_t **)ksks, ms_noise_reduction_key, scalar_divisor_ffi,
|
||||
divisor_has_at_least_one_set, decomposed_divisor, num_scalars_divisor,
|
||||
(uint64_t *)clear_blocks, (uint64_t *)h_clear_blocks, num_clear_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_unsigned_scalar_div_rem_buffer<uint64_t> *mem_ptr =
|
||||
(int_unsigned_scalar_div_rem_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint32_t const active_bits_divisor, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
allocate_ms_array);
|
||||
|
||||
return scratch_integer_signed_scalar_div_rem_radix<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
|
||||
(int_signed_scalar_div_rem_buffer<uint64_t> **)mem_ptr, num_blocks,
|
||||
scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint64_t const *divisor_has_at_least_one_set,
|
||||
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
|
||||
uint32_t numerator_bits) {
|
||||
|
||||
host_integer_signed_scalar_div_rem_radix<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count, quotient_ct,
|
||||
remainder_ct, (int_signed_scalar_div_rem_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key, scalar_divisor_ffi,
|
||||
divisor_has_at_least_one_set, decomposed_divisor, num_scalars_divisor,
|
||||
numerator_bits);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_signed_scalar_div_rem_buffer<uint64_t> *mem_ptr =
|
||||
(int_signed_scalar_div_rem_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
415
backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cuh
Normal file
415
backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cuh
Normal file
@@ -0,0 +1,415 @@
|
||||
#ifndef SCALAR_DIV_CUH
|
||||
#define SCALAR_DIV_CUH
|
||||
|
||||
#include "integer/integer_utilities.h"
|
||||
#include "integer/scalar_bitops.cuh"
|
||||
#include "integer/scalar_mul.cuh"
|
||||
#include "integer/scalar_shifts.cuh"
|
||||
#include "integer/subtraction.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_integer_unsigned_scalar_div_radix(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, const int_radix_params params,
|
||||
int_unsigned_scalar_div_mem<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
const bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
*mem_ptr = new int_unsigned_scalar_div_mem<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
scalar_divisor_ffi, allocate_gpu_memory, size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_unsigned_scalar_div_radix(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *numerator_ct,
|
||||
int_unsigned_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi) {
|
||||
|
||||
if (scalar_divisor_ffi->is_abs_divisor_one) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (scalar_divisor_ffi->is_divisor_pow2) {
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct,
|
||||
scalar_divisor_ffi->ilog2_divisor, mem_ptr->logical_scalar_shift_mem,
|
||||
bsks, ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
if (scalar_divisor_ffi->divisor_has_more_bits_than_numerator) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
|
||||
mem_ptr->tmp_ffi);
|
||||
return;
|
||||
}
|
||||
|
||||
if (scalar_divisor_ffi->is_chosen_multiplier_geq_two_pow_numerator) {
|
||||
|
||||
if (scalar_divisor_ffi->shift_pre != (uint64_t)0) {
|
||||
PANIC("shift_pre should be == 0");
|
||||
}
|
||||
|
||||
if (scalar_divisor_ffi->shift_post == (uint32_t)0) {
|
||||
PANIC("shift_post should be > 0");
|
||||
}
|
||||
|
||||
CudaRadixCiphertextFFI *numerator_cpy = mem_ptr->tmp_ffi;
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
numerator_cpy, numerator_ct);
|
||||
|
||||
host_integer_radix_scalar_mul_high_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, numerator_cpy,
|
||||
mem_ptr->scalar_mul_high_mem, ksks, ms_noise_reduction_key, bsks,
|
||||
scalar_divisor_ffi);
|
||||
|
||||
host_sub_and_propagate_single_carry<Torus>(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct, numerator_cpy, nullptr,
|
||||
nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct, (uint32_t)1,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
numerator_ct->num_radix_blocks);
|
||||
|
||||
host_add_and_propagate_single_carry<Torus>(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct, numerator_cpy, nullptr,
|
||||
nullptr, mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
FLAG_NONE, (uint32_t)0);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct,
|
||||
scalar_divisor_ffi->shift_post - (uint32_t)1,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
numerator_ct->num_radix_blocks);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct,
|
||||
scalar_divisor_ffi->shift_pre, mem_ptr->logical_scalar_shift_mem, bsks,
|
||||
ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);
|
||||
|
||||
host_integer_radix_scalar_mul_high_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct,
|
||||
mem_ptr->scalar_mul_high_mem, ksks, ms_noise_reduction_key, bsks,
|
||||
scalar_divisor_ffi);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct,
|
||||
scalar_divisor_ffi->shift_post, mem_ptr->logical_scalar_shift_mem, bsks,
|
||||
ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_integer_signed_scalar_div_radix_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
int_signed_scalar_div_mem<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
const bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
*mem_ptr = new int_signed_scalar_div_mem<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
scalar_divisor_ffi, allocate_gpu_memory, size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_signed_scalar_div_radix_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *numerator_ct,
|
||||
int_signed_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits) {
|
||||
|
||||
if (scalar_divisor_ffi->is_abs_divisor_one) {
|
||||
if (scalar_divisor_ffi->is_divisor_negative) {
|
||||
CudaRadixCiphertextFFI *tmp = mem_ptr->tmp_ffi;
|
||||
|
||||
host_integer_radix_negation<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp, numerator_ct,
|
||||
mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus,
|
||||
numerator_ct->num_radix_blocks);
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
numerator_ct, tmp);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (scalar_divisor_ffi->chosen_multiplier_has_more_bits_than_numerator) {
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], numerator_ct, 0,
|
||||
numerator_ct->num_radix_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
CudaRadixCiphertextFFI *tmp = mem_ptr->tmp_ffi;
|
||||
|
||||
if (scalar_divisor_ffi->is_divisor_pow2) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
|
||||
numerator_ct);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp,
|
||||
scalar_divisor_ffi->chosen_multiplier_num_bits - 1,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp,
|
||||
numerator_bits - scalar_divisor_ffi->chosen_multiplier_num_bits,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
tmp->num_radix_blocks);
|
||||
|
||||
host_add_and_propagate_single_carry<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp, numerator_ct, nullptr, nullptr,
|
||||
mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
|
||||
(uint32_t)0);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp,
|
||||
scalar_divisor_ffi->chosen_multiplier_num_bits,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
|
||||
} else if (!scalar_divisor_ffi->is_chosen_multiplier_geq_two_pow_numerator) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
|
||||
numerator_ct);
|
||||
|
||||
host_integer_radix_signed_scalar_mul_high_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp, mem_ptr->scalar_mul_high_mem,
|
||||
ksks, scalar_divisor_ffi, ms_noise_reduction_key, bsks);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp, scalar_divisor_ffi->shift_post,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
|
||||
CudaRadixCiphertextFFI *xsign = mem_ptr->xsign_ffi;
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], xsign,
|
||||
numerator_ct);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, xsign, numerator_bits - 1,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
|
||||
host_sub_and_propagate_single_carry<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp, xsign, nullptr, nullptr,
|
||||
mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
FLAG_NONE, (uint32_t)0);
|
||||
|
||||
} else {
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
|
||||
numerator_ct);
|
||||
|
||||
host_integer_radix_signed_scalar_mul_high_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp, mem_ptr->scalar_mul_high_mem,
|
||||
ksks, scalar_divisor_ffi, ms_noise_reduction_key, bsks);
|
||||
|
||||
host_add_and_propagate_single_carry<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp, numerator_ct, nullptr, nullptr,
|
||||
mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
|
||||
(uint32_t)0);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp, scalar_divisor_ffi->shift_post,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
|
||||
CudaRadixCiphertextFFI *xsign = mem_ptr->xsign_ffi;
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], xsign,
|
||||
numerator_ct);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, xsign, numerator_bits - 1,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
|
||||
host_sub_and_propagate_single_carry<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp, xsign, nullptr, nullptr,
|
||||
mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
FLAG_NONE, (uint32_t)0);
|
||||
}
|
||||
|
||||
if (scalar_divisor_ffi->is_divisor_negative) {
|
||||
host_integer_radix_negation<Torus>(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct, tmp,
|
||||
mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus,
|
||||
numerator_ct->num_radix_blocks);
|
||||
} else {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
|
||||
tmp);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_integer_unsigned_scalar_div_rem_radix(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, const int_radix_params params,
|
||||
int_unsigned_scalar_div_rem_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint32_t const active_bits_divisor, const bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_unsigned_scalar_div_rem_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_unsigned_scalar_div_rem_radix(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *quotient_ct,
|
||||
CudaRadixCiphertextFFI *remainder_ct,
|
||||
int_unsigned_scalar_div_rem_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint64_t const *divisor_has_at_least_one_set,
|
||||
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
|
||||
Torus const *clear_blocks, Torus const *h_clear_blocks,
|
||||
uint32_t num_clear_blocks) {
|
||||
|
||||
auto numerator_ct = mem_ptr->numerator_ct;
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
|
||||
quotient_ct);
|
||||
|
||||
host_integer_unsigned_scalar_div_radix(
|
||||
streams, gpu_indexes, gpu_count, quotient_ct, mem_ptr->unsigned_div_mem,
|
||||
bsks, ksks, ms_noise_reduction_key, scalar_divisor_ffi);
|
||||
|
||||
if (scalar_divisor_ffi->is_divisor_pow2) {
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
|
||||
numerator_ct);
|
||||
host_integer_radix_scalar_bitop_kb(
|
||||
streams, gpu_indexes, gpu_count, remainder_ct, remainder_ct,
|
||||
clear_blocks, h_clear_blocks, num_clear_blocks, mem_ptr->bitop_mem,
|
||||
bsks, ksks, ms_noise_reduction_key);
|
||||
|
||||
} else {
|
||||
if (!scalar_divisor_ffi->is_divisor_zero) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
remainder_ct, quotient_ct);
|
||||
|
||||
if (!scalar_divisor_ffi->is_abs_divisor_one &&
|
||||
remainder_ct->num_radix_blocks != 0) {
|
||||
|
||||
host_integer_scalar_mul_radix<Torus>(
|
||||
streams, gpu_indexes, gpu_count, remainder_ct, decomposed_divisor,
|
||||
divisor_has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->params.message_modulus,
|
||||
num_scalars_divisor);
|
||||
}
|
||||
}
|
||||
|
||||
host_sub_and_propagate_single_carry(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct, remainder_ct, nullptr,
|
||||
nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
|
||||
numerator_ct);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_integer_signed_scalar_div_rem_radix(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, const int_radix_params params,
|
||||
int_signed_scalar_div_rem_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint32_t const active_bits_divisor, const bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
*mem_ptr = new int_signed_scalar_div_rem_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_signed_scalar_div_rem_radix(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *quotient_ct,
|
||||
CudaRadixCiphertextFFI *remainder_ct,
|
||||
int_signed_scalar_div_rem_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint64_t const *divisor_has_at_least_one_set,
|
||||
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
|
||||
uint32_t numerator_bits) {
|
||||
|
||||
auto numerator_ct = mem_ptr->numerator_ct;
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
|
||||
quotient_ct);
|
||||
|
||||
host_integer_signed_scalar_div_radix_kb(
|
||||
streams, gpu_indexes, gpu_count, quotient_ct, mem_ptr->signed_div_mem,
|
||||
bsks, ksks, ms_noise_reduction_key, scalar_divisor_ffi, numerator_bits);
|
||||
|
||||
host_propagate_single_carry<Torus>(
|
||||
streams, gpu_indexes, gpu_count, quotient_ct, nullptr, nullptr,
|
||||
mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
|
||||
(uint32_t)0);
|
||||
|
||||
if (!scalar_divisor_ffi->is_divisor_negative &&
|
||||
scalar_divisor_ffi->is_divisor_pow2) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
|
||||
quotient_ct);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
streams, gpu_indexes, gpu_count, remainder_ct,
|
||||
scalar_divisor_ffi->ilog2_divisor, mem_ptr->logical_scalar_shift_mem,
|
||||
bsks, ksks, ms_noise_reduction_key, remainder_ct->num_radix_blocks);
|
||||
|
||||
} else if (!scalar_divisor_ffi->is_divisor_zero) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
|
||||
quotient_ct);
|
||||
|
||||
bool is_divisor_one = scalar_divisor_ffi->is_abs_divisor_one &&
|
||||
!scalar_divisor_ffi->is_divisor_negative;
|
||||
|
||||
if (!is_divisor_one && remainder_ct->num_radix_blocks != 0) {
|
||||
host_integer_scalar_mul_radix<Torus>(
|
||||
streams, gpu_indexes, gpu_count, remainder_ct, decomposed_divisor,
|
||||
divisor_has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->params.message_modulus,
|
||||
num_scalars_divisor);
|
||||
}
|
||||
}
|
||||
|
||||
host_sub_and_propagate_single_carry(
|
||||
streams, gpu_indexes, gpu_count, numerator_ct, remainder_ct, nullptr,
|
||||
nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
|
||||
numerator_ct);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -21,27 +21,6 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
|
||||
num_scalar_bits, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool anticipated_buffer_drop,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
allocate_ms_array);
|
||||
|
||||
return scratch_cuda_integer_radix_scalar_mul_high_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_scalar_mul_high<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
num_scalar_bits, anticipated_buffer_drop, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array, uint64_t const *decomposed_scalar,
|
||||
@@ -50,73 +29,11 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_scalars) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
|
||||
decomposed_scalar, has_at_least_one_set,
|
||||
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
|
||||
num_scalars);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
|
||||
decomposed_scalar, has_at_least_one_set,
|
||||
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
|
||||
num_scalars);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
|
||||
decomposed_scalar, has_at_least_one_set,
|
||||
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
|
||||
num_scalars);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
|
||||
decomposed_scalar, has_at_least_one_set,
|
||||
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
|
||||
num_scalars);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
|
||||
decomposed_scalar, has_at_least_one_set,
|
||||
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
|
||||
num_scalars);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
|
||||
decomposed_scalar, has_at_least_one_set,
|
||||
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
|
||||
num_scalars);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (scalar multiplication): unsupported polynomial size. "
|
||||
"Only N = 512, 1024, 2048, 4096, 8192, 16384 are supported.")
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_integer_radix_scalar_mul_high_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, void *const *ksks,
|
||||
uint64_t rhs, uint64_t const *decomposed_scalar,
|
||||
uint64_t const *has_at_least_one_set,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_scalars) {
|
||||
|
||||
host_integer_radix_scalar_mul_high_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, ct,
|
||||
(int_scalar_mul_high<uint64_t> *)mem_ptr, (uint64_t **)ksks, rhs,
|
||||
decomposed_scalar, has_at_least_one_set, ms_noise_reduction_key, bsks,
|
||||
host_integer_scalar_mul_radix<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
|
||||
decomposed_scalar, has_at_least_one_set,
|
||||
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
|
||||
num_scalars);
|
||||
}
|
||||
|
||||
@@ -130,13 +47,3 @@ void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,
|
||||
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_mul_high_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_scalar_mul_high<uint64_t> *mem_ptr =
|
||||
(int_scalar_mul_high<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
|
||||
}
|
||||
|
||||
@@ -38,11 +38,11 @@ __host__ uint64_t scratch_cuda_integer_radix_scalar_mul_kb(
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_scalar_mul_buffer<T>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
num_scalar_bits, allocate_gpu_memory, true, &size_tracker);
|
||||
num_scalar_bits, allocate_gpu_memory, true, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename T, class params>
|
||||
template <typename T>
|
||||
__host__ void host_integer_scalar_mul_radix(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
|
||||
@@ -80,7 +80,7 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
}
|
||||
}
|
||||
size_t j = 0;
|
||||
for (size_t i = 0; i < min(num_scalars, num_ciphertext_bits); i++) {
|
||||
for (size_t i = 0; i < std::min(num_scalars, num_ciphertext_bits); i++) {
|
||||
if (decomposed_scalar[i] == 1) {
|
||||
// Perform a block shift
|
||||
CudaRadixCiphertextFFI preshifted_radix_ct;
|
||||
@@ -116,9 +116,9 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
set_zero_radix_ciphertext_slice_async<T>(streams[0], gpu_indexes[0],
|
||||
lwe_array, 0, num_radix_blocks);
|
||||
} else {
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, true,
|
||||
bsks, ksks, ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem,
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<T>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, bsks,
|
||||
ksks, ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem,
|
||||
num_radix_blocks, j);
|
||||
|
||||
auto scp_mem_ptr = mem->sc_prop_mem;
|
||||
@@ -169,34 +169,15 @@ __host__ void host_integer_small_scalar_mul_radix(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_scalar_mul_high<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
uint32_t num_scalar_bits, bool anticipated_buffer_drop,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
*mem_ptr = new int_scalar_mul_high<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
allocate_gpu_memory, LEFT_SHIFT, num_scalar_bits, anticipated_buffer_drop,
|
||||
&size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_mul_high_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
|
||||
int_scalar_mul_high<Torus> *mem_ptr, Torus *const *ksks, uint64_t rhs,
|
||||
uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
|
||||
int_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_scalars) {
|
||||
void *const *bsks, const CudaScalarDivisorFFI *scalar_divisor_ffi) {
|
||||
|
||||
if (rhs == (uint64_t)0) {
|
||||
if (scalar_divisor_ffi->is_chosen_multiplier_zero) {
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], ct,
|
||||
0, ct->num_radix_blocks);
|
||||
return;
|
||||
@@ -207,66 +188,71 @@ __host__ void host_integer_radix_scalar_mul_high_kb(
|
||||
host_extend_radix_with_trivial_zero_blocks_msb<Torus>(tmp_ffi, ct, streams,
|
||||
gpu_indexes);
|
||||
|
||||
if (rhs != (uint64_t)1 || tmp_ffi->num_radix_blocks != 0) {
|
||||
if ((rhs & (rhs - 1)) == 0) {
|
||||
|
||||
uint32_t shift = std::log2(rhs);
|
||||
if (scalar_divisor_ffi->active_bits != (uint32_t)0 &&
|
||||
!scalar_divisor_ffi->is_abs_chosen_multiplier_one &&
|
||||
tmp_ffi->num_radix_blocks != 0) {
|
||||
|
||||
if (scalar_divisor_ffi->is_chosen_multiplier_pow2) {
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi, shift,
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi,
|
||||
scalar_divisor_ffi->ilog2_chosen_multiplier,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key, tmp_ffi->num_radix_blocks);
|
||||
|
||||
} else {
|
||||
|
||||
switch (mem_ptr->params.polynomial_size) {
|
||||
case 512:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<512>>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
|
||||
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key,
|
||||
mem_ptr->params.message_modulus, num_scalars);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<1024>>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
|
||||
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key,
|
||||
mem_ptr->params.message_modulus, num_scalars);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<2048>>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
|
||||
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key,
|
||||
mem_ptr->params.message_modulus, num_scalars);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<4096>>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
|
||||
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key,
|
||||
mem_ptr->params.message_modulus, num_scalars);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<8192>>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
|
||||
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key,
|
||||
mem_ptr->params.message_modulus, num_scalars);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<16384>>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
|
||||
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key,
|
||||
mem_ptr->params.message_modulus, num_scalars);
|
||||
break;
|
||||
default:
|
||||
PANIC(
|
||||
"Cuda error (scalar multiplication): unsupported polynomial size. "
|
||||
"Only N = 512, 1024, 2048, 4096, 8192, 16384 are supported.")
|
||||
}
|
||||
host_integer_scalar_mul_radix<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi,
|
||||
scalar_divisor_ffi->decomposed_chosen_multiplier,
|
||||
scalar_divisor_ffi->chosen_multiplier_has_at_least_one_set,
|
||||
mem_ptr->scalar_mul_mem, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key, mem_ptr->params.message_modulus,
|
||||
scalar_divisor_ffi->num_scalars);
|
||||
}
|
||||
}
|
||||
|
||||
host_trim_radix_blocks_lsb<Torus>(ct, tmp_ffi, streams, gpu_indexes);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_signed_scalar_mul_high_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
|
||||
int_signed_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks) {
|
||||
|
||||
if (scalar_divisor_ffi->is_chosen_multiplier_zero) {
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], ct,
|
||||
0, ct->num_radix_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
CudaRadixCiphertextFFI *tmp_ffi = mem_ptr->tmp;
|
||||
|
||||
host_extend_radix_with_sign_msb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi, ct, mem_ptr->extend_radix_mem,
|
||||
ct->num_radix_blocks, bsks, (uint64_t **)ksks, ms_noise_reduction_key);
|
||||
|
||||
if (scalar_divisor_ffi->active_bits != (uint32_t)0 &&
|
||||
!scalar_divisor_ffi->is_abs_chosen_multiplier_one &&
|
||||
tmp_ffi->num_radix_blocks != 0) {
|
||||
|
||||
if (scalar_divisor_ffi->is_chosen_multiplier_pow2) {
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi,
|
||||
scalar_divisor_ffi->ilog2_chosen_multiplier,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key, tmp_ffi->num_radix_blocks);
|
||||
} else {
|
||||
host_integer_scalar_mul_radix<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp_ffi,
|
||||
scalar_divisor_ffi->decomposed_chosen_multiplier,
|
||||
scalar_divisor_ffi->chosen_multiplier_has_at_least_one_set,
|
||||
mem_ptr->scalar_mul_mem, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key, mem_ptr->params.message_modulus,
|
||||
scalar_divisor_ffi->num_scalars);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ __host__ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb(
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
|
||||
allocate_gpu_memory, &size_tracker);
|
||||
allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ __host__ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb(
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
|
||||
allocate_gpu_memory, &size_tracker);
|
||||
allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
@@ -133,7 +133,7 @@ __host__ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_arithmetic_scalar_shift_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
|
||||
allocate_gpu_memory, &size_tracker);
|
||||
allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ __host__ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb(
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_shift_and_rotate_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, shift_type, is_signed, params,
|
||||
num_radix_blocks, allocate_gpu_memory, &size_tracker);
|
||||
num_radix_blocks, allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ uint64_t scratch_cuda_sub_and_propagate_single_carry(
|
||||
|
||||
*mem_ptr = new int_sub_and_propagate<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks, requested_flag,
|
||||
allocate_gpu_memory, &size_tracker);
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
@@ -35,6 +35,20 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
|
||||
static_cast<cudaStream_t>(stream), gpu_index);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
|
||||
void *stream, uint32_t gpu_index, void *dest, void const *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor) {
|
||||
uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
|
||||
level_count * (1 << grouping_factor) /
|
||||
grouping_factor;
|
||||
size_t buffer_size =
|
||||
total_polynomials * polynomial_size * sizeof(__uint128_t);
|
||||
|
||||
cuda_memcpy_async_to_gpu((__uint128_t *)dest, (__uint128_t *)src, buffer_size,
|
||||
static_cast<cudaStream_t>(stream), gpu_index);
|
||||
}
|
||||
|
||||
// We need these lines so the compiler knows how to specialize these functions
|
||||
template __device__ const uint64_t *
|
||||
get_ith_mask_kth_block(const uint64_t *ptr, int i, int k, int level,
|
||||
@@ -80,6 +94,14 @@ template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
|
||||
template __device__ const __uint128_t *
|
||||
get_multi_bit_ith_lwe_gth_group_kth_block(const __uint128_t *ptr, int g, int i,
|
||||
int k, int level,
|
||||
uint32_t grouping_factor,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t level_count);
|
||||
|
||||
template __device__ const uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
const uint64_t *ptr, int g, int i, int k, int level,
|
||||
uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension,
|
||||
|
||||
@@ -83,6 +83,62 @@ mul_ggsw_glwe_in_fourier_domain(double2 *fft, double2 *join_buffer,
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
/** Perform the matrix multiplication between the GGSW and the GLWE,
|
||||
* each block operating on a single level for mask and body.
|
||||
* Both operands should be at fourier domain
|
||||
*
|
||||
* This function assumes:
|
||||
* - Thread blocks at dimension z relates to the decomposition level.
|
||||
* - Thread blocks at dimension y relates to the glwe dimension.
|
||||
* - polynomial_size / params::opt threads are available per block
|
||||
*/
|
||||
template <typename G, class params>
|
||||
__device__ void mul_ggsw_glwe_in_fourier_domain_128(
|
||||
double *fft, double *join_buffer,
|
||||
const double *__restrict__ bootstrapping_key, int iteration, G &group,
|
||||
bool support_dsm = false) {
|
||||
const uint32_t polynomial_size = params::degree;
|
||||
const uint32_t glwe_dimension = gridDim.y - 1;
|
||||
const uint32_t level_count = gridDim.z;
|
||||
|
||||
// The first product is used to initialize level_join_buffer
|
||||
auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
|
||||
|
||||
// Continues multiplying fft by every polynomial in that particular bsk level
|
||||
// Each y-block accumulates in a different polynomial at each iteration
|
||||
auto bsk_slice = get_ith_mask_kth_block_128(
|
||||
bootstrapping_key, iteration, blockIdx.y, blockIdx.z, polynomial_size,
|
||||
glwe_dimension, level_count);
|
||||
for (int j = 0; j < glwe_dimension + 1; j++) {
|
||||
int idx = (j + this_block_rank) % (glwe_dimension + 1);
|
||||
|
||||
auto bsk_poly = bsk_slice + idx * polynomial_size / 2 * 4;
|
||||
auto buffer_slice = get_join_buffer_element_128<G>(
|
||||
blockIdx.z, idx, group, join_buffer, polynomial_size, glwe_dimension,
|
||||
support_dsm);
|
||||
|
||||
polynomial_product_accumulate_in_fourier_domain_128<params>(
|
||||
buffer_slice, fft, bsk_poly, j == 0);
|
||||
group.sync();
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
// All blocks are synchronized here; after this sync, level_join_buffer has
|
||||
// the values needed from every other block
|
||||
|
||||
// accumulate rest of the products into fft buffer
|
||||
for (int l = 0; l < level_count; l++) {
|
||||
auto cur_src_acc = get_join_buffer_element_128<G>(
|
||||
l, blockIdx.y, group, join_buffer, polynomial_size, glwe_dimension,
|
||||
support_dsm);
|
||||
|
||||
polynomial_accumulate_in_fourier_domain_128<params>(fft, cur_src_acc,
|
||||
l == 0);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void execute_pbs_async(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -223,7 +279,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array,
|
||||
uint64_t *size_tracker) {
|
||||
uint64_t &size_tracker) {
|
||||
switch (sizeof(Torus)) {
|
||||
case sizeof(uint32_t):
|
||||
// 32 bits
|
||||
@@ -231,7 +287,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
|
||||
case MULTI_BIT:
|
||||
PANIC("Error: 32-bit multibit PBS is not supported.\n")
|
||||
case CLASSICAL:
|
||||
*size_tracker = scratch_cuda_programmable_bootstrap_32(
|
||||
size_tracker = scratch_cuda_programmable_bootstrap_32(
|
||||
stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory, allocate_ms_array);
|
||||
@@ -246,12 +302,12 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
|
||||
case MULTI_BIT:
|
||||
if (grouping_factor == 0)
|
||||
PANIC("Multi-bit PBS error: grouping factor should be > 0.")
|
||||
*size_tracker = scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
size_tracker = scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
break;
|
||||
case CLASSICAL:
|
||||
*size_tracker = scratch_cuda_programmable_bootstrap_64(
|
||||
size_tracker = scratch_cuda_programmable_bootstrap_64(
|
||||
stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory, allocate_ms_array);
|
||||
|
||||
@@ -8,9 +8,9 @@ execute_scratch_pbs_128(void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array,
|
||||
uint64_t *size_tracker_on_gpu) {
|
||||
uint64_t &size_tracker_on_gpu) {
|
||||
// The squash noise function receives as input 64-bit integers
|
||||
*size_tracker_on_gpu = scratch_cuda_programmable_bootstrap_128_vector_64(
|
||||
size_tracker_on_gpu = scratch_cuda_programmable_bootstrap_128_vector_64(
|
||||
stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory, allocate_ms_array);
|
||||
|
||||
@@ -286,7 +286,7 @@ __host__ uint64_t scratch_programmable_bootstrap_amortized(
|
||||
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
|
||||
max_shared_memory);
|
||||
*pbs_buffer = (int8_t *)cuda_malloc_with_size_tracking_async(
|
||||
buffer_size, stream, gpu_index, &size_tracker, allocate_gpu_memory);
|
||||
buffer_size, stream, gpu_index, size_tracker, allocate_gpu_memory);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
@@ -225,7 +225,7 @@ __host__ uint64_t scratch_programmable_bootstrap_cg(
|
||||
*buffer = new pbs_buffer<Torus, CLASSICAL>(
|
||||
stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, PBS_VARIANT::CG,
|
||||
allocate_gpu_memory, allocate_ms_array, &size_tracker);
|
||||
allocate_gpu_memory, allocate_ms_array, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
|
||||
@@ -280,13 +280,14 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size);
|
||||
auto lwe_chunk_size =
|
||||
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
|
||||
polynomial_size, full_sm_keybundle);
|
||||
uint64_t size_tracker = 0;
|
||||
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::CG,
|
||||
allocate_gpu_memory, &size_tracker);
|
||||
allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
|
||||
@@ -386,7 +386,7 @@ __host__ uint64_t scratch_programmable_bootstrap(
|
||||
*buffer = new pbs_buffer<Torus, CLASSICAL>(
|
||||
stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT,
|
||||
allocate_gpu_memory, allocate_ms_array, &size_tracker);
|
||||
allocate_gpu_memory, allocate_ms_array, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
|
||||
@@ -18,62 +18,6 @@
|
||||
#include "programmable_bootstrap.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
|
||||
/** Perform the matrix multiplication between the GGSW and the GLWE,
|
||||
* each block operating on a single level for mask and body.
|
||||
* Both operands should be at fourier domain
|
||||
*
|
||||
* This function assumes:
|
||||
* - Thread blocks at dimension z relates to the decomposition level.
|
||||
* - Thread blocks at dimension y relates to the glwe dimension.
|
||||
* - polynomial_size / params::opt threads are available per block
|
||||
*/
|
||||
template <typename G, class params>
|
||||
__device__ void mul_ggsw_glwe_in_fourier_domain_128(
|
||||
double *fft, double *join_buffer,
|
||||
const double *__restrict__ bootstrapping_key, int iteration, G &group,
|
||||
bool support_dsm = false) {
|
||||
const uint32_t polynomial_size = params::degree;
|
||||
const uint32_t glwe_dimension = gridDim.y - 1;
|
||||
const uint32_t level_count = gridDim.z;
|
||||
|
||||
// The first product is used to initialize level_join_buffer
|
||||
auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
|
||||
|
||||
// Continues multiplying fft by every polynomial in that particular bsk level
|
||||
// Each y-block accumulates in a different polynomial at each iteration
|
||||
auto bsk_slice = get_ith_mask_kth_block_128(
|
||||
bootstrapping_key, iteration, blockIdx.y, blockIdx.z, polynomial_size,
|
||||
glwe_dimension, level_count);
|
||||
for (int j = 0; j < glwe_dimension + 1; j++) {
|
||||
int idx = (j + this_block_rank) % (glwe_dimension + 1);
|
||||
|
||||
auto bsk_poly = bsk_slice + idx * polynomial_size / 2 * 4;
|
||||
auto buffer_slice = get_join_buffer_element_128<G>(
|
||||
blockIdx.z, idx, group, join_buffer, polynomial_size, glwe_dimension,
|
||||
support_dsm);
|
||||
|
||||
polynomial_product_accumulate_in_fourier_domain_128<params>(
|
||||
buffer_slice, fft, bsk_poly, j == 0);
|
||||
group.sync();
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
// All blocks are synchronized here; after this sync, level_join_buffer has
|
||||
// the values needed from every other block
|
||||
|
||||
// accumulate rest of the products into fft buffer
|
||||
for (int l = 0; l < level_count; l++) {
|
||||
auto cur_src_acc = get_join_buffer_element_128<G>(
|
||||
l, blockIdx.y, group, join_buffer, polynomial_size, glwe_dimension,
|
||||
support_dsm);
|
||||
|
||||
polynomial_accumulate_in_fourier_domain_128<params>(fft, cur_src_acc,
|
||||
l == 0);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename InputTorus, class params, sharedMemDegree SMD,
|
||||
bool first_iter>
|
||||
__global__ void __launch_bounds__(params::degree / params::opt)
|
||||
@@ -174,9 +118,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
accumulator);
|
||||
gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
// accumulator_rotated, so we need to synchronize here to make sure they
|
||||
// don't modify the same memory space at the same time
|
||||
// Switch to the FFT space
|
||||
auto acc_fft_re_hi = accumulator_fft + 0 * params::degree / 2;
|
||||
auto acc_fft_re_lo = accumulator_fft + 1 * params::degree / 2;
|
||||
@@ -455,6 +396,7 @@ __host__ uint64_t scratch_programmable_bootstrap_cg_128(
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
cuda_set_device(gpu_index);
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_cg<__uint128_t>(
|
||||
polynomial_size);
|
||||
@@ -484,7 +426,7 @@ __host__ uint64_t scratch_programmable_bootstrap_cg_128(
|
||||
*buffer = new pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>(
|
||||
stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, PBS_VARIANT::CG,
|
||||
allocate_gpu_memory, allocate_ms_array, &size_tracker);
|
||||
allocate_gpu_memory, allocate_ms_array, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
@@ -591,7 +533,7 @@ __host__ uint64_t scratch_programmable_bootstrap_128(
|
||||
*buffer = new pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>(
|
||||
stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT,
|
||||
allocate_gpu_memory, allocate_ms_array, &size_tracker);
|
||||
allocate_gpu_memory, allocate_ms_array, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
|
||||
@@ -455,11 +455,8 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size) {
|
||||
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
|
||||
polynomial_size);
|
||||
uint32_t polynomial_size,
|
||||
uint64_t full_sm_keybundle) {
|
||||
|
||||
int max_blocks_per_sm;
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
|
||||
@@ -521,13 +521,14 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size);
|
||||
auto lwe_chunk_size =
|
||||
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
|
||||
polynomial_size, full_sm_keybundle);
|
||||
uint64_t size_tracker = 0;
|
||||
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::DEFAULT,
|
||||
allocate_gpu_memory, &size_tracker);
|
||||
allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,361 @@
|
||||
#include "programmable_bootstrap_cg_multibit.cuh"
|
||||
#include "programmable_bootstrap_multibit_128.cuh"
|
||||
|
||||
template <typename InputTorus>
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer_128<InputTorus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 512:
|
||||
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 1024:
|
||||
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 2048:
|
||||
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 4096:
|
||||
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputTorus>
|
||||
uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap_128(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer_128<InputTorus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return scratch_cg_multi_bit_programmable_bootstrap_128<
|
||||
InputTorus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 512:
|
||||
return scratch_cg_multi_bit_programmable_bootstrap_128<
|
||||
InputTorus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 1024:
|
||||
return scratch_cg_multi_bit_programmable_bootstrap_128<
|
||||
InputTorus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 2048:
|
||||
return scratch_cg_multi_bit_programmable_bootstrap_128<
|
||||
InputTorus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 4096:
|
||||
return scratch_cg_multi_bit_programmable_bootstrap_128<
|
||||
InputTorus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
bool supports_cg =
|
||||
supports_cooperative_groups_on_multibit_programmable_bootstrap<
|
||||
__uint128_t>(glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
|
||||
if (supports_cg)
|
||||
return scratch_cuda_cg_multi_bit_programmable_bootstrap_128<uint64_t>(
|
||||
stream, gpu_index,
|
||||
reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> **>(buffer),
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
else
|
||||
return scratch_cuda_multi_bit_programmable_bootstrap_128<uint64_t>(
|
||||
stream, gpu_index,
|
||||
reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> **>(buffer),
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename InputTorus>
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
|
||||
InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 512:
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 1024:
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 2048:
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 4096:
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputTorus>
|
||||
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
|
||||
InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 512:
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 1024:
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 2048:
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 4096:
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (multi-bit PBS): base log should be <= 64")
|
||||
|
||||
auto *buffer =
|
||||
reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(mem_ptr);
|
||||
switch (buffer->pbs_variant) {
|
||||
case PBS_VARIANT::CG:
|
||||
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128<
|
||||
uint64_t>(stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const __uint128_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const __uint128_t *>(bootstrapping_key), buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples,
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128<uint64_t>(
|
||||
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const __uint128_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const __uint128_t *>(bootstrapping_key), buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
|
||||
const uint32_t gpu_index,
|
||||
int8_t **buffer) {
|
||||
const auto x =
|
||||
reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(*buffer);
|
||||
x->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes divisors of the product of num_sms (streaming multiprocessors on the
|
||||
* GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
|
||||
* device_multi_bit_programmable_bootstrap_keybundle) smaller than its square
|
||||
* root, based on max_num_pbs. If log2(max_num_pbs) <= 13, selects the first
|
||||
* suitable divisor. If greater, calculates an offset as max(1,log2(max_num_pbs)
|
||||
* - 13) for additional logic.
|
||||
*
|
||||
* The value 13 was empirically determined based on memory requirements for
|
||||
* benchmarking on an RTX 4090 GPU, balancing performance and resource use.
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size,
|
||||
uint64_t full_sm_keybundle) {
|
||||
|
||||
int max_blocks_per_sm;
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
if (max_shared_memory < full_sm_keybundle)
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_blocks_per_sm,
|
||||
device_multi_bit_programmable_bootstrap_keybundle_128<Torus, params,
|
||||
NOSM>,
|
||||
polynomial_size / params::opt, full_sm_keybundle);
|
||||
else
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_blocks_per_sm,
|
||||
device_multi_bit_programmable_bootstrap_keybundle_128<Torus, params,
|
||||
FULLSM>,
|
||||
polynomial_size / params::opt, 0);
|
||||
|
||||
int num_sms = 0;
|
||||
check_cuda_error(cudaDeviceGetAttribute(
|
||||
&num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
|
||||
|
||||
int x = num_sms * max_blocks_per_sm;
|
||||
int count = 0;
|
||||
|
||||
int divisor = 1;
|
||||
int ith_divisor = 0;
|
||||
|
||||
#if CUDA_ARCH < 900
|
||||
// We pick a smaller divisor on GPUs other than H100, so 256-bit integer
|
||||
// multiplication can run
|
||||
int log2_max_num_pbs = log2_int(max_num_pbs);
|
||||
if (log2_max_num_pbs > 13)
|
||||
ith_divisor = log2_max_num_pbs - 11;
|
||||
#endif
|
||||
|
||||
for (int i = sqrt(x); i >= 1; i--) {
|
||||
if (x % i == 0) {
|
||||
if (count == ith_divisor) {
|
||||
divisor = i;
|
||||
break;
|
||||
} else {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return divisor;
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -253,7 +253,7 @@ __host__ uint64_t scratch_programmable_bootstrap_tbc(
|
||||
*buffer = new pbs_buffer<Torus, CLASSICAL>(
|
||||
stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, PBS_VARIANT::TBC,
|
||||
allocate_gpu_memory, allocate_ms_array, &size_tracker);
|
||||
allocate_gpu_memory, allocate_ms_array, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
|
||||
@@ -283,13 +283,14 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size);
|
||||
auto lwe_chunk_size =
|
||||
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
|
||||
polynomial_size, full_sm_keybundle);
|
||||
uint64_t size_tracker = 0;
|
||||
*buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::TBC,
|
||||
allocate_gpu_memory, &size_tracker);
|
||||
allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
|
||||
@@ -5,15 +5,15 @@
|
||||
#include <stdio.h>
|
||||
#include <type_traits>
|
||||
|
||||
template <typename T> inline __device__ const char *get_format();
|
||||
template <typename T> __device__ inline const char *get_format();
|
||||
|
||||
template <> inline __device__ const char *get_format<int>() { return "%d, "; }
|
||||
template <> __device__ inline const char *get_format<int>() { return "%d, "; }
|
||||
|
||||
template <> inline __device__ const char *get_format<unsigned int>() {
|
||||
template <> __device__ inline const char *get_format<unsigned int>() {
|
||||
return "%u, ";
|
||||
}
|
||||
|
||||
template <> inline __device__ const char *get_format<uint64_t>() {
|
||||
template <> __device__ inline const char *get_format<uint64_t>() {
|
||||
return "%lu, ";
|
||||
}
|
||||
|
||||
@@ -23,6 +23,15 @@ template <typename T> __global__ void print_debug_kernel(const T *src, int N) {
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
__global__ inline void print_debug_kernel(const __uint128_t *src, int N) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
uint64_t low = static_cast<uint64_t>(src[i]);
|
||||
uint64_t high = static_cast<uint64_t>(src[i] >> 64);
|
||||
printf("(%llu, %llu), ", high, low);
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
__global__ inline void print_debug_kernel(const double2 *src, int N) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
|
||||
@@ -9,7 +9,7 @@ void multi_gpu_alloc_array_async(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, std::vector<Torus *> &dest,
|
||||
uint32_t elements_per_gpu,
|
||||
uint64_t *size_tracker_on_gpu_0,
|
||||
uint64_t &size_tracker_on_gpu_0,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
dest.resize(gpu_count);
|
||||
@@ -17,10 +17,10 @@ void multi_gpu_alloc_array_async(cudaStream_t const *streams,
|
||||
uint64_t size_tracker_on_gpu_i = 0;
|
||||
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
elements_per_gpu * sizeof(Torus), streams[i], gpu_indexes[i],
|
||||
&size_tracker_on_gpu_i, allocate_gpu_memory);
|
||||
size_tracker_on_gpu_i, allocate_gpu_memory);
|
||||
dest[i] = d_array;
|
||||
if (i == 0 && size_tracker_on_gpu_0 != nullptr) {
|
||||
*size_tracker_on_gpu_0 = size_tracker_on_gpu_i;
|
||||
if (i == 0) {
|
||||
size_tracker_on_gpu_0 += size_tracker_on_gpu_i;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -46,7 +46,7 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
std::vector<Torus *> &dest, uint32_t num_inputs,
|
||||
uint32_t lwe_size,
|
||||
uint64_t *size_tracker_on_gpu_0,
|
||||
uint64_t &size_tracker_on_gpu_0,
|
||||
bool allocate_gpu_memory) {
|
||||
dest.resize(gpu_count);
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
@@ -54,10 +54,10 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
|
||||
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
|
||||
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
inputs_on_gpu * lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
|
||||
&size_tracker_on_gpu_i, allocate_gpu_memory);
|
||||
size_tracker_on_gpu_i, allocate_gpu_memory);
|
||||
dest[i] = d_array;
|
||||
if (i == 0 && size_tracker_on_gpu_0 != nullptr) {
|
||||
*size_tracker_on_gpu_0 = size_tracker_on_gpu_i;
|
||||
if (i == 0) {
|
||||
size_tracker_on_gpu_0 += size_tracker_on_gpu_i;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -65,7 +65,7 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
|
||||
template void multi_gpu_alloc_lwe_async<__uint128_t>(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, std::vector<__uint128_t *> &dest, uint32_t num_inputs,
|
||||
uint32_t lwe_size, uint64_t *size_tracker_on_gpu_0,
|
||||
uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
/// Allocates the input/output vector for all devices
|
||||
@@ -75,7 +75,7 @@ template <typename Torus>
|
||||
void multi_gpu_alloc_lwe_many_lut_output_async(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, std::vector<Torus *> &dest, uint32_t num_inputs,
|
||||
uint32_t num_many_lut, uint32_t lwe_size, uint64_t *size_tracker_on_gpu_0,
|
||||
uint32_t num_many_lut, uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0,
|
||||
bool allocate_gpu_memory) {
|
||||
dest.resize(gpu_count);
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
@@ -83,10 +83,10 @@ void multi_gpu_alloc_lwe_many_lut_output_async(
|
||||
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
|
||||
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
num_many_lut * inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
|
||||
gpu_indexes[i], &size_tracker, allocate_gpu_memory);
|
||||
gpu_indexes[i], size_tracker, allocate_gpu_memory);
|
||||
dest[i] = d_array;
|
||||
if (i == 0 && size_tracker_on_gpu_0 != nullptr) {
|
||||
*size_tracker_on_gpu_0 = size_tracker;
|
||||
if (i == 0) {
|
||||
size_tracker_on_gpu_0 += size_tracker;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,7 +94,7 @@ __host__ uint64_t scratch_cuda_expand_without_verification(
|
||||
*mem_ptr = new zk_expand_mem<Torus>(
|
||||
streams, gpu_indexes, gpu_count, computing_params, casting_params,
|
||||
casting_key_type, num_lwes_per_compact_list, is_boolean_array,
|
||||
num_compact_lists, allocate_gpu_memory, &size_tracker);
|
||||
num_compact_lists, allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
|
||||
@@ -238,6 +238,85 @@ const _: () = {
|
||||
["Offset of field: CudaRadixCiphertextFFI::lwe_dimension"]
|
||||
[::std::mem::offset_of!(CudaRadixCiphertextFFI, lwe_dimension) - 32usize];
|
||||
};
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct CudaScalarDivisorFFI {
|
||||
pub chosen_multiplier_has_at_least_one_set: *const u64,
|
||||
pub decomposed_chosen_multiplier: *const u64,
|
||||
pub num_scalars: u32,
|
||||
pub active_bits: u32,
|
||||
pub shift_pre: u64,
|
||||
pub shift_post: u32,
|
||||
pub ilog2_chosen_multiplier: u32,
|
||||
pub chosen_multiplier_num_bits: u32,
|
||||
pub is_chosen_multiplier_zero: bool,
|
||||
pub is_abs_chosen_multiplier_one: bool,
|
||||
pub is_chosen_multiplier_negative: bool,
|
||||
pub is_chosen_multiplier_pow2: bool,
|
||||
pub chosen_multiplier_has_more_bits_than_numerator: bool,
|
||||
pub is_chosen_multiplier_geq_two_pow_numerator: bool,
|
||||
pub ilog2_divisor: u32,
|
||||
pub is_divisor_zero: bool,
|
||||
pub is_abs_divisor_one: bool,
|
||||
pub is_divisor_negative: bool,
|
||||
pub is_divisor_pow2: bool,
|
||||
pub divisor_has_more_bits_than_numerator: bool,
|
||||
}
|
||||
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
|
||||
const _: () = {
|
||||
["Size of CudaScalarDivisorFFI"][::std::mem::size_of::<CudaScalarDivisorFFI>() - 64usize];
|
||||
["Alignment of CudaScalarDivisorFFI"][::std::mem::align_of::<CudaScalarDivisorFFI>() - 8usize];
|
||||
["Offset of field: CudaScalarDivisorFFI::chosen_multiplier_has_at_least_one_set"][::std::mem::offset_of!(
|
||||
CudaScalarDivisorFFI,
|
||||
chosen_multiplier_has_at_least_one_set
|
||||
) - 0usize];
|
||||
["Offset of field: CudaScalarDivisorFFI::decomposed_chosen_multiplier"]
|
||||
[::std::mem::offset_of!(CudaScalarDivisorFFI, decomposed_chosen_multiplier) - 8usize];
|
||||
["Offset of field: CudaScalarDivisorFFI::num_scalars"]
|
||||
[::std::mem::offset_of!(CudaScalarDivisorFFI, num_scalars) - 16usize];
|
||||
["Offset of field: CudaScalarDivisorFFI::active_bits"]
|
||||
[::std::mem::offset_of!(CudaScalarDivisorFFI, active_bits) - 20usize];
|
||||
["Offset of field: CudaScalarDivisorFFI::shift_pre"]
|
||||
[::std::mem::offset_of!(CudaScalarDivisorFFI, shift_pre) - 24usize];
|
||||
["Offset of field: CudaScalarDivisorFFI::shift_post"]
|
||||
[::std::mem::offset_of!(CudaScalarDivisorFFI, shift_post) - 32usize];
|
||||
["Offset of field: CudaScalarDivisorFFI::ilog2_chosen_multiplier"]
|
||||
[::std::mem::offset_of!(CudaScalarDivisorFFI, ilog2_chosen_multiplier) - 36usize];
|
||||
["Offset of field: CudaScalarDivisorFFI::chosen_multiplier_num_bits"]
|
||||
[::std::mem::offset_of!(CudaScalarDivisorFFI, chosen_multiplier_num_bits) - 40usize];
|
||||
["Offset of field: CudaScalarDivisorFFI::is_chosen_multiplier_zero"]
|
||||
[::std::mem::offset_of!(CudaScalarDivisorFFI, is_chosen_multiplier_zero) - 44usize];
|
||||
["Offset of field: CudaScalarDivisorFFI::is_abs_chosen_multiplier_one"]
|
||||
[::std::mem::offset_of!(CudaScalarDivisorFFI, is_abs_chosen_multiplier_one) - 45usize];
|
||||
["Offset of field: CudaScalarDivisorFFI::is_chosen_multiplier_negative"]
|
||||
[::std::mem::offset_of!(CudaScalarDivisorFFI, is_chosen_multiplier_negative) - 46usize];
|
||||
["Offset of field: CudaScalarDivisorFFI::is_chosen_multiplier_pow2"]
|
||||
[::std::mem::offset_of!(CudaScalarDivisorFFI, is_chosen_multiplier_pow2) - 47usize];
|
||||
["Offset of field: CudaScalarDivisorFFI::chosen_multiplier_has_more_bits_than_numerator"][::std::mem::offset_of!(
|
||||
CudaScalarDivisorFFI,
|
||||
chosen_multiplier_has_more_bits_than_numerator
|
||||
)
|
||||
- 48usize];
|
||||
["Offset of field: CudaScalarDivisorFFI::is_chosen_multiplier_geq_two_pow_numerator"][::std::mem::offset_of!(
|
||||
CudaScalarDivisorFFI,
|
||||
is_chosen_multiplier_geq_two_pow_numerator
|
||||
)
|
||||
- 49usize];
|
||||
["Offset of field: CudaScalarDivisorFFI::ilog2_divisor"]
|
||||
[::std::mem::offset_of!(CudaScalarDivisorFFI, ilog2_divisor) - 52usize];
|
||||
["Offset of field: CudaScalarDivisorFFI::is_divisor_zero"]
|
||||
[::std::mem::offset_of!(CudaScalarDivisorFFI, is_divisor_zero) - 56usize];
|
||||
["Offset of field: CudaScalarDivisorFFI::is_abs_divisor_one"]
|
||||
[::std::mem::offset_of!(CudaScalarDivisorFFI, is_abs_divisor_one) - 57usize];
|
||||
["Offset of field: CudaScalarDivisorFFI::is_divisor_negative"]
|
||||
[::std::mem::offset_of!(CudaScalarDivisorFFI, is_divisor_negative) - 58usize];
|
||||
["Offset of field: CudaScalarDivisorFFI::is_divisor_pow2"]
|
||||
[::std::mem::offset_of!(CudaScalarDivisorFFI, is_divisor_pow2) - 59usize];
|
||||
["Offset of field: CudaScalarDivisorFFI::divisor_has_more_bits_than_numerator"][::std::mem::offset_of!(
|
||||
CudaScalarDivisorFFI,
|
||||
divisor_has_more_bits_than_numerator
|
||||
) - 60usize];
|
||||
};
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_apply_univariate_lut_kb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
@@ -1007,6 +1086,7 @@ unsafe extern "C" {
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
reduce_degrees_for_single_carry_propagation: bool,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
) -> u64;
|
||||
@@ -1018,7 +1098,6 @@ unsafe extern "C" {
|
||||
gpu_count: u32,
|
||||
radix_lwe_out: *mut CudaRadixCiphertextFFI,
|
||||
radix_lwe_vec: *mut CudaRadixCiphertextFFI,
|
||||
reduce_degrees_for_single_carry_propagation: bool,
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
@@ -1334,54 +1413,6 @@ unsafe extern "C" {
|
||||
gpu_indexes: *const u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_radix_scalar_mul_high_kb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
num_scalar_bits: u32,
|
||||
anticipated_buffer_drop: bool,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_radix_scalar_mul_high_kb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
ct: *mut CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
rhs: u64,
|
||||
decomposed_scalar: *const u64,
|
||||
has_at_least_one_set: *const u64,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
num_scalars: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_radix_scalar_mul_high_kb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_apply_noise_squashing_kb(
|
||||
streams: *const *mut ffi::c_void,
|
||||
@@ -1477,6 +1508,50 @@ unsafe extern "C" {
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
scalar_divisor_ffi: *const CudaScalarDivisorFFI,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
numerator_ct: *mut CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
scalar_divisor_ffi: *const CudaScalarDivisorFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_extend_radix_with_sign_msb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
@@ -1522,6 +1597,153 @@ unsafe extern "C" {
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
scalar_divisor_ffi: *const CudaScalarDivisorFFI,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
numerator_ct: *mut CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
scalar_divisor_ffi: *const CudaScalarDivisorFFI,
|
||||
numerator_bits: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
scalar_divisor_ffi: *const CudaScalarDivisorFFI,
|
||||
active_bits_divisor: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
quotient_ct: *mut CudaRadixCiphertextFFI,
|
||||
remainder_ct: *mut CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
scalar_divisor_ffi: *const CudaScalarDivisorFFI,
|
||||
divisor_has_at_least_one_set: *const u64,
|
||||
decomposed_divisor: *const u64,
|
||||
num_scalars_divisor: u32,
|
||||
clear_blocks: *const ffi::c_void,
|
||||
h_clear_blocks: *const ffi::c_void,
|
||||
num_clear_blocks: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
scalar_divisor_ffi: *const CudaScalarDivisorFFI,
|
||||
active_bits_divisor: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
quotient_ct: *mut CudaRadixCiphertextFFI,
|
||||
remainder_ct: *mut CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
scalar_divisor_ffi: *const CudaScalarDivisorFFI,
|
||||
divisor_has_at_least_one_set: *const u64,
|
||||
decomposed_divisor: *const u64,
|
||||
num_scalars_divisor: u32,
|
||||
numerator_bits: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
pub const KS_TYPE_BIG_TO_SMALL: KS_TYPE = 0;
|
||||
pub const KS_TYPE_SMALL_TO_BIG: KS_TYPE = 1;
|
||||
pub type KS_TYPE = ffi::c_uint;
|
||||
@@ -2116,6 +2338,19 @@ unsafe extern "C" {
|
||||
grouping_factor: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
dest: *mut ffi::c_void,
|
||||
src: *const ffi::c_void,
|
||||
input_lwe_dim: u32,
|
||||
glwe_dim: u32,
|
||||
level_count: u32,
|
||||
polynomial_size: u32,
|
||||
grouping_factor: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
stream: *mut ffi::c_void,
|
||||
@@ -2158,3 +2393,45 @@ unsafe extern "C" {
|
||||
pbs_buffer: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
buffer: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
level_count: u32,
|
||||
input_lwe_ciphertext_count: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
lwe_array_out: *mut ffi::c_void,
|
||||
lwe_output_indexes: *const ffi::c_void,
|
||||
lut_vector: *const ffi::c_void,
|
||||
lut_vector_indexes: *const ffi::c_void,
|
||||
lwe_array_in: *const ffi::c_void,
|
||||
lwe_input_indexes: *const ffi::c_void,
|
||||
bootstrapping_key: *const ffi::c_void,
|
||||
mem_ptr: *mut i8,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
grouping_factor: u32,
|
||||
base_log: u32,
|
||||
level_count: u32,
|
||||
num_samples: u32,
|
||||
num_many_lut: u32,
|
||||
lut_stride: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_multi_bit_programmable_bootstrap_128(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
buffer: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@ extern "C" {
|
||||
|
||||
pub fn cuda_malloc_async(size: u64, stream: *mut c_void, gpu_index: u32) -> *mut c_void;
|
||||
pub fn cuda_check_valid_malloc(size: u64, gpu_index: u32) -> bool;
|
||||
pub fn cuda_device_total_memory(gpu_index: u32) -> u64;
|
||||
|
||||
pub fn cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
dest: *mut c_void,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tfhe-hpu-backend"
|
||||
version = "0.1.0"
|
||||
version = "0.2.0"
|
||||
edition = "2021"
|
||||
license = "BSD-3-Clause-Clear"
|
||||
description = "HPU implementation on FPGA of TFHE-rs primitives."
|
||||
|
||||
@@ -201,9 +201,9 @@ There are some example applications already available in `tfhe/examples/hpu`:
|
||||
In order to run those applications on hardware, user must build from the project root (i.e `tfhe-rs-internal`) with `hpu-v80` features:
|
||||
|
||||
> NB: Running examples required to have correctly pulled the `.hpu` files. Those files, due to their size, are backed by git-lfs and disabled by default.
|
||||
> In order to retrieve them, use the following command:
|
||||
> In order to retrieve them, run the following command from **TFHE-rs** root folder:
|
||||
> ```bash
|
||||
> git lfs pull --include="*" --exclude=""
|
||||
> make pull_hpu_files
|
||||
> ```
|
||||
|
||||
``` bash
|
||||
@@ -217,7 +217,7 @@ source setup_hpu.sh --config v80
|
||||
> NB: Error that occurred when ".hpu" files weren't correctly fetch could be a bit enigmatic: `memory allocation of ... bytes failed`
|
||||
> If you encountered this issue, you should run the following command:
|
||||
> ```bash
|
||||
> git lfs pull --include="*" --exclude=""
|
||||
> make pull_hpu_files
|
||||
> ```
|
||||
|
||||
|
||||
|
||||
@@ -49,7 +49,8 @@ offset= 0x10
|
||||
owner="Parameter"
|
||||
read_access="Read"
|
||||
write_access="None"
|
||||
default={Param="VERSION"}
|
||||
field.major={size_b=4, default={Param="VERSION_MAJOR"}, description="RTL major version"}
|
||||
field.minor={size_b=4, default={Param="VERSION_MINOR"}, description="RTL minor version"}
|
||||
|
||||
[section.info.register.ntt_architecture]
|
||||
description="NTT architecture"
|
||||
@@ -254,3 +255,15 @@ description="BPIP configuration"
|
||||
read_access="Read"
|
||||
write_access="Write"
|
||||
default={Cst=0xffffffff}
|
||||
|
||||
# =====================================================================================================================
|
||||
[section.keyswitch]
|
||||
offset= 0x3000
|
||||
description="Keyswitch Configuration"
|
||||
|
||||
[section.keyswitch.register.config]
|
||||
description="(1) Use use modulus switching mean compensation. (default), (0) Don't use modulus switching mean compensation."
|
||||
owner="User"
|
||||
read_access="Read"
|
||||
write_access="Write"
|
||||
field.mod_switch_mean_comp = { size_b=1, offset_b=0 , default={Cst=1}, description="Controls whether to use modulus switch mean compensation, aka. Mayeul's Trick."}
|
||||
|
||||
@@ -49,7 +49,8 @@ offset= 0x10
|
||||
owner="Parameter"
|
||||
read_access="Read"
|
||||
write_access="None"
|
||||
default={Param="VERSION"}
|
||||
field.major={size_b=4, default={Param="VERSION_MAJOR"}, description="RTL major version"}
|
||||
field.minor={size_b=4, default={Param="VERSION_MINOR"}, description="RTL minor version"}
|
||||
|
||||
[section.info.register.ntt_architecture]
|
||||
description="NTT architecture"
|
||||
@@ -254,3 +255,15 @@ description="BPIP configuration"
|
||||
read_access="Read"
|
||||
write_access="Write"
|
||||
default={Cst=0xffffffff}
|
||||
|
||||
# =====================================================================================================================
|
||||
[section.keyswitch]
|
||||
offset= 0x3000
|
||||
description="Keyswitch Configuration"
|
||||
|
||||
[section.keyswitch.register.config]
|
||||
description="(1) Use use modulus switching mean compensation. (default), (0) Don't use modulus switching mean compensation."
|
||||
owner="User"
|
||||
read_access="Read"
|
||||
write_access="Write"
|
||||
field.mod_switch_mean_comp = { size_b=1, offset_b=0 , default={Cst=1}, description="Controls whether to use modulus switch mean compensation, aka. Mayeul's Trick."}
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:cb9ebedd0987130c4f6e1ef09f279d92f083815c1383da4b257198a33ab4881e
|
||||
size 80293531
|
||||
oid sha256:0a0798a1170982be0ce714bbf0d4cdfbe3c069e328e8847053c20d7b9b347ef6
|
||||
size 83225193
|
||||
|
||||
@@ -2,8 +2,9 @@ pub(crate) mod traits;
|
||||
|
||||
pub mod parameters;
|
||||
pub use parameters::{
|
||||
HpuIscParameters, HpuKeyswitchParameters, HpuNoiseDistributionInput, HpuNttCoreArch,
|
||||
HpuNttParameters, HpuPBSParameters, HpuParameters, HpuPcParameters, HpuRegfileParameters,
|
||||
HpuIscParameters, HpuKeyswitchParameters, HpuModulusSwitchType, HpuNoiseDistributionInput,
|
||||
HpuNttCoreArch, HpuNttParameters, HpuPBSParameters, HpuParameters, HpuPcParameters,
|
||||
HpuRegfileParameters,
|
||||
};
|
||||
|
||||
pub mod glwe_ciphertext;
|
||||
|
||||
@@ -8,6 +8,12 @@ pub enum HpuNoiseDistributionInput {
|
||||
TUniformBound(u32),
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum HpuModulusSwitchType {
|
||||
Standard,
|
||||
CenteredMeanNoiseReduction,
|
||||
}
|
||||
|
||||
/// Parameters related to Tfhe scheme computation
|
||||
/// Couldn't rely on ClassicPBSParameters to prevent dependency loop
|
||||
#[derive(Clone, Copy, Debug, PartialEq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -24,7 +30,10 @@ pub struct HpuPBSParameters {
|
||||
pub message_width: usize,
|
||||
pub carry_width: usize,
|
||||
pub ciphertext_width: usize,
|
||||
pub log2_p_fail: f64,
|
||||
pub modulus_switch_type: HpuModulusSwitchType,
|
||||
}
|
||||
|
||||
// Manual implementation of Eq trait
|
||||
// Indeed, we can handle strict comparison of f64
|
||||
impl std::cmp::Eq for HpuPBSParameters {}
|
||||
|
||||
@@ -482,6 +482,7 @@ pub fn iop_propagate_msb_to_lsb_blockv(
|
||||
// (op_nb_bool**k)*proc_nb
|
||||
//assert_eq!(g_a.len(),props.blk_w());
|
||||
let grp_nb = g_a.len().div_ceil(proc_nb);
|
||||
let mut level_nb = 0;
|
||||
let mut stride_size: usize = 1; // in group unit
|
||||
while stride_size < grp_nb {
|
||||
for chk in g_a.chunks_mut(op_nb_bool * stride_size * proc_nb) {
|
||||
@@ -499,31 +500,69 @@ pub fn iop_propagate_msb_to_lsb_blockv(
|
||||
}
|
||||
|
||||
stride_size *= op_nb_bool;
|
||||
level_nb += 1;
|
||||
}
|
||||
|
||||
// This code was written for a limited size, due the following
|
||||
// leveled additions.
|
||||
assert!(level_nb < op_nb_bool);
|
||||
|
||||
// Third step
|
||||
// Apply
|
||||
g_a.chunks_mut(proc_nb).rev().fold(None, |acc, chk| {
|
||||
if let Some(x) = acc {
|
||||
let mut neigh_a: Vec<metavar::MetaVarCell> = Vec::new();
|
||||
for _i in 1..level_nb {
|
||||
neigh_a.push(prog.new_cst(0));
|
||||
}
|
||||
|
||||
let mut neigh = prog.new_cst(0);
|
||||
let mut prev = None;
|
||||
g_a.chunks_mut(proc_nb)
|
||||
.enumerate()
|
||||
.rev()
|
||||
.for_each(|(chk_idx, chk)| {
|
||||
let keep_v0 = chk[0].clone();
|
||||
|
||||
let all_neigh = if let Some(x) = &prev {
|
||||
&neigh + x
|
||||
} else {
|
||||
neigh.clone()
|
||||
};
|
||||
|
||||
for (idx, v) in chk.iter_mut().enumerate() {
|
||||
if idx == 0 {
|
||||
// [0] is already complete.
|
||||
// Need to inverse it for 0 if needed
|
||||
if inverse_output.unwrap_or(false) {
|
||||
*v = v.pbs(&pbs_is_null, false);
|
||||
}
|
||||
// [0] is already complete with prev.
|
||||
// do not need to add prev
|
||||
*v = &*v + &neigh;
|
||||
} else {
|
||||
*v = &*v + x;
|
||||
if inverse_output.unwrap_or(false) {
|
||||
*v = v.pbs(&pbs_is_null, false);
|
||||
} else {
|
||||
*v = v.pbs(&pbs_not_null, false);
|
||||
}
|
||||
*v = &*v + &all_neigh;
|
||||
}
|
||||
// Need to inverse it for 0 if needed
|
||||
if inverse_output.unwrap_or(false) {
|
||||
*v = v.pbs(&pbs_is_null, false);
|
||||
} else {
|
||||
*v = v.pbs(&pbs_not_null, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
Some(&chk[0])
|
||||
});
|
||||
|
||||
// For next chunk
|
||||
prev = Some(keep_v0.clone());
|
||||
|
||||
// Update neighbors for next iteration
|
||||
let mut do_update_neigh = false;
|
||||
for i in 1..(level_nb as u32) {
|
||||
if (chk_idx % op_nb_bool.pow(i)) == 0 {
|
||||
// Update the corresponding neigh value
|
||||
neigh_a[(i - 1) as usize] = keep_v0.clone();
|
||||
do_update_neigh = true;
|
||||
}
|
||||
}
|
||||
if do_update_neigh {
|
||||
neigh = neigh_a[0].clone();
|
||||
for n in neigh_a.iter().skip(1) {
|
||||
neigh = &neigh + n;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
if inverse_direction.unwrap_or(false) {
|
||||
g_a.reverse();
|
||||
|
||||
@@ -340,6 +340,7 @@ pub fn iop_mulx(
|
||||
let pbs_carry = pbs_by_name!("CarryInMsg");
|
||||
let pbs_mul_lsb = pbs_by_name!("MultCarryMsgLsb");
|
||||
let pbs_mul_msb = pbs_by_name!("MultCarryMsgMsb");
|
||||
let max_carry = (props.max_msg() * props.max_msg()) >> props.msg_w;
|
||||
|
||||
let mut mul_map: HashMap<usize, Vec<VarCellDeg>> = HashMap::new();
|
||||
itertools::iproduct!(0..blk_w, 0..blk_w).for_each(|(i, j)| {
|
||||
@@ -349,11 +350,11 @@ pub fn iop_mulx(
|
||||
mul_map
|
||||
.entry(i + j)
|
||||
.or_default()
|
||||
.push(VarCellDeg::new(props.max_msg(), lsb));
|
||||
.push(VarCellDeg::new(max_carry, lsb));
|
||||
mul_map
|
||||
.entry(i + j + 1)
|
||||
.or_default()
|
||||
.push(VarCellDeg::new(props.max_msg(), msb));
|
||||
.push(VarCellDeg::new(max_carry, msb));
|
||||
});
|
||||
|
||||
for (blk, dst) in dst.iter_mut().enumerate() {
|
||||
@@ -370,6 +371,8 @@ pub fn iop_mulx(
|
||||
};
|
||||
|
||||
while to_sum.len() > 1 {
|
||||
let prev_len = to_sum.len();
|
||||
|
||||
to_sum = to_sum
|
||||
.deg_chunks(&max_deg)
|
||||
// Leveled Sum
|
||||
@@ -405,7 +408,7 @@ pub fn iop_mulx(
|
||||
// This will be very unlikely, but if it ever happened it would have hanged
|
||||
// the whole loop. Also, the output needs to be bootstrapped,
|
||||
// anyway.
|
||||
to_sum.0.iter().all(|x| x.deg.nu > 1).then(|| {
|
||||
(to_sum.0.iter().all(|x| x.deg.nu > 1) || prev_len == to_sum.len()).then(|| {
|
||||
let max = to_sum.max_mut().unwrap();
|
||||
*max = bootstrap(max);
|
||||
});
|
||||
|
||||
@@ -109,7 +109,7 @@ impl VecVarCellDeg {
|
||||
mut self,
|
||||
max_deg: &VarDeg,
|
||||
) -> <Vec<Vec<VarCellDeg>> as IntoIterator>::IntoIter {
|
||||
trace!(target: "ilp:deg_chunks", "len: {:?}, {:?}", self.len(), self.0);
|
||||
trace!(target: "llt:deg_chunks", "len: {:?}, {:?}", self.len(), self.0);
|
||||
|
||||
let mut res: Vec<Vec<VarCellDeg>> = Vec::new();
|
||||
let mut acc: VarDeg = VarDeg::default();
|
||||
@@ -130,8 +130,8 @@ impl VecVarCellDeg {
|
||||
acc = VarDeg::default();
|
||||
chunk = Vec::new();
|
||||
}
|
||||
trace!(target: "ilp:deg_chunks:loop", "len: {:?}, {:?}, chunk: {:?},
|
||||
acc: {:?}", self.len(), self.0, chunk, acc);
|
||||
trace!(target: "llt:deg_chunks:loop", "len: {:?}, {:?}, chunk: {:?}, acc: {:?}",
|
||||
self.len(), self.0, chunk, acc);
|
||||
}
|
||||
|
||||
// Any remaining chunk is appended
|
||||
@@ -139,6 +139,8 @@ impl VecVarCellDeg {
|
||||
res.push(chunk);
|
||||
}
|
||||
|
||||
trace!(target: "llt:deg_chunks:ret", "res: {:?}", res);
|
||||
|
||||
res.into_iter()
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ impl Event {
|
||||
}
|
||||
}
|
||||
|
||||
/// Event are stored in a BinaryHeap and we want to pop the smallest one firs
|
||||
/// Event are stored in a BinaryHeap and we want to pop the smallest one first
|
||||
/// Thuse Ord trait is implemented in a "reverse".
|
||||
impl Ord for Event {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
|
||||
@@ -1224,7 +1224,7 @@ impl std::ops::ShlAssign<&VarCell> for VarCell {
|
||||
|
||||
// I was expecting more events to be waited for...
|
||||
bitflags! {
|
||||
#[derive(Clone)]
|
||||
#[derive(Clone, Debug)]
|
||||
struct WaitEvents: u8 {
|
||||
const RdUnlock = 0x1;
|
||||
}
|
||||
@@ -1253,6 +1253,7 @@ struct Arch {
|
||||
// could be re-used in other contexts outside our HPU firmware generation
|
||||
impl Arch {
|
||||
// interface
|
||||
#[instrument(level = "trace", skip(self, op))]
|
||||
pub fn try_dispatch(&mut self, op: BinaryHeap<OperationCell>) -> BinaryHeap<OperationCell> {
|
||||
// Postpone scheduling high latency operations until there's no other
|
||||
// option to keep everything going. This is very heuristic, so this
|
||||
@@ -1338,8 +1339,14 @@ impl Arch {
|
||||
.max()
|
||||
}
|
||||
|
||||
#[instrument(level = "trace", skip(self))]
|
||||
pub fn done(&mut self) -> Option<OperationCell> {
|
||||
assert!(!self.events.is_empty());
|
||||
if self.events.is_empty() {
|
||||
// It can happen that for lack of registers, the PE cannot be
|
||||
// filled. In that case, try a forced flush
|
||||
self.probe_for_exec(Some(PeFlush::Force));
|
||||
assert!(!self.events.is_empty());
|
||||
}
|
||||
|
||||
let waiting_for = self.waiting_for.clone();
|
||||
let mut waiting = (true, None);
|
||||
@@ -1350,6 +1357,7 @@ impl Arch {
|
||||
trace!("rd_pdg: {:?}", self.rd_pdg);
|
||||
trace!("queued: {:?}", self.queued);
|
||||
trace!("wr_pdg: {:?}", self.wr_pdg);
|
||||
trace!("waiting: {:?}", self.waiting_for);
|
||||
trace!("---------------------------------------");
|
||||
|
||||
let event = {
|
||||
@@ -1576,7 +1584,6 @@ impl Rtl {
|
||||
|
||||
if let Some(op) = arch.done() {
|
||||
trace!("Removing {:?}", &op);
|
||||
// Done is consumed here
|
||||
let new = op.remove();
|
||||
trace!("new ready op {:?}", &new);
|
||||
todo.extend(new.into_iter());
|
||||
|
||||
@@ -286,6 +286,8 @@ pub const CONCRETE_BOOLEAN: HpuPBSParameters = HpuPBSParameters {
|
||||
message_width: 1,
|
||||
carry_width: 0,
|
||||
ciphertext_width: 32,
|
||||
log2_p_fail: -64.0,
|
||||
modulus_switch_type: parameters::HpuModulusSwitchType::Standard,
|
||||
};
|
||||
|
||||
pub const MSG2_CARRY2: HpuPBSParameters = HpuPBSParameters {
|
||||
@@ -301,6 +303,8 @@ pub const MSG2_CARRY2: HpuPBSParameters = HpuPBSParameters {
|
||||
message_width: 2,
|
||||
carry_width: 2,
|
||||
ciphertext_width: u64::BITS as usize,
|
||||
log2_p_fail: -64.0,
|
||||
modulus_switch_type: parameters::HpuModulusSwitchType::Standard,
|
||||
};
|
||||
|
||||
pub const MSG2_CARRY2_64B: HpuPBSParameters = HpuPBSParameters {
|
||||
@@ -316,6 +320,8 @@ pub const MSG2_CARRY2_64B: HpuPBSParameters = HpuPBSParameters {
|
||||
message_width: 2,
|
||||
carry_width: 2,
|
||||
ciphertext_width: u64::BITS as usize,
|
||||
log2_p_fail: -64.0,
|
||||
modulus_switch_type: parameters::HpuModulusSwitchType::Standard,
|
||||
};
|
||||
|
||||
pub const MSG2_CARRY2_44B: HpuPBSParameters = HpuPBSParameters {
|
||||
@@ -333,6 +339,8 @@ pub const MSG2_CARRY2_44B: HpuPBSParameters = HpuPBSParameters {
|
||||
message_width: 2,
|
||||
carry_width: 2,
|
||||
ciphertext_width: 44,
|
||||
log2_p_fail: -64.0,
|
||||
modulus_switch_type: parameters::HpuModulusSwitchType::Standard,
|
||||
};
|
||||
|
||||
pub const MSG2_CARRY2_64B_FAKE: HpuPBSParameters = HpuPBSParameters {
|
||||
@@ -350,6 +358,8 @@ pub const MSG2_CARRY2_64B_FAKE: HpuPBSParameters = HpuPBSParameters {
|
||||
message_width: 2,
|
||||
carry_width: 2,
|
||||
ciphertext_width: 64,
|
||||
log2_p_fail: -64.0,
|
||||
modulus_switch_type: parameters::HpuModulusSwitchType::Standard,
|
||||
};
|
||||
|
||||
pub const MSG2_CARRY2_GAUSSIAN: HpuPBSParameters = HpuPBSParameters {
|
||||
@@ -367,6 +377,8 @@ pub const MSG2_CARRY2_GAUSSIAN: HpuPBSParameters = HpuPBSParameters {
|
||||
message_width: 2,
|
||||
carry_width: 2,
|
||||
ciphertext_width: 64,
|
||||
log2_p_fail: -64.0,
|
||||
modulus_switch_type: parameters::HpuModulusSwitchType::Standard,
|
||||
};
|
||||
|
||||
pub const MSG2_CARRY2_TUNIFORM: HpuPBSParameters = HpuPBSParameters {
|
||||
@@ -384,6 +396,8 @@ pub const MSG2_CARRY2_TUNIFORM: HpuPBSParameters = HpuPBSParameters {
|
||||
message_width: 2,
|
||||
carry_width: 2,
|
||||
ciphertext_width: 64,
|
||||
log2_p_fail: -64.0,
|
||||
modulus_switch_type: parameters::HpuModulusSwitchType::Standard,
|
||||
};
|
||||
|
||||
pub const MSG2_CARRY2_PFAIL64_132B_GAUSSIAN_1F72DBA: HpuPBSParameters = HpuPBSParameters {
|
||||
@@ -399,6 +413,8 @@ pub const MSG2_CARRY2_PFAIL64_132B_GAUSSIAN_1F72DBA: HpuPBSParameters = HpuPBSPa
|
||||
message_width: 2,
|
||||
carry_width: 2,
|
||||
ciphertext_width: 64,
|
||||
log2_p_fail: -64.0,
|
||||
modulus_switch_type: parameters::HpuModulusSwitchType::Standard,
|
||||
};
|
||||
|
||||
pub const MSG2_CARRY2_PFAIL64_132B_TUNIFORM_7E47D8C: HpuPBSParameters = HpuPBSParameters {
|
||||
@@ -414,6 +430,25 @@ pub const MSG2_CARRY2_PFAIL64_132B_TUNIFORM_7E47D8C: HpuPBSParameters = HpuPBSPa
|
||||
message_width: 2,
|
||||
carry_width: 2,
|
||||
ciphertext_width: 64,
|
||||
log2_p_fail: -64.0,
|
||||
modulus_switch_type: parameters::HpuModulusSwitchType::Standard,
|
||||
};
|
||||
|
||||
pub const MSG2_CARRY2_PFAIL128_132B_TUNIFORM_144A47: HpuPBSParameters = HpuPBSParameters {
|
||||
lwe_dimension: 879,
|
||||
glwe_dimension: 1,
|
||||
polynomial_size: 2048,
|
||||
lwe_noise_distribution: HpuNoiseDistributionInput::TUniformBound(3),
|
||||
glwe_noise_distribution: HpuNoiseDistributionInput::TUniformBound(17),
|
||||
pbs_base_log: 23,
|
||||
pbs_level: 1,
|
||||
ks_base_log: 2,
|
||||
ks_level: 8,
|
||||
message_width: 2,
|
||||
carry_width: 2,
|
||||
ciphertext_width: 64,
|
||||
log2_p_fail: -128.0,
|
||||
modulus_switch_type: parameters::HpuModulusSwitchType::CenteredMeanNoiseReduction,
|
||||
};
|
||||
|
||||
impl FromRtl for HpuPBSParameters {
|
||||
@@ -456,6 +491,7 @@ impl FromRtl for HpuPBSParameters {
|
||||
11 => MSG2_CARRY2_TUNIFORM,
|
||||
12 => MSG2_CARRY2_PFAIL64_132B_GAUSSIAN_1F72DBA,
|
||||
13 => MSG2_CARRY2_PFAIL64_132B_TUNIFORM_7E47D8C,
|
||||
14 => MSG2_CARRY2_PFAIL128_132B_TUNIFORM_144A47,
|
||||
_ => panic!("Unknown TfheAppName encoding"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,5 +12,5 @@
|
||||
"n3-H100x4": 6.08,
|
||||
"n3-H100x2": 3.04,
|
||||
"n3-L40x1": 0.80,
|
||||
"n3-H100x8-SXM5": 24
|
||||
"n3-H100x8-SXM5": 19.2
|
||||
}
|
||||
|
||||
@@ -11,6 +11,8 @@
|
||||
message_width= 2
|
||||
carry_width= 2
|
||||
ciphertext_width= 44
|
||||
log2_p_fail=-64
|
||||
modulus_switch_type= "Standard"
|
||||
|
||||
[ntt_params]
|
||||
core_arch="WmmUnfoldPcg"
|
||||
|
||||
@@ -11,6 +11,8 @@
|
||||
message_width=2
|
||||
carry_width=2
|
||||
ciphertext_width=44
|
||||
log2_p_fail=-64
|
||||
modulus_switch_type= "Standard"
|
||||
|
||||
[ntt_params]
|
||||
core_arch="WmmCompactPcg"
|
||||
|
||||
@@ -11,6 +11,8 @@
|
||||
message_width= 2
|
||||
carry_width= 2
|
||||
ciphertext_width= 64
|
||||
log2_p_fail=-64
|
||||
modulus_switch_type= "Standard"
|
||||
|
||||
[ntt_params]
|
||||
core_arch= {GF64=[5,5]}
|
||||
|
||||
@@ -11,6 +11,8 @@
|
||||
message_width=2
|
||||
carry_width=2
|
||||
ciphertext_width=64
|
||||
log2_p_fail=-64
|
||||
modulus_switch_type= "Standard"
|
||||
|
||||
[ntt_params]
|
||||
core_arch= {GF64=[5,5]}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user