Compare commits

..

51 Commits

Author SHA1 Message Date
Guillermo Oyarzun
35b3430404 chore(gpu): fix some pke bugs 2025-06-19 12:22:39 +02:00
Guillermo Oyarzun
a56a8107ab chore(gpu): reduce the pool size 2025-06-17 13:20:58 +02:00
Guillermo Oyarzun
b322708cbd chore(gpu): add drift to pke gpu tests 2025-06-16 17:36:11 +02:00
Guillermo Oyarzun
25da5ef721 chore(gpu): add pke gpu tests 2025-06-16 15:17:54 +02:00
Guillermo Oyarzun
43b95aaf7c chore(gpu): fix input_br variance formula for multi-bit 2025-06-16 12:27:27 +02:00
Guillermo Oyarzun
cfc19d3bd2 chore(gpu): correct ms formula in pbs128 2025-06-16 12:27:27 +02:00
Guillermo Oyarzun
51427fc9ae chore(gpu): add multiplication and extra logs to pbs128 test 2025-06-16 12:27:26 +02:00
Guillermo Oyarzun
a45baaa3d0 chore(gpu): fixes after rebase 2025-06-16 12:27:26 +02:00
Guillermo Oyarzun
a24adf528b chore(gpu): use the m128 params 2025-06-16 12:27:26 +02:00
Guillermo Oyarzun
3ddf9bdba6 chore(gpu): fix pbs128 after rebasing 2025-06-16 12:27:25 +02:00
Guillermo Oyarzun
de4902fb9f chore(gpu): add noise squashing params without using it 2025-06-16 12:27:25 +02:00
Guillermo Oyarzun
b89bca0d13 chore(gpu): fix pbs128 with old params 2025-06-16 12:27:25 +02:00
Guillermo Oyarzun
f0dc0e18ab chore(gpu): add pbs128 2025-06-16 12:27:24 +02:00
Guillermo Oyarzun
98ca66581e chore(gpu): fix errors after rebasing 2025-06-16 12:27:24 +02:00
Guillermo Oyarzun
c94ccc3a23 feat(gpu): fix bugs and add extra logs 2025-06-16 12:27:24 +02:00
Guillermo Oyarzun
7e6573a1d2 feat(gpu): fix compression tests after rebase 2025-06-16 12:27:23 +02:00
Guillermo Oyarzun
94ff21b089 feat(gpu): add noise checks with multi-bit pbs 2025-06-16 12:27:23 +02:00
Guillermo Oyarzun
71420f0d92 feat(gpu): add noise checks 2025-06-16 12:27:23 +02:00
Arthur Meyre
d81bd4ebd6 wip: long pfail runs 2025-06-16 12:27:22 +02:00
Nicolas Sarlin
83d1d6a46c wip: fix packing ks noise formula 2025-06-16 12:27:22 +02:00
Nicolas Sarlin
ac693f97e1 wip: remove check that makes compression pfail test fail 2025-06-16 12:27:22 +02:00
Nicolas Sarlin
d06656cfb4 wip: remove check that makes compression pfail test fail 2025-06-16 12:27:21 +02:00
Nicolas Sarlin
472ea682ae wip: update noise formulas 2025-06-16 12:27:21 +02:00
Nicolas Sarlin
258524f5e2 chore(zk): add noise tests for zkv1 2025-06-16 12:27:21 +02:00
Nicolas Sarlin
6b98865515 fix: new param naming 2025-06-16 12:27:20 +02:00
Arthur Meyre
e47731b1ee fix(shortint): fix sample count in pbs128 pfail 2025-06-16 12:27:20 +02:00
Arthur Meyre
d1b9bc676d test(shortint): add variance check after KS in classic PBS AP 2025-06-16 12:27:19 +02:00
Arthur Meyre
405952e323 test(shortint): add normality check after KS in classic AP 2025-06-16 12:27:19 +02:00
Arthur Meyre
7556a8e05f fix(test): fix slighlty wrong log message 2025-06-16 12:27:19 +02:00
Arthur Meyre
ebda1426e4 fix(test): fix test parameters for PBS 128
Disable the bound check as our computation disagrees with the RO
2025-06-16 12:27:18 +02:00
Arthur Meyre
da91075b26 test(shortint): add pfail estimation after ms in compression 2025-06-16 12:27:18 +02:00
Arthur Meyre
f95eb2cf2c wip: re-exported pbs 128 with symbolic mantissa
- add pbs 128 params with a mantissa setting
- noise with this settings is in line with RO prediction
2025-06-16 12:27:17 +02:00
Arthur Meyre
45da14c7dd wip: add pbs 128 tests (noise and pfail) 2025-06-16 12:27:16 +02:00
Arthur Meyre
75e03ae800 feat(core): add pbs 128 noise formulas 2025-06-16 12:27:16 +02:00
Arthur Meyre
9976cbe1f2 test(shortint): add pfail measurement for full compression + AP 2025-06-16 12:27:15 +02:00
Arthur Meyre
a9006486e8 test(shortint): slightly change decryption logic to also get the padding bit out 2025-06-16 12:27:14 +02:00
Arthur Meyre
7f8778f178 test(shortint): add pfail check for first part of compression 2025-06-16 12:27:14 +02:00
Arthur Meyre
370e4ae2e6 test(shortint): add first part of the compression AP 2025-06-16 12:27:13 +02:00
Arthur Meyre
b6db9d8ba0 chore: rename helper function to match test usage 2025-06-16 12:27:12 +02:00
Arthur Meyre
54b139b1b5 feat(core): add noise formula for packing keyswitch 2025-06-16 12:27:12 +02:00
Arthur Meyre
d3ccf08f2c test(shortint): add pfail test for PKE -> KS -> (DP -> KS Compute) -> MS 2025-06-16 12:27:11 +02:00
Arthur Meyre
6f1492766f refactor(test): also accept PKE -> KS to big -> DP -> KS -> MS 2025-06-16 12:27:10 +02:00
Arthur Meyre
03fa607209 test(shortint): add noise check for PKE -> KS to small key + MS 2025-06-16 12:27:09 +02:00
Arthur Meyre
29ab6c0709 wip: hotfix for borrow mut error, this needs a design pass/reflection 2025-06-16 12:27:09 +02:00
Arthur Meyre
1c0b428cd3 feat: add noise formulas for TUniform 132 bits of security 2025-06-16 12:27:08 +02:00
Arthur Meyre
dab2d39749 test(hl): add noise check for CompactPublicKey encryption w/ TUniform param 2025-06-16 12:27:07 +02:00
Arthur Meyre
ea81ef5d15 test(shortint): add pfail test for the classic AP 2025-06-16 12:27:06 +02:00
Arthur Meyre
57a7a5a084 refactor: prepare code factorization for noise and pfail 2025-06-16 12:27:06 +02:00
Arthur Meyre
9010ded3d5 test: add shortint atomic pattern noise measurement 2025-06-16 12:27:05 +02:00
Arthur Meyre
1ebd2848ad test(hl): test encryption noise of FheUint ciphertext in HL API 2025-06-16 12:27:04 +02:00
Nicolas Sarlin
6a1a024e6d chore(zk)!: store inside the pke params the supported zk scheme
BREAKING_CHANGE:
- Zk for compact PKE now requires dedicated encryption parameters
2025-06-16 12:27:04 +02:00
607 changed files with 20315 additions and 27114 deletions

2
.gitattributes vendored
View File

@@ -1,3 +1 @@
*.hpu filter=lfs diff=lfs merge=lfs -text
*.bcode filter=lfs diff=lfs merge=lfs -text
*.cbor filter=lfs diff=lfs merge=lfs -text

View File

@@ -71,26 +71,45 @@ jobs:
with:
toolchain: stable
# Cache key is an aggregated hash of lfs files hashes
- name: Get LFS data sha
id: hash-lfs-data
- name: Use specific data branch
if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
env:
PR_BRANCH: ${{ github.head_ref || github.ref_name }}
run: |
SHA=$(git lfs ls-files -l -I utils/tfhe-backward-compat-data | sha256sum | cut -d' ' -f1)
echo "BACKWARD_COMPAT_DATA_BRANCH=${PR_BRANCH}" >> "${GITHUB_ENV}"
- name: Get backward compat branch
id: backward_compat_branch
run: |
BRANCH="$(make backward_compat_branch)"
echo "branch=${BRANCH}" >> "${GITHUB_OUTPUT}"
- name: Get backward compat branch head SHA
id: backward_compat_sha
run: |
SHA=$(git ls-remote "${REPO_URL}" refs/heads/"${BACKWARD_COMPAT_BRANCH}" | awk '{print $1}')
echo "sha=${SHA}" >> "${GITHUB_OUTPUT}"
env:
REPO_URL: "https://github.com/zama-ai/tfhe-backward-compat-data"
BACKWARD_COMPAT_BRANCH: ${{ steps.backward_compat_branch.outputs.branch }}
- name: Retrieve data from cache
id: retrieve-data-cache
uses: actions/cache/restore@5a3ec84eff668545956fd18022155c47e93e2684 #v4.2.3
with:
path: |
utils/tfhe-backward-compat-data/**/*.cbor
utils/tfhe-backward-compat-data/**/*.bcode
key: ${{ steps.hash-lfs-data.outputs.sha }}
path: tests/tfhe-backward-compat-data
key: ${{ steps.backward_compat_branch.outputs.branch }}_${{ steps.backward_compat_sha.outputs.sha }}
- name: Pull test data
- name: Clone test data
if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
run: |
make pull_backward_compat_data
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
repository: zama-ai/tfhe-backward-compat-data
path: tests/tfhe-backward-compat-data
lfs: 'true'
ref: ${{ steps.backward_compat_branch.outputs.branch }}
- name: Run backward compatibility tests
run: |
@@ -101,10 +120,8 @@ jobs:
continue-on-error: true
uses: actions/cache/save@5a3ec84eff668545956fd18022155c47e93e2684 #v4.2.3
with:
path: |
utils/tfhe-backward-compat-data/**/*.cbor
utils/tfhe-backward-compat-data/**/*.bcode
key: ${{ steps.hash-lfs-data.outputs.sha }}
path: tests/tfhe-backward-compat-data
key: ${{ steps.backward_compat_branch.outputs.branch }}_${{ steps.backward_compat_sha.outputs.sha }}
- name: Set pull-request URL
if: ${{ failure() && github.event_name == 'pull_request' }}

View File

@@ -103,7 +103,7 @@ jobs:
name: Unsigned integer tests
needs: setup-instance
concurrency:
group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
group: ${{ github.workflow_ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:

View File

@@ -104,7 +104,7 @@ jobs:
name: Signed integer tests
needs: setup-instance
concurrency:
group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
group: ${{ github.workflow_ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:

View File

@@ -31,7 +31,6 @@ on:
- ks
- ks_pbs
- integer_zk
- hlapi_noise_squash
op_flavor:
description: "Operations set to run"
type: choice

View File

@@ -10,16 +10,37 @@ on:
permissions: {}
jobs:
run-benchmarks-8-h100-sxm5-integer:
name: Run integer benchmarks (8xH100-SXM5)
run-benchmarks-1-h100:
name: Run integer benchmarks (1xH100)
if: github.repository == 'zama-ai/tfhe-rs'
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
profile: multi-h100-sxm5
hardware_name: n3-H100x8-SXM5
profile: single-h100
hardware_name: n3-H100x1
command: integer,integer_multi_bit
op_flavor: default
bench_type: latency
all_precisions: true
secrets:
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
SLAB_URL: ${{ secrets.SLAB_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
run-benchmarks-2-h100:
name: Run integer benchmarks (2xH100)
if: github.repository == 'zama-ai/tfhe-rs'
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
profile: 2-h100
hardware_name: n3-H100x2
command: integer_multi_bit
op_flavor: default
bench_type: both
bench_type: latency
all_precisions: true
secrets:
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
@@ -31,16 +52,16 @@ jobs:
SLAB_URL: ${{ secrets.SLAB_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
run-benchmarks-8-h100-sxm5-integer-compression:
name: Run integer compression benchmarks (8xH100-SXM5)
run-benchmarks-8-h100:
name: Run integer benchmarks (8xH100)
if: github.repository == 'zama-ai/tfhe-rs'
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
profile: multi-h100-sxm5
hardware_name: n3-H100x8-SXM5
command: integer_compression
profile: multi-h100
hardware_name: n3-H100x8
command: integer_multi_bit
op_flavor: default
bench_type: both
bench_type: latency
all_precisions: true
secrets:
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
@@ -52,37 +73,16 @@ jobs:
SLAB_URL: ${{ secrets.SLAB_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
run-benchmarks-8-h100-sxm5-integer-zk:
name: Run integer zk benchmarks (8xH100-SXM5)
run-benchmarks-l40:
name: Run integer benchmarks (L40)
if: github.repository == 'zama-ai/tfhe-rs'
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
profile: multi-h100-sxm5
hardware_name: n3-H100x8-SXM5
command: integer_zk
profile: l40
hardware_name: n3-L40x1
command: integer_multi_bit,integer_compression,pbs,ks
op_flavor: default
bench_type: both
all_precisions: true
secrets:
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
SLAB_URL: ${{ secrets.SLAB_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
run-benchmarks-8-h100-sxm5-noise-squash:
name: Run integer zk benchmarks (8xH100-SXM5)
if: github.repository == 'zama-ai/tfhe-rs'
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
profile: multi-h100-sxm5
hardware_name: n3-H100x8-SXM5
command: hlapi_noise_squash
op_flavor: default
bench_type: both
bench_type: latency
all_precisions: true
secrets:
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}

View File

@@ -33,7 +33,6 @@ jobs:
with:
fetch-depth: 0
persist-credentials: 'false'
lfs: true
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Get benchmark details
@@ -62,7 +61,6 @@ jobs:
- name: Run benchmarks
run: |
make pull_hpu_files
make bench_integer_hpu
make bench_hlapi_erc20_hpu

View File

@@ -48,7 +48,7 @@ jobs:
name: Execute FFT benchmarks in EC2
needs: setup-ec2
concurrency:
group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
group: ${{ github.workflow_ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:

View File

@@ -48,7 +48,7 @@ jobs:
name: Execute NTT benchmarks in EC2
needs: setup-ec2
concurrency:
group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
group: ${{ github.workflow_ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:

View File

@@ -49,14 +49,6 @@ jobs:
mv linelint-linux-amd64 /usr/local/bin/linelint
make check_newline
# This is needed for the ws tests clippy checks
- name: Use specific data branch
if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
env:
PR_BRANCH: ${{ github.head_ref || github.ref_name }}
run: |
echo "BACKWARD_COMPAT_DATA_BRANCH=${PR_BRANCH}" >> "${GITHUB_ENV}"
- name: Run pcc checks
if: ${{ contains(matrix.os, 'ubuntu') }}
run: |

View File

@@ -13,7 +13,7 @@ env:
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
group: ${{ github.workflow }}-${{ github.head_ref }}
cancel-in-progress: true
permissions:

View File

@@ -13,7 +13,7 @@ env:
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
group: ${{ github.workflow }}-${{ github.head_ref }}
cancel-in-progress: true
permissions:

View File

@@ -3,15 +3,14 @@ name: Check commit and PR compliance
on:
pull_request:
permissions: {}
permissions:
contents: read
pull-requests: read # Permission needed to scan commits in a pull-request
jobs:
check-commit-pr:
name: Check commit and PR
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: write # Permission needed to scan commits in a pull-request and write issue comment
steps:
- name: Check first line
uses: gsactions/commit-message-checker@16fa2d5de096ae0d35626443bcd24f1e756cafee

63
.github/workflows/data_pr_close.yml vendored Normal file
View File

@@ -0,0 +1,63 @@
name: Close or Merge corresponding PR on the data repo
# When a PR with the data_PR tag is closed or merged, this will close the corresponding PR in the data repo.
env:
DATA_REPO: zama-ai/tfhe-backward-compat-data
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
PR_BRANCH: ${{ github.head_ref || github.ref_name }}
CLOSE_TYPE: ${{ github.event.pull_request.merged && 'merge' || 'close' }}
# only trigger on pull request closed events
on:
pull_request:
types: [ closed ]
permissions: {}
jobs:
auto_close_job:
if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') && github.repository == 'zama-ai/tfhe-rs' }}
runs-on: ubuntu-latest
env:
GH_TOKEN: ${{ secrets.FHE_ACTIONS_TOKEN }} # Needed for gh CLI commands
steps:
- name: Fetch PR number
run: |
PR_NUMBER=$(gh pr view "${PR_BRANCH}" --repo "${DATA_REPO}" --json number | jq '.number')
echo "DATA_REPO_PR_NUMBER=${PR_NUMBER}" >> "${GITHUB_ENV}"
- name: Comment on the PR to indicate the reason of the close
run: |
gh pr comment "${PR_BRANCH}" \
--repo "${DATA_REPO}" \
--body "PR ${CLOSE_TYPE}d because the corresponding PR in main repo was ${CLOSE_TYPE}d: ${REPO}#${EVENT_NUMBER}"
env:
REPO: ${{ github.repository }}
EVENT_NUMBER: ${{ github.event.number }}
- name: Merge the Pull Request in the data repo
if: ${{ github.event.pull_request.merged }}
run: |
gh pr merge "${PR_BRANCH}" \
--repo "${DATA_REPO}" \
--rebase \
--delete-branch
- name: Close the Pull Request in the data repo
if: ${{ !github.event.pull_request.merged }}
run: |
gh pr close "${PR_BRANCH}" \
--repo "${DATA_REPO}" \
--delete-branch
- name: Slack Notification
if: ${{ always() && job.status == 'failure' }}
continue-on-error: true
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Failed to auto-${{ env.CLOSE_TYPE }} PR on data repo: https://github.com/${{ env.DATA_REPO }}/pull/${{ env.DATA_REPO_PR_NUMBER }}"

View File

@@ -137,7 +137,7 @@ jobs:
# No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
- name: Run multi-bit CUDA integer tests
run: |
BIG_TESTS_INSTANCE=TRUE NO_BIG_PARAMS_GPU=TRUE make test_integer_multi_bit_gpu_ci
BIG_TESTS_INSTANCE=TRUE make test_integer_multi_bit_gpu_ci
- name: Run user docs tests
run: |

View File

@@ -11,7 +11,6 @@ env:
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
IS_PR: ${{ github.event_name == 'pull_request' }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -19,8 +18,6 @@ on:
schedule:
# Nightly tests will be triggered each evening 8p.m.
- cron: "0 20 * * *"
pull_request:
permissions:
contents: read
@@ -81,11 +78,7 @@ jobs:
- name: Run tests
run: |
if [[ "${IS_PR}" == "true" ]]; then
make test_integer_short_run_gpu
else
make test_integer_long_run_gpu
fi
make test_integer_long_run_gpu
slack-notify:
name: Slack Notification

View File

@@ -25,6 +25,9 @@ on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
schedule:
# Nightly tests @ 1AM after each work day
- cron: "0 1 * * MON-FRI"
permissions:
contents: read

View File

@@ -25,6 +25,9 @@ on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
schedule:
# Nightly tests @ 1AM after each work day
- cron: "0 1 * * MON-FRI"
permissions:
contents: read

View File

@@ -13,7 +13,7 @@ env:
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
group: ${{ github.workflow }}-${{ github.head_ref }}
cancel-in-progress: true

View File

@@ -93,9 +93,6 @@ jobs:
echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
env:
GCC_VERSION: ${{ matrix.gcc }}
- name: Prepare package
run: |
cargo package -p tfhe-cuda-backend

View File

@@ -21,7 +21,7 @@ jobs:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: git-sync
uses: valtech-sd/git-sync@e734cfe9485a92e720eac5af8a4555dde5fecf88
uses: wei/git-sync@55c6b63b4f21607da0e9877ca9b4d11a29fc6d83
with:
source_repo: "zama-ai/tfhe-rs"
source_branch: "main"

View File

@@ -1,26 +0,0 @@
name: 'Close unverified PRs'
on:
schedule:
- cron: '30 1 * * *'
permissions: {}
jobs:
stale:
runs-on: ubuntu-latest
permissions:
issues: read
pull-requests: write
steps:
- uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
with:
stale-pr-message: 'This PR is unverified and has been open for 2 days, it will now be closed. If you want to contribute please sign the CLA as indicated by the bot.'
days-before-stale: 2
days-before-close: 0
# We are not interested in suppressing issues so have a currently non existent label
# if we ever accept issues to become stale/closable this label will be the signal for that
only-issue-labels: can-be-auto-closed
# Only unverified PRs are an issue
exempt-pr-labels: cla-signed
# We don't want people commenting to keep an unverified PR
ignore-updates: true

View File

@@ -10,7 +10,6 @@ ignore:
- keys
- coverage
- utils/tfhe-lints/ui/main.stderr
- utils/tfhe-backward-compat-data/**/*.ron # ron files are autogenerated
rules:
# checks if file ends in a newline character

View File

@@ -1,28 +1,18 @@
# Specifying a path without code owners means that path won't have owners and is akin to a negation
# i.e. the `core_crypto` dir is owned and needs owner approval/review, but not the `gpu` sub dir
# See https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners#example-of-a-codeowners-file
/backends/tfhe-cuda-backend/ @agnesLeroy
/backends/tfhe-hpu-backend/ @zama-ai/hardware
/tfhe/examples/hpu @zama-ai/hardware
/tfhe/src/core_crypto/ @IceTDrinker
/tfhe/src/core_crypto/gpu @agnesLeroy
/tfhe/src/core_crypto/hpu @zama-ai/hardware
/tfhe/src/core_crypto/gpu
/tfhe/src/shortint/ @mayeul-zama
/tfhe/src/integer/ @tmontaigu
/tfhe/src/integer/gpu @agnesLeroy
/tfhe/src/integer/hpu @zama-ai/hardware
/tfhe/src/integer/gpu
/tfhe/src/high_level_api/ @tmontaigu
/Makefile @IceTDrinker @soonum
/mockups/tfhe-hpu-mockup @zama-ai/hardware
/.github/ @soonum
/CODEOWNERS @IceTDrinker

View File

@@ -170,8 +170,6 @@ On the contrary, these changes are *not* data breaking:
* Renaming a type (unless it implements the `Named` trait).
* Adding a variant to the end of an enum.
Historical data from previous TFHE-rs versions are stored inside `utils/tfhe-backward-compat-data`. They are used to check on every PR that backward compatibility has been preserved.
## Example: adding a field
Suppose you want to add an i32 field to a type named `MyType`. The original type is defined as:

View File

@@ -18,7 +18,7 @@ members = [
]
exclude = [
"utils/tfhe-backward-compat-data",
"tests/backward_compatibility_tests",
"utils/tfhe-lints",
"apps/trivium",
]

102
Makefile
View File

@@ -22,7 +22,10 @@ BENCH_TYPE?=latency
BENCH_PARAM_TYPE?=classical
BENCH_PARAMS_SET?=default
NODE_VERSION=22.6
BACKWARD_COMPAT_DATA_DIR=utils/tfhe-backward-compat-data
BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
BACKWARD_COMPAT_DATA_BRANCH?=$(shell ./scripts/backward_compat_data_version.py)
BACKWARD_COMPAT_DATA_PROJECT=tfhe-backward-compat-data
BACKWARD_COMPAT_DATA_DIR=$(BACKWARD_COMPAT_DATA_PROJECT)
TFHE_SPEC:=tfhe
WASM_PACK_VERSION="0.13.1"
# We are kind of hacking the cut here, the version cannot contain a quote '"'
@@ -156,23 +159,23 @@ install_tarpaulin: install_rs_build_toolchain
.PHONY: install_cargo_dylint # Install custom tfhe-rs lints
install_cargo_dylint:
cargo install --locked cargo-dylint dylint-link
cargo install cargo-dylint dylint-link
.PHONY: install_typos_checker # Install typos checker
install_typos_checker: install_rs_build_toolchain
@typos --version > /dev/null 2>&1 || \
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked typos-cli || \
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install typos-cli || \
( echo "Unable to install typos-cli, unknown error." && exit 1 )
.PHONY: install_zizmor # Install zizmor workflow security checker
install_zizmor: install_rs_build_toolchain
@zizmor --version > /dev/null 2>&1 || \
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked zizmor --version ~1.9 || \
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install zizmor --version ~1.9 || \
( echo "Unable to install zizmor, unknown error." && exit 1 )
.PHONY: install_cargo_cross # Install cross for big endian tests
.PHONY: install_cargo_cross # Install custom tfhe-rs lints
install_cargo_cross: install_rs_build_toolchain
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked cross
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install cross
.PHONY: setup_venv # Setup Python virtualenv for wasm tests
setup_venv:
@@ -249,9 +252,6 @@ install_mlc: install_rs_build_toolchain
.PHONY: fmt # Format rust code
fmt: install_rs_check_toolchain
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR) fmt
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt
.PHONY: fmt_js # Format javascript code
fmt_js: check_nvm_installed
@@ -273,9 +273,6 @@ fmt_c_tests:
.PHONY: check_fmt # Check rust code format
check_fmt: install_rs_check_toolchain
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR) fmt --check
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt --check
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt --check
.PHONY: check_fmt_c_tests # Check C tests format
check_fmt_c_tests:
@@ -486,22 +483,10 @@ clippy_param_dedup: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-p param_dedup -- --no-deps -D warnings
.PHONY: clippy_backward_compat_data # Run clippy lints on tfhe-backward-compat-data
clippy_backward_compat_data: install_rs_check_toolchain # the toolchain is selected with toolchain.toml
@# Some old crates are x86 specific, only run in that case
@if uname -a | grep -q x86; then \
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options \
-C $(BACKWARD_COMPAT_DATA_DIR) clippy --all-targets \
-- --no-deps -D warnings; \
else \
echo "Cannot run clippy for backward compat crate on non x86 platform for now."; \
fi
.PHONY: clippy_all # Run all clippy targets
clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_trivium \
clippy_versionable clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup \
clippy_backward_compat_data
clippy_versionable clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup
.PHONY: clippy_fast # Run main clippy targets
clippy_fast: clippy_rustdoc clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks \
@@ -517,12 +502,6 @@ clippy_hpu_backend: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-p tfhe-hpu-backend -- --no-deps -D warnings
.PHONY: clippy_hpu_mockup # Run clippy lints on tfhe-hpu-mockup
clippy_hpu_mockup: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--all-targets \
-p tfhe-hpu-mockup -- --no-deps -D warnings
.PHONY: check_rust_bindings_did_not_change # Check rust bindings are up to date for tfhe-cuda-backend
check_rust_bindings_did_not_change:
cargo build -p tfhe-cuda-backend && "$(MAKE)" fmt_gpu && \
@@ -535,9 +514,6 @@ check_rust_bindings_did_not_change:
tfhe_lints: install_cargo_dylint
RUSTFLAGS="$(RUSTFLAGS)" cargo dylint --all -p tfhe --no-deps -- \
--features=boolean,shortint,integer,strings,zk-pok
RUSTFLAGS="$(RUSTFLAGS)" cargo dylint --all -p tfhe-zk-pok --no-deps -- \
--features=experimental
.PHONY: build_core # Build core_crypto without experimental features
build_core: install_rs_build_toolchain install_rs_check_toolchain
@@ -679,14 +655,6 @@ test_integer_gpu: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
.PHONY: test_integer_gpu_debug # Run the tests of the integer module with Debug flags for CUDA
test_integer_gpu_debug: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile release_lto_off \
--features=integer,gpu-debug -vv -p $(TFHE_SPEC) -- integer::gpu::server_key:: --test-threads=1 --nocapture
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile release_lto_off \
--features=integer,gpu-debug -p $(TFHE_SPEC) -- integer::gpu::server_key::
.PHONY: test_integer_long_run_gpu # Run the long run integer tests on the gpu backend
test_integer_long_run_gpu: install_rs_check_toolchain install_cargo_nextest
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
@@ -695,12 +663,6 @@ test_integer_long_run_gpu: install_rs_check_toolchain install_cargo_nextest
--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
--tfhe-package "$(TFHE_SPEC)" --backend "gpu"
.PHONY: test_integer_short_run_gpu # Run the long run integer tests on the gpu backend
test_integer_short_run_gpu: install_rs_check_toolchain install_cargo_nextest
TFHE_RS_TEST_LONG_TESTS_MINIMAL=TRUE \
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::radix::tests_long_run::test_random_op_sequence integer::gpu::server_key::radix::tests_long_run::test_signed_random_op_sequence --test-threads=1 --nocapture
.PHONY: test_integer_compression
test_integer_compression: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -1063,11 +1025,16 @@ test_tfhe_lints: install_cargo_dylint
# Here we use the "patch" functionality of Cargo to make sure the repo used for the data is the same as the one used for the code.
.PHONY: test_backward_compatibility_ci
test_backward_compatibility_ci: install_rs_build_toolchain
TFHE_BACKWARD_COMPAT_DATA_DIR="../$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
TFHE_BACKWARD_COMPAT_DATA_DIR="$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).path=\"tests/$(BACKWARD_COMPAT_DATA_DIR)\"" \
--features=shortint,integer,zk-pok -p tests test_backward_compatibility -- --nocapture
.PHONY: test_backward_compatibility # Same as test_backward_compatibility_ci but tries to clone the data repo first if needed
test_backward_compatibility: pull_backward_compat_data test_backward_compatibility_ci
test_backward_compatibility: tests/$(BACKWARD_COMPAT_DATA_DIR) test_backward_compatibility_ci
.PHONY: backward_compat_branch # Prints the required backward compatibility branch
backward_compat_branch:
@echo "$(BACKWARD_COMPAT_DATA_BRANCH)"
.PHONY: doc # Build rust doc
doc: install_rs_check_toolchain
@@ -1112,10 +1079,6 @@ check_intra_md_links: install_mlc
check_md_links: install_mlc
mlc --match-file-extension tfhe/docs
.PHONY: check_doc_paths_use_dash # Check paths use "-" instead of "_" in docs for gitbook compatibility
check_doc_paths_use_dash:
python3 ./scripts/check_doc_paths_use_dash.py
.PHONY: check_parameter_export_ok # Checks exported "current" shortint parameter module is correct
check_parameter_export_ok:
python3 ./scripts/check_current_param_export.py
@@ -1473,20 +1436,6 @@ bench_tfhe_zk_pok: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-zk-pok --
.PHONY: bench_hlapi_noise_squash # Run benchmarks for noise squash operation
bench_hlapi_noise_squash: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-noise-squash \
--features=integer,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --
.PHONY: bench_hlapi_noise_squash_gpu # Run benchmarks for noise squash operation on GPU
bench_hlapi_noise_squash_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-noise-squash \
--features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --
#
# Utility tools
#
@@ -1544,13 +1493,11 @@ write_params_to_file: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run \
--example write_params_to_file --features=boolean,shortint,hpu,internal-keycache
.PHONY: pull_backward_compat_data # Pull the data files needed for backward compatibility tests
pull_backward_compat_data:
./scripts/pull_lfs_data.sh $(BACKWARD_COMPAT_DATA_DIR)
.PHONY: clone_backward_compat_data # Clone the data repo needed for backward compatibility tests
clone_backward_compat_data:
./scripts/clone_backward_compat_data.sh $(BACKWARD_COMPAT_DATA_URL) $(BACKWARD_COMPAT_DATA_BRANCH) tests/$(BACKWARD_COMPAT_DATA_DIR)
.PHONY: pull_hpu_files # Pull the hpu files
pull_hpu_files:
./scripts/pull_lfs_data.sh backends/tfhe-hpu-backend/
tests/$(BACKWARD_COMPAT_DATA_DIR): clone_backward_compat_data
#
# Real use case examples
@@ -1576,8 +1523,7 @@ sha256_bool: install_rs_check_toolchain
.PHONY: pcc # pcc stands for pre commit checks (except GPU)
pcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
check_md_docs_are_tested check_intra_md_links check_doc_paths_use_dash \
clippy_all check_compile_tests test_tfhe_lints \
check_md_docs_are_tested check_intra_md_links clippy_all check_compile_tests test_tfhe_lints \
tfhe_lints
.PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
@@ -1585,11 +1531,11 @@ pcc_gpu: check_rust_bindings_did_not_change clippy_rustdoc_gpu \
clippy_gpu clippy_cuda_backend clippy_bench_gpu check_compile_tests_benches_gpu
.PHONY: pcc_hpu # pcc stands for pre commit checks for HPU compilation
pcc_hpu: clippy_hpu clippy_hpu_backend clippy_hpu_mockup test_integer_hpu_mockup_ci_fast
pcc_hpu: clippy_hpu clippy_hpu_backend test_integer_hpu_mockup_ci_fast
.PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
fpcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
check_md_docs_are_tested check_intra_md_links check_doc_paths_use_dash clippy_fast check_compile_tests
check_md_docs_are_tested clippy_fast check_compile_tests
.PHONY: conformance # Automatically fix problems that can be fixed
conformance: fix_newline fmt fmt_js

View File

@@ -18,7 +18,6 @@
<a href="https://github.com/zama-ai/tfhe-rs/releases"><img src="https://img.shields.io/github/v/release/zama-ai/tfhe-rs?style=flat-square"></a>
<a href="LICENSE"><img src="https://img.shields.io/badge/License-BSD--3--Clause--Clear-%23ffb243?style=flat-square"></a>
<a href="https://github.com/zama-ai/bounty-program"><img src="https://img.shields.io/badge/Contribute-Zama%20Bounty%20Program-%23ffd208?style=flat-square"></a>
<a href="https://slsa.dev"><img alt="SLSA 3" src="https://slsa.dev/images/gh-badge-level3.svg" /></a>
</p>
## About
@@ -149,7 +148,7 @@ To run this code, use the following command:
> Note that when running code that uses `TFHE-rs`, it is highly recommended
to run in release mode with cargo's `--release` flag to have the best performances possible.
*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick-start)*
*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick_start)*
<p align="right">
<a href="#about" > ↑ Back to top </a>

View File

@@ -129,7 +129,7 @@ Other sizes than 64 bit are expected to be available in the future.
# FHE shortint Trivium implementation
The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128`).
The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128`).
It uses a lower level API of tfhe-rs, so the syntax is a little bit different. It also implements the `TransCiphering` trait. For optimization purposes, it does not internally run
on the same cryptographic parameters as the high level API of tfhe-rs. As such, it requires the usage of a casting key, to switch from one parameter space to another, which makes
its setup a little more intricate.
@@ -137,10 +137,10 @@ its setup a little more intricate.
Example code:
```rust
use tfhe::shortint::prelude::*;
use tfhe::shortint::parameters::current_params::{
V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
use tfhe::shortint::parameters::v1_2::{
V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
};
use tfhe::{ConfigBuilder, generate_keys, FheUint64};
use tfhe::prelude::*;
@@ -148,17 +148,17 @@ use tfhe_trivium::TriviumStreamShortint;
fn test_shortint() {
let config = ConfigBuilder::default()
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.build();
let (hl_client_key, hl_server_key) = generate_keys(config);
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&underlying_ck, &underlying_sk),
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128_2M128,
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128_2M128,
);
let key_string = "0053A6F94C9FF24598EB".to_string();

View File

@@ -1,9 +1,9 @@
use criterion::Criterion;
use tfhe::prelude::*;
use tfhe::shortint::parameters::current_params::{
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
use tfhe::shortint::parameters::v1_2::{
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
};
use tfhe::shortint::prelude::*;
use tfhe::{generate_keys, ConfigBuilder, FheUint64};
@@ -11,19 +11,19 @@ use tfhe_trivium::{KreyviumStreamShortint, TransCiphering};
pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
let config = ConfigBuilder::default()
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.build();
let (hl_client_key, hl_server_key) = generate_keys(config);
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
let (client_key, server_key): (ClientKey, ServerKey) =
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&underlying_ck, &underlying_sk),
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
);
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -64,19 +64,19 @@ pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
pub fn kreyvium_shortint_gen(c: &mut Criterion) {
let config = ConfigBuilder::default()
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.build();
let (hl_client_key, hl_server_key) = generate_keys(config);
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
let (client_key, server_key): (ClientKey, ServerKey) =
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&underlying_ck, &underlying_sk),
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
);
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -112,19 +112,19 @@ pub fn kreyvium_shortint_gen(c: &mut Criterion) {
pub fn kreyvium_shortint_trans(c: &mut Criterion) {
let config = ConfigBuilder::default()
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.build();
let (hl_client_key, hl_server_key) = generate_keys(config);
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
let (client_key, server_key): (ClientKey, ServerKey) =
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&underlying_ck, &underlying_sk),
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
);
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();

View File

@@ -1,9 +1,9 @@
use criterion::Criterion;
use tfhe::prelude::*;
use tfhe::shortint::parameters::current_params::{
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
use tfhe::shortint::parameters::v1_2::{
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
};
use tfhe::shortint::prelude::*;
use tfhe::{generate_keys, ConfigBuilder, FheUint64};
@@ -11,19 +11,19 @@ use tfhe_trivium::{TransCiphering, TriviumStreamShortint};
pub fn trivium_shortint_warmup(c: &mut Criterion) {
let config = ConfigBuilder::default()
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.build();
let (hl_client_key, hl_server_key) = generate_keys(config);
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
let (client_key, server_key): (ClientKey, ServerKey) =
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&underlying_ck, &underlying_sk),
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
);
let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -64,19 +64,19 @@ pub fn trivium_shortint_warmup(c: &mut Criterion) {
pub fn trivium_shortint_gen(c: &mut Criterion) {
let config = ConfigBuilder::default()
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.build();
let (hl_client_key, hl_server_key) = generate_keys(config);
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
let (client_key, server_key): (ClientKey, ServerKey) =
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&underlying_ck, &underlying_sk),
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
);
let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -112,19 +112,19 @@ pub fn trivium_shortint_gen(c: &mut Criterion) {
pub fn trivium_shortint_trans(c: &mut Criterion) {
let config = ConfigBuilder::default()
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.build();
let (hl_client_key, hl_server_key) = generate_keys(config);
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
let (client_key, server_key): (ClientKey, ServerKey) =
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&underlying_ck, &underlying_sk),
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
);
let key_string = "0053A6F94C9FF24598EB".to_string();

View File

@@ -1,9 +1,9 @@
use crate::{KreyviumStream, KreyviumStreamByte, KreyviumStreamShortint, TransCiphering};
use tfhe::prelude::*;
use tfhe::shortint::parameters::current_params::{
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
use tfhe::shortint::parameters::v1_2::{
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
};
use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
// Values for these tests come from the github repo renaud1239/Kreyvium,
@@ -221,19 +221,19 @@ use tfhe::shortint::prelude::*;
#[test]
fn kreyvium_test_shortint_long() {
let config = ConfigBuilder::default()
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.build();
let (hl_client_key, hl_server_key) = generate_keys(config);
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
let (client_key, server_key): (ClientKey, ServerKey) =
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&underlying_ck, &underlying_sk),
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
);
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();

View File

@@ -1,9 +1,9 @@
use crate::{TransCiphering, TriviumStream, TriviumStreamByte, TriviumStreamShortint};
use tfhe::prelude::*;
use tfhe::shortint::parameters::current_params::{
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
use tfhe::shortint::parameters::v1_2::{
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
};
use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
// Values for these tests come from the github repo cantora/avr-crypto-lib, commit 2a5b018,
@@ -357,19 +357,19 @@ use tfhe::shortint::prelude::*;
#[test]
fn trivium_test_shortint_long() {
let config = ConfigBuilder::default()
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.build();
let (hl_client_key, hl_server_key) = generate_keys(config);
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
let (client_key, server_key): (ClientKey, ServerKey) =
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&underlying_ck, &underlying_sk),
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
);
let key_string = "0053A6F94C9FF24598EB".to_string();

View File

@@ -1,6 +1,6 @@
[package]
name = "tfhe-cuda-backend"
version = "0.11.0"
version = "0.10.0"
edition = "2021"
authors = ["Zama team"]
license = "BSD-3-Clause-Clear"
@@ -19,4 +19,3 @@ bindgen = "0.71"
[features]
experimental-multi-arch = []
profile = []
debug = []

View File

@@ -53,10 +53,6 @@ fn main() {
cmake_config.define("USE_NVTOOLS", "OFF");
}
if cfg!(feature = "debug") {
cmake_config.define("CMAKE_BUILD_TYPE", "Debug");
}
// Build the CMake project
let dest = cmake_config.build();
println!("cargo:rustc-link-search=native={}", dest.display());

View File

@@ -52,8 +52,6 @@ endif()
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
else()
message("Building CUDA backend in ${CMAKE_BUILD_TYPE}")
endif()
# Add OpenMP support

View File

@@ -37,6 +37,17 @@ void cuda_glwe_sample_extract_128(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *glwe_array_in, uint32_t const *nth_array, uint32_t num_nths,
uint32_t lwe_per_glwe, uint32_t glwe_dimension, uint32_t polynomial_size);
void cuda_modulus_switch_multi_bit_64(void *stream, uint32_t gpu_index,
void *lwe_array_out, void *lwe_array_in,
uint32_t size, uint32_t log_modulus,
uint32_t degree,
uint32_t grouping_factor);
void cuda_modulus_switch_multi_bit_128(void *stream, uint32_t gpu_index,
void *lwe_array_out, void *lwe_array_in,
uint32_t size, uint32_t log_modulus,
uint32_t degree,
uint32_t grouping_factor);
}
#endif

View File

@@ -49,13 +49,12 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index);
void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
uint32_t gpu_index,
uint64_t &size_tracker,
uint64_t *size_tracker,
bool allocate_gpu_memory);
void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
bool cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
uint64_t cuda_device_total_memory(uint32_t gpu_index);
void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
uint64_t size,

View File

@@ -40,6 +40,11 @@ void cleanup_cuda_integer_compress_radix_ciphertext_64(
void cleanup_cuda_integer_decompress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
}
void cuda_integer_extract_glwe_64(
void *const *streams, uint32_t const *gpu_indexes, void *lwe_array_out,
void const *glwe_list, uint32_t const glwe_index,
uint32_t const log_modulus, uint32_t const polynomial_size,
uint32_t const glwe_dimension, uint32_t const body_count);
}
#endif

View File

@@ -20,7 +20,7 @@ template <typename Torus> struct int_compression {
uint32_t gpu_count, int_radix_params compression_params,
uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
uint32_t storage_log_modulus, bool allocate_gpu_memory,
uint64_t &size_tracker) {
uint64_t *size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
this->compression_params = compression_params;
this->lwe_per_glwe = lwe_per_glwe;
@@ -38,7 +38,7 @@ template <typename Torus> struct int_compression {
lwe_per_glwe * glwe_accumulator_size * sizeof(Torus), streams[0],
gpu_indexes[0], size_tracker, allocate_gpu_memory);
size_tracker += scratch_packing_keyswitch_lwe_list_to_glwe_64(
*size_tracker += scratch_packing_keyswitch_lwe_list_to_glwe_64(
streams[0], gpu_indexes[0], &fp_ks_buffer,
compression_params.small_lwe_dimension,
compression_params.glwe_dimension, compression_params.polynomial_size,
@@ -76,7 +76,7 @@ template <typename Torus> struct int_decompression {
int_radix_params compression_params,
uint32_t num_radix_blocks, uint32_t body_count,
uint32_t storage_log_modulus, bool allocate_gpu_memory,
uint64_t &size_tracker) {
uint64_t *size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
this->encryption_params = encryption_params;
this->compression_params = compression_params;

View File

@@ -48,34 +48,6 @@ typedef struct {
uint32_t lwe_dimension;
} CudaRadixCiphertextFFI;
typedef struct {
uint64_t const *chosen_multiplier_has_at_least_one_set;
uint64_t const *decomposed_chosen_multiplier;
uint32_t const num_scalars;
uint32_t const active_bits;
uint64_t const shift_pre;
uint32_t const shift_post;
uint32_t const ilog2_chosen_multiplier;
uint32_t const chosen_multiplier_num_bits;
bool const is_chosen_multiplier_zero;
bool const is_abs_chosen_multiplier_one;
bool const is_chosen_multiplier_negative;
bool const is_chosen_multiplier_pow2;
bool const chosen_multiplier_has_more_bits_than_numerator;
// if signed: test if chosen_multiplier >= 2^{num_bits - 1}
bool const is_chosen_multiplier_geq_two_pow_numerator;
uint32_t const ilog2_divisor;
bool const is_divisor_zero;
bool const is_abs_divisor_one;
bool const is_divisor_negative;
bool const is_divisor_pow2;
bool const divisor_has_more_bits_than_numerator;
} CudaScalarDivisorFFI;
uint64_t scratch_cuda_apply_univariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
@@ -423,8 +395,7 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool reduce_degrees_for_single_carry_propagation, bool allocate_gpu_memory,
bool allocate_ms_array);
bool allocate_gpu_memory, bool allocate_ms_array);
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -443,8 +414,7 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool allocate_gpu_memory,
bool allocate_ms_array);
PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array);
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -577,6 +547,27 @@ void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
void *const *streams,
uint32_t const *gpu_indexes);
uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool anticipated_buffer_drop, bool allocate_gpu_memory,
bool allocate_ms_array);
void cuda_integer_radix_scalar_mul_high_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, void *const *ksks,
uint64_t rhs, uint64_t const *decomposed_scalar,
uint64_t const *has_at_least_one_set,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks, uint32_t num_scalars);
void cleanup_cuda_integer_radix_scalar_mul_high_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_apply_noise_squashing_kb(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
@@ -599,6 +590,9 @@ void cleanup_cuda_apply_noise_squashing_kb(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void cuda_small_scalar_multiplication_integer_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lwe_array, uint64_t scalar);
uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -621,115 +615,5 @@ void cleanup_cuda_sub_and_propagate_single_carry(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
bool allocate_gpu_memory, bool allocate_ms_array);
void cuda_integer_unsigned_scalar_div_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi);
void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t num_additional_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, bool allocate_ms_array);
void cuda_extend_radix_with_sign_msb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
int8_t *mem_ptr, uint32_t num_additional_blocks, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
void cleanup_cuda_extend_radix_with_sign_msb_64(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
bool allocate_gpu_memory, bool allocate_ms_array);
void cuda_integer_signed_scalar_div_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits);
void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint32_t const active_bits_divisor, bool allocate_gpu_memory,
bool allocate_ms_array);
void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint64_t const *divisor_has_at_least_one_set,
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
void const *clear_blocks, void const *h_clear_blocks,
uint32_t num_clear_blocks);
void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint32_t const active_bits_divisor, bool allocate_gpu_memory,
bool allocate_ms_array);
void cuda_integer_signed_scalar_div_rem_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint64_t const *divisor_has_at_least_one_set,
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
uint32_t numerator_bits);
void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
} // extern C
#endif // CUDA_INTEGER_H

View File

@@ -65,6 +65,9 @@ void cuda_add_lwe_ciphertext_vector_plaintext_64(
void const *lwe_array_in, const uint64_t plaintext_in,
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count);
void cuda_sub_lwe_ciphertext_vector_plaintext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void const *plaintext_array_in, const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count);
}
#endif // CUDA_LINALG_H_

View File

@@ -66,9 +66,6 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
uint32_t num_many_lut, uint32_t lut_stride);
template <typename Torus>
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle(
uint32_t polynomial_size);
template <typename Torus>
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
uint32_t polynomial_size);
template <typename Torus>
@@ -98,12 +95,8 @@ uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
template <typename Torus, class params>
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size,
uint64_t full_sm_keybundle);
template <typename Torus, class params>
uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size,
uint64_t full_sm_keybundle);
uint32_t polynomial_size);
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
int8_t *d_mem_keybundle = NULL;
int8_t *d_mem_acc_step_one = NULL;
@@ -122,7 +115,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
PBS_VARIANT pbs_variant, bool allocate_gpu_memory,
uint64_t &size_tracker) {
uint64_t *size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
cuda_set_device(gpu_index);
@@ -288,146 +281,4 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
}
};
template <typename InputTorus>
struct pbs_buffer_128<InputTorus, PBS_TYPE::MULTI_BIT> {
int8_t *d_mem_keybundle = NULL;
int8_t *d_mem_acc_step_one = NULL;
int8_t *d_mem_acc_step_two = NULL;
int8_t *d_mem_acc_cg = NULL;
int8_t *d_mem_acc_tbc = NULL;
uint32_t lwe_chunk_size;
double *keybundle_fft;
__uint128_t *global_accumulator;
double *global_join_buffer;
PBS_VARIANT pbs_variant;
bool gpu_memory_allocated;
pbs_buffer_128(cudaStream_t stream, uint32_t gpu_index,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t lwe_chunk_size, PBS_VARIANT pbs_variant,
bool allocate_gpu_memory, uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
cuda_set_device(gpu_index);
this->pbs_variant = pbs_variant;
this->lwe_chunk_size = lwe_chunk_size;
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
// default
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle<
__uint128_t>(polynomial_size);
uint64_t full_sm_accumulate_step_one =
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<
__uint128_t>(polynomial_size);
uint64_t full_sm_accumulate_step_two =
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<
__uint128_t>(polynomial_size);
uint64_t partial_sm_accumulate_step_one =
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
__uint128_t>(polynomial_size);
// cg
uint64_t full_sm_cg_accumulate =
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<__uint128_t>(
polynomial_size);
uint64_t partial_sm_cg_accumulate =
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<
__uint128_t>(polynomial_size);
auto num_blocks_keybundle = input_lwe_ciphertext_count * lwe_chunk_size *
(glwe_dimension + 1) * (glwe_dimension + 1) *
level_count;
auto num_blocks_acc_step_one =
level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
auto num_blocks_acc_step_two =
input_lwe_ciphertext_count * (glwe_dimension + 1);
auto num_blocks_acc_cg =
level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
// Keybundle
if (max_shared_memory < full_sm_keybundle)
d_mem_keybundle = (int8_t *)cuda_malloc_with_size_tracking_async(
num_blocks_keybundle * full_sm_keybundle, stream, gpu_index,
size_tracker, allocate_gpu_memory);
switch (pbs_variant) {
case PBS_VARIANT::CG:
// Accumulator CG
if (max_shared_memory < partial_sm_cg_accumulate)
d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
num_blocks_acc_cg * full_sm_cg_accumulate, stream, gpu_index,
size_tracker, allocate_gpu_memory);
else if (max_shared_memory < full_sm_cg_accumulate)
d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
num_blocks_acc_cg * partial_sm_cg_accumulate, stream, gpu_index,
size_tracker, allocate_gpu_memory);
break;
case PBS_VARIANT::DEFAULT:
// Accumulator step one
if (max_shared_memory < partial_sm_accumulate_step_one)
d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
num_blocks_acc_step_one * full_sm_accumulate_step_one, stream,
gpu_index, size_tracker, allocate_gpu_memory);
else if (max_shared_memory < full_sm_accumulate_step_one)
d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream,
gpu_index, size_tracker, allocate_gpu_memory);
// Accumulator step two
if (max_shared_memory < full_sm_accumulate_step_two)
d_mem_acc_step_two = (int8_t *)cuda_malloc_with_size_tracking_async(
num_blocks_acc_step_two * full_sm_accumulate_step_two, stream,
gpu_index, size_tracker, allocate_gpu_memory);
break;
default:
PANIC("Cuda error (PBS): unsupported implementation variant.")
}
keybundle_fft = (double *)cuda_malloc_with_size_tracking_async(
num_blocks_keybundle * (polynomial_size / 2) * 4 * sizeof(double),
stream, gpu_index, size_tracker, allocate_gpu_memory);
global_accumulator = (__uint128_t *)cuda_malloc_with_size_tracking_async(
input_lwe_ciphertext_count * (glwe_dimension + 1) * polynomial_size *
sizeof(__uint128_t),
stream, gpu_index, size_tracker, allocate_gpu_memory);
global_join_buffer = (double *)cuda_malloc_with_size_tracking_async(
level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count *
(polynomial_size / 2) * 4 * sizeof(double),
stream, gpu_index, size_tracker, allocate_gpu_memory);
}
void release(cudaStream_t stream, uint32_t gpu_index) {
if (d_mem_keybundle)
cuda_drop_with_size_tracking_async(d_mem_keybundle, stream, gpu_index,
gpu_memory_allocated);
switch (pbs_variant) {
case DEFAULT:
if (d_mem_acc_step_one)
cuda_drop_with_size_tracking_async(d_mem_acc_step_one, stream,
gpu_index, gpu_memory_allocated);
if (d_mem_acc_step_two)
cuda_drop_with_size_tracking_async(d_mem_acc_step_two, stream,
gpu_index, gpu_memory_allocated);
break;
case CG:
if (d_mem_acc_cg)
cuda_drop_with_size_tracking_async(d_mem_acc_cg, stream, gpu_index,
gpu_memory_allocated);
break;
default:
PANIC("Cuda error (PBS): unsupported implementation variant.")
}
cuda_drop_with_size_tracking_async(keybundle_fft, stream, gpu_index,
gpu_memory_allocated);
cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
gpu_memory_allocated);
cuda_drop_with_size_tracking_async(global_join_buffer, stream, gpu_index,
gpu_memory_allocated);
}
};
#endif // CUDA_MULTI_BIT_UTILITIES_H

View File

@@ -90,7 +90,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
PBS_VARIANT pbs_variant, bool allocate_gpu_memory,
bool allocate_ms_array, uint64_t &size_tracker) {
bool allocate_ms_array, uint64_t *size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
cuda_set_device(gpu_index);
this->uses_noise_reduction = allocate_ms_array;
@@ -240,10 +240,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
}
};
template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer_128;
template <typename InputTorus>
struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
template <typename InputTorus, PBS_TYPE pbs_type> struct pbs_buffer_128 {
int8_t *d_mem;
__uint128_t *global_accumulator;
@@ -260,7 +257,7 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
bool allocate_gpu_memory, bool allocate_ms_array,
uint64_t &size_tracker) {
uint64_t *size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
cuda_set_device(gpu_index);
this->pbs_variant = pbs_variant;

View File

@@ -15,11 +15,6 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size, uint32_t grouping_factor);
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size, uint32_t grouping_factor);
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -38,25 +33,6 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
uint32_t gpu_index,
int8_t **pbs_buffer);
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
const uint32_t gpu_index,
int8_t **buffer);
}
#endif // CUDA_MULTI_BIT_H

View File

@@ -10,8 +10,8 @@ extern "C" {
void cuda_lwe_expand_64(void *const stream, uint32_t gpu_index,
void *lwe_array_out, const void *lwe_compact_array_in,
uint32_t lwe_dimension, uint32_t num_lwe,
const uint32_t *lwe_compact_input_indexes,
const uint32_t *output_body_id_per_compact_list);
const void *lwe_compact_input_indexes,
const void *output_body_id_per_compact_list);
uint64_t scratch_cuda_expand_without_verification_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,

View File

@@ -27,7 +27,7 @@ template <typename Torus> struct zk_expand_mem {
int_radix_params casting_params, KS_TYPE casting_key_type,
const uint32_t *num_lwes_per_compact_list,
const bool *is_boolean_array, uint32_t num_compact_lists,
bool allocate_gpu_memory, uint64_t &size_tracker)
bool allocate_gpu_memory, uint64_t *size_tracker)
: computing_params(computing_params), casting_params(casting_params),
num_compact_lists(num_compact_lists),
casting_key_type(casting_key_type) {

View File

@@ -1,6 +1,5 @@
file(GLOB_RECURSE SOURCES "*.cu")
add_library(tfhe_cuda_backend STATIC ${SOURCES} pbs/programmable_bootstrap_multibit_128.cuh
pbs/programmable_bootstrap_multibit_128.cu)
add_library(tfhe_cuda_backend STATIC ${SOURCES})
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
target_include_directories(tfhe_cuda_backend PRIVATE .)

View File

@@ -1,5 +1,6 @@
#include "ciphertext.cuh"
#include "polynomial/parameters.cuh"
#include "torus.cuh"
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
uint32_t gpu_index,
@@ -142,3 +143,29 @@ void cuda_glwe_sample_extract_128(
"N's are powers of two in the interval [256..4096].")
}
}
void cuda_modulus_switch_multi_bit_64(void *stream, uint32_t gpu_index,
void *lwe_array_out, void *lwe_array_in,
uint32_t size, uint32_t log_modulus,
uint32_t degree,
uint32_t grouping_factor) {
host_modulus_switch_multi_bit<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in), size, log_modulus, degree,
grouping_factor);
}
void cuda_modulus_switch_multi_bit_128(void *stream, uint32_t gpu_index,
void *lwe_array_out, void *lwe_array_in,
uint32_t size, uint32_t log_modulus,
uint32_t degree,
uint32_t grouping_factor) {
host_modulus_switch_multi_bit<__uint128_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<__uint128_t *>(lwe_array_out),
static_cast<__uint128_t *>(lwe_array_in), size, log_modulus, degree,
grouping_factor);
}

View File

@@ -38,16 +38,6 @@ __device__ Torus *get_ith_block(Torus *ksk, int i, int level,
// Each thread in x are used to calculate one output.
// threads in y are used to paralelize the lwe_dimension_in loop.
// shared memory is used to store intermediate results of the reduction.
// Note: To reduce register pressure we have slightly changed the algorithm,
// the idea consists in calculating the negate value of the output. So, instead
// of accumulating subtractions using -=, we accumulate additions using += in
// the local_lwe_out. This seems to work better cause profits madd ops and save
// some regs. For this to work, we need to negate the input
// lwe_array_in[lwe_dimension_in], and negate back the output at the end to get
// the correct results. Additionally, we split the calculation of the ksk offset
// in two parts, a constant part is calculated before the loop, and a variable
// part is calculated inside the loop. This seems to help with the register
// pressure as well.
template <typename Torus>
__global__ void
keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
@@ -70,7 +60,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
lwe_array_in, lwe_input_indexes[blockIdx.x], lwe_dimension_in + 1);
if (tid == lwe_dimension_out && threadIdx.y == 0) {
local_lwe_out = -block_lwe_array_in[lwe_dimension_in];
local_lwe_out = block_lwe_array_in[lwe_dimension_in];
}
const Torus mask_mod_b = (1ll << base_log) - 1ll;
@@ -83,12 +73,12 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
for (int i = start_i; i < end_i; i++) {
Torus state =
init_decomposer_state(block_lwe_array_in[i], base_log, level_count);
uint32_t offset = i * level_count * (lwe_dimension_out + 1);
for (int j = 0; j < level_count; j++) {
for (int j = 0; j < level_count; j++) {
auto ksk_block =
get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
local_lwe_out +=
(Torus)ksk[tid + j * (lwe_dimension_out + 1) + offset] * decomposed;
local_lwe_out -= (Torus)ksk_block[tid] * decomposed;
}
}
@@ -103,7 +93,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
lwe_acc_out[shmem_index + offset * blockDim.x];
}
if (threadIdx.y == 0)
block_lwe_array_out[tid] = -lwe_acc_out[shmem_index];
block_lwe_array_out[tid] = lwe_acc_out[shmem_index];
}
}
@@ -186,10 +176,10 @@ __host__ uint64_t scratch_packing_keyswitch_lwe_list_to_glwe(
? glwe_accumulator_size
: lwe_dimension * 2;
uint64_t size_tracker = 0;
uint64_t size_tracker;
uint64_t buffer_size = 2 * num_lwes * memory_unit * sizeof(Torus);
*fp_ks_buffer = (int8_t *)cuda_malloc_with_size_tracking_async(
buffer_size, stream, gpu_index, size_tracker, allocate_gpu_memory);
buffer_size, stream, gpu_index, &size_tracker, allocate_gpu_memory);
return size_tracker;
}

View File

@@ -66,13 +66,6 @@ __device__ inline void typecast_torus_to_double<uint64_t>(uint64_t x,
r = __ll2double_rn(x);
}
template <>
__device__ inline void typecast_torus_to_double<__uint128_t>(__uint128_t x,
double &r) {
// We truncate x
r = __ll2double_rn(static_cast<uint64_t>(x));
}
template <typename T>
__device__ inline T init_decomposer_state(T input, uint32_t base_log,
uint32_t level_count) {
@@ -302,4 +295,87 @@ __host__ void host_improve_noise_modulus_switch(
check_cuda_error(cudaGetLastError());
}
template <typename Torus, class params>
__device__ uint32_t calculates_monomial_degree(const Torus *lwe_array_group,
uint32_t ggsw_idx,
uint32_t grouping_factor) {
Torus x = 0;
for (int i = 0; i < grouping_factor; i++) {
uint32_t mask_position = grouping_factor - (i + 1);
int selection_bit = (ggsw_idx >> mask_position) & 1;
x += selection_bit * lwe_array_group[i];
}
return modulus_switch(x, params::log2_degree + 1);
}
template <typename Torus, class params>
__global__ void
modulus_switch_multi_bit(Torus *array_out, const Torus *array_in, int size,
uint32_t log_modulus, uint32_t grouping_factor) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < size) {
int num_monomials = 1 << grouping_factor;
int input_offset = tid * grouping_factor;
int output_offset = tid * (num_monomials - 1); // First monomial is skipped
for (int ggsw_idx = 1; ggsw_idx < num_monomials; ggsw_idx++) {
array_out[ggsw_idx - 1 + output_offset] =
calculates_monomial_degree<Torus, params>(&array_in[input_offset],
ggsw_idx, grouping_factor);
}
}
}
template <typename Torus>
__host__ void host_modulus_switch_multi_bit(
cudaStream_t stream, uint32_t gpu_index, Torus *array_out, Torus *array_in,
int size, uint32_t log_modulus, uint32_t degree, uint32_t grouping_factor) {
cudaSetDevice(gpu_index);
int multibit_size = size / grouping_factor;
int num_threads = 0, num_blocks = 0;
getNumBlocksAndThreads(multibit_size, 1024, num_blocks, num_threads);
switch (degree) {
case 256:
modulus_switch_multi_bit<Torus, Degree<256>>
<<<num_blocks, num_threads, 0, stream>>>(
array_out, array_in, multibit_size, log_modulus, grouping_factor);
break;
case 512:
modulus_switch_multi_bit<Torus, Degree<512>>
<<<num_blocks, num_threads, 0, stream>>>(
array_out, array_in, multibit_size, log_modulus, grouping_factor);
break;
case 1024:
modulus_switch_multi_bit<Torus, Degree<1024>>
<<<num_blocks, num_threads, 0, stream>>>(
array_out, array_in, multibit_size, log_modulus, grouping_factor);
break;
case 2048:
modulus_switch_multi_bit<Torus, Degree<2048>>
<<<num_blocks, num_threads, 0, stream>>>(
array_out, array_in, multibit_size, log_modulus, grouping_factor);
break;
case 4096:
modulus_switch_multi_bit<Torus, Degree<4096>>
<<<num_blocks, num_threads, 0, stream>>>(
array_out, array_in, multibit_size, log_modulus, grouping_factor);
break;
case 8192:
modulus_switch_multi_bit<Torus, Degree<8192>>
<<<num_blocks, num_threads, 0, stream>>>(
array_out, array_in, multibit_size, log_modulus, grouping_factor);
break;
case 16384:
modulus_switch_multi_bit<Torus, Degree<16384>>
<<<num_blocks, num_threads, 0, stream>>>(
array_out, array_in, multibit_size, log_modulus, grouping_factor);
break;
default:
PANIC("Cuda error: unsupported polynomial size. Supported "
"N's are powers of two in the interval [256..16384].")
};
check_cuda_error(cudaGetLastError());
}
#endif // CNCRT_TORUS_H

View File

@@ -74,9 +74,10 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
/// asynchronously.
void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
uint32_t gpu_index,
uint64_t &size_tracker,
uint64_t *size_tracker,
bool allocate_gpu_memory) {
size_tracker += size;
if (size_tracker != nullptr)
*size_tracker += size;
void *ptr = nullptr;
if (!allocate_gpu_memory)
return ptr;
@@ -105,9 +106,8 @@ void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
/// asynchronously.
void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
uint32_t gpu_index) {
uint64_t size_tracker = 0;
return cuda_malloc_with_size_tracking_async(size, stream, gpu_index,
size_tracker, true);
return cuda_malloc_with_size_tracking_async(size, stream, gpu_index, nullptr,
true);
}
/// Check that allocation is valid
@@ -122,13 +122,6 @@ bool cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
}
}
uint64_t cuda_device_total_memory(uint32_t gpu_index) {
cuda_set_device(gpu_index);
size_t total_mem = 0, free_mem = 0;
check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
return total_mem;
}
/// Returns
/// false if Cooperative Groups is not supported.
/// true otherwise

View File

@@ -234,29 +234,6 @@ __device__ void convert_u128_to_f128_as_torus(
}
}
// params is expected to be full degree not half degree
// same as convert_u128_to_f128_as_torus() but expects input to be on registers
template <class params>
__device__ void convert_u128_on_regs_to_f128_as_torus(
double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
const __uint128_t *in_re_on_regs, const __uint128_t *in_im_on_regs) {
const double normalization = pow(2., -128.);
Index tid = threadIdx.x;
// #pragma unroll
for (Index i = 0; i < params::opt / 2; i++) {
auto out_re = u128_to_signed_to_f128(in_re_on_regs[i]);
auto out_im = u128_to_signed_to_f128(in_im_on_regs[i]);
out_re_hi[tid] = out_re.hi * normalization;
out_re_lo[tid] = out_re.lo * normalization;
out_im_hi[tid] = out_im.hi * normalization;
out_im_lo[tid] = out_im.lo * normalization;
tid += params::degree / params::opt;
}
}
template <class params>
__device__ void
convert_f128_to_u128_as_torus(__uint128_t *out_re, __uint128_t *out_im,
@@ -295,7 +272,7 @@ batch_convert_u128_to_f128_as_integer(double *out_re_hi, double *out_re_lo,
}
// params is expected to be full degree not half degree
// converts standard input into complex<128> represented by 4 double
// converts standqard input into complex<128> represented by 4 double
// with following pattern: [re_hi_0, re_hi_1, ... re_hi_n, re_lo_0, re_lo_1,
// ... re_lo_n, im_hi_0, im_hi_1, ..., im_hi_n, im_lo_0, im_lo_1, ..., im_lo_n]
template <class params>
@@ -314,7 +291,7 @@ batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo,
}
// params is expected to be full degree not half degree
// converts standard input into complex<128> represented by 4 double
// converts standqard input into complex<128> represented by 4 double
// with following pattern: [re_hi_0, re_lo_0, im_hi_0, im_lo_0, re_hi_1,
// re_lo_1, im_hi_1, im_lo_1,
// ...,re_hi_n, re_lo_n, im_hi_n, im_lo_n, ]

View File

@@ -26,7 +26,7 @@ __host__ uint64_t scratch_cuda_integer_abs_kb(
if (is_signed) {
*mem_ptr = new int_abs_buffer<Torus>(streams, gpu_indexes, gpu_count,
params, num_blocks,
allocate_gpu_memory, size_tracker);
allocate_gpu_memory, &size_tracker);
}
return size_tracker;
}
@@ -53,8 +53,7 @@ __host__ void host_integer_abs_kb(
streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key);
host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
ct->num_radix_blocks, mem_ptr->params.message_modulus,
mem_ptr->params.carry_modulus);
ct->num_radix_blocks);
uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;

View File

@@ -61,7 +61,7 @@ __host__ uint64_t scratch_cuda_integer_radix_bitop_kb(
uint64_t size_tracker = 0;
*mem_ptr = new int_bitop_buffer<Torus>(streams, gpu_indexes, gpu_count, op,
params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
allocate_gpu_memory, &size_tracker);
return size_tracker;
}

View File

@@ -15,48 +15,3 @@ void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
host_trim_radix_blocks_lsb<uint64_t>(output, input, (cudaStream_t *)streams,
gpu_indexes);
}
uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t num_additional_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, bool allocate_ms_array) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus,
allocate_ms_array);
return scratch_extend_radix_with_sign_msb<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count,
(int_extend_radix_with_sign_msb_buffer<uint64_t> **)mem_ptr, params,
num_blocks, num_additional_blocks, allocate_gpu_memory);
}
void cuda_extend_radix_with_sign_msb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
int8_t *mem_ptr, uint32_t num_additional_blocks, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
host_extend_radix_with_sign_msb<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count, output, input,
(int_extend_radix_with_sign_msb_buffer<uint64_t> *)mem_ptr,
num_additional_blocks, bsks, (uint64_t **)ksks, ms_noise_reduction_key);
}
void cleanup_cuda_extend_radix_with_sign_msb_64(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_extend_radix_with_sign_msb_buffer<uint64_t> *mem_ptr =
(int_extend_radix_with_sign_msb_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}

View File

@@ -33,62 +33,4 @@ __host__ void host_trim_radix_blocks_lsb(CudaRadixCiphertextFFI *output,
input_start_lwe_index, input->num_radix_blocks);
}
template <typename Torus>
__host__ uint64_t scratch_extend_radix_with_sign_msb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_extend_radix_with_sign_msb_buffer<Torus> **mem_ptr,
const int_radix_params params, uint32_t num_radix_blocks,
uint32_t num_additional_blocks, const bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_extend_radix_with_sign_msb_buffer<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
num_additional_blocks, allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_extend_radix_with_sign_msb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input,
int_extend_radix_with_sign_msb_buffer<Torus> *mem_ptr,
uint32_t num_additional_blocks, void *const *bsks, Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
if (num_additional_blocks == 0) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], output,
input);
return;
}
const uint32_t input_blocks = input->num_radix_blocks;
if (input_blocks == 0) {
PANIC("Cuda error: input blocks cannot be zero");
}
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
0, input_blocks, input, 0,
input_blocks);
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->last_block, 0, 1, input,
input_blocks - 1, input_blocks);
host_apply_univariate_lut_kb(
streams, gpu_indexes, gpu_count, mem_ptr->padding_block,
mem_ptr->last_block, mem_ptr->lut, ksks, ms_noise_reduction_key, bsks);
for (uint32_t i = 0; i < num_additional_blocks; ++i) {
uint32_t dst_block_idx = input_blocks + i;
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
dst_block_idx, dst_block_idx + 1,
mem_ptr->padding_block, 0, 1);
}
}
#endif

View File

@@ -84,8 +84,7 @@ __host__ void host_integer_radix_cmux_kb(
num_radix_blocks, 2 * num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], &mem_true, &mem_true,
&mem_false, num_radix_blocks, params.message_modulus,
params.carry_modulus);
&mem_false, num_radix_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, &mem_true, bsks, ksks,
@@ -101,7 +100,7 @@ __host__ uint64_t scratch_cuda_integer_radix_cmux_kb(
uint64_t size_tracker = 0;
*mem_ptr = new int_cmux_buffer<Torus>(
streams, gpu_indexes, gpu_count, predicate_lut_f, params,
num_radix_blocks, allocate_gpu_memory, size_tracker);
num_radix_blocks, allocate_gpu_memory, &size_tracker);
return size_tracker;
}
#endif

View File

@@ -684,7 +684,7 @@ __host__ uint64_t scratch_cuda_integer_radix_comparison_check_kb(
uint64_t size_tracker = 0;
*mem_ptr = new int_comparison_buffer<Torus>(
streams, gpu_indexes, gpu_count, op, params, num_radix_blocks, is_signed,
allocate_gpu_memory, size_tracker);
allocate_gpu_memory, &size_tracker);
return size_tracker;
}

View File

@@ -89,3 +89,16 @@ void cleanup_cuda_integer_decompress_radix_ciphertext_64(
(int_decompression<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}
void cuda_integer_extract_glwe_64(
void *const *streams, uint32_t const *gpu_indexes, void *lwe_array_out,
void const *glwe_list, uint32_t const glwe_index,
uint32_t const log_modulus, uint32_t const polynomial_size,
uint32_t const glwe_dimension, uint32_t const body_count) {
auto casted_streams = (cudaStream_t *)(streams);
host_extract_mem_alloc_free<uint64_t>(
casted_streams[0], gpu_indexes[0], static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(glwe_list), glwe_index, log_modulus,
polynomial_size, glwe_dimension, body_count);
}

View File

@@ -231,6 +231,84 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
check_cuda_error(cudaGetLastError());
}
/// Extracts the glwe_index-nth GLWE ciphertext
/// This function does not require memory allocation
template <typename Torus>
__host__ void host_extract_mem_alloc_free(
cudaStream_t stream, uint32_t gpu_index, Torus *glwe_array_out,
Torus const *array_in, const uint32_t glwe_index,
const uint32_t log_modulus, const uint32_t polynomial_size,
const uint32_t glwe_dimension, const uint32_t body_count_in) {
if (array_in == glwe_array_out)
PANIC("Cuda error: Input and output must be different");
cuda_set_device(gpu_index);
auto glwe_ciphertext_size = (glwe_dimension + 1) * polynomial_size;
auto num_glwes = (body_count_in + polynomial_size - 1) / polynomial_size;
// Compressed length of the compressed GLWE we want to extract
uint32_t body_count = 0;
if (body_count_in % polynomial_size == 0)
body_count = polynomial_size;
else if (glwe_index == num_glwes - 1)
body_count = body_count_in % polynomial_size;
else
body_count = polynomial_size;
auto initial_out_len = glwe_dimension * polynomial_size + body_count;
// Calculates how many bits this particular GLWE shall use
auto number_bits_to_unpack = initial_out_len * log_modulus;
auto nbits = sizeof(Torus) * 8;
auto input_len = (number_bits_to_unpack + nbits - 1) / nbits;
// Calculates how many bits a full-packed GLWE shall use
number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
auto len = (number_bits_to_unpack + nbits - 1) / nbits;
// Uses that length to set the input pointer
auto chunk_array_in = array_in + glwe_index * len;
// Ensure the tail of the GLWE is zeroed
if (initial_out_len < glwe_ciphertext_size) {
auto zeroed_slice = glwe_array_out + initial_out_len;
cuda_memset_async(glwe_array_out, 0,
(glwe_ciphertext_size - initial_out_len) * sizeof(Torus),
stream, gpu_index);
}
int num_blocks = 0, num_threads = 0;
getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
dim3 grid(num_blocks);
dim3 threads(num_threads);
extract<Torus><<<grid, threads, 0, stream>>>(glwe_array_out, chunk_array_in,
log_modulus, initial_out_len);
// uint32_t body_count = std::min(body_count_in, polynomial_size);
// auto initial_out_len = glwe_dimension * polynomial_size + body_count;
// auto compressed_glwe_accumulator_size =
// (glwe_dimension + 1) * polynomial_size;
// auto number_bits_to_unpack = compressed_glwe_accumulator_size *
// log_modulus; auto nbits = sizeof(Torus) * 8;
// // number_bits_to_unpack.div_ceil(Scalar::BITS)
// auto input_len = (number_bits_to_unpack + nbits - 1) / nbits;
// // We assure the tail of the glwe is zeroed
// auto zeroed_slice = glwe_array_out + initial_out_len;
// cuda_memset_async(zeroed_slice, 0,
// (polynomial_size - body_count) * sizeof(Torus),
// streams[0], gpu_indexes[0]);
// int num_blocks = 0, num_threads = 0;
// getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
// dim3 grid(num_blocks);
// dim3 threads(num_threads);
// extract<Torus><<<grid, threads, 0, streams[0]>>>(glwe_array_out, array_in,
// glwe_index, log_modulus,
// input_len,
// initial_out_len);
check_cuda_error(cudaGetLastError());
}
template <typename Torus>
__host__ void host_integer_decompress(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -370,7 +448,7 @@ __host__ uint64_t scratch_cuda_compress_integer_radix_ciphertext(
uint64_t size_tracker = 0;
*mem_ptr = new int_compression<Torus>(
streams, gpu_indexes, gpu_count, compression_params, num_radix_blocks,
lwe_per_glwe, storage_log_modulus, allocate_gpu_memory, size_tracker);
lwe_per_glwe, storage_log_modulus, allocate_gpu_memory, &size_tracker);
return size_tracker;
}
@@ -386,7 +464,7 @@ __host__ uint64_t scratch_cuda_integer_decompress_radix_ciphertext(
*mem_ptr = new int_decompression<Torus>(
streams, gpu_indexes, gpu_count, encryption_params, compression_params,
num_radix_blocks, body_count, storage_log_modulus, allocate_gpu_memory,
size_tracker);
&size_tracker);
return size_tracker;
}
#endif

View File

@@ -28,7 +28,7 @@ __host__ uint64_t scratch_cuda_integer_div_rem_kb(
uint64_t size_tracker = 0;
*mem_ptr = new int_div_rem_memory<Torus>(streams, gpu_indexes, gpu_count,
params, is_signed, num_blocks,
allocate_gpu_memory, size_tracker);
allocate_gpu_memory, &size_tracker);
return size_tracker;
}
@@ -268,11 +268,10 @@ __host__ void host_unsigned_integer_div_rem_kb(
// but in that position, interesting_remainder2 always has a 0
auto merged_interesting_remainder = interesting_remainder1;
host_addition<Torus>(
streams[0], gpu_indexes[0], merged_interesting_remainder,
merged_interesting_remainder, interesting_remainder2,
merged_interesting_remainder->num_radix_blocks,
radix_params.message_modulus, radix_params.carry_modulus);
host_addition<Torus>(streams[0], gpu_indexes[0],
merged_interesting_remainder,
merged_interesting_remainder, interesting_remainder2,
merged_interesting_remainder->num_radix_blocks);
// after create_clean_version_of_merged_remainder
// `merged_interesting_remainder` will be reused as
@@ -383,10 +382,9 @@ __host__ void host_unsigned_integer_div_rem_kb(
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
}
host_addition<Torus>(
streams[0], gpu_indexes[0], overflow_sum, subtraction_overflowed,
at_least_one_upper_block_is_non_zero, 1, radix_params.message_modulus,
radix_params.carry_modulus);
host_addition<Torus>(streams[0], gpu_indexes[0], overflow_sum,
subtraction_overflowed,
at_least_one_upper_block_is_non_zero, 1);
auto message_modulus = radix_params.message_modulus;
int factor = (i) ? message_modulus - 1 : message_modulus - 2;
@@ -436,9 +434,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
as_radix_ciphertext_slice<Torus>(&quotient_block, quotient, block_of_bit,
block_of_bit + 1);
host_addition<Torus>(streams[0], gpu_indexes[0], &quotient_block,
&quotient_block, mem_ptr->did_not_overflow, 1,
radix_params.message_modulus,
radix_params.carry_modulus);
&quotient_block, mem_ptr->did_not_overflow, 1);
};
for (uint j = 0; j < gpu_count; j++) {
@@ -481,9 +477,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
// Clean the quotient and remainder
// as even though they have no carries, they are not at nominal noise level
host_addition<Torus>(streams[0], gpu_indexes[0], remainder, remainder1,
remainder2, remainder1->num_radix_blocks,
radix_params.message_modulus,
radix_params.carry_modulus);
remainder2, remainder1->num_radix_blocks);
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);

View File

@@ -398,7 +398,7 @@ uint64_t scratch_cuda_apply_noise_squashing_mem(
*mem_ptr = new int_noise_squashing_lut<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count, params, glwe_dimension,
polynomial_size, num_radix_blocks, original_num_blocks,
allocate_gpu_memory, size_tracker);
allocate_gpu_memory, &size_tracker);
return size_tracker;
}

View File

@@ -409,8 +409,7 @@ __host__ void host_pack_bivariate_blocks(
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
Torus const *lwe_indexes_out, CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2, Torus const *lwe_indexes_in,
uint32_t shift, uint32_t num_radix_blocks, uint32_t const message_modulus,
uint32_t const carry_modulus) {
uint32_t shift, uint32_t num_radix_blocks) {
if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
@@ -434,15 +433,6 @@ __host__ void host_pack_bivariate_blocks(
(Torus *)lwe_array_1->ptr, (Torus *)lwe_array_2->ptr, lwe_indexes_in,
lwe_dimension, shift, num_radix_blocks);
check_cuda_error(cudaGetLastError());
for (uint i = 0; i < num_radix_blocks; i++) {
lwe_array_out->degrees[i] =
lwe_array_1->degrees[i] * shift + lwe_array_2->degrees[i];
lwe_array_out->noise_levels[i] =
lwe_array_1->noise_levels[i] * shift + lwe_array_2->noise_levels[i];
CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], message_modulus,
carry_modulus);
}
}
// polynomial_size threads
@@ -531,7 +521,8 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
if (num_radix_blocks > lut->num_blocks)
PANIC("Cuda error: num radix blocks on which lut is applied should be "
"smaller or equal to the number of lut radix blocks")
if (num_radix_blocks > lwe_array_out->num_radix_blocks)
if (num_radix_blocks > lwe_array_out->num_radix_blocks ||
num_radix_blocks > lwe_array_in->num_radix_blocks)
PANIC("Cuda error: num radix blocks on which lut is applied should be "
"smaller or equal to the number of input & output radix blocks")
@@ -607,8 +598,6 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
auto degrees_index = lut->h_lut_indexes[i];
lwe_array_out->degrees[i] = lut->degrees[degrees_index];
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus,
params.carry_modulus);
}
POP_RANGE()
}
@@ -714,8 +703,6 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
auto degrees_index = lut->h_lut_indexes[i % lut->num_blocks];
lwe_array_out->degrees[i] = lut->degrees[degrees_index];
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus,
params.carry_modulus);
}
POP_RANGE()
}
@@ -764,7 +751,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
host_pack_bivariate_blocks<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_pbs_in,
lut->lwe_trivial_indexes, lwe_array_1, lwe_array_2, lut->lwe_indexes_in,
shift, num_radix_blocks, params.message_modulus, params.carry_modulus);
shift, num_radix_blocks);
check_cuda_error(cudaGetLastError());
/// For multi GPU execution we create vectors of pointers for inputs and
@@ -832,8 +819,6 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
auto degrees_index = lut->h_lut_indexes[i];
lwe_array_out->degrees[i] = lut->degrees[degrees_index];
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus,
params.carry_modulus);
}
POP_RANGE()
}
@@ -1462,8 +1447,6 @@ void host_full_propagate_inplace(
auto degrees_index = mem_ptr->lut->h_lut_indexes[0];
input_blocks->degrees[i] = mem_ptr->lut->degrees[degrees_index];
input_blocks->noise_levels[i] = NoiseLevel::NOMINAL;
CHECK_NOISE_LEVEL(input_blocks->noise_levels[i], params.message_modulus,
params.carry_modulus);
if (i < num_blocks - 1) {
CudaRadixCiphertextFFI next_input_block;
@@ -1474,8 +1457,7 @@ void host_full_propagate_inplace(
mem_ptr->tmp_big_lwe_vector, 1, 2);
host_addition<Torus>(streams[0], gpu_indexes[0], &next_input_block,
&next_input_block, &second_input, 1,
params.message_modulus, params.carry_modulus);
&next_input_block, &second_input, 1);
}
}
}
@@ -1491,7 +1473,7 @@ uint64_t scratch_cuda_full_propagation(cudaStream_t const *streams,
uint64_t size_tracker = 0;
*mem_ptr =
new int_fullprop_buffer<Torus>(streams, gpu_indexes, gpu_count, params,
allocate_gpu_memory, size_tracker);
allocate_gpu_memory, &size_tracker);
return size_tracker;
}
@@ -1726,7 +1708,7 @@ uint64_t scratch_cuda_apply_univariate_lut_kb(
uint64_t size_tracker = 0;
*mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
1, num_radix_blocks, allocate_gpu_memory,
size_tracker);
&size_tracker);
// It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
// 0
cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1762,7 +1744,7 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb(
uint64_t size_tracker = 0;
*mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
1, num_radix_blocks, num_many_lut,
allocate_gpu_memory, size_tracker);
allocate_gpu_memory, &size_tracker);
// It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
// 0
cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1798,7 +1780,7 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb(
uint64_t size_tracker = 0;
*mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
1, num_radix_blocks, allocate_gpu_memory,
size_tracker);
&size_tracker);
// It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
// 0
cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1836,7 +1818,7 @@ uint64_t scratch_cuda_propagate_single_carry_kb_inplace(
uint64_t size_tracker = 0;
*mem_ptr = new int_sc_prop_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks, requested_flag,
uses_carry, allocate_gpu_memory, size_tracker);
uses_carry, allocate_gpu_memory, &size_tracker);
return size_tracker;
}
// This function perform the three steps of Thomas' new carry propagation
@@ -1853,6 +1835,9 @@ void host_propagate_single_carry(
PUSH_RANGE("propagate sc")
auto num_radix_blocks = lwe_array->num_radix_blocks;
auto params = mem->params;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
auto lut_stride = mem->lut_stride;
auto num_many_lut = mem->num_many_lut;
CudaRadixCiphertextFFI output_flag;
@@ -1866,10 +1851,8 @@ void host_propagate_single_carry(
"pointer")
if (uses_carry == 1) {
host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
input_carries, 1, params.message_modulus,
params.carry_modulus);
input_carries, 1);
}
// Step 1
host_compute_shifted_blocks_and_states<Torus>(
streams, gpu_indexes, gpu_count, lwe_array, mem->shifted_blocks_state_mem,
@@ -1893,8 +1876,7 @@ void host_propagate_single_carry(
auto shifted_blocks = mem->shifted_blocks_state_mem->shifted_blocks;
host_addition<Torus>(
streams[0], gpu_indexes[0], prepared_blocks, shifted_blocks,
mem->prop_simu_group_carries_mem->simulators, num_radix_blocks,
params.message_modulus, params.carry_modulus);
mem->prop_simu_group_carries_mem->simulators, num_radix_blocks);
if (requested_flag == outputFlag::FLAG_OVERFLOW ||
requested_flag == outputFlag::FLAG_CARRY) {
@@ -1903,8 +1885,7 @@ void host_propagate_single_carry(
&shifted_simulators, mem->prop_simu_group_carries_mem->simulators,
num_radix_blocks - 1, num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], &output_flag, &output_flag,
&shifted_simulators, 1, params.message_modulus,
params.carry_modulus);
&shifted_simulators, 1);
}
host_radix_sum_in_groups<Torus>(
@@ -1918,8 +1899,7 @@ void host_propagate_single_carry(
mem->prop_simu_group_carries_mem->resolved_carries, mem->num_groups - 1,
mem->num_groups);
host_addition<Torus>(streams[0], gpu_indexes[0], &output_flag, &output_flag,
&shifted_resolved_carries, 1, params.message_modulus,
params.carry_modulus);
&shifted_resolved_carries, 1);
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], prepared_blocks, num_radix_blocks,
@@ -1958,26 +1938,15 @@ void host_add_and_propagate_single_carry(
PUSH_RANGE("add & propagate sc")
if (lhs_array->num_radix_blocks != rhs_array->num_radix_blocks)
PANIC("Cuda error: input and output num radix blocks must be the same")
// Check input carries if used
if (uses_carry == 1) {
if (input_carries == nullptr)
PANIC("Cuda error: if uses_carry is enabled, input_carries cannot be a "
"null pointer");
if (lhs_array->lwe_dimension != input_carries->lwe_dimension)
PANIC(
"Cuda error: input and input_carries lwe dimension must be the same");
}
// Allow nullptr for carry_out if FLAG_NONE is requested
if (lhs_array->lwe_dimension != rhs_array->lwe_dimension ||
lhs_array->lwe_dimension != input_carries->lwe_dimension ||
lhs_array->lwe_dimension != carry_out->lwe_dimension)
PANIC("Cuda error: input and output lwe dimension must be the same")
if ((requested_flag == outputFlag::FLAG_OVERFLOW ||
requested_flag == outputFlag::FLAG_CARRY)) {
if (carry_out == nullptr)
PANIC("Cuda error: when requesting FLAG_CARRY or FLAG_OVERFLOW, "
"carry_out must be a valid pointer")
if (lhs_array->lwe_dimension != carry_out->lwe_dimension)
PANIC("Cuda error: input and carry_out lwe dimension must be the same")
}
requested_flag == outputFlag::FLAG_CARRY) &&
carry_out == nullptr)
PANIC("Cuda error: when requesting FLAG_CARRY, carry_out must be a valid "
"pointer")
auto num_radix_blocks = lhs_array->num_radix_blocks;
auto params = mem->params;
@@ -2000,13 +1969,11 @@ void host_add_and_propagate_single_carry(
}
host_addition<Torus>(streams[0], gpu_indexes[0], lhs_array, lhs_array,
rhs_array, num_radix_blocks, params.message_modulus,
params.carry_modulus);
rhs_array, num_radix_blocks);
if (uses_carry == 1) {
host_addition<Torus>(streams[0], gpu_indexes[0], lhs_array, lhs_array,
input_carries, 1, params.message_modulus,
params.carry_modulus);
input_carries, 1);
}
// Step 1
host_compute_shifted_blocks_and_states<Torus>(
@@ -2037,8 +2004,7 @@ void host_add_and_propagate_single_carry(
auto shifted_blocks = mem->shifted_blocks_state_mem->shifted_blocks;
host_addition<Torus>(
streams[0], gpu_indexes[0], prepared_blocks, shifted_blocks,
mem->prop_simu_group_carries_mem->simulators, num_radix_blocks,
params.message_modulus, params.carry_modulus);
mem->prop_simu_group_carries_mem->simulators, num_radix_blocks);
if (requested_flag == outputFlag::FLAG_OVERFLOW ||
requested_flag == outputFlag::FLAG_CARRY) {
@@ -2047,8 +2013,7 @@ void host_add_and_propagate_single_carry(
&shifted_simulators, mem->prop_simu_group_carries_mem->simulators,
num_radix_blocks - 1, num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], &output_flag, &output_flag,
&shifted_simulators, 1, params.message_modulus,
params.carry_modulus);
&shifted_simulators, 1);
}
// Step 3
@@ -2063,8 +2028,7 @@ void host_add_and_propagate_single_carry(
if (num_radix_blocks == 1 && requested_flag == outputFlag::FLAG_OVERFLOW &&
uses_carry == 1) {
host_addition<Torus>(streams[0], gpu_indexes[0], &output_flag,
&output_flag, input_carries, 1,
params.message_modulus, params.carry_modulus);
&output_flag, input_carries, 1);
} else {
CudaRadixCiphertextFFI shifted_resolved_carries;
@@ -2073,8 +2037,7 @@ void host_add_and_propagate_single_carry(
mem->prop_simu_group_carries_mem->resolved_carries,
mem->num_groups - 1, mem->num_groups);
host_addition<Torus>(streams[0], gpu_indexes[0], &output_flag,
&output_flag, &shifted_resolved_carries, 1,
params.message_modulus, params.carry_modulus);
&output_flag, &shifted_resolved_carries, 1);
}
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], prepared_blocks, num_radix_blocks,
@@ -2087,7 +2050,6 @@ void host_add_and_propagate_single_carry(
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], lhs_array, 0, num_radix_blocks,
mem->output_flag, 0, num_radix_blocks);
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], carry_out, 0, 1, mem->output_flag,
num_radix_blocks, num_radix_blocks + 1);
@@ -2109,7 +2071,7 @@ uint64_t scratch_cuda_integer_overflowing_sub(
uint64_t size_tracker = 0;
*mem_ptr = new int_borrow_prop_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
compute_overflow, allocate_gpu_memory, size_tracker);
compute_overflow, allocate_gpu_memory, &size_tracker);
return size_tracker;
}
@@ -2179,8 +2141,7 @@ void host_single_borrow_propagate(
&shifted_simulators, mem->prop_simu_group_carries_mem->simulators,
num_radix_blocks - 1, num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
mem->overflow_block, &shifted_simulators, 1,
params.message_modulus, params.carry_modulus);
mem->overflow_block, &shifted_simulators, 1);
}
CudaRadixCiphertextFFI resolved_borrows;
as_radix_ciphertext_slice<Torus>(
@@ -2192,8 +2153,7 @@ void host_single_borrow_propagate(
// borrows
if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
mem->overflow_block, &resolved_borrows, 1,
params.message_modulus, params.carry_modulus);
mem->overflow_block, &resolved_borrows, 1);
}
cuda_event_record(mem->incoming_events[0], streams[0], gpu_indexes[0]);
@@ -2343,8 +2303,6 @@ __host__ void integer_radix_apply_noise_squashing_kb(
for (uint i = 0; i < lut->num_blocks; i++) {
lwe_array_out->degrees[i] = lut->degrees[0];
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus,
params.carry_modulus);
}
POP_RANGE()
}

View File

@@ -210,8 +210,7 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool reduce_degrees_for_single_carry_propagation, bool allocate_gpu_memory,
bool allocate_ms_array) {
bool allocate_gpu_memory, bool allocate_ms_array) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
@@ -221,8 +220,7 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
return scratch_cuda_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
max_num_radix_in_vec, reduce_degrees_for_single_carry_propagation, params,
allocate_gpu_memory);
max_num_radix_in_vec, params, allocate_gpu_memory);
}
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
@@ -236,11 +234,69 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
PANIC("Cuda error: input vector length should be a multiple of the "
"output's number of radix blocks")
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_out->num_radix_blocks,
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
// FIXME: this should not be necessary, we should make sure sum_ctxt works in
// the general case
for (int i = 0; i < radix_lwe_vec->num_radix_blocks; i++) {
radix_lwe_vec->degrees[i] = mem->params.message_modulus - 1;
}
switch (mem->params.polynomial_size) {
case 512:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_out->num_radix_blocks,
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
nullptr);
break;
case 1024:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<1024>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_out->num_radix_blocks,
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
nullptr);
break;
case 2048:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<2048>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_out->num_radix_blocks,
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
nullptr);
break;
case 4096:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<4096>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_out->num_radix_blocks,
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
nullptr);
break;
case 8192:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<8192>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_out->num_radix_blocks,
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
nullptr);
break;
case 16384:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<16384>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_out->num_radix_blocks,
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
nullptr);
break;
default:
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
"Supported N's are powers of two in the interval [256..16384].")
}
}
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(

View File

@@ -24,6 +24,24 @@
#include <string>
#include <vector>
template <typename Torus>
__global__ void smart_copy(Torus *dst, Torus *src, int32_t *id_out,
int32_t *id_in, size_t lwe_size) {
size_t tid = threadIdx.x;
size_t b_id = blockIdx.x;
size_t stride = blockDim.x;
auto input_id = id_in[b_id];
auto output_id = id_out[b_id];
auto cur_src = (input_id >= 0) ? &src[input_id * lwe_size] : nullptr;
auto cur_dst = &dst[output_id * lwe_size];
for (int i = tid; i < lwe_size; i += stride) {
cur_dst[i] = (input_id >= 0) ? cur_src[i] : 0;
}
}
template <typename Torus, class params>
__global__ void
all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
@@ -76,155 +94,33 @@ all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
}
}
__global__ inline void radix_vec_to_columns(uint32_t *const *const columns,
uint32_t *const columns_counter,
const uint64_t *const degrees,
const uint32_t num_radix_blocks,
const uint32_t num_radix_in_vec) {
template <typename Torus>
__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
uint32_t chunk_size, uint32_t block_size,
uint32_t num_blocks) {
const uint32_t idx = threadIdx.x;
size_t cnt = 0;
for (int i = 0; i < num_radix_in_vec; i++) {
size_t ct_id = i * num_radix_blocks + idx;
if (degrees[ct_id] != 0) {
columns[idx][cnt] = ct_id;
++cnt;
size_t stride = blockDim.x;
size_t chunk_id = blockIdx.x;
size_t chunk_elem_size = chunk_size * num_blocks * block_size;
size_t radix_elem_size = num_blocks * block_size;
auto src_chunk = &input_blocks[chunk_id * chunk_elem_size];
auto dst_radix = &result_blocks[chunk_id * radix_elem_size];
size_t block_stride = blockIdx.y * block_size;
auto result = &dst_radix[block_stride];
// init shared mem with first radix of chunk
size_t tid = threadIdx.x;
for (int i = tid; i < block_size; i += stride) {
result[i] = src_chunk[block_stride + i];
}
// accumulate rest of the radixes
for (int r_id = 1; r_id < chunk_size; r_id++) {
auto cur_src_radix = &src_chunk[r_id * radix_elem_size];
for (int i = tid; i < block_size; i += stride) {
result[i] += cur_src_radix[block_stride + i];
}
}
columns_counter[idx] = cnt;
}
template <typename Torus>
__global__ inline void prepare_new_columns_and_pbs_indexes(
uint32_t *const *const new_columns, uint32_t *const new_columns_counter,
Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
Torus *const lut_indexes, const uint32_t *const *const columns,
const uint32_t *const columns_counter, const uint32_t chunk_size) {
__shared__ uint32_t counter;
if (threadIdx.x == 0) {
counter = 0;
}
__syncthreads();
const uint32_t base_id = threadIdx.x;
const uint32_t column_len = columns_counter[base_id];
uint32_t ct_count = 0;
for (uint32_t i = 0; i + chunk_size <= column_len; i += chunk_size) {
// those indexes are for message ciphertexts
// for message ciphertexts in and out index should be same
const uint32_t in_index = columns[base_id][i];
new_columns[base_id][ct_count] = in_index;
const uint32_t pbs_index = atomicAdd(&counter, 1);
pbs_indexes_in[pbs_index] = in_index;
pbs_indexes_out[pbs_index] = in_index;
lut_indexes[pbs_index] = 0;
++ct_count;
}
__syncthreads();
if (base_id > 0) {
const uint32_t prev_base_id = base_id - 1;
const uint32_t prev_column_len = columns_counter[prev_base_id];
for (uint32_t i = 0; i + chunk_size <= prev_column_len; i += chunk_size) {
// those indexes are for carry ciphertexts
// for carry ciphertexts input is same as for message
// output will be placed to next block in the column
const uint32_t in_index = columns[prev_base_id][i];
const uint32_t out_index = columns[prev_base_id][i + 1];
new_columns[base_id][ct_count] = out_index;
const uint32_t pbs_index = atomicAdd(&counter, 1);
pbs_indexes_in[pbs_index] = in_index;
pbs_indexes_out[pbs_index] = out_index;
lut_indexes[pbs_index] = 1;
++ct_count;
}
}
const uint32_t start_index = column_len - column_len % chunk_size;
for (uint32_t i = start_index; i < column_len; ++i) {
new_columns[base_id][ct_count] = columns[base_id][i];
++ct_count;
}
new_columns_counter[base_id] = ct_count;
}
template <typename Torus>
__global__ inline void prepare_final_pbs_indexes(
Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
Torus *const lut_indexes, const uint32_t num_radix_blocks) {
int idx = threadIdx.x;
pbs_indexes_in[idx] = idx % num_radix_blocks;
pbs_indexes_out[idx] = idx + idx / num_radix_blocks;
lut_indexes[idx] = idx / num_radix_blocks;
}
template <typename Torus>
__global__ void calculate_chunks(Torus *const input_blocks,
const uint32_t *const *const columns,
const uint32_t *const columns_counter,
const uint32_t chunk_size,
const uint32_t block_size) {
const uint32_t part_size = blockDim.x;
const uint32_t base_id = blockIdx.x;
const uint32_t part_id = blockIdx.y;
const uint32_t coef_id = part_id * part_size + threadIdx.x;
if (coef_id >= block_size)
return;
const uint32_t column_len = columns_counter[base_id];
if (column_len >= chunk_size) {
const uint32_t num_chunks = column_len / chunk_size;
Torus result = 0;
for (uint32_t chunk_id = 0; chunk_id < num_chunks; ++chunk_id) {
const uint32_t first_ct_id = columns[base_id][chunk_id * chunk_size];
result = input_blocks[first_ct_id * block_size + coef_id];
for (uint32_t ct_id = 1; ct_id < chunk_size; ++ct_id) {
const uint32_t cur_ct_id =
columns[base_id][chunk_id * chunk_size + ct_id];
result += input_blocks[cur_ct_id * block_size + coef_id];
}
input_blocks[first_ct_id * block_size + coef_id] = result;
}
}
}
template <typename Torus>
__global__ void calculate_final_chunk_into_radix(
Torus *const out_radix, const Torus *const input_blocks,
const uint32_t *const *const columns, const uint32_t *const columns_counter,
const uint32_t chunk_size, const uint32_t block_size) {
const uint32_t part_size = blockDim.x;
const uint32_t base_id = blockIdx.x;
const uint32_t part_id = blockIdx.y;
const uint32_t coef_id = part_id * part_size + threadIdx.x;
if (coef_id >= block_size)
return;
const uint32_t column_len = columns_counter[base_id];
Torus result = 0;
if (column_len) {
const uint32_t first_ct_id = columns[base_id][0];
result = input_blocks[first_ct_id * block_size + coef_id];
for (uint32_t i = 1; i < column_len; ++i) {
const uint32_t cur_ct_it = columns[base_id][i];
result += input_blocks[cur_ct_it * block_size + coef_id];
}
}
out_radix[base_id * block_size + coef_id] = result;
}
template <typename Torus, class params>
@@ -271,33 +167,29 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
(process_msb) ? cur_msb_ct[params::degree] : 0;
}
}
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
bool reduce_degrees_for_single_carry_propagation, int_radix_params params,
bool allocate_gpu_memory) {
int_radix_params params, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_blocks_in_radix,
max_num_radix_in_vec, reduce_degrees_for_single_carry_propagation,
allocate_gpu_memory, size_tracker);
max_num_radix_in_vec, allocate_gpu_memory, &size_tracker);
return size_tracker;
}
template <typename Torus>
template <typename Torus, class params>
__host__ void host_integer_partial_sum_ciphertexts_vec_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
uint32_t num_radix_blocks, uint32_t num_radix_in_vec) {
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
auto big_lwe_size = big_lwe_dimension + 1;
uint32_t num_radix_blocks, uint32_t num_radix_in_vec,
int_radix_lut<Torus> *reused_lut) {
if (terms->lwe_dimension != radix_lwe_out->lwe_dimension)
PANIC("Cuda error: output and input radix ciphertexts should have the same "
@@ -307,235 +199,273 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
PANIC("Cuda error: input vector does not have enough blocks")
if (num_radix_blocks > radix_lwe_out->num_radix_blocks)
PANIC("Cuda error: output does not have enough blocks")
if (num_radix_in_vec == 0)
return;
auto current_blocks = mem_ptr->current_blocks;
auto new_blocks = mem_ptr->new_blocks;
auto new_blocks_copy = mem_ptr->new_blocks_copy;
auto old_blocks = mem_ptr->old_blocks;
auto small_lwe_vector = mem_ptr->small_lwe_vector;
auto d_degrees = mem_ptr->d_degrees;
auto d_columns = mem_ptr->d_columns;
auto d_columns_counter = mem_ptr->d_columns_counter;
auto d_new_columns = mem_ptr->d_new_columns;
auto d_new_columns_counter = mem_ptr->d_new_columns_counter;
auto d_smart_copy_in = mem_ptr->d_smart_copy_in;
auto d_smart_copy_out = mem_ptr->d_smart_copy_out;
auto message_modulus = mem_ptr->params.message_modulus;
auto carry_modulus = mem_ptr->params.carry_modulus;
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
auto big_lwe_size = big_lwe_dimension + 1;
auto glwe_dimension = mem_ptr->params.glwe_dimension;
auto polynomial_size = mem_ptr->params.polynomial_size;
auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
auto chunk_size =
(mem_ptr->params.message_modulus * mem_ptr->params.carry_modulus - 1) /
(mem_ptr->params.message_modulus - 1);
size_t total_blocks_in_vec = num_radix_blocks * num_radix_in_vec;
auto small_lwe_size = small_lwe_dimension + 1;
// In the case of extracting a single LWE this parameters are dummy
uint32_t num_many_lut = 1;
uint32_t lut_stride = 0;
if (terms->num_radix_blocks == 0) {
if (terms->num_radix_blocks == 0)
return;
}
if (num_radix_in_vec == 1) {
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
radix_lwe_out, 0, num_radix_blocks,
terms, 0, num_radix_blocks);
return;
}
if (old_blocks != terms) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], old_blocks,
terms);
}
if (num_radix_in_vec == 2) {
CudaRadixCiphertextFFI terms_slice;
as_radix_ciphertext_slice<Torus>(&terms_slice, terms, num_radix_blocks,
2 * num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, terms,
&terms_slice, num_radix_blocks,
mem_ptr->params.message_modulus,
mem_ptr->params.carry_modulus);
CudaRadixCiphertextFFI old_blocks_slice;
as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
num_radix_blocks, 2 * num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
&old_blocks_slice, num_radix_blocks);
return;
}
if (current_blocks != terms) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
current_blocks, terms);
size_t r = num_radix_in_vec;
size_t total_modulus = message_modulus * carry_modulus;
size_t message_max = message_modulus - 1;
size_t chunk_size = (total_modulus - 1) / message_max;
size_t h_lwe_idx_in[terms->num_radix_blocks];
size_t h_lwe_idx_out[terms->num_radix_blocks];
int32_t h_smart_copy_in[terms->num_radix_blocks];
int32_t h_smart_copy_out[terms->num_radix_blocks];
/// Here it is important to query the default max shared memory on device 0
/// instead of cuda_get_max_shared_memory,
/// to avoid bugs with tree_add_chunks trying to use too much shared memory
auto max_shared_memory = 0;
check_cuda_error(cudaDeviceGetAttribute(
&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));
// create lut object for message and carry
// we allocate luts_message_carry in the host function (instead of scratch)
// to reduce average memory consumption
int_radix_lut<Torus> *luts_message_carry;
size_t ch_amount = r / chunk_size;
if (!ch_amount)
ch_amount++;
if (reused_lut == nullptr) {
luts_message_carry = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
2 * ch_amount * num_radix_blocks, true, nullptr);
} else {
luts_message_carry = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
2 * ch_amount * num_radix_blocks, reused_lut, true, nullptr);
}
auto message_acc = luts_message_carry->get_lut(0, 0);
auto carry_acc = luts_message_carry->get_lut(0, 1);
cuda_memcpy_async_to_gpu(d_degrees, current_blocks->degrees,
total_blocks_in_vec * sizeof(uint64_t), streams[0],
gpu_indexes[0]);
// define functions for each accumulator
auto lut_f_message = [message_modulus](Torus x) -> Torus {
return x % message_modulus;
};
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
return x / message_modulus;
};
cuda_set_device(gpu_indexes[0]);
radix_vec_to_columns<<<1, num_radix_blocks, 0, streams[0]>>>(
d_columns, d_columns_counter, d_degrees, num_radix_blocks,
num_radix_in_vec);
// generate accumulators
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], message_acc,
luts_message_carry->get_degree(0), luts_message_carry->get_max_degree(0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
lut_f_message, true);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], carry_acc, luts_message_carry->get_degree(1),
luts_message_carry->get_max_degree(1), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, lut_f_carry, true);
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
bool needs_processing = false;
radix_columns current_columns(current_blocks->degrees, num_radix_blocks,
num_radix_in_vec, chunk_size, needs_processing);
int number_of_threads = std::min(256, (int)mem_ptr->params.polynomial_size);
int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);
while (r > 2) {
size_t cur_total_blocks = r * num_radix_blocks;
size_t ch_amount = r / chunk_size;
if (!ch_amount)
ch_amount++;
dim3 add_grid(ch_amount, num_radix_blocks, 1);
mem_ptr->setup_lookup_tables(streams, gpu_indexes, gpu_count,
num_radix_in_vec, current_blocks->degrees);
cuda_set_device(gpu_indexes[0]);
tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
(Torus *)new_blocks->ptr, (Torus *)old_blocks->ptr,
std::min(r, chunk_size), big_lwe_size, num_radix_blocks);
while (needs_processing) {
auto luts_message_carry = mem_ptr->luts_message_carry;
auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
calculate_chunks<Torus>
<<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
(Torus *)(current_blocks->ptr), d_columns, d_columns_counter,
chunk_size, big_lwe_size);
check_cuda_error(cudaGetLastError());
prepare_new_columns_and_pbs_indexes<<<1, num_radix_blocks, 0, streams[0]>>>(
d_new_columns, d_new_columns_counter, d_pbs_indexes_in,
d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0), d_columns,
d_columns_counter, chunk_size);
size_t total_count = 0;
size_t message_count = 0;
size_t carry_count = 0;
size_t sm_copy_count = 0;
uint32_t total_ciphertexts;
uint32_t total_messages;
current_columns.next_accumulation(total_ciphertexts, total_messages,
needs_processing);
generate_ids_update_degrees(
terms->degrees, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
h_smart_copy_out, ch_amount, r, num_radix_blocks, chunk_size,
message_max, total_count, message_count, carry_count, sm_copy_count);
auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
h_lwe_idx_in, h_lwe_idx_out);
auto active_gpu_count = get_active_gpu_count(total_ciphertexts, gpu_count);
size_t copy_size = sm_copy_count * sizeof(int32_t);
cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
streams[0], gpu_indexes[0]);
cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
streams[0], gpu_indexes[0]);
// inside d_smart_copy_in there are only -1 values
// it's fine to call smart_copy with same pointer
// as source and destination
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], new_blocks_copy, 0, r * num_radix_blocks,
new_blocks, 0, r * num_radix_blocks);
smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
(Torus *)new_blocks->ptr, (Torus *)new_blocks_copy->ptr,
d_smart_copy_out, d_smart_copy_in, big_lwe_size);
check_cuda_error(cudaGetLastError());
if (carry_count > 0)
cuda_set_value_async<Torus>(
streams[0], gpu_indexes[0],
luts_message_carry->get_lut_indexes(0, message_count), 1,
carry_count);
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
std::vector<Torus *> small_lwe_vector_vec =
luts_message_carry->lwe_after_ks_vec;
std::vector<Torus *> lwe_after_pbs_vec =
luts_message_carry->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec =
luts_message_carry->lwe_trivial_indexes_vec;
auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
if (active_gpu_count == 1) {
/// Apply KS to go from a big LWE dimension to a small LWE dimension
/// After this keyswitch execution, we need to synchronize the streams
/// because the keyswitch and PBS do not operate on the same number of
/// inputs
execute_keyswitch_async<Torus>(
streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
d_pbs_indexes_in, (Torus *)current_blocks->ptr, d_pbs_indexes_in,
lwe_indexes_in, (Torus *)new_blocks->ptr, lwe_indexes_in, ksks,
polynomial_size * glwe_dimension, small_lwe_dimension,
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, message_count);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
streams, gpu_indexes, 1, (Torus *)new_blocks->ptr, lwe_indexes_out,
luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
(Torus *)small_lwe_vector->ptr, lwe_indexes_in, bsks,
ms_noise_reduction_key, luts_message_carry->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log,
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
} else {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, new_blocks_vec,
(Torus *)new_blocks->ptr, luts_message_carry->h_lwe_indexes_in,
luts_message_carry->using_trivial_lwe_indexes, message_count,
big_lwe_size);
/// Apply KS to go from a big LWE dimension to a small LWE dimension
/// After this keyswitch execution, we need to synchronize the streams
/// because the keyswitch and PBS do not operate on the same number of
/// inputs
execute_keyswitch_async<Torus>(
streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec,
lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
ksks, big_lwe_dimension, small_lwe_dimension,
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
total_messages);
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count);
/// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
/// different configuration
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, gpu_count, (Torus *)small_lwe_vector->ptr,
small_lwe_vector_vec, luts_message_carry->h_lwe_indexes_in,
luts_message_carry->using_trivial_lwe_indexes, message_count,
small_lwe_size);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
(Torus *)small_lwe_vector->ptr, luts_message_carry->h_lwe_indexes_in,
luts_message_carry->using_trivial_lwe_indexes, total_count,
small_lwe_size);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
d_pbs_indexes_out, luts_message_carry->lut_vec,
luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
d_pbs_indexes_in, bsks, ms_noise_reduction_key,
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
polynomial_size, mem_ptr->params.pbs_base_log,
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut,
lut_stride);
} else {
Torus *h_lwe_indexes_in_pinned;
Torus *h_lwe_indexes_out_pinned;
cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
total_ciphertexts * sizeof(Torus));
cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
total_ciphertexts * sizeof(Torus));
for (uint32_t i = 0; i < total_ciphertexts; i++) {
h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
}
cuda_memcpy_async_to_cpu(
h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
cuda_memcpy_async_to_cpu(
h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint32_t i = 0; i < total_ciphertexts; i++) {
luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
}
cudaFreeHost(h_lwe_indexes_in_pinned);
cudaFreeHost(h_lwe_indexes_out_pinned);
total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
luts_message_carry->using_trivial_lwe_indexes = false;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, current_blocks, current_blocks, bsks,
ksks, ms_noise_reduction_key, luts_message_carry, total_ciphertexts);
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, (Torus *)new_blocks->ptr,
lwe_after_pbs_vec, luts_message_carry->h_lwe_indexes_out,
luts_message_carry->using_trivial_lwe_indexes, total_count,
big_lwe_size);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
cuda_set_device(gpu_indexes[0]);
std::swap(d_columns, d_new_columns);
std::swap(d_columns_counter, d_new_columns_counter);
}
calculate_final_chunk_into_radix<Torus>
<<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
(Torus *)(radix_lwe_out->ptr), (Torus *)(current_blocks->ptr),
d_columns, d_columns_counter, chunk_size, big_lwe_size);
if (mem_ptr->reduce_degrees_for_single_carry_propagation) {
auto luts_message_carry = mem_ptr->luts_message_carry;
auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
prepare_final_pbs_indexes<Torus>
<<<1, 2 * num_radix_blocks, 0, streams[0]>>>(
d_pbs_indexes_in, d_pbs_indexes_out,
luts_message_carry->get_lut_indexes(0, 0), num_radix_blocks);
set_zero_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], current_blocks, num_radix_blocks,
num_radix_blocks + 1);
auto active_gpu_count =
get_active_gpu_count(2 * num_radix_blocks, gpu_count);
if (active_gpu_count == 1) {
execute_keyswitch_async<Torus>(
streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
d_pbs_indexes_in, (Torus *)radix_lwe_out->ptr, d_pbs_indexes_in, ksks,
big_lwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log,
mem_ptr->params.ks_level, num_radix_blocks);
execute_pbs_async<Torus>(
streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
d_pbs_indexes_out, luts_message_carry->lut_vec,
luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
d_pbs_indexes_in, bsks, ms_noise_reduction_key,
luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
polynomial_size, mem_ptr->params.pbs_base_log,
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
2 * num_radix_blocks, mem_ptr->params.pbs_type, num_many_lut,
lut_stride);
} else {
uint32_t num_blocks_in_apply_lut = 2 * num_radix_blocks;
Torus *h_lwe_indexes_in_pinned;
Torus *h_lwe_indexes_out_pinned;
cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
num_blocks_in_apply_lut * sizeof(Torus));
cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
num_blocks_in_apply_lut * sizeof(Torus));
for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
}
cuda_memcpy_async_to_cpu(
h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
cuda_memcpy_async_to_cpu(
h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
}
cudaFreeHost(h_lwe_indexes_in_pinned);
cudaFreeHost(h_lwe_indexes_out_pinned);
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
luts_message_carry->using_trivial_lwe_indexes = false;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, active_gpu_count, current_blocks, radix_lwe_out,
bsks, ksks, ms_noise_reduction_key, luts_message_carry,
num_blocks_in_apply_lut);
for (uint i = 0; i < total_count; i++) {
auto degrees_index = luts_message_carry->h_lut_indexes[i];
new_blocks->degrees[i] = luts_message_carry->degrees[degrees_index];
new_blocks->noise_levels[i] = NoiseLevel::NOMINAL;
}
calculate_final_degrees(radix_lwe_out->degrees, terms->degrees,
num_radix_blocks, num_radix_in_vec, chunk_size,
mem_ptr->params.message_modulus);
cuda_set_device(gpu_indexes[0]);
CudaRadixCiphertextFFI current_blocks_slice;
as_radix_ciphertext_slice<Torus>(&current_blocks_slice, current_blocks,
num_radix_blocks, 2 * num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out,
current_blocks, &current_blocks_slice,
num_radix_blocks, mem_ptr->params.message_modulus,
mem_ptr->params.carry_modulus);
int rem_blocks = (r > chunk_size) ? r % chunk_size * num_radix_blocks : 0;
int new_blocks_created = 2 * ch_amount * num_radix_blocks;
if (rem_blocks > 0)
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], new_blocks, new_blocks_created,
new_blocks_created + rem_blocks, old_blocks,
cur_total_blocks - rem_blocks, cur_total_blocks);
std::swap(new_blocks, old_blocks);
r = (new_blocks_created + rem_blocks) / num_radix_blocks;
}
luts_message_carry->release(streams, gpu_indexes, gpu_count);
delete (luts_message_carry);
CudaRadixCiphertextFFI old_blocks_slice;
as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
num_radix_blocks, 2 * num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
&old_blocks_slice, num_radix_blocks);
}
template <typename Torus, class params>
@@ -669,10 +599,10 @@ __host__ void host_integer_mult_radix_kb(
size_t b_id = i % num_blocks;
terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
}
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
host_integer_partial_sum_ciphertexts_vec_kb<Torus, params>(
streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, bsks,
ksks, ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem, num_blocks,
2 * num_blocks);
2 * num_blocks, mem_ptr->luts_array);
auto scp_mem_ptr = mem_ptr->sc_prop_mem;
uint32_t requested_flag = outputFlag::FLAG_NONE;
@@ -693,7 +623,7 @@ __host__ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb(
uint64_t size_tracker = 0;
*mem_ptr = new int_mul_memory<Torus>(
streams, gpu_indexes, gpu_count, params, is_boolean_left,
is_boolean_right, num_radix_blocks, allocate_gpu_memory, size_tracker);
is_boolean_right, num_radix_blocks, allocate_gpu_memory, &size_tracker);
return size_tracker;
}

View File

@@ -106,8 +106,6 @@ __host__ void host_integer_radix_negation(
lwe_array_out->degrees[i] = z - static_cast<uint64_t>(zb);
lwe_array_out->noise_levels[i] = lwe_array_in->noise_levels[i];
CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], message_modulus,
carry_modulus);
zb = z / message_modulus;
}
}
@@ -123,7 +121,7 @@ __host__ uint64_t scratch_cuda_integer_overflowing_sub_kb(
uint64_t size_tracker = 0;
*mem_ptr = new int_overflowing_sub_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory,
allocate_ms_array, size_tracker);
allocate_ms_array, &size_tracker);
POP_RANGE()
return size_tracker;
}

View File

@@ -13,7 +13,7 @@ void create_zero_radix_ciphertext_async(cudaStream_t const stream,
CudaRadixCiphertextFFI *radix,
const uint32_t num_radix_blocks,
const uint32_t lwe_dimension,
uint64_t &size_tracker,
uint64_t *size_tracker,
bool allocate_gpu_memory) {
PUSH_RANGE("create zero radix ct");
radix->lwe_dimension = lwe_dimension;

View File

@@ -34,7 +34,7 @@ void update_degrees_after_scalar_bitor(uint64_t *output_degrees,
auto result = max;
for (uint j = 0; j < min + 1; j++) {
if ((max | j) > result) {
if (max | j > result) {
result = max | j;
}
}
@@ -52,7 +52,7 @@ void update_degrees_after_scalar_bitxor(uint64_t *output_degrees,
// Try every possibility to find the worst case
for (uint j = 0; j < min + 1; j++) {
if ((max ^ j) > result) {
if (max ^ j > result) {
result = max ^ j;
}
}

View File

@@ -277,9 +277,8 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
auto overflowed = x_0 < x_1;
return (Torus)(invert_flags.second ^ overflowed);
};
uint64_t size = 0;
int_radix_lut<Torus> *one_block_lut = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, params, 1, 1, true, size);
streams, gpu_indexes, gpu_count, params, 1, 1, true, nullptr);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], one_block_lut->get_lut(0, 0),
@@ -579,9 +578,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
is_x_less_than_y_given_input_borrow<Torus>(x_0, x_1, 0,
message_modulus);
};
uint64_t size = 0;
int_radix_lut<Torus> *one_block_lut = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, params, 1, 1, true, size);
streams, gpu_indexes, gpu_count, params, 1, 1, true, nullptr);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], one_block_lut->get_lut(0, 0),

View File

@@ -1,202 +0,0 @@
#include "scalar_div.cuh"
uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
bool allocate_gpu_memory, bool allocate_ms_array) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus,
allocate_ms_array);
return scratch_integer_unsigned_scalar_div_radix<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
(int_unsigned_scalar_div_mem<uint64_t> **)mem_ptr, num_blocks,
scalar_divisor_ffi, allocate_gpu_memory);
}
void cuda_integer_unsigned_scalar_div_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi) {
host_integer_unsigned_scalar_div_radix<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count, numerator_ct,
(int_unsigned_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
ms_noise_reduction_key, scalar_divisor_ffi);
}
void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_unsigned_scalar_div_mem<uint64_t> *mem_ptr =
(int_unsigned_scalar_div_mem<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
delete mem_ptr;
*mem_ptr_void = nullptr;
}
uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
bool allocate_gpu_memory, bool allocate_ms_array) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus,
allocate_ms_array);
return scratch_integer_signed_scalar_div_radix_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
(int_signed_scalar_div_mem<uint64_t> **)mem_ptr, num_blocks,
scalar_divisor_ffi, allocate_gpu_memory);
}
void cuda_integer_signed_scalar_div_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits) {
host_integer_signed_scalar_div_radix_kb<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count, numerator_ct,
(int_signed_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
ms_noise_reduction_key, scalar_divisor_ffi, numerator_bits);
}
void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_signed_scalar_div_mem<uint64_t> *mem_ptr =
(int_signed_scalar_div_mem<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
delete mem_ptr;
*mem_ptr_void = nullptr;
}
uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint32_t const active_bits_divisor, bool allocate_gpu_memory,
bool allocate_ms_array) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus,
allocate_ms_array);
return scratch_integer_unsigned_scalar_div_rem_radix<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
(int_unsigned_scalar_div_rem_buffer<uint64_t> **)mem_ptr, num_blocks,
scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory);
}
void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint64_t const *divisor_has_at_least_one_set,
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
void const *clear_blocks, void const *h_clear_blocks,
uint32_t num_clear_blocks) {
host_integer_unsigned_scalar_div_rem_radix<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count, quotient_ct,
remainder_ct, (int_unsigned_scalar_div_rem_buffer<uint64_t> *)mem_ptr,
bsks, (uint64_t **)ksks, ms_noise_reduction_key, scalar_divisor_ffi,
divisor_has_at_least_one_set, decomposed_divisor, num_scalars_divisor,
(uint64_t *)clear_blocks, (uint64_t *)h_clear_blocks, num_clear_blocks);
}
void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_unsigned_scalar_div_rem_buffer<uint64_t> *mem_ptr =
(int_unsigned_scalar_div_rem_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
delete mem_ptr;
*mem_ptr_void = nullptr;
}
uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint32_t const active_bits_divisor, bool allocate_gpu_memory,
bool allocate_ms_array) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus,
allocate_ms_array);
return scratch_integer_signed_scalar_div_rem_radix<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
(int_signed_scalar_div_rem_buffer<uint64_t> **)mem_ptr, num_blocks,
scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory);
}
void cuda_integer_signed_scalar_div_rem_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint64_t const *divisor_has_at_least_one_set,
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
uint32_t numerator_bits) {
host_integer_signed_scalar_div_rem_radix<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count, quotient_ct,
remainder_ct, (int_signed_scalar_div_rem_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks, ms_noise_reduction_key, scalar_divisor_ffi,
divisor_has_at_least_one_set, decomposed_divisor, num_scalars_divisor,
numerator_bits);
}
void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_signed_scalar_div_rem_buffer<uint64_t> *mem_ptr =
(int_signed_scalar_div_rem_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
delete mem_ptr;
*mem_ptr_void = nullptr;
}

View File

@@ -1,415 +0,0 @@
#ifndef SCALAR_DIV_CUH
#define SCALAR_DIV_CUH
#include "integer/integer_utilities.h"
#include "integer/scalar_bitops.cuh"
#include "integer/scalar_mul.cuh"
#include "integer/scalar_shifts.cuh"
#include "integer/subtraction.cuh"
template <typename Torus>
__host__ uint64_t scratch_integer_unsigned_scalar_div_radix(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, const int_radix_params params,
int_unsigned_scalar_div_mem<Torus> **mem_ptr, uint32_t num_radix_blocks,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
const bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unsigned_scalar_div_mem<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
scalar_divisor_ffi, allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_integer_unsigned_scalar_div_radix(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *numerator_ct,
int_unsigned_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi) {
if (scalar_divisor_ffi->is_abs_divisor_one) {
return;
}
if (scalar_divisor_ffi->is_divisor_pow2) {
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, numerator_ct,
scalar_divisor_ffi->ilog2_divisor, mem_ptr->logical_scalar_shift_mem,
bsks, ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);
return;
}
if (scalar_divisor_ffi->divisor_has_more_bits_than_numerator) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
mem_ptr->tmp_ffi);
return;
}
if (scalar_divisor_ffi->is_chosen_multiplier_geq_two_pow_numerator) {
if (scalar_divisor_ffi->shift_pre != (uint64_t)0) {
PANIC("shift_pre should be == 0");
}
if (scalar_divisor_ffi->shift_post == (uint32_t)0) {
PANIC("shift_post should be > 0");
}
CudaRadixCiphertextFFI *numerator_cpy = mem_ptr->tmp_ffi;
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
numerator_cpy, numerator_ct);
host_integer_radix_scalar_mul_high_kb<Torus>(
streams, gpu_indexes, gpu_count, numerator_cpy,
mem_ptr->scalar_mul_high_mem, ksks, ms_noise_reduction_key, bsks,
scalar_divisor_ffi);
host_sub_and_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, numerator_ct, numerator_cpy, nullptr,
nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, numerator_ct, (uint32_t)1,
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
numerator_ct->num_radix_blocks);
host_add_and_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, numerator_ct, numerator_cpy, nullptr,
nullptr, mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key,
FLAG_NONE, (uint32_t)0);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, numerator_ct,
scalar_divisor_ffi->shift_post - (uint32_t)1,
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
numerator_ct->num_radix_blocks);
return;
}
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, numerator_ct,
scalar_divisor_ffi->shift_pre, mem_ptr->logical_scalar_shift_mem, bsks,
ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);
host_integer_radix_scalar_mul_high_kb<Torus>(
streams, gpu_indexes, gpu_count, numerator_ct,
mem_ptr->scalar_mul_high_mem, ksks, ms_noise_reduction_key, bsks,
scalar_divisor_ffi);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, numerator_ct,
scalar_divisor_ffi->shift_post, mem_ptr->logical_scalar_shift_mem, bsks,
ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);
}
template <typename Torus>
__host__ uint64_t scratch_integer_signed_scalar_div_radix_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
int_signed_scalar_div_mem<Torus> **mem_ptr, uint32_t num_radix_blocks,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
const bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_signed_scalar_div_mem<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
scalar_divisor_ffi, allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_integer_signed_scalar_div_radix_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *numerator_ct,
int_signed_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits) {
if (scalar_divisor_ffi->is_abs_divisor_one) {
if (scalar_divisor_ffi->is_divisor_negative) {
CudaRadixCiphertextFFI *tmp = mem_ptr->tmp_ffi;
host_integer_radix_negation<Torus>(
streams, gpu_indexes, gpu_count, tmp, numerator_ct,
mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus,
numerator_ct->num_radix_blocks);
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
numerator_ct, tmp);
}
return;
}
if (scalar_divisor_ffi->chosen_multiplier_has_more_bits_than_numerator) {
set_zero_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], numerator_ct, 0,
numerator_ct->num_radix_blocks);
return;
}
CudaRadixCiphertextFFI *tmp = mem_ptr->tmp_ffi;
if (scalar_divisor_ffi->is_divisor_pow2) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
numerator_ct);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, tmp,
scalar_divisor_ffi->chosen_multiplier_num_bits - 1,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
ms_noise_reduction_key);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, tmp,
numerator_bits - scalar_divisor_ffi->chosen_multiplier_num_bits,
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
tmp->num_radix_blocks);
host_add_and_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, tmp, numerator_ct, nullptr, nullptr,
mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
(uint32_t)0);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, tmp,
scalar_divisor_ffi->chosen_multiplier_num_bits,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
ms_noise_reduction_key);
} else if (!scalar_divisor_ffi->is_chosen_multiplier_geq_two_pow_numerator) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
numerator_ct);
host_integer_radix_signed_scalar_mul_high_kb<Torus>(
streams, gpu_indexes, gpu_count, tmp, mem_ptr->scalar_mul_high_mem,
ksks, scalar_divisor_ffi, ms_noise_reduction_key, bsks);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, tmp, scalar_divisor_ffi->shift_post,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
ms_noise_reduction_key);
CudaRadixCiphertextFFI *xsign = mem_ptr->xsign_ffi;
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], xsign,
numerator_ct);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, xsign, numerator_bits - 1,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
ms_noise_reduction_key);
host_sub_and_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, tmp, xsign, nullptr, nullptr,
mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
FLAG_NONE, (uint32_t)0);
} else {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
numerator_ct);
host_integer_radix_signed_scalar_mul_high_kb<Torus>(
streams, gpu_indexes, gpu_count, tmp, mem_ptr->scalar_mul_high_mem,
ksks, scalar_divisor_ffi, ms_noise_reduction_key, bsks);
host_add_and_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, tmp, numerator_ct, nullptr, nullptr,
mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
(uint32_t)0);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, tmp, scalar_divisor_ffi->shift_post,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
ms_noise_reduction_key);
CudaRadixCiphertextFFI *xsign = mem_ptr->xsign_ffi;
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], xsign,
numerator_ct);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, xsign, numerator_bits - 1,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
ms_noise_reduction_key);
host_sub_and_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, tmp, xsign, nullptr, nullptr,
mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
FLAG_NONE, (uint32_t)0);
}
if (scalar_divisor_ffi->is_divisor_negative) {
host_integer_radix_negation<Torus>(
streams, gpu_indexes, gpu_count, numerator_ct, tmp,
mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus,
numerator_ct->num_radix_blocks);
} else {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
tmp);
}
}
template <typename Torus>
__host__ uint64_t scratch_integer_unsigned_scalar_div_rem_radix(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, const int_radix_params params,
int_unsigned_scalar_div_rem_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint32_t const active_bits_divisor, const bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unsigned_scalar_div_rem_buffer<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_integer_unsigned_scalar_div_rem_radix(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *quotient_ct,
CudaRadixCiphertextFFI *remainder_ct,
int_unsigned_scalar_div_rem_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint64_t const *divisor_has_at_least_one_set,
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
Torus const *clear_blocks, Torus const *h_clear_blocks,
uint32_t num_clear_blocks) {
auto numerator_ct = mem_ptr->numerator_ct;
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
quotient_ct);
host_integer_unsigned_scalar_div_radix(
streams, gpu_indexes, gpu_count, quotient_ct, mem_ptr->unsigned_div_mem,
bsks, ksks, ms_noise_reduction_key, scalar_divisor_ffi);
if (scalar_divisor_ffi->is_divisor_pow2) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
numerator_ct);
host_integer_radix_scalar_bitop_kb(
streams, gpu_indexes, gpu_count, remainder_ct, remainder_ct,
clear_blocks, h_clear_blocks, num_clear_blocks, mem_ptr->bitop_mem,
bsks, ksks, ms_noise_reduction_key);
} else {
if (!scalar_divisor_ffi->is_divisor_zero) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
remainder_ct, quotient_ct);
if (!scalar_divisor_ffi->is_abs_divisor_one &&
remainder_ct->num_radix_blocks != 0) {
host_integer_scalar_mul_radix<Torus>(
streams, gpu_indexes, gpu_count, remainder_ct, decomposed_divisor,
divisor_has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks, ksks,
ms_noise_reduction_key, mem_ptr->params.message_modulus,
num_scalars_divisor);
}
}
host_sub_and_propagate_single_carry(
streams, gpu_indexes, gpu_count, numerator_ct, remainder_ct, nullptr,
nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
numerator_ct);
}
}
template <typename Torus>
__host__ uint64_t scratch_integer_signed_scalar_div_rem_radix(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, const int_radix_params params,
int_signed_scalar_div_rem_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint32_t const active_bits_divisor, const bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_signed_scalar_div_rem_buffer<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_integer_signed_scalar_div_rem_radix(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *quotient_ct,
CudaRadixCiphertextFFI *remainder_ct,
int_signed_scalar_div_rem_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint64_t const *divisor_has_at_least_one_set,
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
uint32_t numerator_bits) {
auto numerator_ct = mem_ptr->numerator_ct;
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
quotient_ct);
host_integer_signed_scalar_div_radix_kb(
streams, gpu_indexes, gpu_count, quotient_ct, mem_ptr->signed_div_mem,
bsks, ksks, ms_noise_reduction_key, scalar_divisor_ffi, numerator_bits);
host_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, quotient_ct, nullptr, nullptr,
mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
(uint32_t)0);
if (!scalar_divisor_ffi->is_divisor_negative &&
scalar_divisor_ffi->is_divisor_pow2) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
quotient_ct);
host_integer_radix_logical_scalar_shift_kb_inplace(
streams, gpu_indexes, gpu_count, remainder_ct,
scalar_divisor_ffi->ilog2_divisor, mem_ptr->logical_scalar_shift_mem,
bsks, ksks, ms_noise_reduction_key, remainder_ct->num_radix_blocks);
} else if (!scalar_divisor_ffi->is_divisor_zero) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
quotient_ct);
bool is_divisor_one = scalar_divisor_ffi->is_abs_divisor_one &&
!scalar_divisor_ffi->is_divisor_negative;
if (!is_divisor_one && remainder_ct->num_radix_blocks != 0) {
host_integer_scalar_mul_radix<Torus>(
streams, gpu_indexes, gpu_count, remainder_ct, decomposed_divisor,
divisor_has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks, ksks,
ms_noise_reduction_key, mem_ptr->params.message_modulus,
num_scalars_divisor);
}
}
host_sub_and_propagate_single_carry(
streams, gpu_indexes, gpu_count, numerator_ct, remainder_ct, nullptr,
nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
numerator_ct);
}
#endif

View File

@@ -6,8 +6,7 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool allocate_gpu_memory,
bool allocate_ms_array) {
PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
@@ -18,7 +17,28 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
return scratch_cuda_integer_radix_scalar_mul_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_scalar_mul_buffer<uint64_t> **)mem_ptr, num_blocks, params,
num_scalar_bits, allocate_gpu_memory);
allocate_gpu_memory);
}
uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool anticipated_buffer_drop, bool allocate_gpu_memory,
bool allocate_ms_array) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus,
allocate_ms_array);
return scratch_cuda_integer_radix_scalar_mul_high_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_scalar_mul_high<uint64_t> **)mem_ptr, num_blocks, params,
anticipated_buffer_drop, allocate_gpu_memory);
}
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
@@ -29,11 +49,73 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_scalars) {
host_integer_scalar_mul_radix<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
decomposed_scalar, has_at_least_one_set,
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
switch (polynomial_size) {
case 512:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<512>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
decomposed_scalar, has_at_least_one_set,
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
num_scalars);
break;
case 1024:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<1024>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
decomposed_scalar, has_at_least_one_set,
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
num_scalars);
break;
case 2048:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<2048>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
decomposed_scalar, has_at_least_one_set,
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
num_scalars);
break;
case 4096:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<4096>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
decomposed_scalar, has_at_least_one_set,
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
num_scalars);
break;
case 8192:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<8192>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
decomposed_scalar, has_at_least_one_set,
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
num_scalars);
break;
case 16384:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<16384>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
decomposed_scalar, has_at_least_one_set,
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
num_scalars);
break;
default:
PANIC("Cuda error (scalar multiplication): unsupported polynomial size. "
"Only N = 512, 1024, 2048, 4096, 8192, 16384 are supported.")
}
}
void cuda_integer_radix_scalar_mul_high_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, void *const *ksks,
uint64_t rhs, uint64_t const *decomposed_scalar,
uint64_t const *has_at_least_one_set,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks, uint32_t num_scalars) {
host_integer_radix_scalar_mul_high_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, ct,
(int_scalar_mul_high<uint64_t> *)mem_ptr, (uint64_t **)ksks, rhs,
decomposed_scalar, has_at_least_one_set, ms_noise_reduction_key, bsks,
num_scalars);
}
@@ -47,3 +129,22 @@ void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}
void cleanup_cuda_integer_radix_scalar_mul_high_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_scalar_mul_high<uint64_t> *mem_ptr =
(int_scalar_mul_high<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
}
void cuda_small_scalar_multiplication_integer_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lwe_array, uint64_t scalar) {
host_integer_small_scalar_mul_radix<uint64_t>((cudaStream_t *)(streams),
gpu_indexes, gpu_count,
lwe_array, lwe_array, scalar);
}

View File

@@ -33,16 +33,16 @@ __host__ uint64_t scratch_cuda_integer_radix_scalar_mul_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_scalar_mul_buffer<T> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
uint32_t num_scalar_bits, bool allocate_gpu_memory) {
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_scalar_mul_buffer<T>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
num_scalar_bits, allocate_gpu_memory, true, size_tracker);
allocate_gpu_memory, true, &size_tracker);
return size_tracker;
}
template <typename T>
template <typename T, class params>
__host__ void host_integer_scalar_mul_radix(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
@@ -80,7 +80,7 @@ __host__ void host_integer_scalar_mul_radix(
}
}
size_t j = 0;
for (size_t i = 0; i < std::min(num_scalars, num_ciphertext_bits); i++) {
for (size_t i = 0; i < min(num_scalars, num_ciphertext_bits); i++) {
if (decomposed_scalar[i] == 1) {
// Perform a block shift
CudaRadixCiphertextFFI preshifted_radix_ct;
@@ -116,10 +116,13 @@ __host__ void host_integer_scalar_mul_radix(
set_zero_radix_ciphertext_slice_async<T>(streams[0], gpu_indexes[0],
lwe_array, 0, num_radix_blocks);
} else {
host_integer_partial_sum_ciphertexts_vec_kb<T>(
for (int i = 0; i < j * num_radix_blocks; i++) {
all_shifted_buffer->degrees[i] = message_modulus - 1;
}
host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, bsks,
ksks, ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem,
num_radix_blocks, j);
num_radix_blocks, j, nullptr);
auto scp_mem_ptr = mem->sc_prop_mem;
uint32_t requested_flag = outputFlag::FLAG_NONE;
@@ -136,8 +139,7 @@ template <typename T>
__host__ void host_integer_small_scalar_mul_radix(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *output_lwe_array,
CudaRadixCiphertextFFI *input_lwe_array, T scalar,
const uint32_t message_modulus, const uint32_t carry_modulus) {
CudaRadixCiphertextFFI *input_lwe_array, T scalar) {
if (output_lwe_array->num_radix_blocks != input_lwe_array->num_radix_blocks)
PANIC("Cuda error: input and output num radix blocks must be the same")
@@ -167,20 +169,35 @@ __host__ void host_integer_small_scalar_mul_radix(
output_lwe_array->noise_levels[i] =
input_lwe_array->noise_levels[i] * scalar;
output_lwe_array->degrees[i] = input_lwe_array->degrees[i] * scalar;
CHECK_NOISE_LEVEL(output_lwe_array->noise_levels[i], message_modulus,
carry_modulus);
}
}
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_scalar_mul_high<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
bool anticipated_buffer_drop, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_scalar_mul_high<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
allocate_gpu_memory, LEFT_SHIFT, anticipated_buffer_drop, &size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_integer_radix_scalar_mul_high_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
int_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
int_scalar_mul_high<Torus> *mem_ptr, Torus *const *ksks, uint64_t rhs,
uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks, const CudaScalarDivisorFFI *scalar_divisor_ffi) {
void *const *bsks, uint32_t num_scalars) {
if (scalar_divisor_ffi->is_chosen_multiplier_zero) {
if (rhs == (uint64_t)0) {
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], ct,
0, ct->num_radix_blocks);
return;
@@ -191,71 +208,66 @@ __host__ void host_integer_radix_scalar_mul_high_kb(
host_extend_radix_with_trivial_zero_blocks_msb<Torus>(tmp_ffi, ct, streams,
gpu_indexes);
if (scalar_divisor_ffi->active_bits != (uint32_t)0 &&
!scalar_divisor_ffi->is_abs_chosen_multiplier_one &&
tmp_ffi->num_radix_blocks != 0) {
if (rhs != (uint64_t)1 || tmp_ffi->num_radix_blocks != 0) {
if ((rhs & (rhs - 1)) == 0) {
uint32_t shift = std::log2(rhs);
if (scalar_divisor_ffi->is_chosen_multiplier_pow2) {
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, tmp_ffi,
scalar_divisor_ffi->ilog2_chosen_multiplier,
streams, gpu_indexes, gpu_count, tmp_ffi, shift,
mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
ms_noise_reduction_key, tmp_ffi->num_radix_blocks);
} else {
host_integer_scalar_mul_radix<Torus>(
streams, gpu_indexes, gpu_count, tmp_ffi,
scalar_divisor_ffi->decomposed_chosen_multiplier,
scalar_divisor_ffi->chosen_multiplier_has_at_least_one_set,
mem_ptr->scalar_mul_mem, bsks, (uint64_t **)ksks,
ms_noise_reduction_key, mem_ptr->params.message_modulus,
scalar_divisor_ffi->num_scalars);
}
}
host_trim_radix_blocks_lsb<Torus>(ct, tmp_ffi, streams, gpu_indexes);
}
template <typename Torus>
__host__ void host_integer_radix_signed_scalar_mul_high_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
int_signed_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks) {
if (scalar_divisor_ffi->is_chosen_multiplier_zero) {
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], ct,
0, ct->num_radix_blocks);
return;
}
CudaRadixCiphertextFFI *tmp_ffi = mem_ptr->tmp;
host_extend_radix_with_sign_msb<Torus>(
streams, gpu_indexes, gpu_count, tmp_ffi, ct, mem_ptr->extend_radix_mem,
ct->num_radix_blocks, bsks, (uint64_t **)ksks, ms_noise_reduction_key);
if (scalar_divisor_ffi->active_bits != (uint32_t)0 &&
!scalar_divisor_ffi->is_abs_chosen_multiplier_one &&
tmp_ffi->num_radix_blocks != 0) {
if (scalar_divisor_ffi->is_chosen_multiplier_pow2) {
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, tmp_ffi,
scalar_divisor_ffi->ilog2_chosen_multiplier,
mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
ms_noise_reduction_key, tmp_ffi->num_radix_blocks);
} else {
host_integer_scalar_mul_radix<Torus>(
streams, gpu_indexes, gpu_count, tmp_ffi,
scalar_divisor_ffi->decomposed_chosen_multiplier,
scalar_divisor_ffi->chosen_multiplier_has_at_least_one_set,
mem_ptr->scalar_mul_mem, bsks, (uint64_t **)ksks,
ms_noise_reduction_key, mem_ptr->params.message_modulus,
scalar_divisor_ffi->num_scalars);
switch (mem_ptr->params.polynomial_size) {
case 512:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<512>>(
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
(uint64_t **)ksks, ms_noise_reduction_key,
mem_ptr->params.message_modulus, num_scalars);
break;
case 1024:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<1024>>(
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
(uint64_t **)ksks, ms_noise_reduction_key,
mem_ptr->params.message_modulus, num_scalars);
break;
case 2048:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<2048>>(
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
(uint64_t **)ksks, ms_noise_reduction_key,
mem_ptr->params.message_modulus, num_scalars);
break;
case 4096:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<4096>>(
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
(uint64_t **)ksks, ms_noise_reduction_key,
mem_ptr->params.message_modulus, num_scalars);
break;
case 8192:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<8192>>(
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
(uint64_t **)ksks, ms_noise_reduction_key,
mem_ptr->params.message_modulus, num_scalars);
break;
case 16384:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<16384>>(
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
(uint64_t **)ksks, ms_noise_reduction_key,
mem_ptr->params.message_modulus, num_scalars);
break;
default:
PANIC(
"Cuda error (scalar multiplication): unsupported polynomial size. "
"Only N = 512, 1024, 2048, 4096, 8192, 16384 are supported.")
}
}
}

View File

@@ -21,7 +21,7 @@ __host__ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb(
uint64_t size_tracker = 0;
*mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
allocate_gpu_memory, &size_tracker);
return size_tracker;
}

View File

@@ -21,7 +21,7 @@ __host__ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb(
uint64_t size_tracker = 0;
*mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
allocate_gpu_memory, &size_tracker);
return size_tracker;
}
@@ -133,7 +133,7 @@ __host__ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
uint64_t size_tracker = 0;
*mem_ptr = new int_arithmetic_scalar_shift_buffer<Torus>(
streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
allocate_gpu_memory, &size_tracker);
return size_tracker;
}

View File

@@ -21,7 +21,7 @@ __host__ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb(
uint64_t size_tracker = 0;
*mem_ptr = new int_shift_and_rotate_buffer<Torus>(
streams, gpu_indexes, gpu_count, shift_type, is_signed, params,
num_radix_blocks, allocate_gpu_memory, size_tracker);
num_radix_blocks, allocate_gpu_memory, &size_tracker);
return size_tracker;
}
@@ -159,13 +159,11 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
// control_bit|b|a
host_pack_bivariate_blocks<Torus>(
streams, gpu_indexes, gpu_count, mux_inputs, mux_lut->lwe_indexes_out,
rotated_input, input_bits_a, mux_lut->lwe_indexes_in, 2, total_nb_bits,
mem->params.message_modulus, mem->params.carry_modulus);
rotated_input, input_bits_a, mux_lut->lwe_indexes_in, 2, total_nb_bits);
// The shift bit is already properly aligned/positioned
host_add_the_same_block_to_all_blocks<Torus>(
streams[0], gpu_indexes[0], mux_inputs, mux_inputs, &shift_bit,
mem->params.message_modulus, mem->params.carry_modulus);
streams[0], gpu_indexes[0], mux_inputs, mux_inputs, &shift_bit);
// we have
// control_bit|b|a
@@ -185,9 +183,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
// Bitshift and add the other bits
for (int i = bits_per_block - 2; i >= 0; i--) {
host_integer_small_scalar_mul_radix<Torus>(
streams, gpu_indexes, gpu_count, lwe_array, lwe_array, 2,
mem->params.message_modulus, mem->params.carry_modulus);
host_integer_small_scalar_mul_radix<Torus>(streams, gpu_indexes, gpu_count,
lwe_array, lwe_array, 2);
for (int j = 0; j < num_radix_blocks; j++) {
CudaRadixCiphertextFFI block;
CudaRadixCiphertextFFI bit_to_add;
@@ -196,8 +193,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
i + j * bits_per_block,
i + j * bits_per_block + 1);
host_addition<Torus>(streams[0], gpu_indexes[0], &block, &block,
&bit_to_add, 1, mem->params.message_modulus,
mem->params.carry_modulus);
&bit_to_add, 1);
}
// To give back a clean ciphertext

View File

@@ -23,7 +23,7 @@ uint64_t scratch_cuda_sub_and_propagate_single_carry(
*mem_ptr = new int_sub_and_propagate<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks, requested_flag,
allocate_gpu_memory, size_tracker);
allocate_gpu_memory, &size_tracker);
return size_tracker;
}
@@ -73,7 +73,6 @@ __host__ void host_integer_radix_subtraction(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in_2,
message_modulus, carry_modulus, num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array_out, lwe_array_out,
lwe_array_in_1, num_radix_blocks, message_modulus,
carry_modulus);
lwe_array_in_1, num_radix_blocks);
}
#endif

View File

@@ -10,7 +10,7 @@ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
output->num_radix_blocks != input_2->num_radix_blocks)
PANIC("Cuda error: input and output num radix blocks must be the same")
host_addition<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index, output,
input_1, input_2, output->num_radix_blocks, 0, 0);
input_1, input_2, output->num_radix_blocks);
}
/*
@@ -48,7 +48,7 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
output->num_radix_blocks != input_2->num_radix_blocks)
PANIC("Cuda error: input and output num radix blocks must be the same")
host_addition<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index, output,
input_1, input_2, output->num_radix_blocks, 0, 0);
input_1, input_2, output->num_radix_blocks);
}
/*
@@ -147,3 +147,44 @@ void cuda_add_lwe_ciphertext_vector_plaintext_64(
static_cast<const uint64_t *>(lwe_array_in), plaintext_in,
input_lwe_dimension, input_lwe_ciphertext_count);
}
/*
* Perform the subtraction of a u64 input LWE ciphertext vector with a u64 input
* plaintext vector.
* - `stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - `lwe_array_out` is an array of size
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
* been allocated on the GPU before calling this function, and that will hold
* the result of the computation.
* - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
* been allocated and initialized before calling this function. It has the same
* size as the output array.
* - `plaintext_array_in` is the plaintext vector used as input, it should have
* been allocated and initialized before calling this function. It should be of
* size `input_lwe_ciphertext_count`.
* - `input_lwe_dimension` is the number of mask elements in the input and
* output LWE ciphertext vectors
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
* input LWE ciphertext vector, as well as in the output. It is also the number
* of plaintexts in the input plaintext vector.
*
* Each plaintext of the input plaintext vector is subtracted to the body of the
* corresponding LWE ciphertext in the LWE ciphertext vector. The result of the
* operation is stored in the output LWE ciphertext vector. The two input
* vectors are unchanged. This function is a wrapper to a device function that
* performs the operation on the GPU.
*/
void cuda_sub_lwe_ciphertext_vector_plaintext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void const *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_subtraction_plaintext<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(plaintext_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}

View File

@@ -9,7 +9,6 @@
#include "device.h"
#include "helper_multi_gpu.h"
#include "integer/integer.h"
#include "integer/integer_utilities.h"
#include "linear_algebra.h"
#include "utils/kernel_dimensions.cuh"
#include <stdio.h>
@@ -103,12 +102,11 @@ __global__ void addition(T *output, T const *input_1, T const *input_2,
// Coefficient-wise addition
// num_radix_blocks selects the amount of blocks to be added from the inputs
template <typename T>
__host__ void
host_addition(cudaStream_t stream, uint32_t gpu_index,
CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input_1,
CudaRadixCiphertextFFI const *input_2, uint32_t num_radix_blocks,
const uint32_t message_modulus, const uint32_t carry_modulus) {
__host__ void host_addition(cudaStream_t stream, uint32_t gpu_index,
CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input_1,
CudaRadixCiphertextFFI const *input_2,
uint32_t num_radix_blocks) {
if (output->lwe_dimension != input_1->lwe_dimension ||
output->lwe_dimension != input_2->lwe_dimension)
PANIC("Cuda error: input and output num radix blocks must be the same")
@@ -137,7 +135,6 @@ host_addition(cudaStream_t stream, uint32_t gpu_index,
output->degrees[i] = input_1->degrees[i] + input_2->degrees[i];
output->noise_levels[i] =
input_1->noise_levels[i] + input_2->noise_levels[i];
CHECK_NOISE_LEVEL(output->noise_levels[i], message_modulus, carry_modulus);
}
}
@@ -163,8 +160,7 @@ template <typename T>
__host__ void host_add_the_same_block_to_all_blocks(
cudaStream_t stream, uint32_t gpu_index, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input_with_multiple_blocks,
CudaRadixCiphertextFFI const *input_with_single_block,
const uint32_t message_modulus, const uint32_t carry_modulus) {
CudaRadixCiphertextFFI const *input_with_single_block) {
if (output->num_radix_blocks != input_with_multiple_blocks->num_radix_blocks)
PANIC("Cuda error: input and output num radix blocks must be the same")
if (input_with_single_block->num_radix_blocks != 1)
@@ -196,7 +192,6 @@ __host__ void host_add_the_same_block_to_all_blocks(
input_with_single_block->degrees[0];
output->noise_levels[i] = input_with_multiple_blocks->noise_levels[i] +
input_with_single_block->noise_levels[0];
CHECK_NOISE_LEVEL(output->noise_levels[i], message_modulus, carry_modulus);
}
}
@@ -276,7 +271,8 @@ __host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
}
template <typename T>
__global__ void radix_body_subtraction_inplace(T *lwe_ct, T *plaintext_input,
__global__ void radix_body_subtraction_inplace(T *lwe_ct,
const T *plaintext_input,
uint32_t input_lwe_dimension,
uint32_t num_entries) {
@@ -293,7 +289,7 @@ __global__ void radix_body_subtraction_inplace(T *lwe_ct, T *plaintext_input,
template <typename T>
__host__ void host_subtraction_plaintext(cudaStream_t stream,
uint32_t gpu_index, T *output,
T *lwe_input, T *plaintext_input,
T *lwe_input, const T *plaintext_input,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
@@ -385,7 +381,6 @@ __host__ void host_unchecked_sub_with_correcting_term(
output->noise_levels[i] =
input_1->noise_levels[i] + input_2->noise_levels[i];
zb = z / message_modulus;
CHECK_NOISE_LEVEL(output->noise_levels[i], message_modulus, carry_modulus);
}
}

View File

@@ -35,20 +35,6 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
static_cast<cudaStream_t>(stream), gpu_index);
}
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size, uint32_t grouping_factor) {
uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
level_count * (1 << grouping_factor) /
grouping_factor;
size_t buffer_size =
total_polynomials * polynomial_size * sizeof(__uint128_t);
cuda_memcpy_async_to_gpu((__uint128_t *)dest, (__uint128_t *)src, buffer_size,
static_cast<cudaStream_t>(stream), gpu_index);
}
// We need these lines so the compiler knows how to specialize these functions
template __device__ const uint64_t *
get_ith_mask_kth_block(const uint64_t *ptr, int i, int k, int level,
@@ -94,14 +80,6 @@ template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
int glwe_dimension,
uint32_t level_count);
template __device__ const __uint128_t *
get_multi_bit_ith_lwe_gth_group_kth_block(const __uint128_t *ptr, int g, int i,
int k, int level,
uint32_t grouping_factor,
uint32_t polynomial_size,
uint32_t glwe_dimension,
uint32_t level_count);
template __device__ const uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
const uint64_t *ptr, int g, int i, int k, int level,
uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension,

View File

@@ -83,62 +83,6 @@ mul_ggsw_glwe_in_fourier_domain(double2 *fft, double2 *join_buffer,
__syncthreads();
}
/** Perform the matrix multiplication between the GGSW and the GLWE,
* each block operating on a single level for mask and body.
* Both operands should be at fourier domain
*
* This function assumes:
* - Thread blocks at dimension z relates to the decomposition level.
* - Thread blocks at dimension y relates to the glwe dimension.
* - polynomial_size / params::opt threads are available per block
*/
template <typename G, class params>
__device__ void mul_ggsw_glwe_in_fourier_domain_128(
double *fft, double *join_buffer,
const double *__restrict__ bootstrapping_key, int iteration, G &group,
bool support_dsm = false) {
const uint32_t polynomial_size = params::degree;
const uint32_t glwe_dimension = gridDim.y - 1;
const uint32_t level_count = gridDim.z;
// The first product is used to initialize level_join_buffer
auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
// Continues multiplying fft by every polynomial in that particular bsk level
// Each y-block accumulates in a different polynomial at each iteration
auto bsk_slice = get_ith_mask_kth_block_128(
bootstrapping_key, iteration, blockIdx.y, blockIdx.z, polynomial_size,
glwe_dimension, level_count);
for (int j = 0; j < glwe_dimension + 1; j++) {
int idx = (j + this_block_rank) % (glwe_dimension + 1);
auto bsk_poly = bsk_slice + idx * polynomial_size / 2 * 4;
auto buffer_slice = get_join_buffer_element_128<G>(
blockIdx.z, idx, group, join_buffer, polynomial_size, glwe_dimension,
support_dsm);
polynomial_product_accumulate_in_fourier_domain_128<params>(
buffer_slice, fft, bsk_poly, j == 0);
group.sync();
}
// -----------------------------------------------------------------
// All blocks are synchronized here; after this sync, level_join_buffer has
// the values needed from every other block
// accumulate rest of the products into fft buffer
for (int l = 0; l < level_count; l++) {
auto cur_src_acc = get_join_buffer_element_128<G>(
l, blockIdx.y, group, join_buffer, polynomial_size, glwe_dimension,
support_dsm);
polynomial_accumulate_in_fourier_domain_128<params>(fft, cur_src_acc,
l == 0);
}
__syncthreads();
}
template <typename Torus>
void execute_pbs_async(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -279,7 +223,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
bool allocate_gpu_memory, bool allocate_ms_array,
uint64_t &size_tracker) {
uint64_t *size_tracker) {
switch (sizeof(Torus)) {
case sizeof(uint32_t):
// 32 bits
@@ -287,7 +231,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
case MULTI_BIT:
PANIC("Error: 32-bit multibit PBS is not supported.\n")
case CLASSICAL:
size_tracker = scratch_cuda_programmable_bootstrap_32(
*size_tracker = scratch_cuda_programmable_bootstrap_32(
stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory, allocate_ms_array);
@@ -302,12 +246,12 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
case MULTI_BIT:
if (grouping_factor == 0)
PANIC("Multi-bit PBS error: grouping factor should be > 0.")
size_tracker = scratch_cuda_multi_bit_programmable_bootstrap_64(
*size_tracker = scratch_cuda_multi_bit_programmable_bootstrap_64(
stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
break;
case CLASSICAL:
size_tracker = scratch_cuda_programmable_bootstrap_64(
*size_tracker = scratch_cuda_programmable_bootstrap_64(
stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory, allocate_ms_array);

View File

@@ -8,9 +8,9 @@ execute_scratch_pbs_128(void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count,
bool allocate_gpu_memory, bool allocate_ms_array,
uint64_t &size_tracker_on_gpu) {
uint64_t *size_tracker_on_gpu) {
// The squash noise function receives as input 64-bit integers
size_tracker_on_gpu = scratch_cuda_programmable_bootstrap_128_vector_64(
*size_tracker_on_gpu = scratch_cuda_programmable_bootstrap_128_vector_64(
stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory, allocate_ms_array);

View File

@@ -286,7 +286,7 @@ __host__ uint64_t scratch_programmable_bootstrap_amortized(
glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
max_shared_memory);
*pbs_buffer = (int8_t *)cuda_malloc_with_size_tracking_async(
buffer_size, stream, gpu_index, size_tracker, allocate_gpu_memory);
buffer_size, stream, gpu_index, &size_tracker, allocate_gpu_memory);
check_cuda_error(cudaGetLastError());
return size_tracker;
}

View File

@@ -225,7 +225,7 @@ __host__ uint64_t scratch_programmable_bootstrap_cg(
*buffer = new pbs_buffer<Torus, CLASSICAL>(
stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, PBS_VARIANT::CG,
allocate_gpu_memory, allocate_ms_array, size_tracker);
allocate_gpu_memory, allocate_ms_array, &size_tracker);
return size_tracker;
}

View File

@@ -280,14 +280,13 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap(
check_cuda_error(cudaGetLastError());
}
auto lwe_chunk_size =
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
polynomial_size, full_sm_keybundle);
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size);
uint64_t size_tracker = 0;
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::CG,
allocate_gpu_memory, size_tracker);
allocate_gpu_memory, &size_tracker);
return size_tracker;
}

View File

@@ -386,7 +386,7 @@ __host__ uint64_t scratch_programmable_bootstrap(
*buffer = new pbs_buffer<Torus, CLASSICAL>(
stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT,
allocate_gpu_memory, allocate_ms_array, size_tracker);
allocate_gpu_memory, allocate_ms_array, &size_tracker);
return size_tracker;
}

View File

@@ -141,7 +141,7 @@ void host_programmable_bootstrap_lwe_ciphertext_vector_128(
PANIC("Cuda error (classical PBS): base log should be <= 64")
// If the parameters contain noise reduction key, then apply it
if (ms_noise_reduction_key->num_zeros != 0) {
if (buffer->uses_noise_reduction) {
uint32_t log_modulus = log2(polynomial_size) + 1;
host_improve_noise_modulus_switch<InputTorus>(
static_cast<cudaStream_t>(stream), gpu_index,

View File

@@ -18,6 +18,62 @@
#include "programmable_bootstrap.cuh"
#include "types/complex/operations.cuh"
/** Perform the matrix multiplication between the GGSW and the GLWE,
* each block operating on a single level for mask and body.
* Both operands should be at fourier domain
*
* This function assumes:
* - Thread blocks at dimension z relates to the decomposition level.
* - Thread blocks at dimension y relates to the glwe dimension.
* - polynomial_size / params::opt threads are available per block
*/
template <typename G, class params>
__device__ void mul_ggsw_glwe_in_fourier_domain_128(
double *fft, double *join_buffer,
const double *__restrict__ bootstrapping_key, int iteration, G &group,
bool support_dsm = false) {
const uint32_t polynomial_size = params::degree;
const uint32_t glwe_dimension = gridDim.y - 1;
const uint32_t level_count = gridDim.z;
// The first product is used to initialize level_join_buffer
auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
// Continues multiplying fft by every polynomial in that particular bsk level
// Each y-block accumulates in a different polynomial at each iteration
auto bsk_slice = get_ith_mask_kth_block_128(
bootstrapping_key, iteration, blockIdx.y, blockIdx.z, polynomial_size,
glwe_dimension, level_count);
for (int j = 0; j < glwe_dimension + 1; j++) {
int idx = (j + this_block_rank) % (glwe_dimension + 1);
auto bsk_poly = bsk_slice + idx * polynomial_size / 2 * 4;
auto buffer_slice = get_join_buffer_element_128<G>(
blockIdx.z, idx, group, join_buffer, polynomial_size, glwe_dimension,
support_dsm);
polynomial_product_accumulate_in_fourier_domain_128<params>(
buffer_slice, fft, bsk_poly, j == 0);
group.sync();
}
// -----------------------------------------------------------------
// All blocks are synchronized here; after this sync, level_join_buffer has
// the values needed from every other block
// accumulate rest of the products into fft buffer
for (int l = 0; l < level_count; l++) {
auto cur_src_acc = get_join_buffer_element_128<G>(
l, blockIdx.y, group, join_buffer, polynomial_size, glwe_dimension,
support_dsm);
polynomial_accumulate_in_fourier_domain_128<params>(fft, cur_src_acc,
l == 0);
}
__syncthreads();
}
template <typename InputTorus, class params, sharedMemDegree SMD,
bool first_iter>
__global__ void __launch_bounds__(params::degree / params::opt)
@@ -118,6 +174,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
accumulator);
gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);
// We are using the same memory space for accumulator_fft and
// accumulator_rotated, so we need to synchronize here to make sure they
// don't modify the same memory space at the same time
// Switch to the FFT space
auto acc_fft_re_hi = accumulator_fft + 0 * params::degree / 2;
auto acc_fft_re_lo = accumulator_fft + 1 * params::degree / 2;
@@ -396,7 +455,6 @@ __host__ uint64_t scratch_programmable_bootstrap_cg_128(
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
bool allocate_gpu_memory, bool allocate_ms_array) {
cuda_set_device(gpu_index);
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_cg<__uint128_t>(
polynomial_size);
@@ -426,7 +484,7 @@ __host__ uint64_t scratch_programmable_bootstrap_cg_128(
*buffer = new pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>(
stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, PBS_VARIANT::CG,
allocate_gpu_memory, allocate_ms_array, size_tracker);
allocate_gpu_memory, allocate_ms_array, &size_tracker);
return size_tracker;
}
@@ -533,7 +591,7 @@ __host__ uint64_t scratch_programmable_bootstrap_128(
*buffer = new pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>(
stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT,
allocate_gpu_memory, allocate_ms_array, size_tracker);
allocate_gpu_memory, allocate_ms_array, &size_tracker);
return size_tracker;
}

View File

@@ -455,8 +455,11 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
*/
template <typename Torus, class params>
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size,
uint64_t full_sm_keybundle) {
uint32_t polynomial_size) {
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
polynomial_size);
int max_blocks_per_sm;
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);

View File

@@ -18,20 +18,6 @@
#include "types/complex/operations.cuh"
#include <vector>
template <typename Torus, class params>
__device__ uint32_t calculates_monomial_degree(const Torus *lwe_array_group,
uint32_t ggsw_idx,
uint32_t grouping_factor) {
Torus x = 0;
for (int i = 0; i < grouping_factor; i++) {
uint32_t mask_position = grouping_factor - (i + 1);
int selection_bit = (ggsw_idx >> mask_position) & 1;
x += selection_bit * lwe_array_group[i];
}
return modulus_switch(x, params::log2_degree + 1);
}
__device__ __forceinline__ int
get_start_ith_ggsw_offset(uint32_t polynomial_size, int glwe_dimension,
uint32_t level_count) {
@@ -521,14 +507,13 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
check_cuda_error(cudaGetLastError());
}
auto lwe_chunk_size =
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
polynomial_size, full_sm_keybundle);
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size);
uint64_t size_tracker = 0;
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::DEFAULT,
allocate_gpu_memory, size_tracker);
allocate_gpu_memory, &size_tracker);
return size_tracker;
}

View File

@@ -1,361 +0,0 @@
#include "programmable_bootstrap_cg_multibit.cuh"
#include "programmable_bootstrap_multibit_128.cuh"
template <typename InputTorus>
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128(
void *stream, uint32_t gpu_index,
pbs_buffer_128<InputTorus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 512:
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 1024:
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 2048:
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 4096:
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..4096].")
}
}
template <typename InputTorus>
uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap_128(
void *stream, uint32_t gpu_index,
pbs_buffer_128<InputTorus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
return scratch_cg_multi_bit_programmable_bootstrap_128<
InputTorus, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 512:
return scratch_cg_multi_bit_programmable_bootstrap_128<
InputTorus, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 1024:
return scratch_cg_multi_bit_programmable_bootstrap_128<
InputTorus, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 2048:
return scratch_cg_multi_bit_programmable_bootstrap_128<
InputTorus, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 4096:
return scratch_cg_multi_bit_programmable_bootstrap_128<
InputTorus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..4096].")
}
}
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
bool supports_cg =
supports_cooperative_groups_on_multibit_programmable_bootstrap<
__uint128_t>(glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count,
cuda_get_max_shared_memory(gpu_index));
if (supports_cg)
return scratch_cuda_cg_multi_bit_programmable_bootstrap_128<uint64_t>(
stream, gpu_index,
reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> **>(buffer),
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory);
else
return scratch_cuda_multi_bit_programmable_bootstrap_128<uint64_t>(
stream, gpu_index,
reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> **>(buffer),
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory);
}
template <typename InputTorus>
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
pbs_buffer_128<InputTorus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride) {
switch (polynomial_size) {
case 256:
host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
case 512:
host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
case 1024:
host_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
case 2048:
host_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
case 4096:
host_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..4096].")
}
}
template <typename InputTorus>
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
pbs_buffer_128<InputTorus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride) {
switch (polynomial_size) {
case 256:
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
case 512:
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
case 1024:
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
case 2048:
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
case 4096:
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..4096].")
}
}
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride) {
if (base_log > 64)
PANIC("Cuda error (multi-bit PBS): base log should be <= 64")
auto *buffer =
reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(mem_ptr);
switch (buffer->pbs_variant) {
case PBS_VARIANT::CG:
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128<
uint64_t>(stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const __uint128_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const __uint128_t *>(bootstrapping_key), buffer,
lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case PBS_VARIANT::DEFAULT:
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128<uint64_t>(
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const __uint128_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const __uint128_t *>(bootstrapping_key), buffer,
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
}
}
void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
const uint32_t gpu_index,
int8_t **buffer) {
const auto x =
reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(*buffer);
x->release(static_cast<cudaStream_t>(stream), gpu_index);
}
/**
* Computes divisors of the product of num_sms (streaming multiprocessors on the
* GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
* device_multi_bit_programmable_bootstrap_keybundle) smaller than its square
* root, based on max_num_pbs. If log2(max_num_pbs) <= 13, selects the first
* suitable divisor. If greater, calculates an offset as max(1,log2(max_num_pbs)
* - 13) for additional logic.
*
* The value 13 was empirically determined based on memory requirements for
* benchmarking on an RTX 4090 GPU, balancing performance and resource use.
*/
template <typename Torus, class params>
uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size,
uint64_t full_sm_keybundle) {
int max_blocks_per_sm;
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
cuda_set_device(gpu_index);
if (max_shared_memory < full_sm_keybundle)
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks_per_sm,
device_multi_bit_programmable_bootstrap_keybundle_128<Torus, params,
NOSM>,
polynomial_size / params::opt, full_sm_keybundle);
else
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks_per_sm,
device_multi_bit_programmable_bootstrap_keybundle_128<Torus, params,
FULLSM>,
polynomial_size / params::opt, 0);
int num_sms = 0;
check_cuda_error(cudaDeviceGetAttribute(
&num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
int x = num_sms * max_blocks_per_sm;
int count = 0;
int divisor = 1;
int ith_divisor = 0;
#if CUDA_ARCH < 900
// We pick a smaller divisor on GPUs other than H100, so 256-bit integer
// multiplication can run
int log2_max_num_pbs = log2_int(max_num_pbs);
if (log2_max_num_pbs > 13)
ith_divisor = log2_max_num_pbs - 11;
#endif
for (int i = sqrt(x); i >= 1; i--) {
if (x % i == 0) {
if (count == ith_divisor) {
divisor = i;
break;
} else {
count++;
}
}
}
return divisor;
}

View File

@@ -253,7 +253,7 @@ __host__ uint64_t scratch_programmable_bootstrap_tbc(
*buffer = new pbs_buffer<Torus, CLASSICAL>(
stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, PBS_VARIANT::TBC,
allocate_gpu_memory, allocate_ms_array, size_tracker);
allocate_gpu_memory, allocate_ms_array, &size_tracker);
return size_tracker;
}

View File

@@ -283,14 +283,13 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
check_cuda_error(cudaGetLastError());
}
auto lwe_chunk_size =
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
polynomial_size, full_sm_keybundle);
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size);
uint64_t size_tracker = 0;
*buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::TBC,
allocate_gpu_memory, size_tracker);
allocate_gpu_memory, &size_tracker);
return size_tracker;
}

View File

@@ -5,15 +5,15 @@
#include <stdio.h>
#include <type_traits>
template <typename T> __device__ inline const char *get_format();
template <typename T> inline __device__ const char *get_format();
template <> __device__ inline const char *get_format<int>() { return "%d, "; }
template <> inline __device__ const char *get_format<int>() { return "%d, "; }
template <> __device__ inline const char *get_format<unsigned int>() {
template <> inline __device__ const char *get_format<unsigned int>() {
return "%u, ";
}
template <> __device__ inline const char *get_format<uint64_t>() {
template <> inline __device__ const char *get_format<uint64_t>() {
return "%lu, ";
}
@@ -23,15 +23,6 @@ template <typename T> __global__ void print_debug_kernel(const T *src, int N) {
}
}
template <>
__global__ inline void print_debug_kernel(const __uint128_t *src, int N) {
for (int i = 0; i < N; i++) {
uint64_t low = static_cast<uint64_t>(src[i]);
uint64_t high = static_cast<uint64_t>(src[i] >> 64);
printf("(%llu, %llu), ", high, low);
}
}
template <>
__global__ inline void print_debug_kernel(const double2 *src, int N) {
for (int i = 0; i < N; i++) {
@@ -47,19 +38,17 @@ template <typename T> void print_debug(const char *name, const T *src, int N) {
}
template <typename T>
__global__ void print_body_kernel(T *src, int N, int lwe_dimension, T delta) {
__global__ void print_body_kernel(T *src, int N, int lwe_dimension) {
for (int i = 0; i < N; i++) {
T body = src[i * (lwe_dimension + 1) + lwe_dimension];
T clear = body / delta;
printf("(%lu, %lu), ", body, clear);
printf("%lu, ", src[i * (lwe_dimension + 1) + lwe_dimension]);
}
}
template <typename T>
void print_body(const char *name, T *src, int n, int lwe_dimension, T delta) {
void print_body(const char *name, T *src, int n, int lwe_dimension) {
printf("%s: ", name);
cudaDeviceSynchronize();
print_body_kernel<<<1, 1>>>(src, n, lwe_dimension, delta);
print_body_kernel<<<1, 1>>>(src, n, lwe_dimension);
cudaDeviceSynchronize();
printf("\n");
}

View File

@@ -9,7 +9,7 @@ void multi_gpu_alloc_array_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, std::vector<Torus *> &dest,
uint32_t elements_per_gpu,
uint64_t &size_tracker_on_gpu_0,
uint64_t *size_tracker_on_gpu_0,
bool allocate_gpu_memory) {
dest.resize(gpu_count);
@@ -17,10 +17,10 @@ void multi_gpu_alloc_array_async(cudaStream_t const *streams,
uint64_t size_tracker_on_gpu_i = 0;
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
elements_per_gpu * sizeof(Torus), streams[i], gpu_indexes[i],
size_tracker_on_gpu_i, allocate_gpu_memory);
&size_tracker_on_gpu_i, allocate_gpu_memory);
dest[i] = d_array;
if (i == 0) {
size_tracker_on_gpu_0 += size_tracker_on_gpu_i;
if (i == 0 && size_tracker_on_gpu_0 != nullptr) {
*size_tracker_on_gpu_0 = size_tracker_on_gpu_i;
}
}
}
@@ -46,7 +46,7 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
std::vector<Torus *> &dest, uint32_t num_inputs,
uint32_t lwe_size,
uint64_t &size_tracker_on_gpu_0,
uint64_t *size_tracker_on_gpu_0,
bool allocate_gpu_memory) {
dest.resize(gpu_count);
for (uint i = 0; i < gpu_count; i++) {
@@ -54,10 +54,10 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
inputs_on_gpu * lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
size_tracker_on_gpu_i, allocate_gpu_memory);
&size_tracker_on_gpu_i, allocate_gpu_memory);
dest[i] = d_array;
if (i == 0) {
size_tracker_on_gpu_0 += size_tracker_on_gpu_i;
if (i == 0 && size_tracker_on_gpu_0 != nullptr) {
*size_tracker_on_gpu_0 = size_tracker_on_gpu_i;
}
}
}
@@ -65,7 +65,7 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
template void multi_gpu_alloc_lwe_async<__uint128_t>(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, std::vector<__uint128_t *> &dest, uint32_t num_inputs,
uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0,
uint32_t lwe_size, uint64_t *size_tracker_on_gpu_0,
bool allocate_gpu_memory);
/// Allocates the input/output vector for all devices
@@ -75,7 +75,7 @@ template <typename Torus>
void multi_gpu_alloc_lwe_many_lut_output_async(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, std::vector<Torus *> &dest, uint32_t num_inputs,
uint32_t num_many_lut, uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0,
uint32_t num_many_lut, uint32_t lwe_size, uint64_t *size_tracker_on_gpu_0,
bool allocate_gpu_memory) {
dest.resize(gpu_count);
for (uint i = 0; i < gpu_count; i++) {
@@ -83,10 +83,10 @@ void multi_gpu_alloc_lwe_many_lut_output_async(
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
num_many_lut * inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
gpu_indexes[i], size_tracker, allocate_gpu_memory);
gpu_indexes[i], &size_tracker, allocate_gpu_memory);
dest[i] = d_array;
if (i == 0) {
size_tracker_on_gpu_0 += size_tracker;
if (i == 0 && size_tracker_on_gpu_0 != nullptr) {
*size_tracker_on_gpu_0 = size_tracker;
}
}
}

View File

@@ -3,8 +3,8 @@
void cuda_lwe_expand_64(void *const stream, uint32_t gpu_index,
void *lwe_array_out, const void *lwe_compact_array_in,
uint32_t lwe_dimension, uint32_t num_lwe,
const uint32_t *lwe_compact_input_indexes,
const uint32_t *output_body_id_per_compact_list) {
const void *lwe_compact_input_indexes,
const void *output_body_id_per_compact_list) {
switch (lwe_dimension) {
case 256:
@@ -12,49 +12,56 @@ void cuda_lwe_expand_64(void *const stream, uint32_t gpu_index,
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_compact_array_in), num_lwe,
lwe_compact_input_indexes, output_body_id_per_compact_list);
static_cast<const uint32_t *>(lwe_compact_input_indexes),
static_cast<const uint32_t *>(output_body_id_per_compact_list));
break;
case 512:
host_lwe_expand<uint64_t, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_compact_array_in), num_lwe,
lwe_compact_input_indexes, output_body_id_per_compact_list);
static_cast<const uint32_t *>(lwe_compact_input_indexes),
static_cast<const uint32_t *>(output_body_id_per_compact_list));
break;
case 1024:
host_lwe_expand<uint64_t, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_compact_array_in), num_lwe,
lwe_compact_input_indexes, output_body_id_per_compact_list);
static_cast<const uint32_t *>(lwe_compact_input_indexes),
static_cast<const uint32_t *>(output_body_id_per_compact_list));
break;
case 2048:
host_lwe_expand<uint64_t, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_compact_array_in), num_lwe,
lwe_compact_input_indexes, output_body_id_per_compact_list);
static_cast<const uint32_t *>(lwe_compact_input_indexes),
static_cast<const uint32_t *>(output_body_id_per_compact_list));
break;
case 4096:
host_lwe_expand<uint64_t, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_compact_array_in), num_lwe,
lwe_compact_input_indexes, output_body_id_per_compact_list);
static_cast<const uint32_t *>(lwe_compact_input_indexes),
static_cast<const uint32_t *>(output_body_id_per_compact_list));
break;
case 8192:
host_lwe_expand<uint64_t, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_compact_array_in), num_lwe,
lwe_compact_input_indexes, output_body_id_per_compact_list);
static_cast<const uint32_t *>(lwe_compact_input_indexes),
static_cast<const uint32_t *>(output_body_id_per_compact_list));
break;
case 16384:
host_lwe_expand<uint64_t, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_compact_array_in), num_lwe,
lwe_compact_input_indexes, output_body_id_per_compact_list);
static_cast<const uint32_t *>(lwe_compact_input_indexes),
static_cast<const uint32_t *>(output_body_id_per_compact_list));
break;
default:
PANIC("CUDA error: lwe_dimension not supported."

View File

@@ -34,7 +34,8 @@ __host__ void host_expand_without_verification(
if (sizeof(Torus) == 8) {
cuda_lwe_expand_64(streams[0], gpu_indexes[0], expanded_lwes,
lwe_flattened_compact_array_in, lwe_dimension, num_lwes,
d_lwe_compact_input_indexes, d_body_id_per_compact_list);
(void *)d_lwe_compact_input_indexes,
(void *)d_body_id_per_compact_list);
} else
PANIC("Cuda error: expand is only supported on 64 bits")
@@ -94,7 +95,7 @@ __host__ uint64_t scratch_cuda_expand_without_verification(
*mem_ptr = new zk_expand_mem<Torus>(
streams, gpu_indexes, gpu_count, computing_params, casting_params,
casting_key_type, num_lwes_per_compact_list, is_boolean_array,
num_compact_lists, allocate_gpu_memory, size_tracker);
num_compact_lists, allocate_gpu_memory, &size_tracker);
return size_tracker;
}

Some files were not shown because too many files have changed in this diff Show More