mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-11 15:48:20 -05:00
Compare commits
62 Commits
jb/tmp
...
gitbook-ed
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bddc35459d | ||
|
|
27c421b359 | ||
|
|
2adeff44f3 | ||
|
|
d0042aed54 | ||
|
|
5eabdeab55 | ||
|
|
0152c212af | ||
|
|
9a2c4a3784 | ||
|
|
c14aad5656 | ||
|
|
702e0ef306 | ||
|
|
515d2e009f | ||
|
|
711b5151dc | ||
|
|
ceaee2f910 | ||
|
|
41015db7a1 | ||
|
|
485b2a7693 | ||
|
|
7d903d5f7a | ||
|
|
19ac6eb123 | ||
|
|
5b653864b7 | ||
|
|
a1d189b415 | ||
|
|
c59434f183 | ||
|
|
83239e6afa | ||
|
|
ef8cb0273f | ||
|
|
9b353bac2d | ||
|
|
46d65f1f87 | ||
|
|
a63a2cb725 | ||
|
|
c45af05ec6 | ||
|
|
584eaeb4ed | ||
|
|
8d94ed2512 | ||
|
|
b8d9dbe85b | ||
|
|
ad25340c33 | ||
|
|
ad1ae0c8c2 | ||
|
|
ee40906b8b | ||
|
|
bf6b4cc541 | ||
|
|
24404567a4 | ||
|
|
052dd4a60e | ||
|
|
f8d829d076 | ||
|
|
d9761ca17e | ||
|
|
8d2e15347b | ||
|
|
a368257bc7 | ||
|
|
76d23d0c91 | ||
|
|
ddc5002232 | ||
|
|
c08c479616 | ||
|
|
f26afc16de | ||
|
|
13f533f6fb | ||
|
|
d9541e472b | ||
|
|
3453e45258 | ||
|
|
55de96f046 | ||
|
|
9747c06f6e | ||
|
|
00f72d2c13 | ||
|
|
01f5cb9056 | ||
|
|
d66e313fa4 | ||
|
|
c9d530e642 | ||
|
|
6c2096fe52 | ||
|
|
1e94134dda | ||
|
|
c76a60111c | ||
|
|
18ff400df2 | ||
|
|
3d31d09be5 | ||
|
|
76322606f2 | ||
|
|
bf58a9f0c6 | ||
|
|
64461c82b4 | ||
|
|
339c84fbd9 | ||
|
|
bc682a5ffb | ||
|
|
2920daf2d9 |
2
.github/ISSUE_TEMPLATE/bug_report.md
vendored
2
.github/ISSUE_TEMPLATE/bug_report.md
vendored
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: Bug report
|
||||
about: Report a problem with TFHE-rs
|
||||
about: Report a problem with concrete
|
||||
title: ''
|
||||
labels: triage_required
|
||||
assignees: ''
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/feature_request.md
vendored
2
.github/ISSUE_TEMPLATE/feature_request.md
vendored
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: Feature request
|
||||
about: Suggest an idea for TFHE-rs
|
||||
about: Suggest an idea for concrete
|
||||
title: ''
|
||||
labels: feature_request
|
||||
assignees: ''
|
||||
|
||||
9
.github/actionlint.yaml
vendored
9
.github/actionlint.yaml
vendored
@@ -1,9 +0,0 @@
|
||||
self-hosted-runner:
|
||||
# Labels of self-hosted runner in array of strings.
|
||||
labels:
|
||||
- m1mac
|
||||
- 4090-desktop
|
||||
# Configuration variables in array of strings defined in your repository or
|
||||
# organization. `null` means disabling configuration variables check.
|
||||
# Empty array means no configuration variable is allowed.
|
||||
config-variables: null
|
||||
34
.github/workflows/approve_label.yml
vendored
34
.github/workflows/approve_label.yml
vendored
@@ -1,34 +0,0 @@
|
||||
# Manage approved label in pull request
|
||||
name: PR approved label manager
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
pull_request_review:
|
||||
types: [submitted]
|
||||
|
||||
jobs:
|
||||
trigger-tests:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
steps:
|
||||
- name: Get current labels
|
||||
uses: snnaplab/get-labels-action@f426df40304808ace3b5282d4f036515f7609576
|
||||
|
||||
# Remove label if a push is performed after an approval
|
||||
- name: Remove approved label
|
||||
if: ${{ github.event_name == 'pull_request' && contains(fromJSON(env.LABELS), 'approved') }}
|
||||
uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
|
||||
with:
|
||||
# We use a PAT to have the same user (zama-bot) for label deletion as for creation.
|
||||
github_token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
|
||||
labels: approved
|
||||
|
||||
# Add label only if the review is approved and if the label doesn't already exist
|
||||
- name: Add approved label
|
||||
uses: actions-ecosystem/action-add-labels@18f1af5e3544586314bbe15c0273249c770b2daf
|
||||
if: ${{ github.event_name == 'pull_request_review' && github.event.review.state == 'approved' && !contains(fromJSON(env.LABELS), 'approved') }}
|
||||
with:
|
||||
# We need to use a PAT to be able to trigger `labeled` event for the other workflow.
|
||||
github_token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
|
||||
labels: approved
|
||||
95
.github/workflows/aws_tfhe_fast_tests.yml
vendored
95
.github/workflows/aws_tfhe_fast_tests.yml
vendored
@@ -6,46 +6,56 @@ env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
# All the inputs are provided by Slab
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "AWS instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "AWS instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "AWS instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: 'Slab request ID'
|
||||
type: string
|
||||
fork_repo:
|
||||
description: 'Name of forked repo as user/repo'
|
||||
type: string
|
||||
fork_git_sha:
|
||||
description: 'Git SHA to checkout from fork'
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (fast-tests)
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
|
||||
aws-region: ${{ steps.start-instance.outputs.aws-region }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
profile: cpu-big
|
||||
|
||||
fast-tests:
|
||||
name: Fast CPU tests
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
runs-on: ${{ inputs.runner_name }}
|
||||
steps:
|
||||
# Step used for log purpose.
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "ID: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
echo "Fork repo: ${{ inputs.fork_repo }}"
|
||||
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: ${{ inputs.fork_repo }}
|
||||
ref: ${{ inputs.fork_git_sha }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
@@ -110,29 +120,8 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Fast AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (fast-tests)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, fast-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
region: ${{ needs.setup-ec2.outputs.aws-region }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "EC2 teardown (fast-tests) failed. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
75
.github/workflows/aws_tfhe_gpu_4090_tests.yml
vendored
75
.github/workflows/aws_tfhe_gpu_4090_tests.yml
vendored
@@ -1,75 +0,0 @@
|
||||
# Compile and test tfhe-cuda-backend on an RTX 4090 machine
|
||||
name: TFHE Cuda Backend - 4090 full tests
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [labeled]
|
||||
|
||||
jobs:
|
||||
cuda-tests-linux:
|
||||
name: CUDA tests (RTX 4090)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, '4090_test') }}
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ["self-hosted", "4090-desktop"]
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Run fmt checks
|
||||
run: |
|
||||
make check_fmt_gpu
|
||||
|
||||
- name: Run clippy checks
|
||||
run: |
|
||||
make pcc_gpu
|
||||
|
||||
- name: Run core crypto, integer and internal CUDA backend tests
|
||||
run: |
|
||||
make test_gpu
|
||||
|
||||
- name: Run user docs tests
|
||||
run: |
|
||||
make test_user_doc_gpu
|
||||
|
||||
- name: Test C API
|
||||
run: |
|
||||
make test_c_api_gpu
|
||||
|
||||
- name: Run High Level API Tests
|
||||
run: |
|
||||
make test_high_level_api_gpu
|
||||
|
||||
- uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
|
||||
if: ${{ github.event_name == 'pull_request' }}
|
||||
with:
|
||||
labels: 4090_test
|
||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "CUDA RTX 4090 tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
128
.github/workflows/aws_tfhe_gpu_tests.yml
vendored
128
.github/workflows/aws_tfhe_gpu_tests.yml
vendored
@@ -1,48 +1,46 @@
|
||||
# Compile and test tfhe-cuda-backend on an AWS instance
|
||||
name: TFHE Cuda Backend - Full tests
|
||||
# Compile and test Concrete-cuda on an AWS instance
|
||||
name: Concrete Cuda - Full tests
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
# All the inputs are provided by Slab
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "AWS instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "AWS instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "AWS instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: 'Slab request ID'
|
||||
type: string
|
||||
fork_repo:
|
||||
description: 'Name of forked repo as user/repo'
|
||||
type: string
|
||||
fork_git_sha:
|
||||
description: 'Git SHA to checkout from fork'
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (cuda-tests)
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
|
||||
aws-region: ${{ steps.start-instance.outputs.aws-region }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
profile: gpu-test
|
||||
|
||||
cuda-tests-linux:
|
||||
name: CUDA tests
|
||||
needs: setup-ec2
|
||||
run-cuda-tests-linux:
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
group: tfhe_cuda_backend_test-${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
name: Test code in EC2
|
||||
runs-on: ${{ inputs.runner_name }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
@@ -55,8 +53,21 @@ jobs:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
|
||||
steps:
|
||||
# Step used for log purpose.
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "ID: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
echo "Fork repo: ${{ inputs.fork_repo }}"
|
||||
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: ${{ inputs.fork_repo }}
|
||||
ref: ${{ inputs.fork_git_sha }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
@@ -79,22 +90,16 @@ jobs:
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run fmt checks
|
||||
run: |
|
||||
make check_fmt_gpu
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run clippy checks
|
||||
run: |
|
||||
make pcc_gpu
|
||||
make clippy_gpu
|
||||
|
||||
- name: Run core crypto, integer and internal CUDA backend tests
|
||||
- name: Run all tests
|
||||
run: |
|
||||
make test_gpu
|
||||
|
||||
@@ -105,40 +110,3 @@ jobs:
|
||||
- name: Test C API
|
||||
run: |
|
||||
make test_c_api_gpu
|
||||
|
||||
- name: Run High Level API Tests
|
||||
run: |
|
||||
make test_high_level_api_gpu
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "CUDA AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (cuda-tests)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
region: ${{ needs.setup-ec2.outputs.aws-region }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "EC2 teardown (cuda-tests) failed. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
101
.github/workflows/aws_tfhe_integer_tests.yml
vendored
101
.github/workflows/aws_tfhe_integer_tests.yml
vendored
@@ -5,48 +5,56 @@ env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
# All the inputs are provided by Slab
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "AWS instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "AWS instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "AWS instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: "Slab request ID"
|
||||
type: string
|
||||
fork_repo:
|
||||
description: "Name of forked repo as user/repo"
|
||||
type: string
|
||||
fork_git_sha:
|
||||
description: "Git SHA to checkout from fork"
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (unsigned-integer-tests)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
|
||||
aws-region: ${{ steps.start-instance.outputs.aws-region }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
profile: cpu-big
|
||||
|
||||
unsigned-integer-tests:
|
||||
name: Unsigned integer tests
|
||||
needs: setup-ec2
|
||||
integer-tests:
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
runs-on: ${{ inputs.runner_name }}
|
||||
steps:
|
||||
# Step used for log purpose.
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "ID: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
echo "Fork repo: ${{ inputs.fork_repo }}"
|
||||
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: ${{ inputs.fork_repo }}
|
||||
ref: ${{ inputs.fork_git_sha }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
@@ -79,29 +87,8 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Unsigned Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (unsigned-integer-tests)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, unsigned-integer-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
region: ${{ needs.setup-ec2.outputs.aws-region }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "EC2 teardown (unsigned-integer-tests) failed. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
101
.github/workflows/aws_tfhe_signed_integer_tests.yml
vendored
101
.github/workflows/aws_tfhe_signed_integer_tests.yml
vendored
@@ -5,48 +5,56 @@ env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
# All the inputs are provided by Slab
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "AWS instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "AWS instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "AWS instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: "Slab request ID"
|
||||
type: string
|
||||
fork_repo:
|
||||
description: "Name of forked repo as user/repo"
|
||||
type: string
|
||||
fork_git_sha:
|
||||
description: "Git SHA to checkout from fork"
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (signed-integer-tests)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
|
||||
aws-region: ${{ steps.start-instance.outputs.aws-region }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
profile: cpu-big
|
||||
|
||||
signed-integer-tests:
|
||||
name: Signed integer tests
|
||||
needs: setup-ec2
|
||||
multi-bit-tests:
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
runs-on: ${{ inputs.runner_name }}
|
||||
steps:
|
||||
# Step used for log purpose.
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "ID: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
echo "Fork repo: ${{ inputs.fork_repo }}"
|
||||
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: ${{ inputs.fork_repo }}
|
||||
ref: ${{ inputs.fork_git_sha }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
@@ -83,29 +91,8 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Signed Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (signed-integer-tests)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, signed-integer-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
region: ${{ needs.setup-ec2.outputs.aws-region }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "EC2 teardown (signed-integer-tests) failed. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Shortint tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
101
.github/workflows/aws_tfhe_tests.yml
vendored
101
.github/workflows/aws_tfhe_tests.yml
vendored
@@ -5,48 +5,56 @@ env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
# All the inputs are provided by Slab
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "AWS instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "AWS instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "AWS instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: 'Slab request ID'
|
||||
type: string
|
||||
fork_repo:
|
||||
description: 'Name of forked repo as user/repo'
|
||||
type: string
|
||||
fork_git_sha:
|
||||
description: 'Git SHA to checkout from fork'
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (cpu-tests)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
|
||||
aws-region: ${{ steps.start-instance.outputs.aws-region }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
profile: cpu-big
|
||||
|
||||
cpu-tests:
|
||||
name: CPU tests
|
||||
needs: setup-ec2
|
||||
shortint-tests:
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
runs-on: ${{ inputs.runner_name }}
|
||||
steps:
|
||||
# Step used for log purpose.
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "ID: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
echo "Fork repo: ${{ inputs.fork_repo }}"
|
||||
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: ${{ inputs.fork_repo }}
|
||||
ref: ${{ inputs.fork_git_sha }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
@@ -105,29 +113,8 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "CPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (cpu-tests)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, cpu-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
region: ${{ needs.setup-ec2.outputs.aws-region }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "EC2 teardown (cpu-tests) failed. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Shortint tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
97
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
97
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
@@ -5,48 +5,56 @@ env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
# All the inputs are provided by Slab
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "AWS instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "AWS instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "AWS instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: 'Slab request ID'
|
||||
type: string
|
||||
fork_repo:
|
||||
description: 'Name of forked repo as user/repo'
|
||||
type: string
|
||||
fork_git_sha:
|
||||
description: 'Git SHA to checkout from fork'
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (wasm-tests)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
|
||||
aws-region: ${{ steps.start-instance.outputs.aws-region }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
profile: cpu-small
|
||||
|
||||
wasm-tests:
|
||||
name: WASM tests
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
runs-on: ${{ inputs.runner_name }}
|
||||
steps:
|
||||
# Step used for log purpose.
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "ID: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
echo "Fork repo: ${{ inputs.fork_repo }}"
|
||||
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: ${{ inputs.fork_repo }}
|
||||
ref: ${{ inputs.fork_git_sha }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
@@ -72,29 +80,8 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "WASM tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (wasm-tests)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, wasm-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
region: ${{ needs.setup-ec2.outputs.aws-region }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "EC2 teardown (wasm-tests) failed. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
5
.github/workflows/boolean_benchmark.yml
vendored
5
.github/workflows/boolean_benchmark.yml
vendored
@@ -33,7 +33,6 @@ env:
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-boolean-benchmarks:
|
||||
@@ -69,7 +68,7 @@ jobs:
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make bench_boolean
|
||||
make AVX512_SUPPORT=ON bench_boolean
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
@@ -97,7 +96,7 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
|
||||
with:
|
||||
name: ${{ github.sha }}_boolean
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
5
.github/workflows/cargo_build.yml
vendored
5
.github/workflows/cargo_build.yml
vendored
@@ -7,7 +7,6 @@ env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref }}
|
||||
@@ -68,9 +67,5 @@ jobs:
|
||||
run: |
|
||||
make build_c_api
|
||||
|
||||
- name: Build coverage tests
|
||||
run: |
|
||||
make build_tfhe_coverage
|
||||
|
||||
# The wasm build check is a bit annoying to set-up here and is done during the tests in
|
||||
# aws_tfhe_tests.yml
|
||||
|
||||
27
.github/workflows/ci_lint.yml
vendored
27
.github/workflows/ci_lint.yml
vendored
@@ -1,27 +0,0 @@
|
||||
# Lint and check CI
|
||||
name: CI Lint and Checks
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
|
||||
env:
|
||||
ACTIONLINT_VERSION: 1.6.27
|
||||
|
||||
jobs:
|
||||
lint-check:
|
||||
name: Lint and checks
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
|
||||
- name: Get actionlint
|
||||
run: |
|
||||
bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash) ${{ env.ACTIONLINT_VERSION }}
|
||||
echo "f2ee6d561ce00fa93aab62a7791c1a0396ec7e8876b2a8f2057475816c550782 actionlint" > checksum
|
||||
sha256sum -c checksum
|
||||
ln -s "$(pwd)/actionlint" /usr/local/bin/
|
||||
|
||||
- name: Lint workflows
|
||||
run: |
|
||||
make lint_workflow
|
||||
21
.github/workflows/code_coverage.yml
vendored
21
.github/workflows/code_coverage.yml
vendored
@@ -5,7 +5,6 @@ env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
@@ -40,7 +39,7 @@ jobs:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ inputs.runner_name }}
|
||||
timeout-minutes: 11520 # 8 days
|
||||
timeout-minutes: 1080
|
||||
steps:
|
||||
# Step used for log purpose.
|
||||
- name: Instance configuration used
|
||||
@@ -69,7 +68,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@aa08304bd477b800d468db44fe10f6c61f7f7b11
|
||||
uses: tj-actions/changed-files@90a06d6ba9543371ab4df8eeca0be07ca6054959
|
||||
with:
|
||||
files_yaml: |
|
||||
tfhe:
|
||||
@@ -99,7 +98,7 @@ jobs:
|
||||
make test_shortint_cov
|
||||
|
||||
- name: Upload tfhe coverage to Codecov
|
||||
uses: codecov/codecov-action@54bcd8715eee62d40e33596ef5e8f0f48dbbccab
|
||||
uses: codecov/codecov-action@4fe8c5f003fae66aa5ebb77cfd3e7bfbbda0b6b0
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
@@ -107,20 +106,6 @@ jobs:
|
||||
fail_ci_if_error: true
|
||||
files: shortint/cobertura.xml,boolean/cobertura.xml,core_crypto/cobertura.xml,core_crypto_avx512/cobertura.xml
|
||||
|
||||
- name: Run integer coverage
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
run: |
|
||||
make test_integer_cov
|
||||
|
||||
- name: Upload tfhe coverage to Codecov
|
||||
uses: codecov/codecov-action@54bcd8715eee62d40e33596ef5e8f0f48dbbccab
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
directory: ./coverage/
|
||||
fail_ci_if_error: true
|
||||
files: integer/cobertura.xml
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
|
||||
74
.github/workflows/csprng_randomness_testing.yml
vendored
Normal file
74
.github/workflows/csprng_randomness_testing.yml
vendored
Normal file
@@ -0,0 +1,74 @@
|
||||
name: CSPRNG randomness testing Workflow
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
# All the inputs are provided by Slab
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "AWS instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "AWS instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "AWS instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: 'Slab request ID'
|
||||
type: string
|
||||
fork_repo:
|
||||
description: 'Name of forked repo as user/repo'
|
||||
type: string
|
||||
fork_git_sha:
|
||||
description: 'Git SHA to checkout from fork'
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
csprng-randomness-teting:
|
||||
name: CSPRNG randomness testing
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ inputs.runner_name }}
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: ${{ inputs.fork_repo }}
|
||||
ref: ${{ inputs.fork_git_sha }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Dieharder randomness test suite
|
||||
run: |
|
||||
make dieharder_csprng
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "concrete-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
96
.github/workflows/csprng_randomness_tests.yml
vendored
96
.github/workflows/csprng_randomness_tests.yml
vendored
@@ -1,96 +0,0 @@
|
||||
name: CSPRNG randomness testing Workflow
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
|
||||
|
||||
jobs:
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (csprng-randomness-tests)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
|
||||
aws-region: ${{ steps.start-instance.outputs.aws-region }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
profile: cpu-small
|
||||
|
||||
csprng-randomness-tests:
|
||||
name: CSPRNG randomness tests
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Dieharder randomness test suite
|
||||
run: |
|
||||
make dieharder_csprng
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "concrete-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (csprng-randomness-tests)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, csprng-randomness-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
region: ${{ needs.setup-ec2.outputs.aws-region }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "EC2 teardown (csprng-randomness-tests) failed. (${{ env.ACTION_RUN_URL }})"
|
||||
202
.github/workflows/gpu_4090_full_benchmark.yml
vendored
202
.github/workflows/gpu_4090_full_benchmark.yml
vendored
@@ -1,202 +0,0 @@
|
||||
# Run all benchmarks on an RTX 4090 machine and return parsed results to Slab CI bot.
|
||||
name: TFHE Cuda Backend - 4090 full benchmarks
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [labeled]
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Friday at 9p.m.
|
||||
- cron: "0 21 * * 5"
|
||||
|
||||
jobs:
|
||||
cuda-integer-benchmarks:
|
||||
name: Cuda integer benchmarks for all operations flavor (RTX 4090)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}_cuda_integer_bench
|
||||
cancel-in-progress: true
|
||||
runs-on: ["self-hosted", "4090-desktop"]
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
command: [integer, integer_multi_bit]
|
||||
op_flavor: [default, unchecked]
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Run integer benchmarks
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "rtx4090" \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Integer RTX 4090 full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
cuda-core-crypto-benchmarks:
|
||||
name: Cuda core crypto benchmarks (RTX 4090)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
|
||||
needs: cuda-integer-benchmarks
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}_cuda_core_crypto_bench
|
||||
cancel-in-progress: true
|
||||
runs-on: ["self-hosted", "4090-desktop"]
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Run integer benchmarks
|
||||
run: |
|
||||
make bench_pbs_gpu
|
||||
make bench_ks_gpu
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "rtx4090" \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ !success() && !cancelled() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Core crypto RTX 4090 full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
remove_github_label:
|
||||
name: Remove 4090 bench label
|
||||
if: ${{ github.event_name == 'pull_request' }}
|
||||
needs: [cuda-integer-benchmarks, cuda-core-crypto-benchmarks]
|
||||
runs-on: ["self-hosted", "4090-desktop"]
|
||||
steps:
|
||||
- uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
|
||||
with:
|
||||
labels: 4090_bench
|
||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
||||
7
.github/workflows/integer_benchmark.yml
vendored
7
.github/workflows/integer_benchmark.yml
vendored
@@ -26,7 +26,6 @@ env:
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-integer-benchmarks:
|
||||
@@ -62,7 +61,7 @@ jobs:
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make FAST_BENCH=TRUE bench_integer
|
||||
make AVX512_SUPPORT=ON FAST_BENCH=TRUE bench_integer
|
||||
|
||||
- name: Parse benchmarks to csv
|
||||
run: |
|
||||
@@ -70,7 +69,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -91,7 +90,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
19
.github/workflows/integer_full_benchmark.yml
vendored
19
.github/workflows/integer_full_benchmark.yml
vendored
@@ -29,7 +29,6 @@ env:
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
prepare-matrix:
|
||||
@@ -41,17 +40,17 @@ jobs:
|
||||
- name: Weekly benchmarks
|
||||
if: ${{ github.event.inputs.user_inputs == 'weekly_benchmarks' }}
|
||||
run: |
|
||||
echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
|
||||
echo "OP_FLAVOR=[\"default\"]" >> ${GITHUB_ENV}
|
||||
|
||||
- name: Quarterly benchmarks
|
||||
if: ${{ github.event.inputs.user_inputs == 'quarterly_benchmarks' }}
|
||||
run: |
|
||||
echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\", \"misc\"]" >> "${GITHUB_ENV}"
|
||||
echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\", \"misc\"]" >> ${GITHUB_ENV}
|
||||
|
||||
- name: Set operation flavor output
|
||||
id: set_op_flavor
|
||||
run: |
|
||||
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
|
||||
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> ${GITHUB_OUTPUT}
|
||||
|
||||
integer-benchmarks:
|
||||
name: Execute integer benchmarks for all operations flavor
|
||||
@@ -80,11 +79,9 @@ jobs:
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
@@ -105,7 +102,7 @@ jobs:
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}
|
||||
make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
@@ -121,7 +118,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
26
.github/workflows/integer_gpu_benchmark.yml
vendored
26
.github/workflows/integer_gpu_benchmark.yml
vendored
@@ -26,7 +26,6 @@ env:
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-integer-benchmarks:
|
||||
@@ -73,26 +72,23 @@ jobs:
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CUDA_PATH=$CUDA_PATH";
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_gpu
|
||||
make AVX512_SUPPORT=ON FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_gpu
|
||||
|
||||
- name: Parse benchmarks to csv
|
||||
run: |
|
||||
@@ -100,7 +96,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -122,7 +118,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -149,7 +145,7 @@ jobs:
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ !success() && !cancelled() }}
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
|
||||
37
.github/workflows/integer_gpu_full_benchmark.yml
vendored
37
.github/workflows/integer_gpu_full_benchmark.yml
vendored
@@ -33,21 +33,19 @@ env:
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
integer-benchmarks:
|
||||
name: Execute integer benchmarks for all operations flavor
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
if: ${{ !cancelled() }}
|
||||
continue-on-error: true
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
command: [integer, integer_multi_bit]
|
||||
op_flavor: [default, unchecked]
|
||||
command: [ integer, integer_multi_bit]
|
||||
op_flavor: [ default, unchecked ]
|
||||
# explicit include-based build matrix, of known valid options
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
@@ -70,11 +68,9 @@ jobs:
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
@@ -89,22 +85,19 @@ jobs:
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CUDA_PATH=$CUDA_PATH";
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
@@ -115,7 +108,7 @@ jobs:
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
|
||||
make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
@@ -132,7 +125,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -154,7 +147,7 @@ jobs:
|
||||
slack-notification:
|
||||
name: Slack Notification
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !success() && !cancelled() }}
|
||||
if: ${{ failure() }}
|
||||
needs: integer-benchmarks
|
||||
steps:
|
||||
- name: Notify
|
||||
|
||||
@@ -26,7 +26,6 @@ env:
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-integer-benchmarks:
|
||||
@@ -62,7 +61,7 @@ jobs:
|
||||
|
||||
- name: Run multi-bit benchmarks with AVX512
|
||||
run: |
|
||||
make FAST_BENCH=TRUE bench_integer_multi_bit
|
||||
make AVX512_SUPPORT=ON FAST_BENCH=TRUE bench_integer_multi_bit
|
||||
|
||||
- name: Parse benchmarks to csv
|
||||
run: |
|
||||
@@ -70,7 +69,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -91,7 +90,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# Run integer benchmarks with multi-bit cryptographic parameters on an AWS instance and return parsed results to Slab CI bot.
|
||||
name: Integer GPU Multi-bit benchmarks
|
||||
name: Integer Multi-bit benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
@@ -26,13 +26,11 @@ env:
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
cuda-integer-benchmarks:
|
||||
run-integer-benchmarks:
|
||||
name: Execute integer multi-bit benchmarks in EC2
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
if: ${{ !cancelled() }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -40,7 +38,8 @@ jobs:
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
cuda: "11.8"
|
||||
cuda_arch: "70"
|
||||
gcc: 9
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
@@ -74,26 +73,23 @@ jobs:
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CUDA_PATH=$CUDA_PATH";
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run multi-bit benchmarks with AVX512
|
||||
run: |
|
||||
make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_multi_bit_gpu
|
||||
make AVX512_SUPPORT=ON FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_multi_bit_gpu
|
||||
|
||||
- name: Parse benchmarks to csv
|
||||
run: |
|
||||
@@ -101,7 +97,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -123,7 +119,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -150,7 +146,7 @@ jobs:
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ !success() && !cancelled() }}
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
|
||||
3
.github/workflows/m1_tests.yml
vendored
3
.github/workflows/m1_tests.yml
vendored
@@ -15,7 +15,6 @@ env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
FAST_TESTS: "TRUE"
|
||||
|
||||
@@ -27,8 +26,6 @@ jobs:
|
||||
cargo-builds:
|
||||
if: ${{ (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'm1_test') }}
|
||||
runs-on: ["self-hosted", "m1mac"]
|
||||
# 12 hours, default is 6 hours, hopefully this is more than enough
|
||||
timeout-minutes: 720
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
|
||||
1
.github/workflows/parameters_check.yml
vendored
1
.github/workflows/parameters_check.yml
vendored
@@ -24,7 +24,6 @@ jobs:
|
||||
with:
|
||||
repository: malb/lattice-estimator
|
||||
path: lattice_estimator
|
||||
ref: '53508253629d3b5d31a2ad110e85dc69391ccb95'
|
||||
|
||||
- name: Install Sage
|
||||
run: |
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# Run core crypto benchmarks on an AWS instance and return parsed results to Slab CI bot.
|
||||
name: Core crypto benchmarks
|
||||
# Run PBS benchmarks on an AWS instance and return parsed results to Slab CI bot.
|
||||
name: PBS benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
@@ -33,11 +33,10 @@ env:
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-core-crypto-benchmarks:
|
||||
name: Execute core crypto benchmarks in EC2
|
||||
run-pbs-benchmarks:
|
||||
name: Execute PBS benchmarks in EC2
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !cancelled() }}
|
||||
steps:
|
||||
@@ -69,8 +68,7 @@ jobs:
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make bench_pbs
|
||||
make bench_ks
|
||||
make AVX512_SUPPORT=ON bench_pbs
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
@@ -88,9 +86,9 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
name: ${{ github.sha }}_pbs
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
@@ -1,5 +1,5 @@
|
||||
# Run core crypto benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
|
||||
name: Core crypto GPU benchmarks
|
||||
# Run PBS benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
|
||||
name: PBS GPU benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
@@ -34,20 +34,10 @@ env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
|
||||
jobs:
|
||||
run-core-crypto-benchmarks:
|
||||
name: Execute GPU core crypto benchmarks in EC2
|
||||
run-pbs-benchmarks:
|
||||
name: Execute PBS benchmarks in EC2
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !cancelled() }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
steps:
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
@@ -78,28 +68,23 @@ jobs:
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CUDA_PATH=$CUDA_PATH";
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make bench_pbs_gpu
|
||||
make bench_ks_gpu
|
||||
make AVX512_SUPPORT=ON bench_pbs_gpu
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
@@ -118,9 +103,9 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
name: ${{ github.sha }}_pbs
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
5
.github/workflows/shortint_benchmark.yml
vendored
5
.github/workflows/shortint_benchmark.yml
vendored
@@ -25,7 +25,6 @@ env:
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-shortint-benchmarks:
|
||||
@@ -61,7 +60,7 @@ jobs:
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make bench_shortint
|
||||
make AVX512_SUPPORT=ON bench_shortint
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
@@ -89,7 +88,7 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
|
||||
with:
|
||||
name: ${{ github.sha }}_shortint
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
13
.github/workflows/shortint_full_benchmark.yml
vendored
13
.github/workflows/shortint_full_benchmark.yml
vendored
@@ -33,7 +33,6 @@ env:
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
shortint-benchmarks:
|
||||
@@ -59,11 +58,9 @@ jobs:
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
@@ -84,7 +81,7 @@ jobs:
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_shortint
|
||||
make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_shortint
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
@@ -115,7 +112,7 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
|
||||
with:
|
||||
name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
@@ -26,7 +26,6 @@ env:
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-integer-benchmarks:
|
||||
@@ -62,7 +61,7 @@ jobs:
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make FAST_BENCH=TRUE bench_signed_integer
|
||||
make AVX512_SUPPORT=ON FAST_BENCH=TRUE bench_signed_integer
|
||||
|
||||
- name: Parse benchmarks to csv
|
||||
run: |
|
||||
@@ -70,7 +69,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -91,7 +90,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
@@ -29,7 +29,6 @@ env:
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
integer-benchmarks:
|
||||
@@ -58,11 +57,9 @@ jobs:
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
@@ -83,7 +80,7 @@ jobs:
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_signed_${{ matrix.command }}
|
||||
make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_signed_${{ matrix.command }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
@@ -99,7 +96,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
@@ -26,7 +26,6 @@ env:
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-integer-benchmarks:
|
||||
@@ -62,7 +61,7 @@ jobs:
|
||||
|
||||
- name: Run multi-bit benchmarks with AVX512
|
||||
run: |
|
||||
make FAST_BENCH=TRUE bench_signed_integer_multi_bit
|
||||
make AVX512_SUPPORT=ON FAST_BENCH=TRUE bench_signed_integer_multi_bit
|
||||
|
||||
- name: Parse benchmarks to csv
|
||||
run: |
|
||||
@@ -70,7 +69,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -91,7 +90,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
16
.github/workflows/start_benchmarks.yml
vendored
16
.github/workflows/start_benchmarks.yml
vendored
@@ -32,12 +32,12 @@ on:
|
||||
description: "Run signed integer multi bit benches"
|
||||
type: boolean
|
||||
default: true
|
||||
core_crypto_bench:
|
||||
description: "Run core crypto benches"
|
||||
pbs_bench:
|
||||
description: "Run PBS benches"
|
||||
type: boolean
|
||||
default: true
|
||||
core_crypto_gpu_bench:
|
||||
description: "Run core crypto benches on GPU"
|
||||
pbs_gpu_bench:
|
||||
description: "Run PBS benches on GPU"
|
||||
type: boolean
|
||||
default: true
|
||||
wasm_client_bench:
|
||||
@@ -54,7 +54,7 @@ jobs:
|
||||
integer_bench, integer_multi_bit_bench,
|
||||
signed_integer_bench, signed_integer_multi_bit_bench,
|
||||
integer_gpu_bench, integer_multi_bit_gpu_bench,
|
||||
core_crypto_bench, core_crypto_gpu_bench, wasm_client_bench ]
|
||||
pbs_bench, pbs_gpu_bench, wasm_client_bench ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
@@ -64,7 +64,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@aa08304bd477b800d468db44fe10f6c61f7f7b11
|
||||
uses: tj-actions/changed-files@90a06d6ba9543371ab4df8eeca0be07ca6054959
|
||||
with:
|
||||
files_yaml: |
|
||||
common_benches:
|
||||
@@ -102,10 +102,10 @@ jobs:
|
||||
- tfhe/src/integer/**
|
||||
- tfhe/benches/integer/signed_bench.rs
|
||||
- .github/workflows/signed_integer_multi_bit_benchmark.yml
|
||||
core_crypto_bench:
|
||||
pbs_bench:
|
||||
- tfhe/src/core_crypto/**
|
||||
- tfhe/benches/core_crypto/**
|
||||
- .github/workflows/core_crypto_benchmark.yml
|
||||
- .github/workflows/pbs_benchmark.yml
|
||||
wasm_client_bench:
|
||||
- tfhe/web_wasm_parallel_tests/**
|
||||
- .github/workflows/wasm_client_benchmark.yml
|
||||
|
||||
2
.github/workflows/start_full_benchmarks.yml
vendored
2
.github/workflows/start_full_benchmarks.yml
vendored
@@ -26,7 +26,7 @@ jobs:
|
||||
matrix:
|
||||
command: [ boolean_bench, shortint_full_bench,
|
||||
integer_full_bench, signed_integer_full_bench, integer_gpu_full_bench,
|
||||
core_crypto_bench, core_crypto_gpu_bench, wasm_client_bench ]
|
||||
pbs_bench, pbs_gpu_bench, wasm_client_bench ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
|
||||
2
.github/workflows/sync_on_push.yml
vendored
2
.github/workflows/sync_on_push.yml
vendored
@@ -17,7 +17,7 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Save repo
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
|
||||
with:
|
||||
name: repo-archive
|
||||
path: '.'
|
||||
|
||||
55
.github/workflows/trigger_aws_tests_on_pr.yml
vendored
Normal file
55
.github/workflows/trigger_aws_tests_on_pr.yml
vendored
Normal file
@@ -0,0 +1,55 @@
|
||||
# Trigger an AWS build each time commits are pushed to a pull request.
|
||||
name: PR AWS build trigger
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
pull_request_review:
|
||||
types: [submitted]
|
||||
|
||||
jobs:
|
||||
trigger-tests:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
steps:
|
||||
- name: Get current labels
|
||||
uses: snnaplab/get-labels-action@f426df40304808ace3b5282d4f036515f7609576
|
||||
|
||||
- name: Remove approved label
|
||||
if: ${{ github.event_name == 'pull_request' && contains(fromJSON(env.LABELS), 'approved') }}
|
||||
uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
|
||||
with:
|
||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
||||
labels: approved
|
||||
|
||||
- name: Launch fast tests
|
||||
if: ${{ github.event_name == 'pull_request' }}
|
||||
uses: mshick/add-pr-comment@a65df5f64fc741e91c59b8359a4bc56e57aaf5b1
|
||||
with:
|
||||
allow-repeats: true
|
||||
message: |
|
||||
@slab-ci cpu_fast_test
|
||||
@slab-ci gpu_test
|
||||
|
||||
- name: Add approved label
|
||||
uses: actions-ecosystem/action-add-labels@18f1af5e3544586314bbe15c0273249c770b2daf
|
||||
if: ${{ github.event_name == 'pull_request_review' && github.event.review.state == 'approved' && !contains(fromJSON(env.LABELS), 'approved') }}
|
||||
with:
|
||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
||||
labels: approved
|
||||
|
||||
# PR label 'approved' presence is checked to avoid running the full test suite several times
|
||||
# in case of multiple approvals without new commits in between.
|
||||
- name: Launch full tests suite
|
||||
if: ${{ github.event_name == 'pull_request_review' && github.event.review.state == 'approved' && !contains(fromJSON(env.LABELS), 'approved') }}
|
||||
uses: mshick/add-pr-comment@a65df5f64fc741e91c59b8359a4bc56e57aaf5b1
|
||||
with:
|
||||
allow-repeats: true
|
||||
message: |
|
||||
Pull Request has been approved :tada:
|
||||
Launching full test suite...
|
||||
@slab-ci cpu_test
|
||||
@slab-ci cpu_unsigned_integer_test
|
||||
@slab-ci cpu_signed_integer_test
|
||||
@slab-ci cpu_wasm_test
|
||||
@slab-ci csprng_randomness_testing
|
||||
3
.github/workflows/wasm_client_benchmark.yml
vendored
3
.github/workflows/wasm_client_benchmark.yml
vendored
@@ -33,7 +33,6 @@ env:
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-wasm-client-benchmarks:
|
||||
@@ -98,7 +97,7 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
|
||||
with:
|
||||
name: ${{ github.sha }}_wasm
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -19,6 +19,3 @@ dieharder_run.log
|
||||
|
||||
# Coverage reports
|
||||
/coverage/
|
||||
|
||||
# Cuda local build
|
||||
backends/tfhe-cuda-backend/cuda/cmake-build-debug/
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[workspace]
|
||||
resolver = "2"
|
||||
members = ["tfhe", "tasks", "apps/trivium", "concrete-csprng", "backends/tfhe-cuda-backend"]
|
||||
members = ["tfhe", "tasks", "apps/trivium", "concrete-csprng"]
|
||||
|
||||
[profile.bench]
|
||||
lto = "fat"
|
||||
|
||||
131
Makefile
131
Makefile
@@ -61,7 +61,7 @@ REGEX_STRING?=''
|
||||
REGEX_PATTERN?=''
|
||||
|
||||
# tfhe-cuda-backend
|
||||
TFHECUDA_SRC=backends/tfhe-cuda-backend/cuda
|
||||
TFHECUDA_SRC="backends/tfhe-cuda-backend/cuda"
|
||||
TFHECUDA_BUILD=$(TFHECUDA_SRC)/build
|
||||
|
||||
# Exclude these files from coverage reports
|
||||
@@ -144,11 +144,6 @@ check_linelint_installed:
|
||||
@printf "\n" | linelint - > /dev/null 2>&1 || \
|
||||
( echo "Unable to locate linelint. Try installing it: https://github.com/fernandrone/linelint/releases" && exit 1 )
|
||||
|
||||
.PHONY: check_actionlint_installed # Check if actionlint workflow linter is installed
|
||||
check_actionlint_installed:
|
||||
@actionlint --version > /dev/null 2>&1 || \
|
||||
( echo "Unable to locate actionlint. Try installing it: https://github.com/rhysd/actionlint/releases" && exit 1 )
|
||||
|
||||
.PHONY: fmt # Format rust code
|
||||
fmt: install_rs_check_toolchain
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
|
||||
@@ -162,16 +157,10 @@ fmt_gpu: install_rs_check_toolchain
|
||||
check_fmt: install_rs_check_toolchain
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
|
||||
|
||||
.PHONY: check_fmt_gpu # Check rust and cuda code format
|
||||
check_fmt_gpu: install_rs_check_toolchain
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
|
||||
cd "$(TFHECUDA_SRC)" && ./format_tfhe_cuda_backend.sh -c
|
||||
|
||||
.PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
|
||||
.PHONY: clippy_gpu # Run clippy lints on the gpu backend
|
||||
clippy_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu \
|
||||
--all-targets \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,shortint,gpu \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
|
||||
@@ -182,10 +171,6 @@ fix_newline: check_linelint_installed
|
||||
check_newline: check_linelint_installed
|
||||
linelint .
|
||||
|
||||
.PHONY: lint_workflow # Run static linter on GitHub workflows
|
||||
lint_workflow: check_actionlint_installed
|
||||
actionlint
|
||||
|
||||
.PHONY: clippy_core # Run clippy lints on core_crypto with and without experimental features
|
||||
clippy_core: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
@@ -267,11 +252,6 @@ clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_triviu
|
||||
clippy_fast: clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core \
|
||||
clippy_concrete_csprng
|
||||
|
||||
.PHONY: clippy_cuda_backend # Run clippy lints on the tfhe-cuda-backend
|
||||
clippy_cuda_backend: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-cuda-backend -- --no-deps -D warnings
|
||||
|
||||
.PHONY: build_core # Build core_crypto without experimental features
|
||||
build_core: install_rs_build_toolchain install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
@@ -310,11 +290,6 @@ build_tfhe_full: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p $(TFHE_SPEC) --all-targets
|
||||
|
||||
.PHONY: build_tfhe_coverage # Build with test coverage enabled
|
||||
build_tfhe_coverage: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) --tests
|
||||
|
||||
.PHONY: symlink_c_libs_without_fingerprint # Link the .a and .so files without the changing hash part in target
|
||||
symlink_c_libs_without_fingerprint:
|
||||
@./scripts/symlink_c_libs_without_fingerprint.sh \
|
||||
@@ -384,33 +359,25 @@ test_core_crypto_cov: install_rs_build_toolchain install_rs_check_toolchain inst
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
|
||||
--out xml --output-dir coverage/core_crypto --line --engine llvm --timeout 500 \
|
||||
--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,__coverage \
|
||||
-p $(TFHE_SPEC) -- core_crypto::
|
||||
@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
|
||||
--out xml --output-dir coverage/core_crypto_avx512 --line --engine llvm --timeout 500 \
|
||||
--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,$(AVX512_FEATURE) \
|
||||
-p $(TFHE_SPEC) -- -Z unstable-options --report-time core_crypto::; \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,__coverage,$(AVX512_FEATURE) \
|
||||
-p $(TFHE_SPEC) -- core_crypto::; \
|
||||
fi
|
||||
|
||||
.PHONY: test_cuda_backend # Run the internal tests of the CUDA backend
|
||||
test_cuda_backend:
|
||||
mkdir -p "$(TFHECUDA_BUILD)" && \
|
||||
cd "$(TFHECUDA_BUILD)" && \
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON && \
|
||||
make -j && \
|
||||
make test
|
||||
|
||||
.PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
|
||||
test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
|
||||
test_gpu: test_core_crypto_gpu test_integer_gpu
|
||||
|
||||
.PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
|
||||
test_core_crypto_gpu: install_rs_build_toolchain install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
|
||||
|
||||
.PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
|
||||
test_integer_gpu: install_rs_build_toolchain install_rs_check_toolchain
|
||||
@@ -429,8 +396,8 @@ test_boolean_cov: install_rs_check_toolchain install_tarpaulin
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
|
||||
--out xml --output-dir coverage/boolean --line --engine llvm --timeout 500 \
|
||||
$(COVERAGE_EXCLUDED_FILES) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache \
|
||||
-p $(TFHE_SPEC) -- -Z unstable-options --report-time boolean::
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache,__coverage \
|
||||
-p $(TFHE_SPEC) -- boolean::
|
||||
|
||||
.PHONY: test_c_api_rs # Run the rust tests for the C API
|
||||
test_c_api_rs: install_rs_check_toolchain
|
||||
@@ -474,8 +441,8 @@ test_shortint_cov: install_rs_check_toolchain install_tarpaulin
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
|
||||
--out xml --output-dir coverage/shortint --line --engine llvm --timeout 500 \
|
||||
$(COVERAGE_EXCLUDED_FILES) \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache \
|
||||
-p $(TFHE_SPEC) -- -Z unstable-options --report-time shortint::
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,__coverage \
|
||||
-p $(TFHE_SPEC) -- shortint::
|
||||
|
||||
.PHONY: test_integer_ci # Run the tests for integer ci
|
||||
test_integer_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
@@ -535,26 +502,12 @@ test_integer: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache -p $(TFHE_SPEC) -- integer::
|
||||
|
||||
.PHONY: test_integer_cov # Run the tests of the integer module with code coverage
|
||||
test_integer_cov: install_rs_check_toolchain install_tarpaulin
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
|
||||
--out xml --output-dir coverage/integer --line --engine llvm --timeout 500 \
|
||||
--implicit-test-threads \
|
||||
--exclude-files $(COVERAGE_EXCLUDED_FILES) \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache \
|
||||
-p $(TFHE_SPEC) -- -Z unstable-options --report-time integer::
|
||||
|
||||
.PHONY: test_high_level_api # Run all the tests for high_level_api
|
||||
test_high_level_api: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) \
|
||||
-- high_level_api::
|
||||
|
||||
test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) \
|
||||
-E "test(/high_level_api::.*gpu.*/)"
|
||||
|
||||
.PHONY: test_user_doc # Run tests from the .md documentation
|
||||
test_user_doc: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
|
||||
@@ -567,12 +520,6 @@ test_user_doc_gpu: install_rs_build_toolchain
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu -p $(TFHE_SPEC) \
|
||||
-- test_user_docs::
|
||||
|
||||
.PHONY: test_fhe_strings # Run tests for fhe_strings example
|
||||
test_fhe_strings: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--example fhe_strings \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer
|
||||
|
||||
.PHONY: test_regex_engine # Run tests for regex_engine example
|
||||
test_regex_engine: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
@@ -641,16 +588,6 @@ check_compile_tests:
|
||||
./scripts/c_api_tests.sh --build-only; \
|
||||
fi
|
||||
|
||||
.PHONY: check_compile_tests_benches_gpu # Build tests in debug without running them
|
||||
check_compile_tests_benches_gpu: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache,gpu \
|
||||
-p $(TFHE_SPEC)
|
||||
mkdir -p "$(TFHECUDA_BUILD)" && \
|
||||
cd "$(TFHECUDA_BUILD)" && \
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Debug -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON -DTFHE_CUDA_BACKEND_BUILD_BENCHMARKS=ON && \
|
||||
make -j
|
||||
|
||||
.PHONY: build_nodejs_test_docker # Build a docker image with tools to run nodejs tests for wasm API
|
||||
build_nodejs_test_docker:
|
||||
DOCKER_BUILDKIT=1 docker build --build-arg RUST_TOOLCHAIN="$(RS_BUILD_TOOLCHAIN)" \
|
||||
@@ -703,21 +640,21 @@ bench_integer: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_signed_integer # Run benchmarks for signed integer
|
||||
bench_signed_integer: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-signed-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
|
||||
bench_integer_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
|
||||
bench_integer_multi_bit: install_rs_check_toolchain
|
||||
@@ -725,7 +662,7 @@ bench_integer_multi_bit: install_rs_check_toolchain
|
||||
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters
|
||||
bench_signed_integer_multi_bit: install_rs_check_toolchain
|
||||
@@ -733,7 +670,7 @@ bench_signed_integer_multi_bit: install_rs_check_toolchain
|
||||
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-signed-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters
|
||||
bench_integer_multi_bit_gpu: install_rs_check_toolchain
|
||||
@@ -741,25 +678,25 @@ bench_integer_multi_bit_gpu: install_rs_check_toolchain
|
||||
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_shortint # Run benchmarks for shortint
|
||||
bench_shortint: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench shortint-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)
|
||||
|
||||
.PHONY: bench_oprf # Run benchmarks for shortint
|
||||
bench_oprf: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench oprf-shortint-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)
|
||||
RUSTFLAGS="$(RUSTFLAGS)" \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench oprf-integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)
|
||||
|
||||
|
||||
|
||||
@@ -769,38 +706,26 @@ bench_shortint_multi_bit: install_rs_check_toolchain
|
||||
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench shortint-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
|
||||
|
||||
|
||||
.PHONY: bench_boolean # Run benchmarks for boolean
|
||||
bench_boolean: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench boolean-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)
|
||||
|
||||
.PHONY: bench_pbs # Run benchmarks for PBS
|
||||
bench_pbs: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench pbs-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)
|
||||
|
||||
.PHONY: bench_pbs_gpu # Run benchmarks for PBS on GPU backend
|
||||
bench_pbs_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench pbs-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
|
||||
.PHONY: bench_ks # Run benchmarks for keyswitch
|
||||
bench_ks: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench ks-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
|
||||
.PHONY: bench_ks_gpu # Run benchmarks for PBS on GPU backend
|
||||
bench_ks_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench ks-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)
|
||||
|
||||
.PHONY: bench_web_js_api_parallel # Run benchmarks for the web wasm api
|
||||
bench_web_js_api_parallel: build_web_js_api_parallel
|
||||
@@ -817,7 +742,7 @@ ci_bench_web_js_api_parallel: build_web_js_api_parallel
|
||||
#
|
||||
.PHONY: gen_key_cache # Run the script to generate keys and cache them for shortint tests
|
||||
gen_key_cache: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
|
||||
--example generates_test_keys \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache -- \
|
||||
$(MULTI_BIT_ONLY) $(COVERAGE_ONLY)
|
||||
@@ -893,7 +818,7 @@ sha256_bool: install_rs_check_toolchain
|
||||
pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_all check_compile_tests
|
||||
|
||||
.PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
|
||||
pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu
|
||||
pcc_gpu: pcc clippy_gpu
|
||||
|
||||
.PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
|
||||
fpcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_fast check_compile_tests
|
||||
|
||||
199
README.md
199
README.md
@@ -2,66 +2,40 @@
|
||||
<!-- product name logo -->
|
||||
<img width=600 src="https://user-images.githubusercontent.com/5758427/231206749-8f146b97-3c5a-4201-8388-3ffa88580415.png">
|
||||
</p>
|
||||
|
||||
<hr/>
|
||||
<p align="center">
|
||||
<a href="https://docs.zama.ai/tfhe-rs"> 📒 Read documentation</a> | <a href="https://zama.ai/community"> 💛 Community support</a> | <a href="https://github.com/zama-ai/awesome-zama"> 📚 FHE resources</a>
|
||||
</p>
|
||||
<p align="center">
|
||||
<!-- Version badge using shields.io -->
|
||||
<a href="https://github.com/zama-ai/tfhe-rs/releases">
|
||||
<img src="https://img.shields.io/github/v/release/zama-ai/tfhe-rs?style=flat-square">
|
||||
</a>
|
||||
<!-- Link to tutorials badge using shields.io -->
|
||||
<a href="#license">
|
||||
<img src="https://img.shields.io/badge/License-BSD--3--Clause--Clear-orange?style=flat-square">
|
||||
</a>
|
||||
<!-- Zama Bounty Program -->
|
||||
<a href="https://github.com/zama-ai/bounty-program">
|
||||
<img src="https://img.shields.io/badge/Contribute-Zama%20Bounty%20Program-yellow?style=flat-square">
|
||||
</a>
|
||||
</p>
|
||||
<hr/>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://docs.zama.ai/tfhe-rs"> 📒 Documentation</a> | <a href="https://zama.ai/community"> 💛 Community support</a> | <a href="https://github.com/zama-ai/awesome-zama"> 📚 FHE resources by Zama</a>
|
||||
</p>
|
||||
|
||||
**TFHE-rs** is a pure Rust implementation of TFHE for boolean and integer
|
||||
arithmetics over encrypted data. It includes:
|
||||
- a **Rust** API
|
||||
- a **C** API
|
||||
- and a **client-side WASM** API
|
||||
|
||||
<p align="center">
|
||||
<a href="https://github.com/zama-ai/tfhe-rs/releases"><img src="https://img.shields.io/github/v/release/zama-ai/tfhe-rs?style=flat-square"></a>
|
||||
<a href="LICENSE"><img src="https://img.shields.io/badge/License-BSD--3--Clause--Clear-%23ffb243?style=flat-square"></a>
|
||||
<a href="https://github.com/zama-ai/bounty-program"><img src="https://img.shields.io/badge/Contribute-Zama%20Bounty%20Program-%23ffd208?style=flat-square"></a>
|
||||
</p>
|
||||
|
||||
## About
|
||||
|
||||
### What is TFHE-rs
|
||||
|
||||
**TFHE-rs** is a pure Rust implementation of TFHE for boolean and integer arithmetics over encrypted data.
|
||||
|
||||
It includes:
|
||||
- a **Rust** API
|
||||
- a **C** API
|
||||
- and a **client-side WASM** API
|
||||
|
||||
TFHE-rs is designed for developers and researchers who want full control over
|
||||
what they can do with TFHE, while not having to worry about the low-level
|
||||
**TFHE-rs** is meant for developers and researchers who want full control over
|
||||
what they can do with TFHE, while not having to worry about the low level
|
||||
implementation. The goal is to have a stable, simple, high-performance, and
|
||||
production-ready library for all the advanced features of TFHE.
|
||||
<br></br>
|
||||
|
||||
### Main features
|
||||
|
||||
- **Low-level cryptographic library** that implements Zama’s variant of TFHE, including programmable bootstrapping
|
||||
- **Implementation of the original TFHE boolean API** that can be used as a drop-in replacement for other TFHE libraries
|
||||
- **Short integer API** that enables exact, unbounded FHE integer arithmetics with up to 8 bits of message space
|
||||
- **Size-efficient public key encryption**
|
||||
- **Ciphertext and server key compression** for efficient data transfer
|
||||
- **Full Rust API, C bindings to the Rust High-Level API, and client-side Javascript API using WASM**.
|
||||
|
||||
*Learn more about TFHE-rs features in the [documentation](https://docs.zama.ai/tfhe-rs/readme).*
|
||||
<br></br>
|
||||
|
||||
## Table of Contents
|
||||
- **[Getting Started](#getting-started)**
|
||||
- [Cargo.toml configuration](#cargotoml-configuration)
|
||||
- [A simple example](#a-simple-example)
|
||||
- **[Resources](#resources)**
|
||||
- [TFHE deep dive](#tfhe-deep-dive)
|
||||
- [Tutorials](#tutorials)
|
||||
- [Documentation](#documentation)
|
||||
- **[Working with TFHE-rs](#working-with-tfhe-rs)**
|
||||
- [Disclaimers](#disclaimers)
|
||||
- [Citations](#citations)
|
||||
- [Contributing](#contributing)
|
||||
- [License](#license)
|
||||
- **[Support](#support)**
|
||||
<br></br>
|
||||
|
||||
## Getting Started
|
||||
The steps to run a first example are described below.
|
||||
|
||||
### Cargo.toml configuration
|
||||
To use the latest version of `TFHE-rs` in your project, you first need to add it as a dependency in your `Cargo.toml`:
|
||||
@@ -77,24 +51,20 @@ tfhe = { version = "*", features = ["boolean", "shortint", "integer", "x86_64-un
|
||||
```toml
|
||||
tfhe = { version = "*", features = ["boolean", "shortint", "integer", "aarch64-unix"] }
|
||||
```
|
||||
Note: users with ARM devices must compile `TFHE-rs` using a stable toolchain with version >= 1.72.
|
||||
|
||||
+ For x86_64-based machines with the [`rdseed instruction`](https://en.wikipedia.org/wiki/RDRAND) running Windows:
|
||||
|
||||
+ For x86_64-based machines with the [`rdseed instruction`](https://en.wikipedia.org/wiki/RDRAND)
|
||||
running Windows:
|
||||
|
||||
```toml
|
||||
tfhe = { version = "*", features = ["boolean", "shortint", "integer", "x86_64"] }
|
||||
```
|
||||
|
||||
> [!Note]
|
||||
> Note: You need to use a Rust version >= 1.73 to compile TFHE-rs.
|
||||
Note: aarch64-based machines are not yet supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.
|
||||
|
||||
> [!Note]
|
||||
> Note: aarch64-based machines are not yet supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.
|
||||
|
||||
<p align="right">
|
||||
<a href="#about" > ↑ Back to top </a>
|
||||
</p>
|
||||
|
||||
### A simple example
|
||||
## A simple example
|
||||
|
||||
Here is a full example:
|
||||
|
||||
@@ -151,64 +121,34 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
To run this code, use the following command:
|
||||
<p align="center"> <code> cargo run --release </code> </p>
|
||||
|
||||
> [!Note]
|
||||
> Note that when running code that uses `TFHE-rs`, it is highly recommended
|
||||
Note that when running code that uses `tfhe-rs`, it is highly recommended
|
||||
to run in release mode with cargo's `--release` flag to have the best performances possible.
|
||||
|
||||
*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/getting-started/quick_start)*
|
||||
|
||||
<p align="right">
|
||||
<a href="#about" > ↑ Back to top </a>
|
||||
</p>
|
||||
## Contributing
|
||||
|
||||
There are two ways to contribute to TFHE-rs:
|
||||
|
||||
- you can open issues to report bugs or typos, or to suggest new ideas
|
||||
- you can ask to become an official contributor by emailing [hello@zama.ai](mailto:hello@zama.ai).
|
||||
(becoming an approved contributor involves signing our Contributor License Agreement (CLA))
|
||||
|
||||
Only approved contributors can send pull requests, so please make sure to get in touch before you do!
|
||||
|
||||
## Credits
|
||||
|
||||
This library uses several dependencies and we would like to thank the contributors of those
|
||||
libraries.
|
||||
|
||||
## Need support?
|
||||
<a target="_blank" href="https://community.zama.ai">
|
||||
<img src="https://github.com/zama-ai/tfhe-rs/assets/157474013/33d856dc-f25d-454b-a010-af12bff2aa7d">
|
||||
</a>
|
||||
|
||||
|
||||
|
||||
## Resources
|
||||
## Citing TFHE-rs
|
||||
|
||||
### TFHE deep dive
|
||||
- [TFHE Deep Dive - Part I - Ciphertext types](https://www.zama.ai/post/tfhe-deep-dive-part-1)
|
||||
- [TFHE Deep Dive - Part II - Encodings and linear leveled operations](https://www.zama.ai/post/tfhe-deep-dive-part-2)
|
||||
- [TFHE Deep Dive - Part III - Key switching and leveled multiplications](https://www.zama.ai/post/tfhe-deep-dive-part-3)
|
||||
- [TFHE Deep Dive - Part IV - Programmable Bootstrapping](https://www.zama.ai/post/tfhe-deep-dive-part-4)
|
||||
<br></br>
|
||||
|
||||
### Tutorials
|
||||
- [Homomorphic Parity Bit](https://docs.zama.ai/tfhe-rs/tutorials/parity_bit)
|
||||
- [Homomorphic Case Changing on Ascii String](https://docs.zama.ai/tfhe-rs/tutorials/ascii_fhe_string)
|
||||
- [Boolean SHA256 with TFHE-rs](https://www.zama.ai/post/boolean-sha256-tfhe-rs)
|
||||
- [Dark Market with TFHE-rs](https://www.zama.ai/post/dark-market-tfhe-rs)
|
||||
- [Regular Expression Engine with TFHE-rs](https://www.zama.ai/post/regex-engine-tfhe-rs)
|
||||
|
||||
|
||||
*Explore more useful resources in [TFHE-rs tutorials](https://docs.zama.ai/tfhe-rs/tutorials) and [Awesome Zama repo](https://github.com/zama-ai/awesome-zama)*
|
||||
<br></br>
|
||||
### Documentation
|
||||
|
||||
Full, comprehensive documentation is available here: [https://docs.zama.ai/tfhe-rs](https://docs.zama.ai/tfhe-rs).
|
||||
<p align="right">
|
||||
<a href="#about" > ↑ Back to top </a>
|
||||
</p>
|
||||
|
||||
|
||||
## Working with TFHE-rs
|
||||
|
||||
### Disclaimers
|
||||
|
||||
#### Security Estimation
|
||||
|
||||
Security estimations are done using the
|
||||
[Lattice Estimator](https://github.com/malb/lattice-estimator)
|
||||
with `red_cost_model = reduction.RC.BDGL16`.
|
||||
|
||||
When a new update is published in the Lattice Estimator, we update parameters accordingly.
|
||||
|
||||
#### Side-Channel Attacks
|
||||
|
||||
Mitigation for side-channel attacks has not yet been implemented in TFHE-rs,
|
||||
and will be released in upcoming versions.
|
||||
<br></br>
|
||||
|
||||
### Citations
|
||||
To cite TFHE-rs in academic papers, please use the following entry:
|
||||
|
||||
```text
|
||||
@@ -220,31 +160,22 @@ To cite TFHE-rs in academic papers, please use the following entry:
|
||||
}
|
||||
```
|
||||
|
||||
### Contributing
|
||||
## License
|
||||
|
||||
There are two ways to contribute to TFHE-rs:
|
||||
This software is distributed under the BSD-3-Clause-Clear license. If you have any questions,
|
||||
please contact us at `hello@zama.ai`.
|
||||
|
||||
- [Open issues](https://github.com/zama-ai/tfhe-rs/issues/new/choose) to report bugs and typos, or to suggest new ideas
|
||||
- Request to become an official contributor by emailing [hello@zama.ai](mailto:hello@zama.ai).
|
||||
## Disclaimers
|
||||
|
||||
Becoming an approved contributor involves signing our Contributor License Agreement (CLA). Only approved contributors can send pull requests, so please make sure to get in touch before you do!
|
||||
<br></br>
|
||||
### Security Estimation
|
||||
|
||||
### License
|
||||
This software is distributed under the **BSD-3-Clause-Clear** license. If you have any questions, please contact us at hello@zama.ai.
|
||||
<p align="right">
|
||||
<a href="#about" > ↑ Back to top </a>
|
||||
</p>
|
||||
Security estimations are done using the
|
||||
[Lattice Estimator](https://github.com/malb/lattice-estimator)
|
||||
with `red_cost_model = reduction.RC.BDGL16`.
|
||||
|
||||
When a new update is published in the Lattice Estimator, we update parameters accordingly.
|
||||
|
||||
## Support
|
||||
### Side-Channel Attacks
|
||||
|
||||
<a target="_blank" href="https://community.zama.ai">
|
||||
<img src="https://github.com/zama-ai/tfhe-rs/assets/157474013/8da6cf5b-51a0-4c86-9e75-fd0e4a4c64a4">
|
||||
</a>
|
||||
|
||||
🌟 If you find this project helpful or interesting, please consider giving it a star on GitHub! Your support helps to grow the community and motivates further development.
|
||||
|
||||
<p align="right">
|
||||
<a href="#about" > ↑ Back to top </a>
|
||||
</p>
|
||||
Mitigation for side channel attacks have not yet been implemented in TFHE-rs,
|
||||
and will be released in upcoming versions.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tfhe-cuda-backend"
|
||||
version = "0.2.0"
|
||||
version = "0.1.3"
|
||||
edition = "2021"
|
||||
authors = ["Zama team"]
|
||||
license = "BSD-3-Clause-Clear"
|
||||
|
||||
@@ -30,7 +30,8 @@ The cryptographic operations it provides are:
|
||||
|
||||
## Build
|
||||
|
||||
The Cuda project held in `tfhe-cuda-backend` can be compiled independently from TFHE-rs in the following way:
|
||||
The Cuda project held in `tfhe-cuda-backend` can be compiled independently from Concrete in the
|
||||
following way:
|
||||
```
|
||||
git clone git@github.com:zama-ai/tfhe-rs
|
||||
cd backends/tfhe-cuda-backend/cuda
|
||||
|
||||
1
backends/tfhe-cuda-backend/cuda/.gitignore
vendored
1
backends/tfhe-cuda-backend/cuda/.gitignore
vendored
@@ -1 +0,0 @@
|
||||
/build/
|
||||
@@ -71,13 +71,10 @@ set(CMAKE_CUDA_FLAGS
|
||||
set(INCLUDE_DIR include)
|
||||
|
||||
add_subdirectory(src)
|
||||
enable_testing()
|
||||
add_subdirectory(tests_and_benchmarks)
|
||||
target_include_directories(tfhe_cuda_backend PRIVATE ${INCLUDE_DIR})
|
||||
|
||||
# This is required for rust cargo build
|
||||
install(TARGETS tfhe_cuda_backend DESTINATION .)
|
||||
|
||||
install(TARGETS tfhe_cuda_backend DESTINATION lib)
|
||||
|
||||
# Define a function to add a lint target.
|
||||
@@ -89,3 +86,5 @@ if(CPPLINT)
|
||||
set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_ALL TRUE)
|
||||
# set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD TRUE)
|
||||
endif()
|
||||
|
||||
enable_testing()
|
||||
|
||||
@@ -1,19 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
while getopts ":c" option; do
|
||||
case $option in
|
||||
c)
|
||||
# code to execute when flag1 is provided
|
||||
find ./{include,src,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file' --dry-run --Werror
|
||||
cmake-format -i CMakeLists.txt -c .cmake-format-config.py
|
||||
find ./{include,src,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
|
||||
git diff --exit-code
|
||||
exit
|
||||
;;
|
||||
esac
|
||||
done
|
||||
find ./{include,src,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file'
|
||||
find ./{include,src} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file'
|
||||
cmake-format -i CMakeLists.txt -c .cmake-format-config.py
|
||||
find ./{include,src,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
|
||||
|
||||
find ./{include,src} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
#include <cstdint>
|
||||
|
||||
enum PBS_TYPE { MULTI_BIT = 0, LOW_LAT = 1, AMORTIZED = 2 };
|
||||
enum PBS_VARIANT { DEFAULT = 0, FAST = 1 };
|
||||
|
||||
extern "C" {
|
||||
void cuda_fourier_polynomial_mul(void *input1, void *input2, void *output,
|
||||
@@ -55,13 +54,13 @@ void cleanup_cuda_bootstrap_amortized(cuda_stream_t *stream,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
void scratch_cuda_bootstrap_low_latency_32(
|
||||
cuda_stream_t *stream, int8_t **buffer, uint32_t glwe_dimension,
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_bootstrap_low_latency_64(
|
||||
cuda_stream_t *stream, int8_t **buffer, uint32_t glwe_dimension,
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
@@ -69,7 +68,7 @@ void scratch_cuda_bootstrap_low_latency_64(
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
@@ -77,16 +76,13 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
void cleanup_cuda_bootstrap_low_latency_32(cuda_stream_t *stream,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
void cleanup_cuda_bootstrap_low_latency_64(cuda_stream_t *stream,
|
||||
int8_t **pbs_buffer);
|
||||
void cleanup_cuda_bootstrap_low_latency(cuda_stream_t *stream,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
uint64_t get_buffer_size_bootstrap_amortized_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
@@ -97,212 +93,6 @@ uint64_t get_buffer_size_bootstrap_low_latency_64(
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_bootstrap_low_latency_step_one(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_bootstrap_low_latency_step_two(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_bootstrap_low_latency(uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_bootstrap_fast_low_latency(uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_bootstrap_fast_low_latency(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
|
||||
}
|
||||
|
||||
template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;
|
||||
|
||||
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::LOW_LAT> {
|
||||
int8_t *d_mem;
|
||||
|
||||
Torus *global_accumulator;
|
||||
double2 *global_accumulator_fft;
|
||||
|
||||
PBS_VARIANT pbs_variant;
|
||||
|
||||
pbs_buffer(cuda_stream_t *stream, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
|
||||
bool allocate_gpu_memory) {
|
||||
this->pbs_variant = pbs_variant;
|
||||
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
switch (pbs_variant) {
|
||||
case PBS_VARIANT::DEFAULT: {
|
||||
uint64_t full_sm_step_one =
|
||||
get_buffer_size_full_sm_bootstrap_low_latency_step_one<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t full_sm_step_two =
|
||||
get_buffer_size_full_sm_bootstrap_low_latency_step_two<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
|
||||
uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
|
||||
uint64_t full_dm = full_sm_step_one;
|
||||
|
||||
uint64_t device_mem = 0;
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
} else if (max_shared_memory < full_sm_step_two) {
|
||||
device_mem =
|
||||
(partial_dm_step_two + partial_dm_step_one * level_count) *
|
||||
input_lwe_ciphertext_count * (glwe_dimension + 1);
|
||||
} else if (max_shared_memory < full_sm_step_one) {
|
||||
device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
|
||||
level_count * (glwe_dimension + 1);
|
||||
}
|
||||
// Otherwise, both kernels run all in shared memory
|
||||
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream);
|
||||
|
||||
global_accumulator_fft = (double2 *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
|
||||
(polynomial_size / 2) * sizeof(double2),
|
||||
stream);
|
||||
|
||||
global_accumulator = (Torus *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
polynomial_size * sizeof(Torus),
|
||||
stream);
|
||||
} break;
|
||||
case PBS_VARIANT::FAST: {
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
uint64_t partial_dm = full_sm - partial_sm;
|
||||
uint64_t full_dm = full_sm;
|
||||
uint64_t device_mem = 0;
|
||||
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
}
|
||||
|
||||
// Otherwise, both kernels run all in shared memory
|
||||
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream);
|
||||
|
||||
global_accumulator_fft = (double2 *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
|
||||
polynomial_size / 2 * sizeof(double2),
|
||||
stream);
|
||||
} break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unsupported implementation variant.")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void release(cuda_stream_t *stream) {
|
||||
cuda_drop_async(d_mem, stream);
|
||||
cuda_drop_async(global_accumulator_fft, stream);
|
||||
|
||||
if (pbs_variant == DEFAULT)
|
||||
cuda_drop_async(global_accumulator, stream);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t get_buffer_size_bootstrap_fast_low_latency(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
|
||||
|
||||
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_dm = full_sm - partial_sm;
|
||||
uint64_t full_dm = full_sm;
|
||||
uint64_t device_mem = 0;
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
}
|
||||
uint64_t buffer_size = device_mem + (glwe_dimension + 1) * level_count *
|
||||
input_lwe_ciphertext_count *
|
||||
polynomial_size / 2 * sizeof(double2);
|
||||
return buffer_size + buffer_size % sizeof(double2);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_bootstrap_fast_low_latency(uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t level_count,
|
||||
uint32_t num_samples,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_bootstrap_fast_low_latency_lwe_ciphertext_vector(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, LOW_LAT> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, LOW_LAT> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_fast_bootstrap_low_latency(
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, LOW_LAT> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_bootstrap_low_latency(
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, LOW_LAT> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
|
||||
@@ -1,22 +1,23 @@
|
||||
#ifndef CUDA_MULTI_BIT_H
|
||||
#define CUDA_MULTI_BIT_H
|
||||
|
||||
#include "bootstrap.h"
|
||||
#include <cstdint>
|
||||
|
||||
extern "C" {
|
||||
|
||||
bool has_support_to_cuda_bootstrap_fast_multi_bit(uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t level_count,
|
||||
uint32_t num_samples,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
void cuda_convert_lwe_multi_bit_bootstrap_key_64(
|
||||
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor);
|
||||
|
||||
void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t chunk_size = 0);
|
||||
|
||||
void scratch_cuda_multi_bit_pbs_64(
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
@@ -24,118 +25,8 @@ void scratch_cuda_multi_bit_pbs_64(
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory,
|
||||
uint32_t chunk_size = 0);
|
||||
|
||||
void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
|
||||
|
||||
void scratch_cuda_generic_multi_bit_pbs_64(
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size = 0);
|
||||
|
||||
void cuda_generic_multi_bit_pbs_lwe_ciphertext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
|
||||
|
||||
void cleanup_cuda_multi_bit_pbs_32(cuda_stream_t *stream, int8_t **pbs_buffer);
|
||||
void cleanup_cuda_multi_bit_pbs_64(cuda_stream_t *stream, int8_t **pbs_buffer);
|
||||
void cleanup_cuda_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer);
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_fast_multi_bit_pbs(
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_fast_multi_bit_pbs_lwe_ciphertext_vector(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t lwe_chunk_size = 0);
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_multi_bit_pbs(
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_multi_bit_pbs_lwe_ciphertext_vector(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t lwe_chunk_size = 0);
|
||||
|
||||
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
double2 *keybundle_fft;
|
||||
Torus *global_accumulator;
|
||||
double2 *global_accumulator_fft;
|
||||
|
||||
PBS_VARIANT pbs_variant;
|
||||
|
||||
pbs_buffer(cuda_stream_t *stream, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
|
||||
PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
|
||||
this->pbs_variant = pbs_variant;
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
switch (pbs_variant) {
|
||||
case DEFAULT:
|
||||
case FAST:
|
||||
keybundle_fft = (double2 *)cuda_malloc_async(
|
||||
input_lwe_ciphertext_count * lwe_chunk_size * level_count *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) *
|
||||
(polynomial_size / 2) * sizeof(double2),
|
||||
stream);
|
||||
global_accumulator = (Torus *)cuda_malloc_async(
|
||||
input_lwe_ciphertext_count * (glwe_dimension + 1) *
|
||||
polynomial_size * sizeof(Torus),
|
||||
stream);
|
||||
global_accumulator_fft = (double2 *)cuda_malloc_async(
|
||||
input_lwe_ciphertext_count * (glwe_dimension + 1) * level_count *
|
||||
(polynomial_size / 2) * sizeof(double2),
|
||||
stream);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unsupported implementation variant.")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void release(cuda_stream_t *stream) {
|
||||
cuda_drop_async(keybundle_fft, stream);
|
||||
cuda_drop_async(global_accumulator, stream);
|
||||
cuda_drop_async(global_accumulator_fft, stream);
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__host__ uint32_t get_lwe_chunk_size(uint32_t lwe_dimension,
|
||||
uint32_t level_count,
|
||||
|
||||
@@ -11,22 +11,6 @@
|
||||
|
||||
extern "C" {
|
||||
|
||||
#define check_cuda_error(ans) \
|
||||
{ cuda_error((ans), __FILE__, __LINE__); }
|
||||
inline void cuda_error(cudaError_t code, const char *file, int line) {
|
||||
if (code != cudaSuccess) {
|
||||
std::fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code),
|
||||
file, line);
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
#define PANIC(format, ...) \
|
||||
{ \
|
||||
std::fprintf(stderr, "%s::%d::%s: panic.\n" format "\n", __FILE__, \
|
||||
__LINE__, __func__, ##__VA_ARGS__); \
|
||||
std::abort(); \
|
||||
}
|
||||
|
||||
struct cuda_stream_t {
|
||||
cudaStream_t stream;
|
||||
uint32_t gpu_index;
|
||||
@@ -34,58 +18,68 @@ struct cuda_stream_t {
|
||||
cuda_stream_t(uint32_t gpu_index) {
|
||||
this->gpu_index = gpu_index;
|
||||
|
||||
check_cuda_error(cudaStreamCreate(&stream));
|
||||
cudaStreamCreate(&stream);
|
||||
}
|
||||
|
||||
void release() {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
check_cuda_error(cudaStreamDestroy(stream));
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaStreamDestroy(stream);
|
||||
}
|
||||
|
||||
void synchronize() { check_cuda_error(cudaStreamSynchronize(stream)); }
|
||||
void synchronize() { cudaStreamSynchronize(stream); }
|
||||
};
|
||||
|
||||
cuda_stream_t *cuda_create_stream(uint32_t gpu_index);
|
||||
|
||||
void cuda_destroy_stream(cuda_stream_t *stream);
|
||||
int cuda_destroy_stream(cuda_stream_t *stream);
|
||||
|
||||
void *cuda_malloc(uint64_t size, uint32_t gpu_index);
|
||||
|
||||
void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream);
|
||||
|
||||
void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
|
||||
int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
|
||||
|
||||
bool cuda_check_support_cooperative_groups();
|
||||
int cuda_check_support_cooperative_groups();
|
||||
|
||||
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cuda_stream_t *stream);
|
||||
int cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size);
|
||||
|
||||
void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cuda_stream_t *stream);
|
||||
int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cuda_stream_t *stream);
|
||||
|
||||
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
cuda_stream_t *stream);
|
||||
int cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cuda_stream_t *stream);
|
||||
|
||||
void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||
cuda_stream_t *stream);
|
||||
int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size);
|
||||
|
||||
int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
cuda_stream_t *stream);
|
||||
|
||||
int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||
cuda_stream_t *stream);
|
||||
|
||||
int cuda_get_number_of_gpus();
|
||||
|
||||
void cuda_synchronize_device(uint32_t gpu_index);
|
||||
int cuda_synchronize_device(uint32_t gpu_index);
|
||||
|
||||
void cuda_drop(void *ptr, uint32_t gpu_index);
|
||||
int cuda_drop(void *ptr, uint32_t gpu_index);
|
||||
|
||||
void cuda_drop_async(void *ptr, cuda_stream_t *stream);
|
||||
int cuda_drop_async(void *ptr, cuda_stream_t *stream);
|
||||
|
||||
int cuda_get_max_shared_memory(uint32_t gpu_index);
|
||||
|
||||
void cuda_synchronize_stream(cuda_stream_t *stream);
|
||||
int cuda_synchronize_stream(cuda_stream_t *stream);
|
||||
|
||||
void cuda_stream_add_callback(cuda_stream_t *stream,
|
||||
cudaStreamCallback_t callback, void *user_data);
|
||||
|
||||
void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
|
||||
void *host_pointer);
|
||||
#define check_cuda_error(ans) \
|
||||
{ cuda_error((ans), __FILE__, __LINE__); }
|
||||
inline void cuda_error(cudaError_t code, const char *file, int line,
|
||||
bool abort = true) {
|
||||
if (code != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code), file,
|
||||
line);
|
||||
if (abort)
|
||||
exit(code);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
|
||||
#include "bootstrap.h"
|
||||
#include "bootstrap_multibit.h"
|
||||
#include "pbs/bootstrap.cuh"
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <functional>
|
||||
@@ -274,7 +273,7 @@ template <typename Torus> struct int_radix_lut {
|
||||
uint32_t num_blocks;
|
||||
bool mem_reuse = false;
|
||||
|
||||
int8_t *buffer;
|
||||
int8_t *pbs_buffer;
|
||||
|
||||
Torus *lut_indexes;
|
||||
Torus *lwe_indexes;
|
||||
@@ -298,11 +297,31 @@ template <typename Torus> struct int_radix_lut {
|
||||
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);
|
||||
|
||||
///////////////
|
||||
execute_scratch_pbs<Torus>(
|
||||
stream, &buffer, params.glwe_dimension, params.small_lwe_dimension,
|
||||
params.polynomial_size, params.pbs_level, params.grouping_factor,
|
||||
num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
|
||||
params.pbs_type, allocate_gpu_memory);
|
||||
// PBS
|
||||
if (params.pbs_type == MULTI_BIT) {
|
||||
// Only 64 bits is supported
|
||||
static_assert(
|
||||
sizeof(Torus) == 8,
|
||||
"Error (GPU multi bit PBS): only 64 bits Torus is supported");
|
||||
scratch_cuda_multi_bit_pbs_64(
|
||||
stream, &pbs_buffer, params.small_lwe_dimension,
|
||||
params.glwe_dimension, params.polynomial_size, params.pbs_level,
|
||||
params.grouping_factor, num_radix_blocks,
|
||||
cuda_get_max_shared_memory(stream->gpu_index), allocate_gpu_memory);
|
||||
} else {
|
||||
// Classic
|
||||
// We only use low latency for classic mode
|
||||
if (sizeof(Torus) == sizeof(uint32_t))
|
||||
scratch_cuda_bootstrap_low_latency_32(
|
||||
stream, &pbs_buffer, params.glwe_dimension, params.polynomial_size,
|
||||
params.pbs_level, num_radix_blocks,
|
||||
cuda_get_max_shared_memory(stream->gpu_index), allocate_gpu_memory);
|
||||
else
|
||||
scratch_cuda_bootstrap_low_latency_64(
|
||||
stream, &pbs_buffer, params.glwe_dimension, params.polynomial_size,
|
||||
params.pbs_level, num_radix_blocks,
|
||||
cuda_get_max_shared_memory(stream->gpu_index), allocate_gpu_memory);
|
||||
}
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
// Allocate LUT
|
||||
@@ -325,8 +344,8 @@ template <typename Torus> struct int_radix_lut {
|
||||
for (int i = 0; i < num_radix_blocks; i++)
|
||||
h_lwe_indexes[i] = i;
|
||||
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes,
|
||||
num_radix_blocks * sizeof(Torus), stream);
|
||||
cuda_memcpy_to_gpu(lwe_indexes, h_lwe_indexes,
|
||||
num_radix_blocks * sizeof(Torus));
|
||||
free(h_lwe_indexes);
|
||||
|
||||
// Keyswitch
|
||||
@@ -338,17 +357,21 @@ template <typename Torus> struct int_radix_lut {
|
||||
// constructor to reuse memory
|
||||
int_radix_lut(cuda_stream_t *stream, int_radix_params params,
|
||||
uint32_t num_luts, uint32_t num_radix_blocks,
|
||||
int_radix_lut *base_lut_object) {
|
||||
int_radix_lut<Torus> *base_lut_object) {
|
||||
this->params = params;
|
||||
this->num_blocks = num_radix_blocks;
|
||||
Torus lut_indexes_size = num_radix_blocks * sizeof(Torus);
|
||||
Torus big_size =
|
||||
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
|
||||
Torus small_size =
|
||||
(params.small_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
|
||||
Torus lut_buffer_size =
|
||||
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);
|
||||
|
||||
// base lut object should have bigger or equal memory than current one
|
||||
assert(num_radix_blocks <= base_lut_object->num_blocks);
|
||||
// pbs
|
||||
buffer = base_lut_object->buffer;
|
||||
pbs_buffer = base_lut_object->pbs_buffer;
|
||||
// Keyswitch
|
||||
tmp_lwe_before_ks = base_lut_object->tmp_lwe_before_ks;
|
||||
tmp_lwe_after_ks = base_lut_object->tmp_lwe_after_ks;
|
||||
@@ -375,9 +398,8 @@ template <typename Torus> struct int_radix_lut {
|
||||
for (int i = 0; i < num_radix_blocks; i++)
|
||||
h_lwe_indexes[i] = i;
|
||||
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes,
|
||||
num_radix_blocks * sizeof(Torus), stream);
|
||||
cuda_synchronize_stream(stream);
|
||||
cuda_memcpy_to_gpu(lwe_indexes, h_lwe_indexes,
|
||||
num_radix_blocks * sizeof(Torus));
|
||||
free(h_lwe_indexes);
|
||||
}
|
||||
|
||||
@@ -392,41 +414,7 @@ template <typename Torus> struct int_radix_lut {
|
||||
cuda_drop_async(lwe_indexes, stream);
|
||||
cuda_drop_async(lut, stream);
|
||||
if (!mem_reuse) {
|
||||
switch (params.pbs_type) {
|
||||
case MULTI_BIT:
|
||||
switch (sizeof(Torus)) {
|
||||
case sizeof(uint32_t):
|
||||
cleanup_cuda_multi_bit_pbs_32(stream, &buffer);
|
||||
break;
|
||||
case sizeof(uint64_t):
|
||||
cleanup_cuda_multi_bit_pbs_64(stream, &buffer);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit "
|
||||
"integer "
|
||||
"moduli are supported.")
|
||||
}
|
||||
break;
|
||||
case LOW_LAT:
|
||||
switch (sizeof(Torus)) {
|
||||
case sizeof(uint32_t):
|
||||
cleanup_cuda_bootstrap_low_latency_32(stream, &buffer);
|
||||
break;
|
||||
case sizeof(uint64_t):
|
||||
cleanup_cuda_bootstrap_low_latency_64(stream, &buffer);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit "
|
||||
"integer "
|
||||
"moduli are supported.")
|
||||
}
|
||||
break;
|
||||
case AMORTIZED:
|
||||
cleanup_cuda_bootstrap_amortized(stream, &buffer);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unknown PBS type. ")
|
||||
}
|
||||
cuda_drop_async(pbs_buffer, stream);
|
||||
cuda_drop_async(tmp_lwe_before_ks, stream);
|
||||
cuda_drop_async(tmp_lwe_after_ks, stream);
|
||||
}
|
||||
@@ -500,12 +488,12 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
};
|
||||
|
||||
// create lut objects
|
||||
luts_array = new int_radix_lut<Torus>(stream, params, 2, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
luts_carry_propagation_sum = new int_radix_lut<Torus>(
|
||||
stream, params, 1, num_radix_blocks, luts_array);
|
||||
message_acc = new int_radix_lut<Torus>(stream, params, 1, num_radix_blocks,
|
||||
luts_array);
|
||||
luts_array = new int_radix_lut<Torus>(
|
||||
stream, params, 2, num_radix_blocks, allocate_gpu_memory);
|
||||
luts_carry_propagation_sum = new struct int_radix_lut<Torus>(
|
||||
stream, params, 1, num_radix_blocks, allocate_gpu_memory);
|
||||
message_acc = new struct int_radix_lut<Torus>(
|
||||
stream, params, 1, num_radix_blocks, allocate_gpu_memory);
|
||||
|
||||
auto lut_does_block_generate_carry = luts_array->get_lut(0);
|
||||
auto lut_does_block_generate_or_propagate = luts_array->get_lut(1);
|
||||
@@ -523,9 +511,8 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
num_radix_blocks - 1);
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
stream, luts_carry_propagation_sum->lut, glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
f_luts_carry_propagation_sum);
|
||||
stream, luts_carry_propagation_sum->lut, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, f_luts_carry_propagation_sum);
|
||||
|
||||
generate_device_accumulator<Torus>(stream, message_acc->lut, glwe_dimension,
|
||||
polynomial_size, message_modulus,
|
||||
@@ -550,6 +537,7 @@ template <typename Torus> struct int_mul_memory {
|
||||
Torus *vector_result_sb;
|
||||
Torus *block_mul_res;
|
||||
Torus *small_lwe_vector;
|
||||
Torus *lwe_pbs_out_array;
|
||||
int_radix_lut<Torus> *luts_array; // lsb msb
|
||||
int_radix_lut<Torus> *luts_message;
|
||||
int_radix_lut<Torus> *luts_carry;
|
||||
@@ -589,15 +577,19 @@ template <typename Torus> struct int_mul_memory {
|
||||
stream);
|
||||
small_lwe_vector = (Torus *)cuda_malloc_async(
|
||||
total_block_count * (lwe_dimension + 1) * sizeof(Torus), stream);
|
||||
lwe_pbs_out_array =
|
||||
(Torus *)cuda_malloc_async((glwe_dimension * polynomial_size + 1) *
|
||||
total_block_count * sizeof(Torus),
|
||||
stream);
|
||||
|
||||
// create int_radix_lut objects for lsb, msb, message, carry
|
||||
// luts_array -> lut = {lsb_acc, msb_acc}
|
||||
luts_array = new int_radix_lut<Torus>(stream, params, 2, total_block_count,
|
||||
allocate_gpu_memory);
|
||||
luts_message = new int_radix_lut<Torus>(stream, params, 1,
|
||||
total_block_count, luts_array);
|
||||
luts_carry = new int_radix_lut<Torus>(stream, params, 1, total_block_count,
|
||||
luts_array);
|
||||
luts_array = new int_radix_lut<Torus>(
|
||||
stream, params, 2, total_block_count, allocate_gpu_memory);
|
||||
luts_message = new int_radix_lut<Torus>(
|
||||
stream, params, 1, total_block_count, luts_array);
|
||||
luts_carry = new int_radix_lut<Torus>(
|
||||
stream, params, 1, total_block_count, luts_array);
|
||||
|
||||
auto lsb_acc = luts_array->get_lut(0);
|
||||
auto msb_acc = luts_array->get_lut(1);
|
||||
@@ -645,6 +637,7 @@ template <typename Torus> struct int_mul_memory {
|
||||
cuda_drop_async(vector_result_sb, stream);
|
||||
cuda_drop_async(block_mul_res, stream);
|
||||
cuda_drop_async(small_lwe_vector, stream);
|
||||
cuda_drop_async(lwe_pbs_out_array, stream);
|
||||
|
||||
luts_array->release(stream);
|
||||
luts_message->release(stream);
|
||||
@@ -681,10 +674,7 @@ template <typename Torus> struct int_shift_buffer {
|
||||
uint32_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
tmp_rotated = (Torus *)cuda_malloc_async(
|
||||
(max_amount_of_pbs + 2) * big_lwe_size_bytes, stream);
|
||||
|
||||
cuda_memset_async(tmp_rotated, 0,
|
||||
(max_amount_of_pbs + 2) * big_lwe_size_bytes, stream);
|
||||
max_amount_of_pbs * big_lwe_size_bytes, stream);
|
||||
|
||||
uint32_t num_bits_in_block = (uint32_t)std::log2(params.message_modulus);
|
||||
|
||||
@@ -694,9 +684,9 @@ template <typename Torus> struct int_shift_buffer {
|
||||
// here we generate 'num_bits_in_block' times lut
|
||||
// one for each 'shift_within_block' = 'shift' % 'num_bits_in_block'
|
||||
// even though lut_left contains 'num_bits_in_block' lut
|
||||
// lut_indexes will have indexes for single lut only and those indexes
|
||||
// will be 0 it means for pbs corresponding lut should be selected and
|
||||
// pass along lut_indexes filled with zeros
|
||||
// lut_indexes will have indexes for single lut only and those indexes will be 0
|
||||
// it means for pbs corresponding lut should be selected and pass along
|
||||
// lut_indexes filled with zeros
|
||||
|
||||
// calculate bivariate lut for each 'shift_within_block'
|
||||
for (int s_w_b = 1; s_w_b < num_bits_in_block; s_w_b++) {
|
||||
@@ -747,6 +737,34 @@ template <typename Torus> struct int_shift_buffer {
|
||||
|
||||
lut_buffers_bivariate.push_back(cur_lut_bivariate);
|
||||
}
|
||||
|
||||
// here we generate 'message_modulus' times lut
|
||||
// one for each 'shift'
|
||||
// lut_indexes will have indexes for single lut only and those indexes will be 0
|
||||
// it means for pbs corresponding lut should be selected and pass along
|
||||
// lut_indexes filled with zeros
|
||||
|
||||
// calculate lut for each 'shift'
|
||||
for (int shift = 0; shift < params.message_modulus; shift++) {
|
||||
auto cur_lut =
|
||||
new int_radix_lut<Torus>(stream, params, 1, 1, allocate_gpu_memory);
|
||||
|
||||
std::function<Torus(Torus)> shift_lut_f;
|
||||
if (shift_type == LEFT_SHIFT)
|
||||
shift_lut_f = [shift, params](Torus x) -> Torus {
|
||||
return (x << shift) % params.message_modulus;
|
||||
};
|
||||
else
|
||||
shift_lut_f = [shift, params](Torus x) -> Torus {
|
||||
return (x >> shift) % params.message_modulus;
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
stream, cur_lut->lut, params.glwe_dimension, params.polynomial_size,
|
||||
params.message_modulus, params.carry_modulus, shift_lut_f);
|
||||
|
||||
lut_buffers_univariate.push_back(cur_lut);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -816,6 +834,8 @@ template <typename Torus> struct int_cmux_buffer {
|
||||
if (allocate_gpu_memory) {
|
||||
Torus big_size =
|
||||
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
|
||||
Torus small_size =
|
||||
(params.small_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
|
||||
|
||||
tmp_true_ct = (Torus *)cuda_malloc_async(big_size, stream);
|
||||
tmp_false_ct = (Torus *)cuda_malloc_async(big_size, stream);
|
||||
@@ -884,10 +904,8 @@ template <typename Torus> struct int_are_all_block_true_buffer {
|
||||
COMPARISON_TYPE op;
|
||||
int_radix_params params;
|
||||
|
||||
// This map store LUTs that checks the equality between some input and values
|
||||
// of interest in are_all_block_true(), as with max_value (the maximum message
|
||||
// value).
|
||||
std::unordered_map<int, int_radix_lut<Torus> *> is_equal_to_lut_map;
|
||||
int_radix_lut<Torus> *is_max_value_lut;
|
||||
int_radix_lut<Torus> *is_equal_to_num_blocks_lut;
|
||||
|
||||
Torus *tmp_block_accumulated;
|
||||
|
||||
@@ -905,14 +923,34 @@ template <typename Torus> struct int_are_all_block_true_buffer {
|
||||
int max_chunks = (num_radix_blocks + max_value - 1) / max_value;
|
||||
tmp_block_accumulated = (Torus *)cuda_malloc_async(
|
||||
(params.big_lwe_dimension + 1) * max_chunks * sizeof(Torus), stream);
|
||||
|
||||
// LUT
|
||||
// We need three LUTs:
|
||||
// (x & max_value as u64) == max_value
|
||||
// x != 0
|
||||
// (x & max_value as u64) == blocks.len()
|
||||
|
||||
auto is_max_value_lut_f = [total_modulus](Torus x) -> Torus {
|
||||
Torus max_value = total_modulus - 1;
|
||||
return (x & max_value) == max_value;
|
||||
};
|
||||
|
||||
is_max_value_lut = new int_radix_lut<Torus>(
|
||||
stream, params, 1, num_radix_blocks, allocate_gpu_memory);
|
||||
is_equal_to_num_blocks_lut = new int_radix_lut<Torus>(
|
||||
stream, params, 1, num_radix_blocks, allocate_gpu_memory);
|
||||
generate_device_accumulator<Torus>(
|
||||
stream, is_max_value_lut->lut, params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
is_max_value_lut_f);
|
||||
}
|
||||
}
|
||||
|
||||
void release(cuda_stream_t *stream) {
|
||||
for (auto &lut : is_equal_to_lut_map) {
|
||||
lut.second->release(stream);
|
||||
}
|
||||
is_equal_to_lut_map.clear();
|
||||
is_max_value_lut->release(stream);
|
||||
delete is_max_value_lut;
|
||||
is_equal_to_num_blocks_lut->release(stream);
|
||||
delete is_equal_to_num_blocks_lut;
|
||||
|
||||
cuda_drop_async(tmp_block_accumulated, stream);
|
||||
}
|
||||
@@ -927,8 +965,6 @@ template <typename Torus> struct int_comparison_eq_buffer {
|
||||
|
||||
int_are_all_block_true_buffer<Torus> *are_all_block_true_buffer;
|
||||
|
||||
int_radix_lut<Torus> *scalar_comparison_luts;
|
||||
|
||||
int_comparison_eq_buffer(cuda_stream_t *stream, COMPARISON_TYPE op,
|
||||
int_radix_params params, uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
@@ -971,22 +1007,6 @@ template <typename Torus> struct int_comparison_eq_buffer {
|
||||
stream, is_non_zero_lut->lut, params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
is_non_zero_lut_f);
|
||||
|
||||
// Scalar may have up to num_radix_blocks blocks
|
||||
scalar_comparison_luts = new int_radix_lut<Torus>(
|
||||
stream, params, total_modulus, num_radix_blocks, allocate_gpu_memory);
|
||||
|
||||
for (int i = 0; i < total_modulus; i++) {
|
||||
auto lut_f = [i, operator_f](Torus x) -> Torus {
|
||||
return operator_f(i, x);
|
||||
};
|
||||
|
||||
Torus *lut = scalar_comparison_luts->lut +
|
||||
i * (params.glwe_dimension + 1) * params.polynomial_size;
|
||||
generate_device_accumulator<Torus>(
|
||||
stream, lut, params.glwe_dimension, params.polynomial_size,
|
||||
params.message_modulus, params.carry_modulus, lut_f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -998,9 +1018,6 @@ template <typename Torus> struct int_comparison_eq_buffer {
|
||||
|
||||
are_all_block_true_buffer->release(stream);
|
||||
delete are_all_block_true_buffer;
|
||||
|
||||
scalar_comparison_luts->release(stream);
|
||||
delete scalar_comparison_luts;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1031,6 +1048,13 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
|
||||
return msb;
|
||||
};
|
||||
|
||||
auto last_leaf_noop_lut_f = [this](Torus x) -> Torus {
|
||||
int msb = (x >> 2) & 3;
|
||||
int lsb = x & 3;
|
||||
|
||||
return this->block_selector_f(msb, lsb);
|
||||
};
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
tmp_x = (Torus *)cuda_malloc_async((params.big_lwe_dimension + 1) *
|
||||
num_radix_blocks * sizeof(Torus),
|
||||
@@ -1077,8 +1101,14 @@ template <typename Torus> struct int_comparison_diff_buffer {
|
||||
|
||||
std::function<Torus(Torus)> operator_f;
|
||||
|
||||
int_radix_lut<Torus> *is_zero_lut;
|
||||
|
||||
int_tree_sign_reduction_buffer<Torus> *tree_buffer;
|
||||
|
||||
// Used for scalar comparisons
|
||||
cuda_stream_t *lsb_stream;
|
||||
cuda_stream_t *msb_stream;
|
||||
|
||||
int_comparison_diff_buffer(cuda_stream_t *stream, COMPARISON_TYPE op,
|
||||
int_radix_params params, uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
@@ -1102,6 +1132,8 @@ template <typename Torus> struct int_comparison_diff_buffer {
|
||||
};
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
lsb_stream = cuda_create_stream(stream->gpu_index);
|
||||
msb_stream = cuda_create_stream(stream->gpu_index);
|
||||
|
||||
Torus big_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
|
||||
|
||||
@@ -1111,17 +1143,36 @@ template <typename Torus> struct int_comparison_diff_buffer {
|
||||
tmp_packed_right =
|
||||
(Torus *)cuda_malloc_async(big_size * (num_radix_blocks / 2), stream);
|
||||
|
||||
// LUTs
|
||||
uint32_t total_modulus = params.message_modulus * params.carry_modulus;
|
||||
auto is_zero_f = [total_modulus](Torus x) -> Torus {
|
||||
return (x % total_modulus) == 0;
|
||||
};
|
||||
|
||||
is_zero_lut = new int_radix_lut<Torus>(
|
||||
stream, params, 1, num_radix_blocks, allocate_gpu_memory);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
stream, is_zero_lut->lut, params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
is_zero_f);
|
||||
|
||||
tree_buffer = new int_tree_sign_reduction_buffer<Torus>(
|
||||
stream, operator_f, params, num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
}
|
||||
|
||||
void release(cuda_stream_t *stream) {
|
||||
is_zero_lut->release(stream);
|
||||
delete is_zero_lut;
|
||||
tree_buffer->release(stream);
|
||||
delete tree_buffer;
|
||||
|
||||
cuda_drop_async(tmp_packed_left, stream);
|
||||
cuda_drop_async(tmp_packed_right, stream);
|
||||
|
||||
cuda_destroy_stream(lsb_stream);
|
||||
cuda_destroy_stream(msb_stream);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1134,24 +1185,15 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
int_radix_lut<Torus> *cleaning_lut;
|
||||
std::function<Torus(Torus)> cleaning_lut_f;
|
||||
|
||||
int_radix_lut<Torus> *is_zero_lut;
|
||||
|
||||
int_comparison_eq_buffer<Torus> *eq_buffer;
|
||||
int_comparison_diff_buffer<Torus> *diff_buffer;
|
||||
|
||||
Torus *tmp_block_comparisons;
|
||||
Torus *tmp_lwe_array_out;
|
||||
|
||||
// Scalar EQ / NE
|
||||
Torus *tmp_packed_input;
|
||||
|
||||
// Max Min
|
||||
Torus *tmp_lwe_array_out;
|
||||
int_cmux_buffer<Torus> *cmux_buffer;
|
||||
|
||||
// Used for scalar comparisons
|
||||
cuda_stream_t *lsb_stream;
|
||||
cuda_stream_t *msb_stream;
|
||||
|
||||
int_comparison_buffer(cuda_stream_t *stream, COMPARISON_TYPE op,
|
||||
int_radix_params params, uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
@@ -1161,17 +1203,10 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
cleaning_lut_f = [](Torus x) -> Torus { return x; };
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
lsb_stream = cuda_create_stream(stream->gpu_index);
|
||||
msb_stream = cuda_create_stream(stream->gpu_index);
|
||||
|
||||
tmp_lwe_array_out = (Torus *)cuda_malloc_async(
|
||||
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus),
|
||||
stream);
|
||||
|
||||
tmp_packed_input = (Torus *)cuda_malloc_async(
|
||||
(params.big_lwe_dimension + 1) * 2 * num_radix_blocks * sizeof(Torus),
|
||||
stream);
|
||||
|
||||
// Block comparisons
|
||||
tmp_block_comparisons = (Torus *)cuda_malloc_async(
|
||||
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus),
|
||||
@@ -1186,19 +1221,6 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
cleaning_lut_f);
|
||||
|
||||
uint32_t total_modulus = params.message_modulus * params.carry_modulus;
|
||||
auto is_zero_f = [total_modulus](Torus x) -> Torus {
|
||||
return (x % total_modulus) == 0;
|
||||
};
|
||||
|
||||
is_zero_lut = new int_radix_lut<Torus>(
|
||||
stream, params, 1, num_radix_blocks, allocate_gpu_memory);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
stream, is_zero_lut->lut, params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
is_zero_f);
|
||||
|
||||
switch (op) {
|
||||
case COMPARISON_TYPE::MAX:
|
||||
case COMPARISON_TYPE::MIN:
|
||||
@@ -1242,14 +1264,8 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
break;
|
||||
}
|
||||
cleaning_lut->release(stream);
|
||||
is_zero_lut->release(stream);
|
||||
delete is_zero_lut;
|
||||
cuda_drop_async(tmp_lwe_array_out, stream);
|
||||
cuda_drop_async(tmp_block_comparisons, stream);
|
||||
cuda_drop_async(tmp_packed_input, stream);
|
||||
|
||||
cuda_destroy_stream(lsb_stream);
|
||||
cuda_destroy_stream(msb_stream);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -106,23 +106,23 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
constexpr int ideal_threads = 128;
|
||||
|
||||
int lwe_size = lwe_dimension_out + 1;
|
||||
int lwe_dim = lwe_dimension_out + 1;
|
||||
int lwe_lower, lwe_upper, cutoff;
|
||||
if (lwe_size % ideal_threads == 0) {
|
||||
lwe_lower = lwe_size / ideal_threads;
|
||||
lwe_upper = lwe_size / ideal_threads;
|
||||
if (lwe_dim % ideal_threads == 0) {
|
||||
lwe_lower = lwe_dim / ideal_threads;
|
||||
lwe_upper = lwe_dim / ideal_threads;
|
||||
cutoff = 0;
|
||||
} else {
|
||||
int y = ceil((double)lwe_size / (double)ideal_threads) * ideal_threads -
|
||||
lwe_size;
|
||||
int y =
|
||||
ceil((double)lwe_dim / (double)ideal_threads) * ideal_threads - lwe_dim;
|
||||
cutoff = ideal_threads - y;
|
||||
lwe_lower = lwe_size / ideal_threads;
|
||||
lwe_upper = (int)ceil((double)lwe_size / (double)ideal_threads);
|
||||
lwe_lower = lwe_dim / ideal_threads;
|
||||
lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads);
|
||||
}
|
||||
|
||||
int lwe_size_after = lwe_size * num_samples;
|
||||
int lwe_size_after = (lwe_dimension_out + 1) * num_samples;
|
||||
|
||||
int shared_mem = sizeof(Torus) * lwe_size;
|
||||
int shared_mem = sizeof(Torus) * (lwe_dimension_out + 1);
|
||||
|
||||
cuda_memset_async(lwe_array_out, 0, sizeof(Torus) * lwe_size_after, stream);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
@@ -130,7 +130,11 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
|
||||
dim3 grid(num_samples, 1, 1);
|
||||
dim3 threads(ideal_threads, 1, 1);
|
||||
|
||||
keyswitch<Torus><<<grid, threads, shared_mem, stream->stream>>>(
|
||||
// cudaFuncSetAttribute(keyswitch<Torus>,
|
||||
// cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
// shared_mem);
|
||||
|
||||
keyswitch<<<grid, threads, shared_mem, stream->stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, lwe_lower,
|
||||
lwe_upper, cutoff);
|
||||
|
||||
@@ -4,21 +4,25 @@
|
||||
|
||||
/// Unsafe function to create a CUDA stream, must check first that GPU exists
|
||||
cuda_stream_t *cuda_create_stream(uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cudaSetDevice(gpu_index);
|
||||
cuda_stream_t *stream = new cuda_stream_t(gpu_index);
|
||||
return stream;
|
||||
}
|
||||
|
||||
/// Unsafe function to destroy CUDA stream, must check first the GPU exists
|
||||
void cuda_destroy_stream(cuda_stream_t *stream) { stream->release(); }
|
||||
int cuda_destroy_stream(cuda_stream_t *stream) {
|
||||
stream->release();
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Unsafe function that will try to allocate even if gpu_index is invalid
|
||||
/// or if there's not enough memory. A safe wrapper around it must call
|
||||
/// cuda_check_valid_malloc() first
|
||||
void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cudaSetDevice(gpu_index);
|
||||
void *ptr;
|
||||
check_cuda_error(cudaMalloc((void **)&ptr, size));
|
||||
cudaMalloc((void **)&ptr, size);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
return ptr;
|
||||
}
|
||||
@@ -26,7 +30,7 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
/// Allocates a size-byte array at the device memory. Tries to do it
|
||||
/// asynchronously.
|
||||
void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream) {
|
||||
check_cuda_error(cudaSetDevice(stream->gpu_index));
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
void *ptr;
|
||||
|
||||
#ifndef CUDART_VERSION
|
||||
@@ -48,88 +52,184 @@ void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream) {
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/// Check that allocation is valid
|
||||
void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
size_t total_mem, free_mem;
|
||||
check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
|
||||
if (size > free_mem) {
|
||||
PANIC("Cuda error: not enough memory on device. "
|
||||
"Available: %zu vs Requested: %lu",
|
||||
free_mem, size)
|
||||
/// Checks that allocation is valid
|
||||
/// 0: valid
|
||||
/// -1: invalid, not enough memory in device
|
||||
/// -2: invalid, gpu index doesn't exist
|
||||
int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
|
||||
if (gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaSetDevice(gpu_index);
|
||||
size_t total_mem, free_mem;
|
||||
cudaMemGetInfo(&free_mem, &total_mem);
|
||||
if (size > free_mem) {
|
||||
// error code: not enough memory
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Returns
|
||||
/// false if Cooperative Groups is not supported.
|
||||
/// true otherwise
|
||||
bool cuda_check_support_cooperative_groups() {
|
||||
/// -> 0 if Cooperative Groups is not supported.
|
||||
/// -> 1 otherwise
|
||||
int cuda_check_support_cooperative_groups() {
|
||||
int cooperative_groups_supported = 0;
|
||||
check_cuda_error(cudaDeviceGetAttribute(&cooperative_groups_supported,
|
||||
cudaDevAttrCooperativeLaunch, 0));
|
||||
cudaDeviceGetAttribute(&cooperative_groups_supported,
|
||||
cudaDevAttrCooperativeLaunch, 0);
|
||||
|
||||
return cooperative_groups_supported > 0;
|
||||
}
|
||||
|
||||
/// Copy memory to the GPU asynchronously
|
||||
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cuda_stream_t *stream) {
|
||||
if (size == 0)
|
||||
return;
|
||||
cudaPointerAttributes attr;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
|
||||
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid device pointer in async copy to GPU.")
|
||||
/// Tries to copy memory to the GPU asynchronously
|
||||
/// 0: success
|
||||
/// -1: error, invalid device pointer
|
||||
/// -2: error, gpu index doesn't exist
|
||||
/// -3: error, zero copy size
|
||||
int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cuda_stream_t *stream) {
|
||||
if (size == 0) {
|
||||
// error code: zero copy size
|
||||
return -3;
|
||||
}
|
||||
|
||||
check_cuda_error(cudaSetDevice(stream->gpu_index));
|
||||
if (stream->gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaPointerAttributes attr;
|
||||
cudaPointerGetAttributes(&attr, dest);
|
||||
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
// error code: invalid device pointer
|
||||
return -1;
|
||||
}
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
check_cuda_error(
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream->stream));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Copy memory within a GPU asynchronously
|
||||
void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cuda_stream_t *stream) {
|
||||
if (size == 0)
|
||||
return;
|
||||
cudaPointerAttributes attr_dest;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
|
||||
if (attr_dest.device != stream->gpu_index &&
|
||||
attr_dest.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
|
||||
}
|
||||
cudaPointerAttributes attr_src;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
|
||||
if (attr_src.device != stream->gpu_index &&
|
||||
attr_src.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
|
||||
}
|
||||
if (attr_src.device != attr_dest.device) {
|
||||
PANIC("Cuda error: different devices specified in copy from GPU to GPU.")
|
||||
/// Tries to copy memory to the GPU synchronously
|
||||
/// 0: success
|
||||
/// -1: error, invalid device pointer
|
||||
/// -2: error, gpu index doesn't exist
|
||||
/// -3: error, zero copy size
|
||||
int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size) {
|
||||
if (size == 0) {
|
||||
// error code: zero copy size
|
||||
return -3;
|
||||
}
|
||||
|
||||
check_cuda_error(cudaSetDevice(stream->gpu_index));
|
||||
cudaPointerAttributes attr;
|
||||
cudaPointerGetAttributes(&attr, dest);
|
||||
if (attr.type != cudaMemoryTypeDevice) {
|
||||
// error code: invalid device pointer
|
||||
return -1;
|
||||
}
|
||||
|
||||
check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyHostToDevice));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Tries to copy memory to the CPU synchronously
|
||||
/// 0: success
|
||||
/// -1: error, invalid device pointer
|
||||
/// -2: error, gpu index doesn't exist
|
||||
/// -3: error, zero copy size
|
||||
int cuda_memcpy_to_cpu(void *dest, void *src, uint64_t size) {
|
||||
if (size == 0) {
|
||||
// error code: zero copy size
|
||||
return -3;
|
||||
}
|
||||
|
||||
cudaPointerAttributes attr;
|
||||
cudaPointerGetAttributes(&attr, src);
|
||||
if (attr.type != cudaMemoryTypeDevice) {
|
||||
// error code: invalid device pointer
|
||||
return -1;
|
||||
}
|
||||
|
||||
check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToHost));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Tries to copy memory within a GPU asynchronously
|
||||
/// 0: success
|
||||
/// -1: error, invalid device pointer
|
||||
/// -2: error, gpu index doesn't exist
|
||||
/// -3: error, zero copy size
|
||||
int cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cuda_stream_t *stream) {
|
||||
if (size == 0) {
|
||||
// error code: zero copy size
|
||||
return -3;
|
||||
}
|
||||
|
||||
if (stream->gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaPointerAttributes attr_dest;
|
||||
cudaPointerGetAttributes(&attr_dest, dest);
|
||||
if (attr_dest.device != stream->gpu_index &&
|
||||
attr_dest.type != cudaMemoryTypeDevice) {
|
||||
// error code: invalid device pointer
|
||||
return -1;
|
||||
}
|
||||
cudaPointerAttributes attr_src;
|
||||
cudaPointerGetAttributes(&attr_src, src);
|
||||
if (attr_src.device != stream->gpu_index &&
|
||||
attr_src.type != cudaMemoryTypeDevice) {
|
||||
// error code: invalid device pointer
|
||||
return -1;
|
||||
}
|
||||
if (attr_src.device != attr_dest.device) {
|
||||
// error code: different devices
|
||||
return -1;
|
||||
}
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
check_cuda_error(cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice,
|
||||
stream->stream));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Synchronizes device
|
||||
void cuda_synchronize_device(uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
check_cuda_error(cudaDeviceSynchronize());
|
||||
/// 0: success
|
||||
/// -2: error, gpu index doesn't exist
|
||||
int cuda_synchronize_device(uint32_t gpu_index) {
|
||||
if (gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaDeviceSynchronize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||
cuda_stream_t *stream) {
|
||||
if (size == 0)
|
||||
return;
|
||||
cudaPointerAttributes attr;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
|
||||
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid dest device pointer in cuda memset.")
|
||||
int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||
cuda_stream_t *stream) {
|
||||
if (size == 0) {
|
||||
// error code: zero copy size
|
||||
return -3;
|
||||
}
|
||||
check_cuda_error(cudaSetDevice(stream->gpu_index));
|
||||
|
||||
if (stream->gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaPointerAttributes attr;
|
||||
cudaPointerGetAttributes(&attr, dest);
|
||||
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
// error code: invalid device pointer
|
||||
return -1;
|
||||
}
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
check_cuda_error(cudaMemsetAsync(dest, val, size, stream->stream));
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -142,18 +242,12 @@ __global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
|
||||
template <typename Torus>
|
||||
void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
|
||||
Torus n) {
|
||||
cudaPointerAttributes attr;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr, d_array));
|
||||
if (attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid dest device pointer in cuda set value.")
|
||||
}
|
||||
int block_size = 256;
|
||||
int num_blocks = (n + block_size - 1) / block_size;
|
||||
|
||||
// Launch the kernel
|
||||
cuda_set_value_kernel<<<num_blocks, block_size, 0, *stream>>>(d_array, value,
|
||||
n);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
/// Explicitly instantiate cuda_set_value_async for 32 and 64 bits
|
||||
@@ -162,39 +256,57 @@ template void cuda_set_value_async(cudaStream_t *stream, uint64_t *d_array,
|
||||
template void cuda_set_value_async(cudaStream_t *stream, uint32_t *d_array,
|
||||
uint32_t value, uint32_t n);
|
||||
|
||||
/// Copy memory to the CPU asynchronously
|
||||
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
cuda_stream_t *stream) {
|
||||
if (size == 0)
|
||||
return;
|
||||
cudaPointerAttributes attr;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr, src));
|
||||
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
|
||||
/// Tries to copy memory to the GPU asynchronously
|
||||
/// 0: success
|
||||
/// -1: error, invalid device pointer
|
||||
/// -2: error, gpu index doesn't exist
|
||||
/// -3: error, zero copy size
|
||||
int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
cuda_stream_t *stream) {
|
||||
if (size == 0) {
|
||||
// error code: zero copy size
|
||||
return -3;
|
||||
}
|
||||
|
||||
check_cuda_error(cudaSetDevice(stream->gpu_index));
|
||||
if (stream->gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaPointerAttributes attr;
|
||||
cudaPointerGetAttributes(&attr, src);
|
||||
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
// error code: invalid device pointer
|
||||
return -1;
|
||||
}
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
check_cuda_error(
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream->stream));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Return number of GPUs available
|
||||
int cuda_get_number_of_gpus() {
|
||||
int num_gpus;
|
||||
check_cuda_error(cudaGetDeviceCount(&num_gpus));
|
||||
cudaGetDeviceCount(&num_gpus);
|
||||
return num_gpus;
|
||||
}
|
||||
|
||||
/// Drop a cuda array
|
||||
void cuda_drop(void *ptr, uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
int cuda_drop(void *ptr, uint32_t gpu_index) {
|
||||
if (gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaSetDevice(gpu_index);
|
||||
check_cuda_error(cudaFree(ptr));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Drop a cuda array asynchronously, if supported on the device
|
||||
void cuda_drop_async(void *ptr, cuda_stream_t *stream) {
|
||||
/// Drop a cuda array. Tries to do it asynchronously
|
||||
int cuda_drop_async(void *ptr, cuda_stream_t *stream) {
|
||||
|
||||
check_cuda_error(cudaSetDevice(stream->gpu_index));
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
#ifndef CUDART_VERSION
|
||||
#error CUDART_VERSION Undefined!
|
||||
#elif (CUDART_VERSION >= 11020)
|
||||
@@ -211,13 +323,18 @@ void cuda_drop_async(void *ptr, cuda_stream_t *stream) {
|
||||
#else
|
||||
check_cuda_error(cudaFree(ptr));
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Get the maximum size for the shared memory
|
||||
int cuda_get_max_shared_memory(uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
if (gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaDeviceProp prop;
|
||||
check_cuda_error(cudaGetDeviceProperties(&prop, gpu_index));
|
||||
cudaGetDeviceProperties(&prop, gpu_index);
|
||||
int max_shared_memory = 0;
|
||||
if (prop.major >= 6) {
|
||||
max_shared_memory = prop.sharedMemPerMultiprocessor;
|
||||
@@ -227,16 +344,7 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
|
||||
return max_shared_memory;
|
||||
}
|
||||
|
||||
void cuda_synchronize_stream(cuda_stream_t *stream) { stream->synchronize(); }
|
||||
|
||||
void cuda_stream_add_callback(cuda_stream_t *stream,
|
||||
cudaStreamCallback_t callback, void *user_data) {
|
||||
|
||||
check_cuda_error(
|
||||
cudaStreamAddCallback(stream->stream, callback, user_data, 0));
|
||||
}
|
||||
|
||||
void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
|
||||
void *host_pointer) {
|
||||
free(host_pointer);
|
||||
int cuda_synchronize_stream(cuda_stream_t *stream) {
|
||||
stream->synchronize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -44,7 +44,6 @@ __host__ void scratch_cuda_integer_radix_bitop_kb(
|
||||
uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
*mem_ptr = new int_bitop_buffer<Torus>(stream, op, params, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
@@ -10,7 +10,6 @@ __host__ void zero_out_if(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
int_zero_out_if_buffer<Torus> *mem_ptr,
|
||||
int_radix_lut<Torus> *predicate, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = mem_ptr->params;
|
||||
|
||||
int big_lwe_size = params.big_lwe_dimension + 1;
|
||||
@@ -95,7 +94,6 @@ __host__ void scratch_cuda_integer_radix_cmux_kb(
|
||||
std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
*mem_ptr = new int_cmux_buffer<Torus>(stream, predicate_lut_f, params,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
@@ -70,7 +70,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
static_cast<uint64_t *>(ksk), lwe_ciphertext_count);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: integer operation not supported")
|
||||
printf("Not implemented\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -37,7 +37,6 @@ __host__ void accumulate_all_blocks(cuda_stream_t *stream, Torus *output,
|
||||
Torus *input, uint32_t lwe_dimension,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = (lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
@@ -47,13 +46,6 @@ __host__ void accumulate_all_blocks(cuda_stream_t *stream, Torus *output,
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
/* This takes an array of lwe ciphertexts, where each is an encryption of
|
||||
* either 0 or 1.
|
||||
*
|
||||
* It writes in lwe_array_out a single lwe ciphertext encrypting 1 if all input
|
||||
* blocks are 1 otherwise the block encrypts 0
|
||||
*
|
||||
*/
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
@@ -61,7 +53,6 @@ are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
@@ -99,34 +90,29 @@ are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
input_blocks += (big_lwe_dimension + 1) * chunk_length;
|
||||
}
|
||||
accumulator = are_all_block_true_buffer->tmp_block_accumulated;
|
||||
auto is_equal_to_num_blocks_map =
|
||||
&are_all_block_true_buffer->is_equal_to_lut_map;
|
||||
|
||||
// Selects a LUT
|
||||
int_radix_lut<Torus> *lut;
|
||||
if (are_all_block_true_buffer->op == COMPARISON_TYPE::NE) {
|
||||
// is_non_zero_lut_buffer LUT
|
||||
lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
} else if (chunk_length == max_value) {
|
||||
// is_max_value LUT
|
||||
lut = are_all_block_true_buffer->is_max_value_lut;
|
||||
} else {
|
||||
if ((*is_equal_to_num_blocks_map).find(chunk_length) !=
|
||||
(*is_equal_to_num_blocks_map).end()) {
|
||||
// The LUT is already computed
|
||||
lut = (*is_equal_to_num_blocks_map)[chunk_length];
|
||||
} else {
|
||||
// LUT needs to be computed
|
||||
auto new_lut = new int_radix_lut<Torus>(stream, params, max_value,
|
||||
num_radix_blocks, true);
|
||||
|
||||
// is_equal_to_num_blocks LUT
|
||||
lut = are_all_block_true_buffer->is_equal_to_num_blocks_lut;
|
||||
if (chunk_length != lut_num_blocks) {
|
||||
auto is_equal_to_num_blocks_lut_f = [max_value,
|
||||
chunk_length](Torus x) -> Torus {
|
||||
return (x & max_value) == chunk_length;
|
||||
};
|
||||
generate_device_accumulator<Torus>(
|
||||
stream, new_lut->lut, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, is_equal_to_num_blocks_lut_f);
|
||||
stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
|
||||
carry_modulus, is_equal_to_num_blocks_lut_f);
|
||||
|
||||
(*is_equal_to_num_blocks_map)[chunk_length] = new_lut;
|
||||
lut = new_lut;
|
||||
// We don't have to generate this lut again
|
||||
lut_num_blocks = chunk_length;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -136,60 +122,6 @@ are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
}
|
||||
}
|
||||
|
||||
/* This takes an array of lwe ciphertexts, where each is an encryption of
|
||||
* either 0 or 1.
|
||||
*
|
||||
* It writes in lwe_array_out a single lwe ciphertext encrypting 1 if at least
|
||||
* one input ciphertext encrypts 1 otherwise encrypts 0
|
||||
*/
|
||||
template <typename Torus>
|
||||
__host__ void is_at_least_one_comparisons_block_true(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
|
||||
uint32_t num_radix_blocks) {
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
auto buffer = mem_ptr->eq_buffer->are_all_block_true_buffer;
|
||||
|
||||
uint32_t total_modulus = message_modulus * carry_modulus;
|
||||
uint32_t max_value = total_modulus - 1;
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
lwe_array_out, lwe_array_in,
|
||||
num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
|
||||
|
||||
uint32_t remaining_blocks = num_radix_blocks;
|
||||
while (remaining_blocks > 1) {
|
||||
// Split in max_value chunks
|
||||
uint32_t chunk_length = std::min(max_value, remaining_blocks);
|
||||
int num_chunks = remaining_blocks / chunk_length;
|
||||
|
||||
// Since all blocks encrypt either 0 or 1, we can sum max_value of them
|
||||
// as in the worst case we will be adding `max_value` ones
|
||||
auto input_blocks = lwe_array_out;
|
||||
auto accumulator = buffer->tmp_block_accumulated;
|
||||
for (int i = 0; i < num_chunks; i++) {
|
||||
accumulate_all_blocks(stream, accumulator, input_blocks,
|
||||
big_lwe_dimension, chunk_length);
|
||||
|
||||
accumulator += (big_lwe_dimension + 1);
|
||||
remaining_blocks -= (chunk_length - 1);
|
||||
input_blocks += (big_lwe_dimension + 1) * chunk_length;
|
||||
}
|
||||
accumulator = buffer->tmp_block_accumulated;
|
||||
|
||||
// Selects a LUT
|
||||
int_radix_lut<Torus> *lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
|
||||
// Applies the LUT
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, lwe_array_out, accumulator, bsk, ksk, num_chunks, lut);
|
||||
}
|
||||
}
|
||||
|
||||
// This takes an input slice of blocks.
|
||||
//
|
||||
// Each block can encrypt any value as long as its < message_modulus.
|
||||
@@ -213,9 +145,8 @@ template <typename Torus>
|
||||
__host__ void host_compare_with_zero_equality(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
|
||||
int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {
|
||||
int32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
@@ -244,6 +175,7 @@ __host__ void host_compare_with_zero_equality(
|
||||
num_sum_blocks = 1;
|
||||
} else {
|
||||
uint32_t remainder_blocks = num_radix_blocks;
|
||||
|
||||
auto sum_i = sum;
|
||||
auto chunk = lwe_array_in;
|
||||
while (remainder_blocks > 1) {
|
||||
@@ -262,8 +194,9 @@ __host__ void host_compare_with_zero_equality(
|
||||
}
|
||||
}
|
||||
|
||||
auto is_equal_to_zero_lut = mem_ptr->diff_buffer->is_zero_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, sum, sum, bsk, ksk, num_sum_blocks, zero_comparison);
|
||||
stream, sum, sum, bsk, ksk, num_sum_blocks, is_equal_to_zero_lut);
|
||||
are_all_comparisons_block_true(stream, lwe_array_out, sum, mem_ptr, bsk, ksk,
|
||||
num_sum_blocks);
|
||||
|
||||
@@ -310,7 +243,6 @@ __host__ void scratch_cuda_integer_radix_equality_check_kb(
|
||||
uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
*mem_ptr = new int_comparison_buffer<Torus>(
|
||||
stream, op, params, num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
@@ -370,7 +302,6 @@ tree_sign_reduction(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = tree_buffer->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
|
||||
@@ -59,9 +59,7 @@ void cuda_full_propagation_64_inplace(
|
||||
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (full propagation inplace): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -88,21 +86,10 @@ void cleanup_cuda_full_propagation(cuda_stream_t *stream,
|
||||
cuda_drop_async(mem_ptr->lut_buffer, stream);
|
||||
cuda_drop_async(mem_ptr->lut_indexes, stream);
|
||||
|
||||
cuda_drop_async(mem_ptr->pbs_buffer, stream);
|
||||
|
||||
cuda_drop_async(mem_ptr->tmp_small_lwe_vector, stream);
|
||||
cuda_drop_async(mem_ptr->tmp_big_lwe_vector, stream);
|
||||
|
||||
switch (mem_ptr->pbs_type) {
|
||||
case LOW_LAT: {
|
||||
auto x = (pbs_buffer<uint64_t, LOW_LAT> *)(mem_ptr->pbs_buffer);
|
||||
x->release(stream);
|
||||
} break;
|
||||
case MULTI_BIT: {
|
||||
auto x = (pbs_buffer<uint64_t, MULTI_BIT> *)(mem_ptr->pbs_buffer);
|
||||
x->release(stream);
|
||||
} break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unsupported implementation variant.")
|
||||
}
|
||||
}
|
||||
|
||||
void scratch_cuda_propagate_single_carry_low_latency_kb_64_inplace(
|
||||
|
||||
@@ -1,17 +1,89 @@
|
||||
#ifndef CUDA_INTEGER_CUH
|
||||
#define CUDA_INTEGER_CUH
|
||||
|
||||
#include "bootstrap.h"
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.h"
|
||||
#include "integer/scalar_addition.cuh"
|
||||
#include "linear_algebra.h"
|
||||
#include "linearalgebra/addition.cuh"
|
||||
#include "pbs/bootstrap_low_latency.cuh"
|
||||
#include "pbs/bootstrap_multibit.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <functional>
|
||||
|
||||
template <typename Torus>
|
||||
void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, PBS_TYPE pbs_type) {
|
||||
if (sizeof(Torus) == sizeof(uint32_t)) {
|
||||
// 32 bits
|
||||
switch (pbs_type) {
|
||||
case MULTI_BIT:
|
||||
printf("multibit\n");
|
||||
printf("Error: 32-bit multibit PBS is not supported.\n");
|
||||
break;
|
||||
case LOW_LAT:
|
||||
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
|
||||
num_luts, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case AMORTIZED:
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
|
||||
num_luts, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// 64 bits
|
||||
switch (pbs_type) {
|
||||
case MULTI_BIT:
|
||||
cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, grouping_factor, base_log, level_count,
|
||||
input_lwe_ciphertext_count, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case LOW_LAT:
|
||||
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
|
||||
num_luts, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case AMORTIZED:
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
|
||||
num_luts, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// function rotates right radix ciphertext with specific value
|
||||
// grid is one dimensional
|
||||
// blockIdx.x represents x_th block of radix ciphertext
|
||||
@@ -83,7 +155,6 @@ __host__ void pack_bivariate_blocks(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
uint32_t message_modulus,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// Left message is shifted
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = num_radix_blocks * (lwe_dimension + 1);
|
||||
@@ -98,7 +169,6 @@ template <typename Torus>
|
||||
__host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks, int_radix_lut<Torus> *lut) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// apply_lookup_table
|
||||
auto params = lut->params;
|
||||
auto pbs_type = params.pbs_type;
|
||||
@@ -118,12 +188,12 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
lut->lwe_indexes, ksk, big_lwe_dimension, small_lwe_dimension,
|
||||
ks_base_log, ks_level, num_radix_blocks);
|
||||
|
||||
execute_pbs<Torus>(stream, lwe_array_out, lut->lwe_indexes, lut->lut,
|
||||
lut->lut_indexes, lut->tmp_lwe_after_ks, lut->lwe_indexes,
|
||||
bsk, lut->buffer, glwe_dimension, small_lwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level, grouping_factor,
|
||||
num_radix_blocks, 1, 0,
|
||||
cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
|
||||
execute_pbs(stream, lwe_array_out, lut->lwe_indexes, lut->lut,
|
||||
lut->lut_indexes, lut->tmp_lwe_after_ks, lut->lwe_indexes, bsk,
|
||||
lut->pbs_buffer, glwe_dimension, small_lwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level, grouping_factor,
|
||||
num_radix_blocks, 1, 0,
|
||||
cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -131,7 +201,6 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
|
||||
Torus *lwe_array_2, void *bsk, Torus *ksk, uint32_t num_radix_blocks,
|
||||
int_radix_lut<Torus> *lut) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// apply_lookup_table_bivariate
|
||||
|
||||
auto params = lut->params;
|
||||
@@ -239,8 +308,8 @@ void generate_device_accumulator_bivariate(
|
||||
acc_bivariate, h_lut,
|
||||
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream);
|
||||
|
||||
// Release memory when possible
|
||||
cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
|
||||
cuda_synchronize_stream(stream);
|
||||
free(h_lut);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -271,8 +340,8 @@ void generate_device_accumulator(cuda_stream_t *stream, Torus *acc,
|
||||
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
|
||||
stream);
|
||||
|
||||
// Release memory when possible
|
||||
cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
|
||||
cuda_synchronize_stream(stream);
|
||||
free(h_lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -294,6 +363,7 @@ void host_propagate_single_carry_low_latency(cuda_stream_t *stream,
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto big_lwe_size = glwe_dimension * polynomial_size + 1;
|
||||
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
@@ -344,9 +414,10 @@ void host_propagate_single_carry_low_latency(cuda_stream_t *stream,
|
||||
/*
|
||||
* input_blocks: input radix ciphertext propagation will happen inplace
|
||||
* acc_message_carry: list of two lut s, [(message_acc), (carry_acc)]
|
||||
* lut_indexes_message_carry: lut_indexes for message and carry, should always
|
||||
* be {0, 1} small_lwe_vector: output of keyswitch should have size = 2 *
|
||||
* (lwe_dimension + 1) * sizeof(Torus) big_lwe_vector: output of pbs should have
|
||||
* lut_indexes_message_carry: lut_indexes for message and carry, should always be {0, 1}
|
||||
* small_lwe_vector: output of keyswitch should have
|
||||
* size = 2 * (lwe_dimension + 1) * sizeof(Torus)
|
||||
* big_lwe_vector: output of pbs should have
|
||||
* size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus)
|
||||
*/
|
||||
template <typename Torus, typename STorus, class params>
|
||||
@@ -403,12 +474,31 @@ void scratch_cuda_full_propagation(
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
// PBS
|
||||
int8_t *pbs_buffer;
|
||||
execute_scratch_pbs<Torus>(stream, &pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, pbs_level, grouping_factor,
|
||||
num_radix_blocks,
|
||||
cuda_get_max_shared_memory(stream->gpu_index),
|
||||
pbs_type, allocate_gpu_memory);
|
||||
if (pbs_type == MULTI_BIT) {
|
||||
uint32_t lwe_chunk_size = get_average_lwe_chunk_size(
|
||||
lwe_dimension, pbs_level, glwe_dimension, num_radix_blocks);
|
||||
// Only 64 bits is supported
|
||||
scratch_cuda_multi_bit_pbs_64(stream, &pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, pbs_level,
|
||||
grouping_factor, num_radix_blocks,
|
||||
cuda_get_max_shared_memory(stream->gpu_index),
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
} else {
|
||||
// Classic
|
||||
// We only use low latency for classic mode
|
||||
if (sizeof(Torus) == sizeof(uint32_t))
|
||||
scratch_cuda_bootstrap_low_latency_32(
|
||||
stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level,
|
||||
num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
|
||||
allocate_gpu_memory);
|
||||
else
|
||||
scratch_cuda_bootstrap_low_latency_64(
|
||||
stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level,
|
||||
num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
// LUT
|
||||
Torus *lut_buffer;
|
||||
@@ -461,8 +551,8 @@ void scratch_cuda_full_propagation(
|
||||
h_lwe_indexes[i] = i;
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size,
|
||||
stream);
|
||||
cuda_stream_add_callback(stream, host_free_on_stream_callback,
|
||||
h_lwe_indexes);
|
||||
cuda_synchronize_stream(stream);
|
||||
free(h_lwe_indexes);
|
||||
}
|
||||
|
||||
// Temporary arrays
|
||||
@@ -531,11 +621,7 @@ template <typename Torus>
|
||||
__host__ void pack_blocks(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_in, uint32_t lwe_dimension,
|
||||
uint32_t num_radix_blocks, uint32_t factor) {
|
||||
if (lwe_array_out == lwe_array_in)
|
||||
PANIC("Cuda error in pack blocks: input and output pointers must be "
|
||||
"different.");
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
assert(lwe_array_out != lwe_array_in);
|
||||
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = (lwe_dimension + 1);
|
||||
@@ -565,7 +651,6 @@ create_trivial_radix(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
uint32_t num_radix_blocks, uint32_t num_scalar_blocks,
|
||||
uint64_t message_modulus, uint64_t carry_modulus) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
size_t radix_size = (lwe_dimension + 1) * num_radix_blocks;
|
||||
cuda_memset_async(lwe_array_out, 0, radix_size * sizeof(Torus), stream);
|
||||
|
||||
|
||||
@@ -24,8 +24,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
"Only N = 2048 is supported")
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -76,8 +75,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
num_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
"Only N = 2048 is supported")
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -7,11 +7,15 @@
|
||||
#endif
|
||||
|
||||
#include "bootstrap.h"
|
||||
#include "bootstrap_multibit.h"
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.h"
|
||||
#include "integer/integer.cuh"
|
||||
#include "linear_algebra.h"
|
||||
#include "pbs/bootstrap_amortized.cuh"
|
||||
#include "pbs/bootstrap_low_latency.cuh"
|
||||
#include "pbs/bootstrap_multibit.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <fstream>
|
||||
@@ -78,7 +82,6 @@ void compress_device_array_with_map(cuda_stream_t *stream, Torus *src,
|
||||
Torus *dst, int *S, int *F, int num_blocks,
|
||||
uint32_t map_size, uint32_t unit_size,
|
||||
int &total_copied, bool is_message) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
for (int i = 0; i < map_size; i++) {
|
||||
int s_index = i * num_blocks + S[i];
|
||||
int number_of_unit = F[i] - S[i] + is_message;
|
||||
@@ -97,7 +100,6 @@ void extract_message_carry_to_full_radix(cuda_stream_t *stream, Torus *src,
|
||||
int &total_copied,
|
||||
int &total_radix_copied,
|
||||
int num_blocks, bool is_message) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
size_t radix_size = unit_size * num_blocks;
|
||||
for (int i = 0; i < map_size; i++) {
|
||||
auto cur_dst_radix = &dst[total_radix_copied * radix_size];
|
||||
@@ -225,7 +227,6 @@ __host__ void host_integer_mult_radix_kb(
|
||||
uint64_t *radix_lwe_right, void *bsk, uint64_t *ksk,
|
||||
int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto glwe_dimension = mem_ptr->params.glwe_dimension;
|
||||
auto polynomial_size = mem_ptr->params.polynomial_size;
|
||||
auto lwe_dimension = mem_ptr->params.small_lwe_dimension;
|
||||
@@ -276,6 +277,11 @@ __host__ void host_integer_mult_radix_kb(
|
||||
// lwe_dimension +1 coefficients
|
||||
auto small_lwe_vector = mem_ptr->small_lwe_vector;
|
||||
|
||||
// buffer to keep pbs result for num_blocks^2 lwe_ciphertext
|
||||
// in total it has num_blocks^2 big lwe ciphertexts with
|
||||
// glwe_dimension * polynomial_size + 1 coefficients
|
||||
auto lwe_pbs_out_array = mem_ptr->lwe_pbs_out_array;
|
||||
|
||||
// it contains two lut, first for lsb extraction,
|
||||
// second for msb extraction, with total length =
|
||||
// 2 * (glwe_dimension + 1) * polynomial_size
|
||||
@@ -402,19 +408,19 @@ __host__ void host_integer_mult_radix_kb(
|
||||
polynomial_size * glwe_dimension, lwe_dimension,
|
||||
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_copied);
|
||||
|
||||
execute_pbs<Torus>(stream, message_blocks_vector, lwe_indexes,
|
||||
luts_message->lut, luts_message->lut_indexes,
|
||||
small_lwe_vector, lwe_indexes, bsk, luts_message->buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size,
|
||||
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
|
||||
mem_ptr->params.grouping_factor, message_count, 1, 0,
|
||||
max_shared_memory, mem_ptr->params.pbs_type);
|
||||
execute_pbs<Torus>(
|
||||
stream, message_blocks_vector, lwe_indexes, luts_message->lut,
|
||||
luts_message->lut_indexes, small_lwe_vector, lwe_indexes, bsk,
|
||||
luts_message->pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
|
||||
message_count, 1, 0, max_shared_memory, mem_ptr->params.pbs_type);
|
||||
|
||||
execute_pbs<Torus>(stream, carry_blocks_vector, lwe_indexes,
|
||||
luts_carry->lut, luts_carry->lut_indexes,
|
||||
&small_lwe_vector[message_count * (lwe_dimension + 1)],
|
||||
lwe_indexes, bsk, luts_carry->buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size,
|
||||
lwe_indexes, bsk, luts_carry->pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size,
|
||||
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
|
||||
mem_ptr->params.grouping_factor, carry_count, 1, 0,
|
||||
max_shared_memory, mem_ptr->params.pbs_type);
|
||||
@@ -457,7 +463,7 @@ __host__ void host_integer_mult_radix_kb(
|
||||
cuda_memset_async(block_mul_res, 0, big_lwe_size * sizeof(Torus), stream);
|
||||
|
||||
host_addition(stream, radix_lwe_out, vector_result_sb, block_mul_res,
|
||||
big_lwe_dimension, num_blocks);
|
||||
big_lwe_size, num_blocks);
|
||||
|
||||
host_propagate_single_carry_low_latency<Torus>(
|
||||
stream, radix_lwe_out, mem_ptr->scp_mem, bsk, ksk, num_blocks);
|
||||
@@ -468,7 +474,6 @@ __host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
|
||||
cuda_stream_t *stream, int_mul_memory<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
*mem_ptr = new int_mul_memory<Torus>(stream, params, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
@@ -542,13 +547,13 @@ void apply_lookup_table(Torus *input_ciphertexts, Torus *output_ciphertexts,
|
||||
Torus *cur_lut_indexes;
|
||||
if (lsb_msb_mode) {
|
||||
cur_lut_indexes = (big_lwe_start_index < lsb_message_blocks_count)
|
||||
? mem_ptr->lut_indexes_lsb_multi_gpu[i]
|
||||
: mem_ptr->lut_indexes_msb_multi_gpu[i];
|
||||
? mem_ptr->lut_indexes_lsb_multi_gpu[i]
|
||||
: mem_ptr->lut_indexes_msb_multi_gpu[i];
|
||||
|
||||
} else {
|
||||
cur_lut_indexes = (big_lwe_start_index < lsb_message_blocks_count)
|
||||
? mem_ptr->lut_indexes_message_multi_gpu[i]
|
||||
: mem_ptr->lut_indexes_carry_multi_gpu[i];
|
||||
? mem_ptr->lut_indexes_message_multi_gpu[i]
|
||||
: mem_ptr->lut_indexes_carry_multi_gpu[i];
|
||||
}
|
||||
|
||||
// execute keyswitch on a current gpu with corresponding input and output
|
||||
|
||||
@@ -11,7 +11,6 @@ __host__ void host_integer_radix_scalar_bitop_kb(
|
||||
int_bitop_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
|
||||
uint32_t num_radix_blocks, BITOP_TYPE op) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto lut = mem_ptr->lut;
|
||||
auto params = lut->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
@@ -20,6 +19,7 @@ __host__ void host_integer_radix_scalar_bitop_kb(
|
||||
|
||||
if (num_clear_blocks == 0) {
|
||||
if (op == SCALAR_BITAND) {
|
||||
auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
|
||||
cuda_memset_async(lwe_array_out, 0,
|
||||
num_radix_blocks * lwe_size * sizeof(Torus), stream);
|
||||
} else {
|
||||
@@ -28,6 +28,7 @@ __host__ void host_integer_radix_scalar_bitop_kb(
|
||||
stream);
|
||||
}
|
||||
} else {
|
||||
auto lut_buffer = lut->lut;
|
||||
// We have all possible LUTs pre-computed and we use the decomposed scalar
|
||||
// as index to recover the right one
|
||||
cuda_memcpy_async_gpu_to_gpu(lut->lut_indexes, clear_blocks,
|
||||
@@ -37,7 +38,7 @@ __host__ void host_integer_radix_scalar_bitop_kb(
|
||||
stream, lwe_array_out, lwe_array_input, bsk, ksk, num_clear_blocks,
|
||||
lut);
|
||||
|
||||
if (op == SCALAR_BITAND && num_clear_blocks < num_radix_blocks) {
|
||||
if (op == SCALAR_BITAND) {
|
||||
auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
|
||||
cuda_memset_async(lwe_array_out_block, 0,
|
||||
(num_radix_blocks - num_clear_blocks) * lwe_size *
|
||||
|
||||
@@ -8,14 +8,17 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
int_comparison_buffer<uint64_t> *buffer =
|
||||
(int_comparison_buffer<uint64_t> *)mem_ptr;
|
||||
switch (buffer->op) {
|
||||
case EQ:
|
||||
case NE:
|
||||
host_integer_radix_scalar_equality_check_kb<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(scalar_blocks), buffer, bsk,
|
||||
static_cast<uint64_t *>(ksk), lwe_ciphertext_count, num_scalar_blocks);
|
||||
break;
|
||||
// case EQ:
|
||||
// case NE:
|
||||
// host_integer_radix_equality_check_kb<uint64_t>(
|
||||
// stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
// static_cast<uint64_t *>(lwe_array_1),
|
||||
// static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
|
||||
// static_cast<uint64_t *>(ksk), glwe_dimension, polynomial_size,
|
||||
// big_lwe_dimension, small_lwe_dimension, ks_level, ks_base_log,
|
||||
// pbs_level, pbs_base_log, grouping_factor, lwe_ciphertext_count,
|
||||
// message_modulus, carry_modulus);
|
||||
// break;
|
||||
case GT:
|
||||
case GE:
|
||||
case LT:
|
||||
@@ -36,6 +39,6 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
static_cast<uint64_t *>(ksk), lwe_ciphertext_count, num_scalar_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: integer operation not supported")
|
||||
printf("Not implemented\n");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,7 +11,6 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
|
||||
std::function<Torus(Torus)> sign_handler_f, void *bsk, Torus *ksk,
|
||||
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
@@ -47,9 +46,9 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
|
||||
if (total_num_scalar_blocks == 0) {
|
||||
// We only have to compare blocks with zero
|
||||
// means scalar is zero
|
||||
host_compare_with_zero_equality(
|
||||
stream, mem_ptr->tmp_lwe_array_out, lwe_array_in, mem_ptr, bsk, ksk,
|
||||
total_num_radix_blocks, mem_ptr->is_zero_lut);
|
||||
host_compare_with_zero_equality(stream, mem_ptr->tmp_lwe_array_out,
|
||||
lwe_array_in, mem_ptr, bsk, ksk,
|
||||
total_num_radix_blocks);
|
||||
|
||||
auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
|
||||
x = (x == 1 ? IS_EQUAL : IS_SUPERIOR);
|
||||
@@ -85,8 +84,8 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
|
||||
auto lwe_array_msb_out = lwe_array_lsb_out + big_lwe_size;
|
||||
|
||||
cuda_synchronize_stream(stream);
|
||||
auto lsb_stream = mem_ptr->lsb_stream;
|
||||
auto msb_stream = mem_ptr->msb_stream;
|
||||
auto lsb_stream = diff_buffer->lsb_stream;
|
||||
auto msb_stream = diff_buffer->msb_stream;
|
||||
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
@@ -129,8 +128,8 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
|
||||
//////////////
|
||||
// msb
|
||||
host_compare_with_zero_equality(msb_stream, lwe_array_msb_out, msb,
|
||||
mem_ptr, bsk, ksk, num_msb_radix_blocks,
|
||||
mem_ptr->is_zero_lut);
|
||||
mem_ptr, bsk, ksk,
|
||||
num_msb_radix_blocks);
|
||||
}
|
||||
}
|
||||
cuda_synchronize_stream(lsb_stream);
|
||||
@@ -210,9 +209,17 @@ scalar_compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = mem_ptr->params;
|
||||
auto pbs_type = params.pbs_type;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto small_lwe_dimension = params.small_lwe_dimension;
|
||||
auto ks_level = params.ks_level;
|
||||
auto ks_base_log = params.ks_base_log;
|
||||
auto pbs_level = params.pbs_level;
|
||||
auto pbs_base_log = params.pbs_base_log;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto grouping_factor = params.grouping_factor;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
@@ -260,7 +267,6 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
Torus *ksk, uint32_t total_num_radix_blocks,
|
||||
uint32_t total_num_scalar_blocks) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = mem_ptr->params;
|
||||
|
||||
// Calculates the difference sign between the ciphertext and the scalar
|
||||
@@ -289,115 +295,4 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
|
||||
auto eq_buffer = mem_ptr->eq_buffer;
|
||||
|
||||
size_t big_lwe_size = big_lwe_dimension + 1;
|
||||
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
auto scalar_comparison_luts = eq_buffer->scalar_comparison_luts;
|
||||
|
||||
uint32_t num_halved_scalar_blocks =
|
||||
(num_scalar_blocks / 2) + (num_scalar_blocks % 2);
|
||||
|
||||
uint32_t num_lsb_radix_blocks =
|
||||
std::min(num_radix_blocks, 2 * num_halved_scalar_blocks);
|
||||
uint32_t num_msb_radix_blocks = num_radix_blocks - num_lsb_radix_blocks;
|
||||
uint32_t num_halved_lsb_radix_blocks =
|
||||
(num_lsb_radix_blocks / 2) + (num_lsb_radix_blocks % 2);
|
||||
|
||||
auto lsb = lwe_array_in;
|
||||
auto msb = lwe_array_in + big_lwe_size * num_lsb_radix_blocks;
|
||||
|
||||
auto lwe_array_lsb_out = mem_ptr->tmp_lwe_array_out;
|
||||
auto lwe_array_msb_out =
|
||||
lwe_array_lsb_out + big_lwe_size * num_halved_lsb_radix_blocks;
|
||||
|
||||
cuda_synchronize_stream(stream);
|
||||
|
||||
auto lsb_stream = mem_ptr->lsb_stream;
|
||||
auto msb_stream = mem_ptr->msb_stream;
|
||||
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
// Both sections may be executed in parallel
|
||||
#pragma omp section
|
||||
{
|
||||
if (num_halved_scalar_blocks > 0) {
|
||||
auto packed_blocks = mem_ptr->tmp_packed_input;
|
||||
auto packed_scalar =
|
||||
packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks;
|
||||
|
||||
pack_blocks(lsb_stream, packed_blocks, lsb, big_lwe_dimension,
|
||||
num_lsb_radix_blocks, message_modulus);
|
||||
pack_blocks(lsb_stream, packed_scalar, scalar_blocks, 0,
|
||||
num_scalar_blocks, message_modulus);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
scalar_comparison_luts->lut_indexes, packed_scalar,
|
||||
num_halved_scalar_blocks * sizeof(Torus), lsb_stream);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
lsb_stream, lwe_array_lsb_out, packed_blocks, bsk, ksk,
|
||||
num_halved_lsb_radix_blocks, scalar_comparison_luts);
|
||||
}
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
//////////////
|
||||
// msb
|
||||
if (num_msb_radix_blocks > 0) {
|
||||
int_radix_lut<Torus> *msb_lut;
|
||||
switch (mem_ptr->op) {
|
||||
case COMPARISON_TYPE::EQ:
|
||||
msb_lut = mem_ptr->is_zero_lut;
|
||||
break;
|
||||
case COMPARISON_TYPE::NE:
|
||||
msb_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: integer operation not supported")
|
||||
}
|
||||
|
||||
host_compare_with_zero_equality(msb_stream, lwe_array_msb_out, msb,
|
||||
mem_ptr, bsk, ksk, num_msb_radix_blocks,
|
||||
msb_lut);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cuda_synchronize_stream(lsb_stream);
|
||||
cuda_synchronize_stream(msb_stream);
|
||||
|
||||
switch (mem_ptr->op) {
|
||||
case COMPARISON_TYPE::EQ:
|
||||
are_all_comparisons_block_true(
|
||||
stream, lwe_array_out, lwe_array_lsb_out, mem_ptr, bsk, ksk,
|
||||
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
|
||||
break;
|
||||
case COMPARISON_TYPE::NE:
|
||||
is_at_least_one_comparisons_block_true(
|
||||
stream, lwe_array_out, lwe_array_lsb_out, mem_ptr, bsk, ksk,
|
||||
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: integer operation not supported")
|
||||
}
|
||||
|
||||
// The result will be in the two first block. Everything else is
|
||||
// garbage.
|
||||
if (num_radix_blocks > 1)
|
||||
cuda_memset_async(lwe_array_out + big_lwe_size, 0,
|
||||
big_lwe_size_bytes * (num_radix_blocks - 1), stream);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -30,7 +30,6 @@ __host__ void scratch_cuda_integer_radix_scalar_rotate_kb(
|
||||
uint32_t num_radix_blocks, int_radix_params params, SHIFT_TYPE shift_type,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
*mem_ptr = new int_shift_buffer<Torus>(stream, shift_type, params,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
@@ -40,7 +39,6 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
cuda_stream_t *stream, Torus *lwe_array, uint32_t n,
|
||||
int_shift_buffer<Torus> *mem, void *bsk, Torus *ksk, uint32_t num_blocks) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
|
||||
@@ -17,7 +17,6 @@ __host__ void scratch_cuda_integer_radix_scalar_shift_kb(
|
||||
uint32_t num_radix_blocks, int_radix_params params, SHIFT_TYPE shift_type,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
*mem_ptr = new int_shift_buffer<Torus>(stream, shift_type, params,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
@@ -27,7 +26,6 @@ __host__ void host_integer_radix_scalar_shift_kb_inplace(
|
||||
cuda_stream_t *stream, Torus *lwe_array, uint32_t shift,
|
||||
int_shift_buffer<Torus> *mem, void *bsk, Torus *ksk, uint32_t num_blocks) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
@@ -46,10 +44,10 @@ __host__ void host_integer_radix_scalar_shift_kb_inplace(
|
||||
size_t rotations = std::min(shift / num_bits_in_block, (size_t)num_blocks);
|
||||
size_t shift_within_block = shift % num_bits_in_block;
|
||||
|
||||
Torus *full_rotated_buffer = mem->tmp_rotated;
|
||||
Torus *rotated_buffer = &full_rotated_buffer[big_lwe_size];
|
||||
Torus *rotated_buffer = mem->tmp_rotated;
|
||||
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
auto lut_univariate = mem->lut_buffers_univariate[shift_within_block];
|
||||
|
||||
// rotate right all the blocks in radix ciphertext
|
||||
// copy result in new buffer
|
||||
@@ -70,15 +68,23 @@ __host__ void host_integer_radix_scalar_shift_kb_inplace(
|
||||
return;
|
||||
}
|
||||
|
||||
auto partial_current_blocks = &lwe_array[rotations * big_lwe_size];
|
||||
auto partial_previous_blocks =
|
||||
&full_rotated_buffer[rotations * big_lwe_size];
|
||||
// check if we have enough blocks for partial processing
|
||||
if (rotations < num_blocks - 1) {
|
||||
auto partial_current_blocks = &lwe_array[(rotations + 1) * big_lwe_size];
|
||||
auto partial_previous_blocks = &lwe_array[rotations * big_lwe_size];
|
||||
|
||||
size_t partial_block_count = num_blocks - rotations;
|
||||
size_t partial_block_count = num_blocks - rotations - 1;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
stream, partial_current_blocks, partial_current_blocks,
|
||||
partial_previous_blocks, bsk, ksk, partial_block_count, lut_bivariate);
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
stream, partial_current_blocks, partial_current_blocks,
|
||||
partial_previous_blocks, bsk, ksk, partial_block_count,
|
||||
lut_bivariate);
|
||||
}
|
||||
|
||||
auto rest = &lwe_array[rotations * big_lwe_size];
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, rest, rest, bsk, ksk, 1, lut_univariate);
|
||||
|
||||
} else {
|
||||
// right shift
|
||||
@@ -96,14 +102,23 @@ __host__ void host_integer_radix_scalar_shift_kb_inplace(
|
||||
return;
|
||||
}
|
||||
|
||||
auto partial_current_blocks = lwe_array;
|
||||
auto partial_next_blocks = &rotated_buffer[big_lwe_size];
|
||||
// check if we have enough blocks for partial processing
|
||||
if (rotations < num_blocks - 1) {
|
||||
auto partial_current_blocks = lwe_array;
|
||||
auto partial_next_blocks = &lwe_array[big_lwe_size];
|
||||
|
||||
size_t partial_block_count = num_blocks - rotations;
|
||||
size_t partial_block_count = num_blocks - rotations - 1;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
stream, partial_current_blocks, partial_current_blocks,
|
||||
partial_next_blocks, bsk, ksk, partial_block_count, lut_bivariate);
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
stream, partial_current_blocks, partial_current_blocks,
|
||||
partial_next_blocks, bsk, ksk, partial_block_count, lut_bivariate);
|
||||
}
|
||||
|
||||
// The right-most block is done separately as it does not
|
||||
// need to recuperate the shifted bits from its right neighbour.
|
||||
auto last_block = &lwe_array[(num_blocks - rotations - 1) * big_lwe_size];
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, last_block, last_block, bsk, ksk, 1, lut_univariate);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1 +1 @@
|
||||
#include "bootstrap.cuh"
|
||||
#include "bootstrapping_key.cuh"
|
||||
|
||||
@@ -1,136 +0,0 @@
|
||||
#include "../../include/bootstrap.h"
|
||||
#include "../../include/device.h"
|
||||
#include "../include/device.h"
|
||||
#include "bootstrap_low_latency.cuh"
|
||||
#include "bootstrap_multibit.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, PBS_TYPE pbs_type) {
|
||||
switch (sizeof(Torus)) {
|
||||
case sizeof(uint32_t):
|
||||
// 32 bits
|
||||
switch (pbs_type) {
|
||||
case MULTI_BIT:
|
||||
PANIC("Error: 32-bit multibit PBS is not supported.\n")
|
||||
case LOW_LAT:
|
||||
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
|
||||
num_luts, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case AMORTIZED:
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
|
||||
num_luts, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case sizeof(uint64_t):
|
||||
// 64 bits
|
||||
switch (pbs_type) {
|
||||
case MULTI_BIT:
|
||||
cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, grouping_factor, base_log, level_count,
|
||||
input_lwe_ciphertext_count, num_luts, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case LOW_LAT:
|
||||
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
|
||||
num_luts, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case AMORTIZED:
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
|
||||
num_luts, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Error: unsupported cuda PBS type.")
|
||||
}
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer "
|
||||
"moduli are supported.")
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void execute_scratch_pbs(cuda_stream_t *stream, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
switch (sizeof(Torus)) {
|
||||
case sizeof(uint32_t):
|
||||
// 32 bits
|
||||
switch (pbs_type) {
|
||||
case MULTI_BIT:
|
||||
PANIC("Error: 32-bit multibit PBS is not supported.\n")
|
||||
case LOW_LAT:
|
||||
scratch_cuda_bootstrap_low_latency_32(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case AMORTIZED:
|
||||
scratch_cuda_bootstrap_amortized_32(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Error: unsupported cuda PBS type.")
|
||||
}
|
||||
break;
|
||||
case sizeof(uint64_t):
|
||||
// 64 bits
|
||||
switch (pbs_type) {
|
||||
case MULTI_BIT:
|
||||
scratch_cuda_multi_bit_pbs_64(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, grouping_factor, input_lwe_ciphertext_count,
|
||||
max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case LOW_LAT:
|
||||
scratch_cuda_bootstrap_low_latency_64(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case AMORTIZED:
|
||||
scratch_cuda_bootstrap_amortized_64(
|
||||
stream, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Error: unsupported cuda PBS type.")
|
||||
}
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer "
|
||||
"moduli are supported.")
|
||||
}
|
||||
}
|
||||
@@ -11,9 +11,31 @@ uint64_t get_buffer_size_bootstrap_amortized_64(
|
||||
max_shared_memory);
|
||||
}
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_fast_bootstrap_amortized(int polynomial_size) {
|
||||
assert(
|
||||
("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
|
||||
"1024, 2048, 4096, 8192, 16384",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192 ||
|
||||
polynomial_size == 16384));
|
||||
}
|
||||
|
||||
/*
|
||||
* Runs standard checks to validate the inputs
|
||||
*/
|
||||
void checks_bootstrap_amortized(int nbits, int base_log, int polynomial_size) {
|
||||
assert(("Error (GPU amortized PBS): base log should be <= nbits",
|
||||
base_log <= nbits));
|
||||
checks_fast_bootstrap_amortized(polynomial_size);
|
||||
}
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the amortized PBS on 32 bits inputs, into `buffer`. It also
|
||||
* the amortized PBS on 32 bits inputs, into `pbs_buffer`. It also
|
||||
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
|
||||
* be used.
|
||||
*/
|
||||
@@ -21,6 +43,7 @@ void scratch_cuda_bootstrap_amortized_32(
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
checks_fast_bootstrap_amortized(polynomial_size);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -59,15 +82,13 @@ void scratch_cuda_bootstrap_amortized_32(
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the amortized PBS on 64 bits inputs, into `buffer`. It also
|
||||
* the amortized PBS on 64 bits inputs, into `pbs_buffer`. It also
|
||||
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
|
||||
* be used.
|
||||
*/
|
||||
@@ -75,6 +96,7 @@ void scratch_cuda_bootstrap_amortized_64(
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
checks_fast_bootstrap_amortized(polynomial_size);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -113,9 +135,7 @@ void scratch_cuda_bootstrap_amortized_64(
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -130,9 +150,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
|
||||
if (base_log > 32)
|
||||
PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
|
||||
"the ciphertext representation (32)");
|
||||
checks_bootstrap_amortized(32, base_log, polynomial_size);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -141,8 +159,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_bootstrap_amortized<uint32_t, AmortizedDegree<512>>(
|
||||
@@ -150,8 +168,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_bootstrap_amortized<uint32_t, AmortizedDegree<1024>>(
|
||||
@@ -159,8 +177,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_bootstrap_amortized<uint32_t, AmortizedDegree<2048>>(
|
||||
@@ -168,8 +186,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_bootstrap_amortized<uint32_t, AmortizedDegree<4096>>(
|
||||
@@ -177,8 +195,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_bootstrap_amortized<uint32_t, AmortizedDegree<8192>>(
|
||||
@@ -186,8 +204,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 16384:
|
||||
host_bootstrap_amortized<uint32_t, AmortizedDegree<16384>>(
|
||||
@@ -195,13 +213,11 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -278,9 +294,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
|
||||
"the ciphertext representation (64)");
|
||||
checks_bootstrap_amortized(64, base_log, polynomial_size);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -289,8 +303,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_bootstrap_amortized<uint64_t, AmortizedDegree<512>>(
|
||||
@@ -298,8 +312,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_bootstrap_amortized<uint64_t, AmortizedDegree<1024>>(
|
||||
@@ -307,8 +321,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_bootstrap_amortized<uint64_t, AmortizedDegree<2048>>(
|
||||
@@ -316,8 +330,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_bootstrap_amortized<uint64_t, AmortizedDegree<4096>>(
|
||||
@@ -325,8 +339,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_bootstrap_amortized<uint64_t, AmortizedDegree<8192>>(
|
||||
@@ -334,8 +348,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 16384:
|
||||
host_bootstrap_amortized<uint64_t, AmortizedDegree<16384>>(
|
||||
@@ -343,19 +357,17 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory);
|
||||
polynomial_size, base_log, level_count, num_samples, num_luts,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This cleanup function frees the data for the amortized PBS on GPU in
|
||||
* buffer for 32 or 64 bits inputs.
|
||||
* pbs_buffer for 32 or 64 bits inputs.
|
||||
*/
|
||||
void cleanup_cuda_bootstrap_amortized(cuda_stream_t *stream,
|
||||
int8_t **pbs_buffer) {
|
||||
|
||||
@@ -288,8 +288,8 @@ __host__ void host_bootstrap_amortized(
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
uint64_t SM_FULL = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
|
||||
|
||||
@@ -245,10 +245,51 @@ __global__ void device_bootstrap_fast_low_latency(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_bootstrap_fast_low_latency(uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_bootstrap_fast_low_latency(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t get_buffer_size_bootstrap_fast_low_latency(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
|
||||
|
||||
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_dm = full_sm - partial_sm;
|
||||
uint64_t full_dm = full_sm;
|
||||
uint64_t device_mem = 0;
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
}
|
||||
uint64_t buffer_size = device_mem + (glwe_dimension + 1) * level_count *
|
||||
input_lwe_ciphertext_count *
|
||||
polynomial_size / 2 * sizeof(double2);
|
||||
return buffer_size + buffer_size % sizeof(double2);
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
__host__ void scratch_bootstrap_fast_low_latency(
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, LOW_LAT> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
@@ -275,10 +316,13 @@ __host__ void scratch_bootstrap_fast_low_latency(
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
*buffer = new pbs_buffer<Torus, LOW_LAT>(
|
||||
stream, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, PBS_VARIANT::FAST, allocate_gpu_memory);
|
||||
if (allocate_gpu_memory) {
|
||||
uint64_t buffer_size = get_buffer_size_bootstrap_fast_low_latency<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -289,11 +333,11 @@ template <typename Torus, class params>
|
||||
__host__ void host_bootstrap_fast_low_latency(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, LOW_LAT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t num_luts, uint32_t max_shared_memory) {
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t num_luts,
|
||||
uint32_t max_shared_memory) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
// With SM each block corresponds to either the mask or body, no need to
|
||||
@@ -309,8 +353,15 @@ __host__ void host_bootstrap_fast_low_latency(
|
||||
|
||||
uint64_t partial_dm = full_dm - partial_sm;
|
||||
|
||||
int8_t *d_mem = buffer->d_mem;
|
||||
double2 *buffer_fft = buffer->global_accumulator_fft;
|
||||
int8_t *d_mem = pbs_buffer;
|
||||
double2 *buffer_fft =
|
||||
(double2 *)d_mem +
|
||||
(ptrdiff_t)(get_buffer_size_bootstrap_fast_low_latency<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory) /
|
||||
sizeof(double2) -
|
||||
(glwe_dimension + 1) * level_count *
|
||||
input_lwe_ciphertext_count * polynomial_size / 2);
|
||||
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
|
||||
@@ -385,12 +436,12 @@ __host__ bool verify_cuda_bootstrap_fast_low_latency_grid_size(
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
|
||||
thds, partial_sm);
|
||||
thds, 0);
|
||||
} else {
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_bootstrap_fast_low_latency<Torus, params, FULLSM>, thds,
|
||||
full_sm);
|
||||
0);
|
||||
}
|
||||
|
||||
// Get the number of streaming multiprocessors
|
||||
@@ -399,46 +450,4 @@ __host__ bool verify_cuda_bootstrap_fast_low_latency_grid_size(
|
||||
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
|
||||
}
|
||||
|
||||
// Verify if the grid size for the low latency kernel satisfies the cooperative
|
||||
// group constraints
|
||||
template <typename Torus>
|
||||
__host__ bool supports_cooperative_groups_on_lowlat_pbs(
|
||||
int glwe_dimension, int polynomial_size, int level_count, int num_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return verify_cuda_bootstrap_fast_low_latency_grid_size<
|
||||
Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
case 512:
|
||||
return verify_cuda_bootstrap_fast_low_latency_grid_size<
|
||||
Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
case 1024:
|
||||
return verify_cuda_bootstrap_fast_low_latency_grid_size<
|
||||
Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
case 2048:
|
||||
return verify_cuda_bootstrap_fast_low_latency_grid_size<
|
||||
Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
case 4096:
|
||||
return verify_cuda_bootstrap_fast_low_latency_grid_size<
|
||||
Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
case 8192:
|
||||
return verify_cuda_bootstrap_fast_low_latency_grid_size<
|
||||
Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
case 16384:
|
||||
return verify_cuda_bootstrap_fast_low_latency_grid_size<
|
||||
Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
default:
|
||||
PANIC("Cuda error (low latency PBS): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
}
|
||||
}
|
||||
|
||||
#endif // LOWLAT_FAST_PBS_H
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
#include "bootstrap.h"
|
||||
#include "bootstrap_multibit.cuh"
|
||||
#include "bootstrap_multibit.h"
|
||||
#include "cooperative_groups.h"
|
||||
#include "crypto/gadget.cuh"
|
||||
#include "crypto/ggsw.cuh"
|
||||
@@ -154,11 +155,11 @@ __host__ __device__ uint64_t get_buffer_size_fast_multibit_bootstrap(
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
__host__ void scratch_fast_multi_bit_pbs(
|
||||
cuda_stream_t *stream, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t grouping_factor, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size = 0) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
@@ -183,25 +184,30 @@ __host__ void scratch_fast_multi_bit_pbs(
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size = get_average_lwe_chunk_size(
|
||||
lwe_dimension, level_count, glwe_dimension, input_lwe_ciphertext_count);
|
||||
*buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
|
||||
stream, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::FAST,
|
||||
allocate_gpu_memory);
|
||||
if (allocate_gpu_memory) {
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size =
|
||||
get_average_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
|
||||
uint64_t buffer_size = get_buffer_size_fast_multibit_bootstrap<Torus>(
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, lwe_chunk_size,
|
||||
max_shared_memory);
|
||||
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, class params>
|
||||
__host__ void host_fast_multi_bit_pbs(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t lwe_chunk_size = 0) {
|
||||
Torus *lwe_input_indexes, uint64_t *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
if (!lwe_chunk_size)
|
||||
@@ -209,9 +215,15 @@ __host__ void host_fast_multi_bit_pbs(
|
||||
glwe_dimension, num_samples);
|
||||
|
||||
//
|
||||
double2 *keybundle_fft = pbs_buffer->keybundle_fft;
|
||||
Torus *global_accumulator = pbs_buffer->global_accumulator;
|
||||
double2 *buffer_fft = pbs_buffer->global_accumulator_fft;
|
||||
double2 *keybundle_fft = (double2 *)pbs_buffer;
|
||||
double2 *buffer_fft = (double2 *)keybundle_fft +
|
||||
num_samples * lwe_chunk_size * level_count *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) *
|
||||
(polynomial_size / 2);
|
||||
Torus *global_accumulator =
|
||||
(Torus *)buffer_fft +
|
||||
(ptrdiff_t)(sizeof(double2) * num_samples * (glwe_dimension + 1) *
|
||||
level_count * (polynomial_size / 2) / sizeof(Torus));
|
||||
|
||||
//
|
||||
uint64_t full_sm_keybundle =
|
||||
@@ -307,46 +319,4 @@ verify_cuda_bootstrap_fast_multi_bit_grid_size(int glwe_dimension,
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
|
||||
}
|
||||
|
||||
// Verify if the grid size for the multi-bit kernel satisfies the cooperative
|
||||
// group constraints
|
||||
template <typename Torus>
|
||||
__host__ bool supports_cooperative_groups_on_multibit_pbs(
|
||||
int glwe_dimension, int polynomial_size, int level_count, int num_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return verify_cuda_bootstrap_fast_multi_bit_grid_size<Torus,
|
||||
AmortizedDegree<256>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory);
|
||||
case 512:
|
||||
return verify_cuda_bootstrap_fast_multi_bit_grid_size<Torus,
|
||||
AmortizedDegree<512>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory);
|
||||
case 1024:
|
||||
return verify_cuda_bootstrap_fast_multi_bit_grid_size<
|
||||
Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
case 2048:
|
||||
return verify_cuda_bootstrap_fast_multi_bit_grid_size<
|
||||
Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
case 4096:
|
||||
return verify_cuda_bootstrap_fast_multi_bit_grid_size<
|
||||
Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
case 8192:
|
||||
return verify_cuda_bootstrap_fast_multi_bit_grid_size<
|
||||
Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
case 16384:
|
||||
return verify_cuda_bootstrap_fast_multi_bit_grid_size<
|
||||
Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
}
|
||||
}
|
||||
#endif // FASTMULTIBIT_PBS_H
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -221,6 +221,27 @@ __global__ void device_bootstrap_low_latency_step_two(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_bootstrap_low_latency_step_one(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_bootstrap_low_latency_step_two(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_bootstrap_low_latency(uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t get_buffer_size_bootstrap_low_latency(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
@@ -264,8 +285,8 @@ __host__ __device__ uint64_t get_buffer_size_bootstrap_low_latency(
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
__host__ void scratch_bootstrap_low_latency(
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, LOW_LAT> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
@@ -317,9 +338,13 @@ __host__ void scratch_bootstrap_low_latency(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
*buffer = new pbs_buffer<Torus, LOW_LAT>(
|
||||
stream, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT, allocate_gpu_memory);
|
||||
if (allocate_gpu_memory) {
|
||||
uint64_t buffer_size = get_buffer_size_bootstrap_low_latency<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
@@ -407,11 +432,11 @@ template <typename Torus, class params>
|
||||
__host__ void host_bootstrap_low_latency(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, LOW_LAT> *pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t num_luts, uint32_t max_shared_memory) {
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t num_luts,
|
||||
uint32_t max_shared_memory) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
// With SM each block corresponds to either the mask or body, no need to
|
||||
@@ -431,9 +456,16 @@ __host__ void host_bootstrap_low_latency(
|
||||
uint64_t full_dm_step_one = full_sm_step_one;
|
||||
uint64_t full_dm_step_two = full_sm_step_two;
|
||||
|
||||
Torus *global_accumulator = pbs_buffer->global_accumulator;
|
||||
double2 *global_accumulator_fft = pbs_buffer->global_accumulator_fft;
|
||||
int8_t *d_mem = pbs_buffer->d_mem;
|
||||
double2 *global_accumulator_fft = (double2 *)pbs_buffer;
|
||||
Torus *global_accumulator =
|
||||
(Torus *)global_accumulator_fft +
|
||||
(ptrdiff_t)(sizeof(double2) * (glwe_dimension + 1) * level_count *
|
||||
input_lwe_ciphertext_count * (polynomial_size / 2) /
|
||||
sizeof(Torus));
|
||||
int8_t *d_mem = (int8_t *)global_accumulator +
|
||||
(ptrdiff_t)(sizeof(Torus) * (glwe_dimension + 1) *
|
||||
input_lwe_ciphertext_count * polynomial_size /
|
||||
sizeof(int8_t));
|
||||
|
||||
for (int i = 0; i < lwe_dimension; i++) {
|
||||
execute_low_latency_step_one<Torus, params>(
|
||||
|
||||
@@ -3,357 +3,365 @@
|
||||
#include "bootstrap_multibit.cuh"
|
||||
#include "bootstrap_multibit.h"
|
||||
|
||||
bool has_support_to_cuda_bootstrap_fast_multi_bit(uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t level_count,
|
||||
uint32_t num_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
return supports_cooperative_groups_on_multibit_pbs<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_fast_multi_bit_pbs_lwe_ciphertext_vector(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t lwe_chunk_size) {
|
||||
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
|
||||
"the ciphertext representation (64)");
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 512:
|
||||
host_fast_multi_bit_pbs<Torus, int64_t, AmortizedDegree<512>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 1024:
|
||||
host_fast_multi_bit_pbs<Torus, int64_t, AmortizedDegree<1024>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 2048:
|
||||
host_fast_multi_bit_pbs<Torus, int64_t, AmortizedDegree<2048>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 4096:
|
||||
host_fast_multi_bit_pbs<Torus, int64_t, AmortizedDegree<4096>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 8192:
|
||||
host_fast_multi_bit_pbs<Torus, int64_t, AmortizedDegree<8192>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 16384:
|
||||
host_fast_multi_bit_pbs<Torus, int64_t, AmortizedDegree<16384>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_multi_bit_pbs_lwe_ciphertext_vector(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t lwe_chunk_size) {
|
||||
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
|
||||
"the ciphertext representation (64)");
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 512:
|
||||
host_multi_bit_pbs<Torus, int64_t, AmortizedDegree<512>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 1024:
|
||||
host_multi_bit_pbs<Torus, int64_t, AmortizedDegree<1024>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 2048:
|
||||
host_multi_bit_pbs<Torus, int64_t, AmortizedDegree<2048>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 4096:
|
||||
host_multi_bit_pbs<Torus, int64_t, AmortizedDegree<4096>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 8192:
|
||||
host_multi_bit_pbs<Torus, int64_t, AmortizedDegree<8192>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 16384:
|
||||
host_multi_bit_pbs<Torus, int64_t, AmortizedDegree<16384>>(
|
||||
stream, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
}
|
||||
void checks_multi_bit_pbs(int polynomial_size) {
|
||||
assert(
|
||||
("Error (GPU multi-bit PBS): polynomial size should be one of 256, 512, "
|
||||
"1024, 2048, 4096, 8192, 16384",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192 ||
|
||||
polynomial_size == 16384));
|
||||
}
|
||||
|
||||
void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
|
||||
|
||||
if (supports_cooperative_groups_on_multibit_pbs<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count, num_samples,
|
||||
max_shared_memory))
|
||||
cuda_fast_multi_bit_pbs_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key),
|
||||
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
else
|
||||
cuda_multi_bit_pbs_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key),
|
||||
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_fast_multi_bit_pbs(
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
|
||||
checks_multi_bit_pbs(polynomial_size);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_fast_multi_bit_pbs<Torus, STorus, AmortizedDegree<256>>(
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<256>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory)) {
|
||||
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
} else {
|
||||
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 512:
|
||||
scratch_fast_multi_bit_pbs<Torus, STorus, AmortizedDegree<512>>(
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<512>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory)) {
|
||||
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
} else {
|
||||
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 1024:
|
||||
scratch_fast_multi_bit_pbs<Torus, STorus, AmortizedDegree<1024>>(
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<1024>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory)) {
|
||||
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
} else {
|
||||
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 2048:
|
||||
scratch_fast_multi_bit_pbs<Torus, STorus, AmortizedDegree<2048>>(
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<2048>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory)) {
|
||||
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
} else {
|
||||
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 4096:
|
||||
scratch_fast_multi_bit_pbs<Torus, STorus, AmortizedDegree<4096>>(
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<4096>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory)) {
|
||||
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
} else {
|
||||
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 8192:
|
||||
scratch_fast_multi_bit_pbs<Torus, STorus, AmortizedDegree<8192>>(
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<8192>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory)) {
|
||||
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
} else {
|
||||
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 16384:
|
||||
scratch_fast_multi_bit_pbs<Torus, STorus, AmortizedDegree<16384>>(
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<16384>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory)) {
|
||||
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
} else {
|
||||
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lut_vector),
|
||||
static_cast<uint64_t *>(lut_vector_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_luts, lwe_idx,
|
||||
max_shared_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_multi_bit_pbs(
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_multi_bit_pbs<Torus, STorus, AmortizedDegree<256>>(
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 512:
|
||||
scratch_multi_bit_pbs<Torus, STorus, AmortizedDegree<512>>(
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_multi_bit_pbs<Torus, STorus, AmortizedDegree<1024>>(
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_multi_bit_pbs<Torus, STorus, AmortizedDegree<2048>>(
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_multi_bit_pbs<Torus, STorus, AmortizedDegree<4096>>(
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_multi_bit_pbs<Torus, STorus, AmortizedDegree<8192>>(
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_multi_bit_pbs<Torus, STorus, AmortizedDegree<16384>>(
|
||||
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
}
|
||||
}
|
||||
|
||||
void scratch_cuda_multi_bit_pbs_64(
|
||||
cuda_stream_t *stream, int8_t **buffer, uint32_t lwe_dimension,
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size) {
|
||||
|
||||
if (supports_cooperative_groups_on_multibit_pbs<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory))
|
||||
scratch_cuda_fast_multi_bit_pbs<uint64_t, int64_t>(
|
||||
stream, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count, grouping_factor,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory,
|
||||
lwe_chunk_size);
|
||||
else
|
||||
scratch_cuda_multi_bit_pbs<uint64_t, int64_t>(
|
||||
stream, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count, grouping_factor,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory,
|
||||
lwe_chunk_size);
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<256>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory)) {
|
||||
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
} else {
|
||||
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 512:
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<512>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory)) {
|
||||
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
} else {
|
||||
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 1024:
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<1024>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory)) {
|
||||
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
} else {
|
||||
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 2048:
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<2048>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory)) {
|
||||
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
} else {
|
||||
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 4096:
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<4096>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory)) {
|
||||
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
} else {
|
||||
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 8192:
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<8192>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory)) {
|
||||
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
} else {
|
||||
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
case 16384:
|
||||
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
|
||||
AmortizedDegree<16384>>(
|
||||
glwe_dimension, level_count, input_lwe_ciphertext_count,
|
||||
max_shared_memory)) {
|
||||
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
} else {
|
||||
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, grouping_factor,
|
||||
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup_cuda_multi_bit_pbs_32(cuda_stream_t *stream, int8_t **buffer) {
|
||||
auto x = (pbs_buffer<uint32_t, MULTI_BIT> *)(*buffer);
|
||||
x->release(stream);
|
||||
}
|
||||
void cleanup_cuda_multi_bit_pbs_64(cuda_stream_t *stream, int8_t **buffer) {
|
||||
auto x = (pbs_buffer<uint64_t, MULTI_BIT> *)(*buffer);
|
||||
x->release(stream);
|
||||
void cleanup_cuda_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer) {
|
||||
|
||||
// Free memory
|
||||
cuda_drop_async(*pbs_buffer, stream);
|
||||
}
|
||||
|
||||
// Pick the best possible chunk size for each GPU
|
||||
@@ -419,12 +427,7 @@ __host__ uint32_t get_lwe_chunk_size(uint32_t lwe_dimension,
|
||||
return 9;
|
||||
} else if (std::strstr(deviceProp.name, h100Name) != nullptr) {
|
||||
// Tesla H100
|
||||
if (num_samples < 1024)
|
||||
return 128;
|
||||
else if (num_samples < 4096)
|
||||
return 64;
|
||||
else
|
||||
return 32;
|
||||
return 45;
|
||||
}
|
||||
|
||||
// Generic case
|
||||
@@ -452,11 +455,11 @@ __host__ uint32_t get_average_lwe_chunk_size(uint32_t lwe_dimension,
|
||||
return (ct_count > 10000) ? 30 : 45;
|
||||
} else if (std::strstr(deviceProp.name, h100Name) != nullptr) {
|
||||
// Tesla H100
|
||||
return 64;
|
||||
return (ct_count > 10000) ? 30 : 45;
|
||||
}
|
||||
|
||||
// Generic case
|
||||
return (ct_count > 10000) ? 2 : 1;
|
||||
return (ct_count > 10000) ? 2 : 10;
|
||||
}
|
||||
|
||||
// Returns the maximum buffer size required to execute batches up to
|
||||
@@ -470,51 +473,14 @@ __host__ uint64_t get_max_buffer_size_multibit_bootstrap(
|
||||
for (uint32_t input_lwe_ciphertext_count = 1;
|
||||
input_lwe_ciphertext_count <= max_input_lwe_ciphertext_count;
|
||||
input_lwe_ciphertext_count *= 2) {
|
||||
max_buffer_size =
|
||||
std::max(max_buffer_size,
|
||||
get_buffer_size_multibit_bootstrap<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count,
|
||||
get_average_lwe_chunk_size(lwe_dimension, level_count,
|
||||
glwe_dimension,
|
||||
input_lwe_ciphertext_count)));
|
||||
max_buffer_size = std::max(
|
||||
max_buffer_size,
|
||||
get_buffer_size_multibit_bootstrap<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count,
|
||||
get_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension,
|
||||
input_lwe_ciphertext_count)));
|
||||
}
|
||||
|
||||
return max_buffer_size;
|
||||
}
|
||||
|
||||
template void scratch_cuda_multi_bit_pbs<uint64_t, int64_t>(
|
||||
cuda_stream_t *stream, pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
|
||||
|
||||
template void cuda_multi_bit_pbs_lwe_ciphertext_vector<uint64_t>(
|
||||
cuda_stream_t *stream, uint64_t *lwe_array_out,
|
||||
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
|
||||
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
|
||||
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t lwe_chunk_size);
|
||||
|
||||
template void scratch_cuda_fast_multi_bit_pbs<uint64_t, int64_t>(
|
||||
cuda_stream_t *stream, pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
|
||||
|
||||
template void cuda_fast_multi_bit_pbs_lwe_ciphertext_vector<uint64_t>(
|
||||
cuda_stream_t *stream, uint64_t *lwe_array_out,
|
||||
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
|
||||
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
|
||||
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t lwe_chunk_size);
|
||||
|
||||
@@ -329,12 +329,13 @@ __host__ __device__ uint64_t get_buffer_size_multibit_bootstrap(
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
__host__ void scratch_multi_bit_pbs(
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t grouping_factor, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
|
||||
__host__ void
|
||||
scratch_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t grouping_factor, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
@@ -373,25 +374,29 @@ __host__ void scratch_multi_bit_pbs(
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size = get_average_lwe_chunk_size(
|
||||
lwe_dimension, level_count, glwe_dimension, input_lwe_ciphertext_count);
|
||||
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
|
||||
stream, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::DEFAULT,
|
||||
allocate_gpu_memory);
|
||||
if (allocate_gpu_memory) {
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size =
|
||||
get_average_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
|
||||
uint64_t buffer_size = get_buffer_size_multibit_bootstrap<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, lwe_chunk_size);
|
||||
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, class params>
|
||||
__host__ void host_multi_bit_pbs(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t lwe_chunk_size = 0) {
|
||||
Torus *lwe_input_indexes, uint64_t *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
// If a chunk size is not passed to this function, select one.
|
||||
@@ -399,9 +404,15 @@ __host__ void host_multi_bit_pbs(
|
||||
lwe_chunk_size = get_average_lwe_chunk_size(lwe_dimension, level_count,
|
||||
glwe_dimension, num_samples);
|
||||
//
|
||||
double2 *keybundle_fft = buffer->keybundle_fft;
|
||||
Torus *global_accumulator = buffer->global_accumulator;
|
||||
double2 *global_accumulator_fft = buffer->global_accumulator_fft;
|
||||
double2 *keybundle_fft = (double2 *)pbs_buffer;
|
||||
double2 *global_accumulator_fft =
|
||||
(double2 *)keybundle_fft +
|
||||
num_samples * lwe_chunk_size * level_count * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * (polynomial_size / 2);
|
||||
Torus *global_accumulator =
|
||||
(Torus *)global_accumulator_fft +
|
||||
(ptrdiff_t)(sizeof(double2) * num_samples * (glwe_dimension + 1) *
|
||||
level_count * (polynomial_size / 2) / sizeof(Torus));
|
||||
|
||||
//
|
||||
uint64_t full_sm_keybundle =
|
||||
|
||||
@@ -1,78 +0,0 @@
|
||||
#include "bootstrapping_key.cuh"
|
||||
|
||||
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size) {
|
||||
uint32_t total_polynomials =
|
||||
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
|
||||
cuda_convert_lwe_bootstrap_key<uint32_t, int32_t>(
|
||||
(double2 *)dest, (int32_t *)src, stream, input_lwe_dim, glwe_dim,
|
||||
level_count, polynomial_size, total_polynomials);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size) {
|
||||
uint32_t total_polynomials =
|
||||
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
|
||||
cuda_convert_lwe_bootstrap_key<uint64_t, int64_t>(
|
||||
(double2 *)dest, (int64_t *)src, stream, input_lwe_dim, glwe_dim,
|
||||
level_count, polynomial_size, total_polynomials);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_multi_bit_bootstrap_key_64(
|
||||
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor) {
|
||||
uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
|
||||
level_count * (1 << grouping_factor) /
|
||||
grouping_factor;
|
||||
size_t buffer_size = total_polynomials * polynomial_size * sizeof(uint64_t);
|
||||
|
||||
cuda_memcpy_async_to_gpu((uint64_t *)dest, (uint64_t *)src, buffer_size,
|
||||
stream);
|
||||
}
|
||||
|
||||
// We need these lines so the compiler knows how to specialize these functions
|
||||
template __device__ uint64_t *get_ith_mask_kth_block(uint64_t *ptr, int i,
|
||||
int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
template __device__ uint32_t *get_ith_mask_kth_block(uint32_t *ptr, int i,
|
||||
int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
template __device__ double2 *get_ith_mask_kth_block(double2 *ptr, int i, int k,
|
||||
int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
template __device__ uint64_t *get_ith_body_kth_block(uint64_t *ptr, int i,
|
||||
int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
template __device__ uint32_t *get_ith_body_kth_block(uint32_t *ptr, int i,
|
||||
int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
|
||||
int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
|
||||
template __device__ uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
uint64_t *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
|
||||
|
||||
template __device__ double2 *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
double2 *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
|
||||
@@ -100,10 +100,11 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
|
||||
|
||||
cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream);
|
||||
|
||||
double2 *buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
double2 *buffer;
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -122,6 +123,7 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
|
||||
break;
|
||||
case 512:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -140,6 +142,7 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
|
||||
break;
|
||||
case 1024:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -158,6 +161,7 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
|
||||
break;
|
||||
case 2048:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -176,6 +180,7 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
|
||||
break;
|
||||
case 4096:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -194,6 +199,7 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
|
||||
break;
|
||||
case 8192:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -212,6 +218,7 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
|
||||
break;
|
||||
case 16384:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -229,8 +236,7 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
|
||||
}
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (convert KSK): unsupported polynomial size. Supported "
|
||||
"N's are powers of two in the interval [256..16384].")
|
||||
break;
|
||||
}
|
||||
|
||||
cuda_drop_async(d_bsk, stream);
|
||||
@@ -238,6 +244,43 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
|
||||
free(h_bsk);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size) {
|
||||
uint32_t total_polynomials =
|
||||
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
|
||||
cuda_convert_lwe_bootstrap_key<uint32_t, int32_t>(
|
||||
(double2 *)dest, (int32_t *)src, stream, input_lwe_dim, glwe_dim,
|
||||
level_count, polynomial_size, total_polynomials);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size) {
|
||||
uint32_t total_polynomials =
|
||||
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
|
||||
cuda_convert_lwe_bootstrap_key<uint64_t, int64_t>(
|
||||
(double2 *)dest, (int64_t *)src, stream, input_lwe_dim, glwe_dim,
|
||||
level_count, polynomial_size, total_polynomials);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_multi_bit_bootstrap_key_64(
|
||||
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor) {
|
||||
uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
|
||||
level_count * (1 << grouping_factor) /
|
||||
grouping_factor;
|
||||
size_t buffer_size = total_polynomials * polynomial_size * sizeof(uint64_t);
|
||||
|
||||
cuda_memcpy_async_to_gpu((uint64_t *)dest, (uint64_t *)src, buffer_size,
|
||||
stream);
|
||||
}
|
||||
|
||||
void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t polynomial_size,
|
||||
@@ -415,4 +458,43 @@ void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
|
||||
cuda_drop_async(buffer, stream);
|
||||
}
|
||||
|
||||
// We need these lines so the compiler knows how to specialize these functions
|
||||
template __device__ uint64_t *get_ith_mask_kth_block(uint64_t *ptr, int i,
|
||||
int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
template __device__ uint32_t *get_ith_mask_kth_block(uint32_t *ptr, int i,
|
||||
int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
template __device__ double2 *get_ith_mask_kth_block(double2 *ptr, int i, int k,
|
||||
int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
template __device__ uint64_t *get_ith_body_kth_block(uint64_t *ptr, int i,
|
||||
int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
template __device__ uint32_t *get_ith_body_kth_block(uint32_t *ptr, int i,
|
||||
int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
|
||||
int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
|
||||
template __device__ uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
uint64_t *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
|
||||
|
||||
template __device__ double2 *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
double2 *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
|
||||
#endif // CNCRT_BSK_H
|
||||
|
||||
@@ -1,12 +0,0 @@
|
||||
option(TFHE_CUDA_BACKEND_BUILD_TESTS "Build the test tool" OFF)
|
||||
option(TFHE_CUDA_BACKEND_BUILD_BENCHMARKS "Build the benchmark tool" OFF)
|
||||
|
||||
if(TFHE_CUDA_BACKEND_BUILD_TESTS)
|
||||
message(STATUS "Building the test tool")
|
||||
add_subdirectory(tests)
|
||||
endif()
|
||||
|
||||
if(TFHE_CUDA_BACKEND_BUILD_BENCHMARKS)
|
||||
message(STATUS "Building the benchmark tool")
|
||||
add_subdirectory(benchmarks)
|
||||
endif()
|
||||
@@ -1,88 +0,0 @@
|
||||
project(benchmark_tfhe_cuda_backend LANGUAGES CXX)
|
||||
|
||||
# See if the minimum CUDA version is available. If not, only enable documentation building.
|
||||
set(MINIMUM_SUPPORTED_CUDA_VERSION 10.0)
|
||||
include(CheckLanguage)
|
||||
# See if CUDA is available
|
||||
check_language(CUDA)
|
||||
# If so, enable CUDA to check the version.
|
||||
if(CMAKE_CUDA_COMPILER)
|
||||
enable_language(CUDA)
|
||||
find_package(CUDAToolkit)
|
||||
endif()
|
||||
# If CUDA is not available, or the minimum version is too low do not build
|
||||
if(NOT CMAKE_CUDA_COMPILER)
|
||||
message(FATAL_ERROR "Cuda compiler not found.")
|
||||
endif()
|
||||
|
||||
if(NOT CMAKE_BUILD_TYPE)
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
endif()
|
||||
|
||||
# Disable the Google Benchmark requirement on Google Test
|
||||
set(BENCHMARK_ENABLE_GTEST_TESTS OFF)
|
||||
set(BENCHMARK_ENABLE_TESTING OFF)
|
||||
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(
|
||||
googlebenchmark
|
||||
GIT_REPOSITORY https://github.com/google/benchmark.git
|
||||
GIT_TAG v1.7.1)
|
||||
FetchContent_MakeAvailable(googlebenchmark)
|
||||
|
||||
# Enable ExternalProject CMake module
|
||||
include(ExternalProject)
|
||||
|
||||
set(CONCRETE_CUDA_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../")
|
||||
|
||||
# Enable ExternalProject CMake module
|
||||
include(ExternalProject)
|
||||
set(TFHE_RS_SOURCE_DIR "${CMAKE_BINARY_DIR}/../../../../")
|
||||
set(TFHE_RS_BINARY_DIR "${TFHE_RS_SOURCE_DIR}/target/release")
|
||||
|
||||
if(NOT TARGET tfhe-rs)
|
||||
ExternalProject_Add(
|
||||
tfhe-rs
|
||||
SOURCE_DIR ${TFHE_RS_SOURCE_DIR}
|
||||
BUILD_IN_SOURCE 1
|
||||
BUILD_ALWAYS 1
|
||||
UPDATE_COMMAND ""
|
||||
CONFIGURE_COMMAND ""
|
||||
DOWNLOAD_COMMAND ""
|
||||
BUILD_COMMAND make build_c_api
|
||||
INSTALL_COMMAND ""
|
||||
LOG_BUILD ON)
|
||||
endif()
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include)
|
||||
include_directories(${CONCRETE_CUDA_SOURCE_DIR}/include)
|
||||
include_directories(${CONCRETE_CUDA_SOURCE_DIR}/src)
|
||||
include_directories(${TFHE_RS_BINARY_DIR})
|
||||
include_directories(${TFHE_RS_BINARY_DIR}/deps)
|
||||
include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
|
||||
|
||||
find_package(OpenMP REQUIRED)
|
||||
# Add the OpenMP flag to the compiler flags
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
|
||||
|
||||
add_library(tfhe_rs_lib STATIC IMPORTED)
|
||||
add_dependencies(tfhe_rs_lib tfhe-rs)
|
||||
set_target_properties(tfhe_rs_lib PROPERTIES IMPORTED_LOCATION ${TFHE_RS_BINARY_DIR}/libtfhe.a)
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--no-as-needed -ldl")
|
||||
|
||||
set(BINARY benchmark_tfhe_cuda_backend)
|
||||
|
||||
file(
|
||||
GLOB_RECURSE BENCH_SOURCES
|
||||
LIST_DIRECTORIES false
|
||||
benchmark*.cpp main.cpp)
|
||||
|
||||
add_executable(${BINARY} ${BENCH_SOURCES} ../utils.cpp ../setup_and_teardown.cpp)
|
||||
|
||||
set_target_properties(benchmark_tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS
|
||||
ON)
|
||||
target_link_libraries(
|
||||
benchmark_tfhe_cuda_backend
|
||||
PUBLIC benchmark::benchmark tfhe_rs_lib tfhe_cuda_backend OpenMP::OpenMP_CXX
|
||||
PRIVATE CUDA::cudart)
|
||||
@@ -1,73 +0,0 @@
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <cstdint>
|
||||
#include <setup_and_teardown.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
typedef struct {
|
||||
size_t polynomial_size;
|
||||
int samples;
|
||||
} FourierTransformTestParams;
|
||||
|
||||
class FourierTransformTestPrimitives_u64 : public benchmark::Fixture {
|
||||
protected:
|
||||
size_t polynomial_size;
|
||||
int num_samples;
|
||||
cuda_stream_t *stream;
|
||||
int gpu_index = 0;
|
||||
|
||||
double *poly1;
|
||||
double *poly2; // will be used as extracted result for cuda mult
|
||||
double2 *h_cpoly1;
|
||||
double2 *h_cpoly2; // will be used as a result poly
|
||||
double2 *d_cpoly1;
|
||||
double2 *d_cpoly2; // will be used as a result poly
|
||||
|
||||
public:
|
||||
void SetUp(const ::benchmark::State &state) {
|
||||
stream = cuda_create_stream(0);
|
||||
|
||||
// get test params
|
||||
polynomial_size = state.range(0);
|
||||
num_samples = state.range(1);
|
||||
|
||||
fft_setup(stream, &poly1, &poly2, &h_cpoly1, &h_cpoly2, &d_cpoly1,
|
||||
&d_cpoly2, polynomial_size, num_samples);
|
||||
}
|
||||
|
||||
void TearDown(const ::benchmark::State &state) {
|
||||
fft_teardown(stream, poly1, poly2, h_cpoly1, h_cpoly2, d_cpoly1, d_cpoly2);
|
||||
}
|
||||
};
|
||||
|
||||
BENCHMARK_DEFINE_F(FourierTransformTestPrimitives_u64, cuda_fft_mult)
|
||||
(benchmark::State &st) {
|
||||
|
||||
for (auto _ : st) {
|
||||
cuda_fourier_polynomial_mul(d_cpoly1, d_cpoly2, d_cpoly2, stream,
|
||||
polynomial_size, num_samples);
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
}
|
||||
|
||||
static void FFTBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
|
||||
// Define the parameters to benchmark
|
||||
// n, input_lwe_ciphertext_count
|
||||
std::vector<FourierTransformTestParams> params = {
|
||||
(FourierTransformTestParams){256, 100},
|
||||
(FourierTransformTestParams){512, 100},
|
||||
(FourierTransformTestParams){1024, 100},
|
||||
(FourierTransformTestParams){2048, 100},
|
||||
(FourierTransformTestParams){4096, 100},
|
||||
(FourierTransformTestParams){8192, 100},
|
||||
(FourierTransformTestParams){16384, 100},
|
||||
};
|
||||
|
||||
// Add to the list of parameters to benchmark
|
||||
for (auto x : params)
|
||||
b->Args({x.polynomial_size, x.samples});
|
||||
}
|
||||
|
||||
BENCHMARK_REGISTER_F(FourierTransformTestPrimitives_u64, cuda_fft_mult)
|
||||
->Apply(FFTBenchmarkGenerateParams)
|
||||
->ArgNames({"polynomial_size", "samples"});
|
||||
@@ -1,372 +0,0 @@
|
||||
#include <benchmark/benchmark.h>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <omp.h>
|
||||
#include <setup_and_teardown.h>
|
||||
|
||||
typedef struct {
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int input_lwe_ciphertext_count;
|
||||
int grouping_factor;
|
||||
int chunk_size;
|
||||
} MultiBitPBSBenchmarkParams;
|
||||
|
||||
typedef struct {
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int input_lwe_ciphertext_count;
|
||||
} BootstrapBenchmarkParams;
|
||||
|
||||
class MultiBitBootstrap_u64 : public benchmark::Fixture {
|
||||
protected:
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
int input_lwe_ciphertext_count;
|
||||
int grouping_factor;
|
||||
DynamicDistribution lwe_modular_variance;
|
||||
DynamicDistribution glwe_modular_variance;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int message_modulus = 4;
|
||||
int carry_modulus = 4;
|
||||
int payload_modulus;
|
||||
uint64_t delta;
|
||||
cuda_stream_t *stream;
|
||||
uint64_t *lwe_sk_in_array;
|
||||
uint64_t *lwe_sk_out_array;
|
||||
uint64_t *plaintexts;
|
||||
uint64_t *d_bsk;
|
||||
uint64_t *d_lut_pbs_identity;
|
||||
uint64_t *d_lut_pbs_indexes;
|
||||
uint64_t *d_lwe_ct_in_array;
|
||||
uint64_t *d_lwe_ct_out_array;
|
||||
uint64_t *lwe_ct_out_array;
|
||||
uint64_t *d_lwe_input_indexes;
|
||||
uint64_t *d_lwe_output_indexes;
|
||||
int8_t *buffer;
|
||||
|
||||
int chunk_size;
|
||||
|
||||
public:
|
||||
void SetUp(const ::benchmark::State &state) {
|
||||
int gpu_index = 0;
|
||||
stream = cuda_create_stream(gpu_index);
|
||||
|
||||
lwe_dimension = state.range(0);
|
||||
glwe_dimension = state.range(1);
|
||||
polynomial_size = state.range(2);
|
||||
pbs_base_log = state.range(3);
|
||||
pbs_level = state.range(4);
|
||||
input_lwe_ciphertext_count = state.range(5);
|
||||
grouping_factor = state.range(6);
|
||||
chunk_size = state.range(7);
|
||||
|
||||
DynamicDistribution lwe_modular_variance =
|
||||
new_gaussian_from_std_dev(sqrt(0.000007069849454709433));
|
||||
DynamicDistribution glwe_modular_variance =
|
||||
new_gaussian_from_std_dev(sqrt(0.00000000000000029403601535432533));
|
||||
|
||||
Seed seed;
|
||||
init_seed(&seed);
|
||||
|
||||
bootstrap_multibit_setup(
|
||||
stream, &seed, &lwe_sk_in_array, &lwe_sk_out_array, &d_bsk, &plaintexts,
|
||||
&d_lut_pbs_identity, &d_lut_pbs_indexes, &d_lwe_ct_in_array,
|
||||
&d_lwe_input_indexes, &d_lwe_ct_out_array, &d_lwe_output_indexes,
|
||||
&buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, lwe_modular_variance, glwe_modular_variance,
|
||||
pbs_base_log, pbs_level, message_modulus, carry_modulus,
|
||||
&payload_modulus, &delta, input_lwe_ciphertext_count, 1, 1);
|
||||
}
|
||||
|
||||
void TearDown(const ::benchmark::State &state) {
|
||||
bootstrap_multibit_teardown(
|
||||
stream, lwe_sk_in_array, lwe_sk_out_array, d_bsk, plaintexts,
|
||||
d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
|
||||
d_lwe_input_indexes, d_lwe_ct_out_array, d_lwe_output_indexes);
|
||||
cudaDeviceReset();
|
||||
}
|
||||
};
|
||||
|
||||
class ClassicalBootstrap_u64 : public benchmark::Fixture {
|
||||
protected:
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
int input_lwe_ciphertext_count;
|
||||
DynamicDistribution lwe_modular_variance;
|
||||
DynamicDistribution glwe_modular_variance;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int message_modulus = 4;
|
||||
int carry_modulus = 4;
|
||||
int payload_modulus;
|
||||
uint64_t delta;
|
||||
double *d_fourier_bsk;
|
||||
uint64_t *d_lut_pbs_identity;
|
||||
uint64_t *d_lut_pbs_indexes;
|
||||
uint64_t *d_lwe_input_indexes;
|
||||
uint64_t *d_lwe_output_indexes;
|
||||
uint64_t *d_lwe_ct_in_array;
|
||||
uint64_t *d_lwe_ct_out_array;
|
||||
uint64_t *lwe_ct_array;
|
||||
uint64_t *lwe_sk_in_array;
|
||||
uint64_t *lwe_sk_out_array;
|
||||
uint64_t *plaintexts;
|
||||
int8_t *buffer;
|
||||
|
||||
cuda_stream_t *stream;
|
||||
|
||||
public:
|
||||
void SetUp(const ::benchmark::State &state) {
|
||||
int gpu_index = 0;
|
||||
stream = cuda_create_stream(gpu_index);
|
||||
|
||||
lwe_dimension = state.range(0);
|
||||
glwe_dimension = state.range(1);
|
||||
polynomial_size = state.range(2);
|
||||
pbs_base_log = state.range(3);
|
||||
pbs_level = state.range(4);
|
||||
input_lwe_ciphertext_count = state.range(5);
|
||||
|
||||
DynamicDistribution lwe_modular_variance =
|
||||
new_gaussian_from_std_dev(sqrt(0.000007069849454709433));
|
||||
DynamicDistribution glwe_modular_variance =
|
||||
new_gaussian_from_std_dev(sqrt(0.00000000000000029403601535432533));
|
||||
|
||||
Seed seed;
|
||||
init_seed(&seed);
|
||||
|
||||
bootstrap_classical_setup(
|
||||
stream, &seed, &lwe_sk_in_array, &lwe_sk_out_array, &d_fourier_bsk,
|
||||
&plaintexts, &d_lut_pbs_identity, &d_lut_pbs_indexes,
|
||||
&d_lwe_ct_in_array, &d_lwe_input_indexes, &d_lwe_ct_out_array,
|
||||
&d_lwe_output_indexes, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
lwe_modular_variance, glwe_modular_variance, pbs_base_log, pbs_level,
|
||||
message_modulus, carry_modulus, &payload_modulus, &delta,
|
||||
input_lwe_ciphertext_count, 1, 1);
|
||||
}
|
||||
|
||||
void TearDown(const ::benchmark::State &state) {
|
||||
bootstrap_classical_teardown(
|
||||
stream, lwe_sk_in_array, lwe_sk_out_array, d_fourier_bsk, plaintexts,
|
||||
d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
|
||||
d_lwe_input_indexes, d_lwe_ct_out_array, d_lwe_output_indexes);
|
||||
|
||||
cudaDeviceReset();
|
||||
}
|
||||
};
|
||||
|
||||
BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, FastMultiBit)
|
||||
(benchmark::State &st) {
|
||||
if (!has_support_to_cuda_bootstrap_fast_multi_bit(
|
||||
glwe_dimension, polynomial_size, pbs_level,
|
||||
input_lwe_ciphertext_count,
|
||||
cuda_get_max_shared_memory(stream->gpu_index))) {
|
||||
st.SkipWithError("Configuration not supported for fast operation");
|
||||
return;
|
||||
}
|
||||
|
||||
scratch_cuda_fast_multi_bit_pbs<uint64_t, int64_t>(
|
||||
stream, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, pbs_level, grouping_factor,
|
||||
input_lwe_ciphertext_count, cuda_get_max_shared_memory(stream->gpu_index),
|
||||
true, chunk_size);
|
||||
|
||||
for (auto _ : st) {
|
||||
// Execute PBS
|
||||
cuda_fast_multi_bit_pbs_lwe_ciphertext_vector(
|
||||
stream, d_lwe_ct_out_array, d_lwe_output_indexes, d_lut_pbs_identity,
|
||||
d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk,
|
||||
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
|
||||
pbs_level, input_lwe_ciphertext_count, 1, 0,
|
||||
cuda_get_max_shared_memory(stream->gpu_index), chunk_size);
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
|
||||
cleanup_cuda_multi_bit_pbs_64(stream, &buffer);
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit)
|
||||
(benchmark::State &st) {
|
||||
scratch_cuda_multi_bit_pbs<uint64_t, int64_t>(
|
||||
stream, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, pbs_level, grouping_factor,
|
||||
input_lwe_ciphertext_count, cuda_get_max_shared_memory(stream->gpu_index),
|
||||
true, chunk_size);
|
||||
|
||||
for (auto _ : st) {
|
||||
// Execute PBS
|
||||
cuda_multi_bit_pbs_lwe_ciphertext_vector(
|
||||
stream, d_lwe_ct_out_array, d_lwe_output_indexes, d_lut_pbs_identity,
|
||||
d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk,
|
||||
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
|
||||
pbs_level, input_lwe_ciphertext_count, 1, 0,
|
||||
cuda_get_max_shared_memory(stream->gpu_index), chunk_size);
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
|
||||
cleanup_cuda_multi_bit_pbs_64(stream, &buffer);
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, FastLowLatencyPBS)
|
||||
(benchmark::State &st) {
|
||||
if (!has_support_to_cuda_bootstrap_fast_low_latency<uint64_t>(
|
||||
glwe_dimension, polynomial_size, pbs_level,
|
||||
input_lwe_ciphertext_count,
|
||||
cuda_get_max_shared_memory(stream->gpu_index))) {
|
||||
st.SkipWithError("Configuration not supported for fast operation");
|
||||
return;
|
||||
}
|
||||
|
||||
scratch_cuda_fast_bootstrap_low_latency<uint64_t, int64_t>(
|
||||
stream, (pbs_buffer<uint64_t, LOW_LAT> **)&buffer, glwe_dimension,
|
||||
polynomial_size, pbs_level, input_lwe_ciphertext_count,
|
||||
cuda_get_max_shared_memory(stream->gpu_index), true);
|
||||
|
||||
for (auto _ : st) {
|
||||
// Execute PBS
|
||||
cuda_bootstrap_fast_low_latency_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, (uint64_t *)d_lwe_ct_out_array,
|
||||
(uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity,
|
||||
(uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array,
|
||||
(uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
|
||||
(pbs_buffer<uint64_t, LOW_LAT> *)buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level, input_lwe_ciphertext_count, 1,
|
||||
0, cuda_get_max_shared_memory(stream->gpu_index));
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
|
||||
cleanup_cuda_bootstrap_low_latency_64(stream, &buffer);
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, DefaultLowLatencyPBS)
|
||||
(benchmark::State &st) {
|
||||
|
||||
scratch_cuda_bootstrap_low_latency<uint64_t, int64_t>(
|
||||
stream, (pbs_buffer<uint64_t, LOW_LAT> **)&buffer, glwe_dimension,
|
||||
polynomial_size, pbs_level, input_lwe_ciphertext_count,
|
||||
cuda_get_max_shared_memory(stream->gpu_index), true);
|
||||
|
||||
for (auto _ : st) {
|
||||
// Execute PBS
|
||||
cuda_bootstrap_low_latency_lwe_ciphertext_vector<uint64_t>(
|
||||
stream, (uint64_t *)d_lwe_ct_out_array,
|
||||
(uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity,
|
||||
(uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array,
|
||||
(uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
|
||||
(pbs_buffer<uint64_t, LOW_LAT> *)buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level, input_lwe_ciphertext_count, 1,
|
||||
0, cuda_get_max_shared_memory(stream->gpu_index));
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
|
||||
cleanup_cuda_bootstrap_low_latency_64(stream, &buffer);
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, AmortizedPBS)
|
||||
(benchmark::State &st) {
|
||||
|
||||
scratch_cuda_bootstrap_amortized_64(
|
||||
stream, &buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, cuda_get_max_shared_memory(stream->gpu_index),
|
||||
true);
|
||||
|
||||
for (auto _ : st) {
|
||||
// Execute PBS
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
stream, (void *)d_lwe_ct_out_array, (void *)d_lwe_output_indexes,
|
||||
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
|
||||
(void *)d_lwe_ct_in_array, (void *)d_lwe_input_indexes,
|
||||
(void *)d_fourier_bsk, buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level, input_lwe_ciphertext_count, 1,
|
||||
0, cuda_get_max_shared_memory(stream->gpu_index));
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
|
||||
cleanup_cuda_bootstrap_amortized(stream, &buffer);
|
||||
}
|
||||
|
||||
static void
|
||||
MultiBitPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
|
||||
// Define the parameters to benchmark
|
||||
// lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
// input_lwe_ciphertext_count
|
||||
std::vector<MultiBitPBSBenchmarkParams> params = {
|
||||
// 4_bits_multi_bit_group_2
|
||||
(MultiBitPBSBenchmarkParams){818, 1, 2048, 22, 1, 1, 2, 0},
|
||||
// 4_bits_multi_bit_group_3
|
||||
(MultiBitPBSBenchmarkParams){888, 1, 2048, 21, 1, 1, 3, 0},
|
||||
};
|
||||
|
||||
// Add to the list of parameters to benchmark
|
||||
for (auto x : params) {
|
||||
for (int input_lwe_ciphertext_count = 1; input_lwe_ciphertext_count <= 4096;
|
||||
input_lwe_ciphertext_count *= 2) {
|
||||
for (int lwe_chunk_size = 1;
|
||||
lwe_chunk_size <= x.lwe_dimension / x.grouping_factor;
|
||||
lwe_chunk_size *= 2)
|
||||
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
|
||||
x.pbs_base_log, x.pbs_level, input_lwe_ciphertext_count,
|
||||
x.grouping_factor, lwe_chunk_size});
|
||||
|
||||
int lwe_chunk_size = x.lwe_dimension / x.grouping_factor;
|
||||
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
|
||||
x.pbs_base_log, x.pbs_level, input_lwe_ciphertext_count,
|
||||
x.grouping_factor, lwe_chunk_size});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
BootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
|
||||
// Define the parameters to benchmark
|
||||
// lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
// input_lwe_ciphertext_count
|
||||
|
||||
// PARAM_MESSAGE_2_CARRY_2_KS_PBS
|
||||
std::vector<BootstrapBenchmarkParams> params = {
|
||||
(BootstrapBenchmarkParams){742, 1, 2048, 23, 1, 1},
|
||||
};
|
||||
|
||||
// Add to the list of parameters to benchmark
|
||||
for (int num_samples = 1; num_samples <= 4096; num_samples *= 2)
|
||||
for (auto x : params) {
|
||||
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
|
||||
x.pbs_base_log, x.pbs_level, num_samples});
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, FastMultiBit)
|
||||
->Apply(MultiBitPBSBenchmarkGenerateParams)
|
||||
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
|
||||
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count",
|
||||
"grouping_factor", "chunk_size"});
|
||||
|
||||
BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, DefaultMultiBit)
|
||||
->Apply(MultiBitPBSBenchmarkGenerateParams)
|
||||
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
|
||||
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count",
|
||||
"grouping_factor", "chunk_size"});
|
||||
|
||||
BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, DefaultLowLatencyPBS)
|
||||
->Apply(BootstrapBenchmarkGenerateParams)
|
||||
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
|
||||
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count"});
|
||||
|
||||
BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, AmortizedPBS)
|
||||
->Apply(BootstrapBenchmarkGenerateParams)
|
||||
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
|
||||
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count"});
|
||||
@@ -1,3 +0,0 @@
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
BENCHMARK_MAIN();
|
||||
@@ -1,71 +0,0 @@
|
||||
#ifndef SETUP_AND_TEARDOWN_H
|
||||
#define SETUP_AND_TEARDOWN_H
|
||||
|
||||
#include <bootstrap.h>
|
||||
#include <bootstrap_multibit.h>
|
||||
#include <device.h>
|
||||
#include <keyswitch.h>
|
||||
#include <utils.h>
|
||||
|
||||
void bootstrap_classical_setup(
|
||||
cuda_stream_t *stream, Seed *seed, uint64_t **lwe_sk_in_array,
|
||||
uint64_t **lwe_sk_out_array, double **d_fourier_bsk_array,
|
||||
uint64_t **plaintexts, uint64_t **d_lut_pbs_identity,
|
||||
uint64_t **d_lut_pbs_indexes, uint64_t **d_lwe_ct_in_array,
|
||||
uint64_t **d_lwe_input_indexes, uint64_t **d_lwe_ct_out_array,
|
||||
uint64_t **d_lwe_output_indexes, int lwe_dimension, int glwe_dimension,
|
||||
int polynomial_size, DynamicDistribution lwe_noise_distribution,
|
||||
DynamicDistribution glwe_noise_distribution, int pbs_base_log,
|
||||
int pbs_level, int message_modulus, int carry_modulus, int *payload_modulus,
|
||||
uint64_t *delta, int number_of_inputs, int repetitions, int samples);
|
||||
void bootstrap_classical_teardown(
|
||||
cuda_stream_t *stream, uint64_t *lwe_sk_in_array,
|
||||
uint64_t *lwe_sk_out_array, double *d_fourier_bsk_array,
|
||||
uint64_t *plaintexts, uint64_t *d_lut_pbs_identity,
|
||||
uint64_t *d_lut_pbs_indexes, uint64_t *d_lwe_ct_in_array,
|
||||
uint64_t *d_lwe_input_indexes, uint64_t *d_lwe_ct_out_array,
|
||||
uint64_t *d_lwe_output_indexes);
|
||||
void bootstrap_multibit_setup(
|
||||
cuda_stream_t *stream, Seed *seed, uint64_t **lwe_sk_in_array,
|
||||
uint64_t **lwe_sk_out_array, uint64_t **d_bsk_array, uint64_t **plaintexts,
|
||||
uint64_t **d_lut_pbs_identity, uint64_t **d_lut_pbs_indexes,
|
||||
uint64_t **d_lwe_ct_in_array, uint64_t **d_lwe_input_indexes,
|
||||
uint64_t **d_lwe_ct_out_array, uint64_t **d_lwe_output_indexes,
|
||||
int8_t **pbs_buffer, int lwe_dimension, int glwe_dimension,
|
||||
int polynomial_size, int grouping_factor,
|
||||
DynamicDistribution lwe_noise_distribution,
|
||||
DynamicDistribution glwe_noise_distribution, int pbs_base_log,
|
||||
int pbs_level, int message_modulus, int carry_modulus, int *payload_modulus,
|
||||
uint64_t *delta, int number_of_inputs, int repetitions, int samples,
|
||||
int chunk_size = 0);
|
||||
void bootstrap_multibit_teardown(
|
||||
cuda_stream_t *stream, uint64_t *lwe_sk_in_array,
|
||||
uint64_t *lwe_sk_out_array, uint64_t *d_bsk_array, uint64_t *plaintexts,
|
||||
uint64_t *d_lut_pbs_identity, uint64_t *d_lut_pbs_indexes,
|
||||
uint64_t *d_lwe_ct_in_array, uint64_t *d_lwe_input_indexes,
|
||||
uint64_t *d_lwe_ct_out_array, uint64_t *d_lwe_output_indexes);
|
||||
void keyswitch_setup(
|
||||
cuda_stream_t *stream, Seed *seed, uint64_t **lwe_sk_in_array,
|
||||
uint64_t **lwe_sk_out_array, uint64_t **d_ksk_array, uint64_t **plaintexts,
|
||||
uint64_t **d_lwe_ct_in_array, uint64_t **d_lwe_input_indexes,
|
||||
uint64_t **d_lwe_ct_out_array, uint64_t **d_lwe_output_indexes,
|
||||
int input_lwe_dimension, int output_lwe_dimension,
|
||||
DynamicDistribution lwe_noise_distribution, int ksk_base_log, int ksk_level,
|
||||
int message_modulus, int carry_modulus, int *payload_modulus,
|
||||
uint64_t *delta, int number_of_inputs, int repetitions, int samples);
|
||||
void keyswitch_teardown(cuda_stream_t *stream, uint64_t *lwe_sk_in_array,
|
||||
uint64_t *lwe_sk_out_array, uint64_t *d_ksk_array,
|
||||
uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array,
|
||||
uint64_t *lwe_input_indexes,
|
||||
uint64_t *d_lwe_ct_out_array,
|
||||
uint64_t *lwe_output_indexes);
|
||||
|
||||
void fft_setup(cuda_stream_t *stream, double **poly1, double **poly2,
|
||||
double2 **h_cpoly1, double2 **h_cpoly2, double2 **d_cpoly1,
|
||||
double2 **d_cpoly2, size_t polynomial_size, int samples);
|
||||
|
||||
void fft_teardown(cuda_stream_t *stream, double *poly1, double *poly2,
|
||||
double2 *h_cpoly1, double2 *h_cpoly2, double2 *d_cpoly1,
|
||||
double2 *d_cpoly2);
|
||||
|
||||
#endif // SETUP_AND_TEARDOWN_H
|
||||
@@ -1,54 +0,0 @@
|
||||
#ifndef UTILS_H
|
||||
#define UTILS_H
|
||||
|
||||
#include <device.h>
|
||||
#include <functional>
|
||||
#include <tfhe.h>
|
||||
|
||||
typedef struct Seed {
|
||||
uint64_t lo;
|
||||
uint64_t hi;
|
||||
} Seed;
|
||||
|
||||
void init_seed(Seed *seed);
|
||||
|
||||
void shuffle_seed(Seed *seed);
|
||||
|
||||
uint64_t *generate_plaintexts(uint64_t payload_modulus, uint64_t delta,
|
||||
int number_of_inputs, const unsigned repetitions,
|
||||
const unsigned samples);
|
||||
|
||||
uint64_t *generate_identity_lut_pbs(int polynomial_size, int glwe_dimension,
|
||||
int message_modulus, int carry_modulus,
|
||||
std::function<uint64_t(uint64_t)> func);
|
||||
|
||||
void generate_lwe_secret_keys(uint64_t **lwe_sk_array, int lwe_dimension,
|
||||
Seed *seed, const unsigned repetitions);
|
||||
|
||||
void generate_glwe_secret_keys(uint64_t **glwe_sk_array, int glwe_dimension,
|
||||
int polynomial_size, Seed *seed,
|
||||
const unsigned repetitions);
|
||||
|
||||
void generate_lwe_bootstrap_keys(cuda_stream_t *stream,
|
||||
double **d_fourier_bsk_array,
|
||||
uint64_t *lwe_sk_in_array,
|
||||
uint64_t *lwe_sk_out_array, int lwe_dimension,
|
||||
int glwe_dimension, int polynomial_size,
|
||||
int pbs_level, int pbs_base_log, Seed *seed,
|
||||
DynamicDistribution noise_distribution,
|
||||
const unsigned repetitions);
|
||||
|
||||
void generate_lwe_multi_bit_pbs_keys(
|
||||
cuda_stream_t *stream, uint64_t **d_bsk_array, uint64_t *lwe_sk_in_array,
|
||||
uint64_t *lwe_sk_out_array, int lwe_dimension, int glwe_dimension,
|
||||
int polynomial_size, int pbs_level, int pbs_base_log, int grouping_factor,
|
||||
Seed *seed, DynamicDistribution noise_distribution,
|
||||
const unsigned repetitions);
|
||||
|
||||
void generate_lwe_keyswitch_keys(
|
||||
cuda_stream_t *stream, uint64_t **d_ksk_array, uint64_t *lwe_sk_in_array,
|
||||
uint64_t *lwe_sk_out_array, int input_lwe_dimension,
|
||||
int output_lwe_dimension, int ksk_level, int ksk_base_log, Seed *seed,
|
||||
DynamicDistribution noise_distribution, const unsigned repetitions);
|
||||
|
||||
#endif
|
||||
@@ -1,438 +0,0 @@
|
||||
#include <cmath>
|
||||
#include <random>
|
||||
#include <setup_and_teardown.h>
|
||||
|
||||
void bootstrap_classical_setup(
|
||||
cuda_stream_t *stream, Seed *seed, uint64_t **lwe_sk_in_array,
|
||||
uint64_t **lwe_sk_out_array, double **d_fourier_bsk_array,
|
||||
uint64_t **plaintexts, uint64_t **d_lut_pbs_identity,
|
||||
uint64_t **d_lut_pbs_indexes, uint64_t **d_lwe_ct_in_array,
|
||||
uint64_t **d_lwe_input_indexes, uint64_t **d_lwe_ct_out_array,
|
||||
uint64_t **d_lwe_output_indexes, int lwe_dimension, int glwe_dimension,
|
||||
int polynomial_size, DynamicDistribution lwe_noise_distribution,
|
||||
DynamicDistribution glwe_noise_distribution, int pbs_base_log,
|
||||
int pbs_level, int message_modulus, int carry_modulus, int *payload_modulus,
|
||||
uint64_t *delta, int number_of_inputs, int repetitions, int samples) {
|
||||
|
||||
*payload_modulus = message_modulus * carry_modulus;
|
||||
// Value of the shift we multiply our messages by
|
||||
*delta = ((uint64_t)(1) << 63) / (uint64_t)(*payload_modulus);
|
||||
|
||||
// Generate the keys
|
||||
shuffle_seed(seed);
|
||||
generate_lwe_secret_keys(lwe_sk_in_array, lwe_dimension, seed, repetitions);
|
||||
shuffle_seed(seed);
|
||||
generate_lwe_secret_keys(lwe_sk_out_array, glwe_dimension * polynomial_size,
|
||||
seed, repetitions);
|
||||
shuffle_seed(seed);
|
||||
generate_lwe_bootstrap_keys(stream, d_fourier_bsk_array, *lwe_sk_in_array,
|
||||
*lwe_sk_out_array, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, pbs_level, pbs_base_log, seed,
|
||||
glwe_noise_distribution, repetitions);
|
||||
shuffle_seed(seed);
|
||||
*plaintexts = generate_plaintexts(*payload_modulus, *delta, number_of_inputs,
|
||||
repetitions, samples);
|
||||
|
||||
// Create the LUT
|
||||
uint64_t *lut_pbs_identity = generate_identity_lut_pbs(
|
||||
polynomial_size, glwe_dimension, message_modulus, carry_modulus,
|
||||
[](int x) -> int { return x; });
|
||||
uint64_t *lwe_ct_in_array =
|
||||
(uint64_t *)malloc((lwe_dimension + 1) * number_of_inputs * repetitions *
|
||||
samples * sizeof(uint64_t));
|
||||
// Create the input/output ciphertexts
|
||||
for (int r = 0; r < repetitions; r++) {
|
||||
uint64_t *lwe_sk_in = *lwe_sk_in_array + (ptrdiff_t)(r * lwe_dimension);
|
||||
for (int s = 0; s < samples; s++) {
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext = (*plaintexts)[r * samples * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t *lwe_ct_in =
|
||||
lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
|
||||
s * number_of_inputs + i) *
|
||||
(lwe_dimension + 1));
|
||||
core_crypto_lwe_encrypt(lwe_ct_in, plaintext, lwe_sk_in, lwe_dimension,
|
||||
lwe_noise_distribution, seed->lo, seed->hi);
|
||||
shuffle_seed(seed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize and copy things in/to the device
|
||||
*d_lut_pbs_identity = (uint64_t *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * polynomial_size * sizeof(uint64_t), stream);
|
||||
cuda_memcpy_async_to_gpu(
|
||||
*d_lut_pbs_identity, lut_pbs_identity,
|
||||
polynomial_size * (glwe_dimension + 1) * sizeof(uint64_t), stream);
|
||||
*d_lut_pbs_indexes = (uint64_t *)cuda_malloc_async(
|
||||
number_of_inputs * sizeof(uint64_t), stream);
|
||||
cuda_memset_async(*d_lut_pbs_indexes, 0, number_of_inputs * sizeof(uint64_t),
|
||||
stream);
|
||||
|
||||
// Input and output LWEs
|
||||
*d_lwe_ct_out_array =
|
||||
(uint64_t *)cuda_malloc_async((glwe_dimension * polynomial_size + 1) *
|
||||
number_of_inputs * sizeof(uint64_t),
|
||||
stream);
|
||||
*d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
|
||||
(lwe_dimension + 1) * number_of_inputs * repetitions * samples *
|
||||
sizeof(uint64_t),
|
||||
stream);
|
||||
|
||||
cuda_memcpy_async_to_gpu(*d_lwe_ct_in_array, lwe_ct_in_array,
|
||||
repetitions * samples * number_of_inputs *
|
||||
(lwe_dimension + 1) * sizeof(uint64_t),
|
||||
stream);
|
||||
|
||||
uint64_t *h_lwe_indexes =
|
||||
(uint64_t *)malloc(number_of_inputs * sizeof(uint64_t));
|
||||
*d_lwe_input_indexes = (uint64_t *)cuda_malloc_async(
|
||||
number_of_inputs * sizeof(uint64_t), stream);
|
||||
*d_lwe_output_indexes = (uint64_t *)cuda_malloc_async(
|
||||
number_of_inputs * sizeof(uint64_t), stream);
|
||||
for (int i = 0; i < number_of_inputs; i++)
|
||||
h_lwe_indexes[i] = i;
|
||||
cuda_memcpy_async_to_gpu(*d_lwe_input_indexes, h_lwe_indexes,
|
||||
number_of_inputs * sizeof(uint64_t), stream);
|
||||
cuda_memcpy_async_to_gpu(*d_lwe_output_indexes, h_lwe_indexes,
|
||||
number_of_inputs * sizeof(uint64_t), stream);
|
||||
|
||||
stream->synchronize();
|
||||
|
||||
free(lwe_ct_in_array);
|
||||
free(lut_pbs_identity);
|
||||
free(h_lwe_indexes);
|
||||
}
|
||||
|
||||
void bootstrap_classical_teardown(
|
||||
cuda_stream_t *stream, uint64_t *lwe_sk_in_array,
|
||||
uint64_t *lwe_sk_out_array, double *d_fourier_bsk_array,
|
||||
uint64_t *plaintexts, uint64_t *d_lut_pbs_identity,
|
||||
uint64_t *d_lut_pbs_indexes, uint64_t *d_lwe_ct_in_array,
|
||||
uint64_t *d_lwe_input_indexes, uint64_t *d_lwe_ct_out_array,
|
||||
uint64_t *d_lwe_output_indexes) {
|
||||
cuda_synchronize_stream(stream);
|
||||
|
||||
free(lwe_sk_in_array);
|
||||
free(lwe_sk_out_array);
|
||||
free(plaintexts);
|
||||
|
||||
cuda_drop_async(d_fourier_bsk_array, stream);
|
||||
cuda_drop_async(d_lut_pbs_identity, stream);
|
||||
cuda_drop_async(d_lut_pbs_indexes, stream);
|
||||
cuda_drop_async(d_lwe_ct_in_array, stream);
|
||||
cuda_drop_async(d_lwe_ct_out_array, stream);
|
||||
cuda_drop_async(d_lwe_input_indexes, stream);
|
||||
cuda_drop_async(d_lwe_output_indexes, stream);
|
||||
stream->synchronize();
|
||||
stream->release();
|
||||
}
|
||||
|
||||
void bootstrap_multibit_setup(
|
||||
cuda_stream_t *stream, Seed *seed, uint64_t **lwe_sk_in_array,
|
||||
uint64_t **lwe_sk_out_array, uint64_t **d_bsk_array, uint64_t **plaintexts,
|
||||
uint64_t **d_lut_pbs_identity, uint64_t **d_lut_pbs_indexes,
|
||||
uint64_t **d_lwe_ct_in_array, uint64_t **d_lwe_input_indexes,
|
||||
uint64_t **d_lwe_ct_out_array, uint64_t **d_lwe_output_indexes,
|
||||
int8_t **pbs_buffer, int lwe_dimension, int glwe_dimension,
|
||||
int polynomial_size, int grouping_factor,
|
||||
DynamicDistribution lwe_noise_distribution,
|
||||
DynamicDistribution glwe_noise_distribution, int pbs_base_log,
|
||||
int pbs_level, int message_modulus, int carry_modulus, int *payload_modulus,
|
||||
uint64_t *delta, int number_of_inputs, int repetitions, int samples,
|
||||
int lwe_chunk_size) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
*payload_modulus = message_modulus * carry_modulus;
|
||||
// Value of the shift we multiply our messages by
|
||||
*delta = ((uint64_t)(1) << 63) / (uint64_t)(*payload_modulus);
|
||||
|
||||
// Generate the keys
|
||||
shuffle_seed(seed);
|
||||
generate_lwe_secret_keys(lwe_sk_in_array, lwe_dimension, seed, repetitions);
|
||||
shuffle_seed(seed);
|
||||
generate_lwe_secret_keys(lwe_sk_out_array, glwe_dimension * polynomial_size,
|
||||
seed, repetitions);
|
||||
shuffle_seed(seed);
|
||||
generate_lwe_multi_bit_pbs_keys(
|
||||
stream, d_bsk_array, *lwe_sk_in_array, *lwe_sk_out_array, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, pbs_level, pbs_base_log,
|
||||
seed, glwe_noise_distribution, repetitions);
|
||||
shuffle_seed(seed);
|
||||
|
||||
*plaintexts = generate_plaintexts(*payload_modulus, *delta, number_of_inputs,
|
||||
repetitions, samples);
|
||||
|
||||
// Create the LUT
|
||||
uint64_t *lut_pbs_identity = generate_identity_lut_pbs(
|
||||
polynomial_size, glwe_dimension, message_modulus, carry_modulus,
|
||||
[](int x) -> int { return x; });
|
||||
uint64_t *lwe_ct_in_array =
|
||||
(uint64_t *)malloc((lwe_dimension + 1) * number_of_inputs * repetitions *
|
||||
samples * sizeof(uint64_t));
|
||||
// Create the input/output ciphertexts
|
||||
for (int r = 0; r < repetitions; r++) {
|
||||
uint64_t *lwe_sk_in = *lwe_sk_in_array + (ptrdiff_t)(r * lwe_dimension);
|
||||
for (int s = 0; s < samples; s++) {
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext = (*plaintexts)[r * samples * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t *lwe_ct_in =
|
||||
lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
|
||||
s * number_of_inputs + i) *
|
||||
(lwe_dimension + 1));
|
||||
core_crypto_lwe_encrypt(lwe_ct_in, plaintext, lwe_sk_in, lwe_dimension,
|
||||
lwe_noise_distribution, seed->lo, seed->hi);
|
||||
shuffle_seed(seed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize and copy things in/to the device
|
||||
*d_lut_pbs_identity = (uint64_t *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * polynomial_size * sizeof(uint64_t), stream);
|
||||
cuda_memcpy_async_to_gpu(
|
||||
*d_lut_pbs_identity, lut_pbs_identity,
|
||||
polynomial_size * (glwe_dimension + 1) * sizeof(uint64_t), stream);
|
||||
*d_lut_pbs_indexes = (uint64_t *)cuda_malloc_async(
|
||||
number_of_inputs * sizeof(uint64_t), stream);
|
||||
cuda_memset_async(*d_lut_pbs_indexes, 0, number_of_inputs * sizeof(uint64_t),
|
||||
stream);
|
||||
|
||||
// Input and output LWEs
|
||||
*d_lwe_ct_out_array =
|
||||
(uint64_t *)cuda_malloc_async((glwe_dimension * polynomial_size + 1) *
|
||||
number_of_inputs * sizeof(uint64_t),
|
||||
stream);
|
||||
*d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
|
||||
(lwe_dimension + 1) * number_of_inputs * repetitions * samples *
|
||||
sizeof(uint64_t),
|
||||
stream);
|
||||
|
||||
cuda_memcpy_async_to_gpu(*d_lwe_ct_in_array, lwe_ct_in_array,
|
||||
repetitions * samples * number_of_inputs *
|
||||
(lwe_dimension + 1) * sizeof(uint64_t),
|
||||
stream);
|
||||
|
||||
uint64_t *h_lwe_indexes =
|
||||
(uint64_t *)malloc(number_of_inputs * sizeof(uint64_t));
|
||||
*d_lwe_input_indexes = (uint64_t *)cuda_malloc_async(
|
||||
number_of_inputs * sizeof(uint64_t), stream);
|
||||
*d_lwe_output_indexes = (uint64_t *)cuda_malloc_async(
|
||||
number_of_inputs * sizeof(uint64_t), stream);
|
||||
for (int i = 0; i < number_of_inputs; i++)
|
||||
h_lwe_indexes[i] = i;
|
||||
cuda_memcpy_async_to_gpu(*d_lwe_input_indexes, h_lwe_indexes,
|
||||
number_of_inputs * sizeof(uint64_t), stream);
|
||||
cuda_memcpy_async_to_gpu(*d_lwe_output_indexes, h_lwe_indexes,
|
||||
number_of_inputs * sizeof(uint64_t), stream);
|
||||
|
||||
scratch_cuda_multi_bit_pbs_64(
|
||||
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_level, grouping_factor, number_of_inputs,
|
||||
cuda_get_max_shared_memory(stream->gpu_index), true, lwe_chunk_size);
|
||||
|
||||
stream->synchronize();
|
||||
|
||||
free(h_lwe_indexes);
|
||||
free(lut_pbs_identity);
|
||||
free(lwe_ct_in_array);
|
||||
}
|
||||
|
||||
void bootstrap_multibit_teardown(
|
||||
cuda_stream_t *stream, uint64_t *lwe_sk_in_array,
|
||||
uint64_t *lwe_sk_out_array, uint64_t *d_bsk_array, uint64_t *plaintexts,
|
||||
uint64_t *d_lut_pbs_identity, uint64_t *d_lut_pbs_indexes,
|
||||
uint64_t *d_lwe_ct_in_array, uint64_t *d_lwe_input_indexes,
|
||||
uint64_t *d_lwe_ct_out_array, uint64_t *d_lwe_output_indexes) {
|
||||
cuda_synchronize_stream(stream);
|
||||
|
||||
free(lwe_sk_in_array);
|
||||
free(lwe_sk_out_array);
|
||||
free(plaintexts);
|
||||
|
||||
cuda_drop_async(d_bsk_array, stream);
|
||||
cuda_drop_async(d_lut_pbs_identity, stream);
|
||||
cuda_drop_async(d_lut_pbs_indexes, stream);
|
||||
cuda_drop_async(d_lwe_ct_in_array, stream);
|
||||
cuda_drop_async(d_lwe_ct_out_array, stream);
|
||||
cuda_drop_async(d_lwe_input_indexes, stream);
|
||||
cuda_drop_async(d_lwe_output_indexes, stream);
|
||||
stream->synchronize();
|
||||
stream->release();
|
||||
}
|
||||
|
||||
void keyswitch_setup(
|
||||
cuda_stream_t *stream, Seed *seed, uint64_t **lwe_sk_in_array,
|
||||
uint64_t **lwe_sk_out_array, uint64_t **d_ksk_array, uint64_t **plaintexts,
|
||||
uint64_t **d_lwe_ct_in_array, uint64_t **d_lwe_input_indexes,
|
||||
uint64_t **d_lwe_ct_out_array, uint64_t **d_lwe_output_indexes,
|
||||
int input_lwe_dimension, int output_lwe_dimension,
|
||||
DynamicDistribution lwe_noise_distribution, int ksk_base_log, int ksk_level,
|
||||
int message_modulus, int carry_modulus, int *payload_modulus,
|
||||
uint64_t *delta, int number_of_inputs, int repetitions, int samples) {
|
||||
|
||||
*payload_modulus = message_modulus * carry_modulus;
|
||||
// Value of the shift we multiply our messages by
|
||||
*delta = ((uint64_t)(1) << 63) / (uint64_t)(*payload_modulus);
|
||||
|
||||
// Generate the keys
|
||||
shuffle_seed(seed);
|
||||
generate_lwe_secret_keys(lwe_sk_in_array, input_lwe_dimension, seed,
|
||||
repetitions);
|
||||
shuffle_seed(seed);
|
||||
generate_lwe_secret_keys(lwe_sk_out_array, output_lwe_dimension, seed,
|
||||
repetitions);
|
||||
shuffle_seed(seed);
|
||||
generate_lwe_keyswitch_keys(stream, d_ksk_array, *lwe_sk_in_array,
|
||||
*lwe_sk_out_array, input_lwe_dimension,
|
||||
output_lwe_dimension, ksk_level, ksk_base_log,
|
||||
seed, lwe_noise_distribution, repetitions);
|
||||
shuffle_seed(seed);
|
||||
*plaintexts = generate_plaintexts(*payload_modulus, *delta, number_of_inputs,
|
||||
repetitions, samples);
|
||||
|
||||
*d_lwe_ct_out_array = (uint64_t *)cuda_malloc_async(
|
||||
(output_lwe_dimension + 1) * number_of_inputs * sizeof(uint64_t), stream);
|
||||
*d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
|
||||
(input_lwe_dimension + 1) * number_of_inputs * repetitions * samples *
|
||||
sizeof(uint64_t),
|
||||
stream);
|
||||
uint64_t *lwe_ct_in_array =
|
||||
(uint64_t *)malloc((input_lwe_dimension + 1) * number_of_inputs *
|
||||
repetitions * samples * sizeof(uint64_t));
|
||||
// Create the input/output ciphertexts
|
||||
for (int r = 0; r < repetitions; r++) {
|
||||
uint64_t *lwe_sk_in =
|
||||
*lwe_sk_in_array + (ptrdiff_t)(r * input_lwe_dimension);
|
||||
for (int s = 0; s < samples; s++) {
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext = (*plaintexts)[r * samples * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t *lwe_ct_in =
|
||||
lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
|
||||
s * number_of_inputs + i) *
|
||||
(input_lwe_dimension + 1));
|
||||
core_crypto_lwe_encrypt(lwe_ct_in, plaintext, lwe_sk_in,
|
||||
input_lwe_dimension, lwe_noise_distribution,
|
||||
seed->lo, seed->hi);
|
||||
shuffle_seed(seed);
|
||||
}
|
||||
}
|
||||
}
|
||||
cuda_memcpy_async_to_gpu(*d_lwe_ct_in_array, lwe_ct_in_array,
|
||||
repetitions * samples * number_of_inputs *
|
||||
(input_lwe_dimension + 1) * sizeof(uint64_t),
|
||||
stream);
|
||||
stream->synchronize();
|
||||
|
||||
uint64_t *h_lwe_indexes =
|
||||
(uint64_t *)malloc(number_of_inputs * sizeof(uint64_t));
|
||||
*d_lwe_input_indexes = (uint64_t *)cuda_malloc_async(
|
||||
number_of_inputs * sizeof(uint64_t), stream);
|
||||
*d_lwe_output_indexes = (uint64_t *)cuda_malloc_async(
|
||||
number_of_inputs * sizeof(uint64_t), stream);
|
||||
for (int i = 0; i < number_of_inputs; i++)
|
||||
h_lwe_indexes[i] = i;
|
||||
cuda_memcpy_async_to_gpu(*d_lwe_input_indexes, h_lwe_indexes,
|
||||
number_of_inputs * sizeof(uint64_t), stream);
|
||||
cuda_memcpy_async_to_gpu(*d_lwe_output_indexes, h_lwe_indexes,
|
||||
number_of_inputs * sizeof(uint64_t), stream);
|
||||
|
||||
cuda_synchronize_stream(stream);
|
||||
free(h_lwe_indexes);
|
||||
free(lwe_ct_in_array);
|
||||
}
|
||||
|
||||
void keyswitch_teardown(cuda_stream_t *stream, uint64_t *lwe_sk_in_array,
|
||||
uint64_t *lwe_sk_out_array, uint64_t *d_ksk_array,
|
||||
uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array,
|
||||
uint64_t *d_lwe_input_indexes,
|
||||
uint64_t *d_lwe_ct_out_array,
|
||||
uint64_t *d_lwe_output_indexes) {
|
||||
cuda_synchronize_stream(stream);
|
||||
|
||||
free(lwe_sk_in_array);
|
||||
free(lwe_sk_out_array);
|
||||
free(plaintexts);
|
||||
|
||||
cuda_drop_async(d_ksk_array, stream);
|
||||
cuda_drop_async(d_lwe_ct_in_array, stream);
|
||||
cuda_drop_async(d_lwe_ct_out_array, stream);
|
||||
cuda_drop_async(d_lwe_input_indexes, stream);
|
||||
cuda_drop_async(d_lwe_output_indexes, stream);
|
||||
stream->synchronize();
|
||||
stream->release();
|
||||
}
|
||||
|
||||
void fft_setup(cuda_stream_t *stream, double **_poly1, double **_poly2,
|
||||
double2 **_h_cpoly1, double2 **_h_cpoly2, double2 **_d_cpoly1,
|
||||
double2 **_d_cpoly2, size_t polynomial_size, int samples) {
|
||||
|
||||
auto &poly1 = *_poly1;
|
||||
auto &poly2 = *_poly2;
|
||||
auto &h_cpoly1 = *_h_cpoly1;
|
||||
auto &h_cpoly2 = *_h_cpoly2;
|
||||
auto &d_cpoly1 = *_d_cpoly1;
|
||||
auto &d_cpoly2 = *_d_cpoly2;
|
||||
|
||||
poly1 = (double *)malloc(polynomial_size * samples * sizeof(double));
|
||||
poly2 = (double *)malloc(polynomial_size * samples * sizeof(double));
|
||||
h_cpoly1 = (double2 *)malloc(polynomial_size / 2 * samples * sizeof(double2));
|
||||
h_cpoly2 = (double2 *)malloc(polynomial_size / 2 * samples * sizeof(double2));
|
||||
d_cpoly1 = (double2 *)cuda_malloc_async(
|
||||
polynomial_size / 2 * samples * sizeof(double2), stream);
|
||||
d_cpoly2 = (double2 *)cuda_malloc_async(
|
||||
polynomial_size / 2 * samples * sizeof(double2), stream);
|
||||
|
||||
double lower_bound = -1;
|
||||
double upper_bound = 1;
|
||||
std::uniform_real_distribution<double> unif(lower_bound, upper_bound);
|
||||
std::default_random_engine re;
|
||||
// Fill test data with random values
|
||||
for (size_t i = 0; i < polynomial_size * samples; i++) {
|
||||
poly1[i] = unif(re);
|
||||
poly2[i] = unif(re);
|
||||
}
|
||||
|
||||
// prepare data for device
|
||||
// compress
|
||||
for (size_t p = 0; p < (size_t)samples; p++) {
|
||||
auto left_cpoly = &h_cpoly1[p * polynomial_size / 2];
|
||||
auto right_cpoly = &h_cpoly2[p * polynomial_size / 2];
|
||||
auto left = &poly1[p * polynomial_size];
|
||||
auto right = &poly2[p * polynomial_size];
|
||||
for (std::size_t i = 0; i < polynomial_size / 2; ++i) {
|
||||
left_cpoly[i].x = left[i];
|
||||
left_cpoly[i].y = left[i + polynomial_size / 2];
|
||||
|
||||
right_cpoly[i].x = right[i];
|
||||
right_cpoly[i].y = right[i + polynomial_size / 2];
|
||||
}
|
||||
}
|
||||
|
||||
// copy memory cpu->gpu
|
||||
cuda_memcpy_async_to_gpu(d_cpoly1, h_cpoly1,
|
||||
polynomial_size / 2 * samples * sizeof(double2),
|
||||
stream);
|
||||
cuda_memcpy_async_to_gpu(d_cpoly2, h_cpoly2,
|
||||
polynomial_size / 2 * samples * sizeof(double2),
|
||||
stream);
|
||||
stream->synchronize();
|
||||
}
|
||||
|
||||
void fft_teardown(cuda_stream_t *stream, double *poly1, double *poly2,
|
||||
double2 *h_cpoly1, double2 *h_cpoly2, double2 *d_cpoly1,
|
||||
double2 *d_cpoly2) {
|
||||
stream->synchronize();
|
||||
|
||||
free(poly1);
|
||||
free(poly2);
|
||||
free(h_cpoly1);
|
||||
free(h_cpoly2);
|
||||
|
||||
cuda_drop_async(d_cpoly1, stream);
|
||||
cuda_drop_async(d_cpoly2, stream);
|
||||
stream->synchronize();
|
||||
stream->release();
|
||||
}
|
||||
@@ -1,81 +0,0 @@
|
||||
project(test_tfhe_cuda_backend LANGUAGES CXX)
|
||||
|
||||
# See if the minimum CUDA version is available. If not, only enable documentation building.
|
||||
set(MINIMUM_SUPPORTED_CUDA_VERSION 10.0)
|
||||
include(CheckLanguage)
|
||||
# See if CUDA is available
|
||||
check_language(CUDA)
|
||||
# If so, enable CUDA to check the version.
|
||||
if(CMAKE_CUDA_COMPILER)
|
||||
enable_language(CUDA)
|
||||
endif()
|
||||
# If CUDA is not available, or the minimum version is too low do not build
|
||||
if(NOT CMAKE_CUDA_COMPILER)
|
||||
message(FATAL_ERROR "Cuda compiler not found.")
|
||||
endif()
|
||||
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(googletest
|
||||
URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip)
|
||||
|
||||
# For Windows: Prevent overriding the parent project's compiler/linker settings
|
||||
set(gtest_force_shared_crt
|
||||
ON
|
||||
CACHE BOOL "" FORCE)
|
||||
FetchContent_MakeAvailable(googletest)
|
||||
|
||||
set(CONCRETE_CUDA_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../")
|
||||
|
||||
# Enable ExternalProject CMake module
|
||||
include(ExternalProject)
|
||||
|
||||
set(TFHE_RS_SOURCE_DIR "${CMAKE_BINARY_DIR}/../../../../")
|
||||
set(TFHE_RS_BINARY_DIR "${TFHE_RS_SOURCE_DIR}/target/release")
|
||||
|
||||
if(NOT TARGET tfhe-rs)
|
||||
ExternalProject_Add(
|
||||
tfhe-rs
|
||||
SOURCE_DIR ${TFHE_RS_SOURCE_DIR}
|
||||
BUILD_IN_SOURCE 1
|
||||
BUILD_ALWAYS 1
|
||||
UPDATE_COMMAND ""
|
||||
CONFIGURE_COMMAND ""
|
||||
DOWNLOAD_COMMAND ""
|
||||
BUILD_COMMAND make build_c_api
|
||||
INSTALL_COMMAND ""
|
||||
LOG_BUILD ON)
|
||||
endif()
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include)
|
||||
include_directories(${CONCRETE_CUDA_SOURCE_DIR}/include)
|
||||
include_directories(${TFHE_RS_BINARY_DIR})
|
||||
include_directories(${TFHE_RS_BINARY_DIR}/deps)
|
||||
include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
|
||||
include_directories("${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}")
|
||||
|
||||
add_library(tfhe_rs_lib STATIC IMPORTED)
|
||||
add_dependencies(tfhe_rs_lib tfhe-rs)
|
||||
set_target_properties(tfhe_rs_lib PROPERTIES IMPORTED_LOCATION ${TFHE_RS_BINARY_DIR}/libtfhe.a)
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--no-as-needed -ldl")
|
||||
|
||||
set(BINARY test_tfhe_cuda_backend)
|
||||
|
||||
file(
|
||||
GLOB_RECURSE TEST_SOURCES
|
||||
LIST_DIRECTORIES false
|
||||
test_*.cpp)
|
||||
|
||||
add_executable(${BINARY} ${TEST_SOURCES} ../utils.cpp ../setup_and_teardown.cpp)
|
||||
|
||||
add_test(NAME ${BINARY} COMMAND ${BINARY})
|
||||
|
||||
set_target_properties(
|
||||
${BINARY}
|
||||
PROPERTIES CUDA_SEPARABLE_COMPILATION ON
|
||||
CUDA_RESOLVE_DEVICE_SYMBOLS ON
|
||||
CUDA_ARCHITECTURES native)
|
||||
target_link_libraries(${BINARY} PUBLIC GTest::gtest_main tfhe_rs_lib tfhe_cuda_backend cudart)
|
||||
|
||||
include(GoogleTest)
|
||||
gtest_discover_tests(${BINARY})
|
||||
@@ -1,61 +0,0 @@
|
||||
# test_tfhe_cuda_backend
|
||||
|
||||
This test tool is written over GoogleTest library. It tests the correctness of basic
|
||||
cryptographic primitives accelerated using CUDA and helps identify arithmetic flaws.
|
||||
The output format can be adjusted according to the user's interest.
|
||||
|
||||
A particular function will be executed for each test case, and the result will be verified considering the expected behavior. This will be repeated for multiple encryption keys and samples per key. These can be modified by changing `REPETITIONS` and `SAMPLES` variables at the beginning of each test file.
|
||||
|
||||
## How to Compile
|
||||
|
||||
The first step in compiling code with CMake is to create a build directory. This directory will
|
||||
contain all the files generated during the build process, such as object files and executables.
|
||||
We recommend creating this directory outside of the source directory, but inside the
|
||||
implementation folder, to keep the source directory clean.
|
||||
|
||||
```bash
|
||||
$ cd tfhe-rs/backends/tfhe-cuda-backend/cuda
|
||||
$ mkdir build
|
||||
$ cd build
|
||||
```
|
||||
|
||||
Run CMake to generate the build files and then use make to compile the project.
|
||||
|
||||
```bash
|
||||
$ cmake ..
|
||||
$ make
|
||||
```
|
||||
|
||||
The binary will be found in
|
||||
`ctfhe-rs/backends/tfhe-cuda-backend/cuda/build/tests/src`.
|
||||
|
||||
## How to Run Tests
|
||||
|
||||
To run tests, you can simply execute the `test_tfhe_cuda_backend` executable with no arguments:
|
||||
|
||||
```bash
|
||||
$ tests/src/test_tfhe_cuda_backend
|
||||
```
|
||||
|
||||
This will run all the available tests.
|
||||
|
||||
## How to Filter Tests
|
||||
|
||||
You can select a subset of sets by specifying a filter for the name of the tests of interest as
|
||||
an argument. Only tests whose full name matches the filter will be executed.
|
||||
|
||||
For example, to run only tests whose name starts with the word "Bootstrap", you can execute:
|
||||
|
||||
```bash
|
||||
$ tests/src/test_tfhe_cuda_backend --gtest_filter=Bootstrap*
|
||||
```
|
||||
|
||||
The parameter `--gtest_list_tests` can be used to list all the available tests, and a better
|
||||
description on how to select a subset of tests can be found in
|
||||
[GoogleTest documentation](http://google.github.io/googletest/advanced.html#running-a-subset-of-the-tests).
|
||||
|
||||
## Conclusion
|
||||
|
||||
With these options, you can easily verify the correctness of concrete-cuda's implementations. If
|
||||
you have any questions or issues, please feel free to contact us.
|
||||
To learn more about GoogleTest library, please refer to the [official user guide](http://google.github.io/googletest/).
|
||||
@@ -1,387 +0,0 @@
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <functional>
|
||||
#include <gtest/gtest.h>
|
||||
#include <setup_and_teardown.h>
|
||||
#include <utils.h>
|
||||
|
||||
typedef struct {
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
DynamicDistribution lwe_noise_distribution;
|
||||
DynamicDistribution glwe_noise_distribution;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
int number_of_inputs;
|
||||
int repetitions;
|
||||
int samples;
|
||||
} ClassicalBootstrapTestParams;
|
||||
|
||||
class ClassicalBootstrapTestPrimitives_u64
|
||||
: public ::testing::TestWithParam<ClassicalBootstrapTestParams> {
|
||||
protected:
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
DynamicDistribution lwe_noise_distribution;
|
||||
DynamicDistribution glwe_noise_distribution;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
int payload_modulus;
|
||||
int number_of_inputs;
|
||||
int repetitions;
|
||||
int samples;
|
||||
uint64_t delta;
|
||||
cuda_stream_t *stream;
|
||||
int gpu_index = 0;
|
||||
uint64_t *lwe_sk_in_array;
|
||||
uint64_t *lwe_sk_out_array;
|
||||
uint64_t *plaintexts;
|
||||
double *d_fourier_bsk_array;
|
||||
uint64_t *d_lut_pbs_identity;
|
||||
uint64_t *d_lut_pbs_indexes;
|
||||
uint64_t *d_lwe_ct_in_array;
|
||||
uint64_t *d_lwe_ct_out_array;
|
||||
uint64_t *d_lwe_input_indexes;
|
||||
uint64_t *d_lwe_output_indexes;
|
||||
uint64_t *lwe_ct_out_array;
|
||||
|
||||
public:
|
||||
// Test arithmetic functions
|
||||
void SetUp() {
|
||||
stream = cuda_create_stream(gpu_index);
|
||||
|
||||
// TestParams
|
||||
lwe_dimension = (int)GetParam().lwe_dimension;
|
||||
glwe_dimension = (int)GetParam().glwe_dimension;
|
||||
polynomial_size = (int)GetParam().polynomial_size;
|
||||
lwe_noise_distribution =
|
||||
(DynamicDistribution)GetParam().lwe_noise_distribution;
|
||||
glwe_noise_distribution =
|
||||
(DynamicDistribution)GetParam().glwe_noise_distribution;
|
||||
pbs_base_log = (int)GetParam().pbs_base_log;
|
||||
pbs_level = (int)GetParam().pbs_level;
|
||||
message_modulus = (int)GetParam().message_modulus;
|
||||
carry_modulus = (int)GetParam().carry_modulus;
|
||||
number_of_inputs = (int)GetParam().number_of_inputs;
|
||||
repetitions = (int)GetParam().repetitions;
|
||||
samples = (int)GetParam().samples;
|
||||
|
||||
Seed seed;
|
||||
init_seed(&seed);
|
||||
|
||||
bootstrap_classical_setup(
|
||||
stream, &seed, &lwe_sk_in_array, &lwe_sk_out_array,
|
||||
&d_fourier_bsk_array, &plaintexts, &d_lut_pbs_identity,
|
||||
&d_lut_pbs_indexes, &d_lwe_ct_in_array, &d_lwe_input_indexes,
|
||||
&d_lwe_ct_out_array, &d_lwe_output_indexes, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, lwe_noise_distribution,
|
||||
glwe_noise_distribution, pbs_base_log, pbs_level, message_modulus,
|
||||
carry_modulus, &payload_modulus, &delta, number_of_inputs, repetitions,
|
||||
samples);
|
||||
|
||||
lwe_ct_out_array =
|
||||
(uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
|
||||
number_of_inputs * sizeof(uint64_t));
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
free(lwe_ct_out_array);
|
||||
bootstrap_classical_teardown(
|
||||
stream, lwe_sk_in_array, lwe_sk_out_array, d_fourier_bsk_array,
|
||||
plaintexts, d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
|
||||
d_lwe_input_indexes, d_lwe_ct_out_array, d_lwe_output_indexes);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(ClassicalBootstrapTestPrimitives_u64, amortized_bootstrap) {
|
||||
int8_t *pbs_buffer;
|
||||
scratch_cuda_bootstrap_amortized_64(
|
||||
stream, &pbs_buffer, glwe_dimension, polynomial_size, number_of_inputs,
|
||||
cuda_get_max_shared_memory(gpu_index), true);
|
||||
|
||||
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
|
||||
polynomial_size * (lwe_dimension + 1);
|
||||
// Here execute the PBS
|
||||
for (int r = 0; r < repetitions; r++) {
|
||||
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
|
||||
uint64_t *lwe_sk_out =
|
||||
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
|
||||
for (int s = 0; s < samples; s++) {
|
||||
uint64_t *d_lwe_ct_in =
|
||||
d_lwe_ct_in_array +
|
||||
(ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
|
||||
(lwe_dimension + 1));
|
||||
// Execute PBS
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
stream, (void *)d_lwe_ct_out_array, (void *)d_lwe_output_indexes,
|
||||
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
|
||||
(void *)d_lwe_ct_in, (void *)d_lwe_input_indexes,
|
||||
(void *)d_fourier_bsk, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level, number_of_inputs, 1, 0,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
|
||||
(glwe_dimension * polynomial_size + 1) *
|
||||
number_of_inputs * sizeof(uint64_t),
|
||||
stream);
|
||||
|
||||
for (int j = 0; j < number_of_inputs; j++) {
|
||||
uint64_t *result =
|
||||
lwe_ct_out_array +
|
||||
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
|
||||
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
|
||||
s * number_of_inputs + j];
|
||||
uint64_t decrypted = 0;
|
||||
core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out,
|
||||
glwe_dimension * polynomial_size);
|
||||
EXPECT_NE(decrypted, plaintext);
|
||||
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
|
||||
// plaintext
|
||||
// - decrypted;
|
||||
// error_sample_vec.push(err);
|
||||
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
EXPECT_EQ(decoded, plaintext / delta)
|
||||
<< "Repetition: " << r << ", sample: " << s;
|
||||
}
|
||||
}
|
||||
}
|
||||
cleanup_cuda_bootstrap_amortized(stream, &pbs_buffer);
|
||||
}
|
||||
|
||||
TEST_P(ClassicalBootstrapTestPrimitives_u64, low_latency_bootstrap) {
|
||||
int8_t *pbs_buffer;
|
||||
scratch_cuda_bootstrap_low_latency_64(
|
||||
stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level,
|
||||
number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
|
||||
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
|
||||
polynomial_size * (lwe_dimension + 1);
|
||||
// Here execute the PBS
|
||||
for (int r = 0; r < repetitions; r++) {
|
||||
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
|
||||
uint64_t *lwe_sk_out =
|
||||
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
|
||||
for (int s = 0; s < samples; s++) {
|
||||
uint64_t *d_lwe_ct_in =
|
||||
d_lwe_ct_in_array +
|
||||
(ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
|
||||
(lwe_dimension + 1));
|
||||
// Execute PBS
|
||||
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
stream, (void *)d_lwe_ct_out_array, (void *)d_lwe_output_indexes,
|
||||
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
|
||||
(void *)d_lwe_ct_in, (void *)d_lwe_input_indexes,
|
||||
(void *)d_fourier_bsk, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level, number_of_inputs, 1, 0,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
|
||||
(glwe_dimension * polynomial_size + 1) *
|
||||
number_of_inputs * sizeof(uint64_t),
|
||||
stream);
|
||||
|
||||
for (int j = 0; j < number_of_inputs; j++) {
|
||||
uint64_t *result =
|
||||
lwe_ct_out_array +
|
||||
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
|
||||
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
|
||||
s * number_of_inputs + j];
|
||||
uint64_t decrypted = 0;
|
||||
core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out,
|
||||
glwe_dimension * polynomial_size);
|
||||
EXPECT_NE(decrypted, plaintext);
|
||||
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
|
||||
// plaintext
|
||||
// - decrypted;
|
||||
// error_sample_vec.push(err);
|
||||
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
EXPECT_EQ(decoded, plaintext / delta);
|
||||
}
|
||||
}
|
||||
}
|
||||
cleanup_cuda_bootstrap_low_latency_64(stream, &pbs_buffer);
|
||||
}
|
||||
|
||||
// Defines for which parameters set the PBS will be tested.
|
||||
// It executes each src for all pairs on phis X qs (Cartesian product)
|
||||
::testing::internal::ParamGenerator<ClassicalBootstrapTestParams>
|
||||
pbs_params_u64 = ::testing::Values(
|
||||
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
|
||||
// message_modulus, carry_modulus, number_of_inputs, repetitions,
|
||||
// samples
|
||||
// BOOLEAN_DEFAULT_PARAMETERS
|
||||
(ClassicalBootstrapTestParams){
|
||||
777, 3, 512, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
|
||||
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
|
||||
2, 2, 2, 40},
|
||||
// BOOLEAN_TFHE_LIB_PARAMETERS
|
||||
(ClassicalBootstrapTestParams){
|
||||
830, 2, 1024,
|
||||
new_gaussian_from_std_dev(sqrt(1.994564705573226e-12)),
|
||||
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 2,
|
||||
2, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_1_CARRY_0
|
||||
(ClassicalBootstrapTestParams){
|
||||
678, 5, 256, new_gaussian_from_std_dev(sqrt(5.203010004723453e-10)),
|
||||
new_gaussian_from_std_dev(sqrt(1.3996292326131784e-19)), 15, 1, 2,
|
||||
1, 2, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_1_CARRY_1
|
||||
(ClassicalBootstrapTestParams){
|
||||
684, 3, 512, new_gaussian_from_std_dev(sqrt(4.177054989616946e-10)),
|
||||
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
|
||||
2, 2, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_2_CARRY_0
|
||||
(ClassicalBootstrapTestParams){
|
||||
656, 2, 512,
|
||||
new_gaussian_from_std_dev(sqrt(1.1641198952558192e-09)),
|
||||
new_gaussian_from_std_dev(sqrt(1.6434266310406663e-15)), 8, 2, 4, 1,
|
||||
2, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_1_CARRY_2
|
||||
// SHORTINT_PARAM_MESSAGE_2_CARRY_1
|
||||
// SHORTINT_PARAM_MESSAGE_3_CARRY_0
|
||||
(ClassicalBootstrapTestParams){
|
||||
742, 2, 1024,
|
||||
new_gaussian_from_std_dev(sqrt(4.998277131225527e-11)),
|
||||
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 4,
|
||||
2, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_1_CARRY_3
|
||||
// SHORTINT_PARAM_MESSAGE_2_CARRY_2
|
||||
// SHORTINT_PARAM_MESSAGE_3_CARRY_1
|
||||
// SHORTINT_PARAM_MESSAGE_4_CARRY_0
|
||||
(ClassicalBootstrapTestParams){
|
||||
745, 1, 2048,
|
||||
new_gaussian_from_std_dev(sqrt(4.478453795193731e-11)),
|
||||
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 8,
|
||||
2, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_5_CARRY_0
|
||||
// SHORTINT_PARAM_MESSAGE_3_CARRY_2
|
||||
(ClassicalBootstrapTestParams){
|
||||
807, 1, 4096,
|
||||
new_gaussian_from_std_dev(sqrt(4.629015039118823e-12)),
|
||||
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 32, 1,
|
||||
2, 1, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_6_CARRY_0
|
||||
(ClassicalBootstrapTestParams){
|
||||
915, 1, 8192,
|
||||
new_gaussian_from_std_dev(sqrt(8.883173851180252e-14)),
|
||||
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 64, 1,
|
||||
2, 1, 5},
|
||||
// SHORTINT_PARAM_MESSAGE_3_CARRY_3
|
||||
(ClassicalBootstrapTestParams){
|
||||
864, 1, 8192,
|
||||
new_gaussian_from_std_dev(sqrt(1.5843564961097632e-15)),
|
||||
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 8, 8,
|
||||
2, 1, 5},
|
||||
// SHORTINT_PARAM_MESSAGE_4_CARRY_3
|
||||
// SHORTINT_PARAM_MESSAGE_7_CARRY_0
|
||||
(ClassicalBootstrapTestParams){
|
||||
930, 1, 16384,
|
||||
new_gaussian_from_std_dev(sqrt(5.129877458078009e-14)),
|
||||
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 128,
|
||||
1, 2, 1, 5},
|
||||
|
||||
// BOOLEAN_DEFAULT_PARAMETERS
|
||||
(ClassicalBootstrapTestParams){
|
||||
777, 3, 512, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
|
||||
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
|
||||
2, 100, 2, 40},
|
||||
// BOOLEAN_TFHE_LIB_PARAMETERS
|
||||
(ClassicalBootstrapTestParams){
|
||||
830, 2, 1024,
|
||||
new_gaussian_from_std_dev(sqrt(1.994564705573226e-12)),
|
||||
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 2,
|
||||
100, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_1_CARRY_0
|
||||
(ClassicalBootstrapTestParams){
|
||||
678, 5, 256, new_gaussian_from_std_dev(sqrt(5.203010004723453e-10)),
|
||||
new_gaussian_from_std_dev(sqrt(1.3996292326131784e-19)), 15, 1, 2,
|
||||
1, 100, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_1_CARRY_1
|
||||
(ClassicalBootstrapTestParams){
|
||||
684, 3, 512, new_gaussian_from_std_dev(sqrt(4.177054989616946e-10)),
|
||||
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
|
||||
2, 100, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_2_CARRY_0
|
||||
(ClassicalBootstrapTestParams){
|
||||
656, 2, 512,
|
||||
new_gaussian_from_std_dev(sqrt(1.1641198952558192e-09)),
|
||||
new_gaussian_from_std_dev(sqrt(1.6434266310406663e-15)), 8, 2, 4, 1,
|
||||
100, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_1_CARRY_2
|
||||
// SHORTINT_PARAM_MESSAGE_2_CARRY_1
|
||||
// SHORTINT_PARAM_MESSAGE_3_CARRY_0
|
||||
(ClassicalBootstrapTestParams){
|
||||
742, 2, 1024,
|
||||
new_gaussian_from_std_dev(sqrt(4.998277131225527e-11)),
|
||||
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 4,
|
||||
100, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_1_CARRY_3
|
||||
// SHORTINT_PARAM_MESSAGE_2_CARRY_2
|
||||
// SHORTINT_PARAM_MESSAGE_3_CARRY_1
|
||||
// SHORTINT_PARAM_MESSAGE_4_CARRY_0
|
||||
(ClassicalBootstrapTestParams){
|
||||
745, 1, 2048,
|
||||
new_gaussian_from_std_dev(sqrt(4.478453795193731e-11)),
|
||||
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 8,
|
||||
100, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_5_CARRY_0
|
||||
// SHORTINT_PARAM_MESSAGE_3_CARRY_2
|
||||
(ClassicalBootstrapTestParams){
|
||||
807, 1, 4096,
|
||||
new_gaussian_from_std_dev(sqrt(4.629015039118823e-12)),
|
||||
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 32, 1,
|
||||
100, 1, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_6_CARRY_0
|
||||
(ClassicalBootstrapTestParams){
|
||||
915, 1, 8192,
|
||||
new_gaussian_from_std_dev(sqrt(8.883173851180252e-14)),
|
||||
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 64, 1,
|
||||
100, 1, 5},
|
||||
// SHORTINT_PARAM_MESSAGE_3_CARRY_3
|
||||
(ClassicalBootstrapTestParams){
|
||||
864, 1, 8192,
|
||||
new_gaussian_from_std_dev(sqrt(1.5843564961097632e-15)),
|
||||
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 8, 8,
|
||||
100, 1, 5},
|
||||
// SHORTINT_PARAM_MESSAGE_4_CARRY_3
|
||||
// SHORTINT_PARAM_MESSAGE_7_CARRY_0
|
||||
(ClassicalBootstrapTestParams){
|
||||
930, 1, 16384,
|
||||
new_gaussian_from_std_dev(sqrt(5.129877458078009e-14)),
|
||||
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 128,
|
||||
1, 100, 1, 5});
|
||||
std::string
|
||||
printParamName(::testing::TestParamInfo<ClassicalBootstrapTestParams> p) {
|
||||
ClassicalBootstrapTestParams params = p.param;
|
||||
|
||||
return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
|
||||
std::to_string(params.glwe_dimension) + "_N_" +
|
||||
std::to_string(params.polynomial_size) + "_pbs_base_log_" +
|
||||
std::to_string(params.pbs_base_log) + "_pbs_level_" +
|
||||
std::to_string(params.pbs_level) + "_number_of_inputs_" +
|
||||
std::to_string(params.number_of_inputs);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(ClassicalBootstrapInstantiation,
|
||||
ClassicalBootstrapTestPrimitives_u64, pbs_params_u64,
|
||||
printParamName);
|
||||
@@ -1,127 +0,0 @@
|
||||
#include "utils.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <bootstrap.h>
|
||||
#include <cstdint>
|
||||
#include <device.h>
|
||||
#include <functional>
|
||||
#include <random>
|
||||
#include <setup_and_teardown.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
typedef struct {
|
||||
size_t polynomial_size;
|
||||
int samples;
|
||||
} FourierTransformTestParams;
|
||||
|
||||
class FourierTransformTestPrimitives_u64
|
||||
: public ::testing::TestWithParam<FourierTransformTestParams> {
|
||||
protected:
|
||||
size_t polynomial_size;
|
||||
int samples;
|
||||
cuda_stream_t *stream;
|
||||
int gpu_index = 0;
|
||||
|
||||
double *poly1;
|
||||
double *poly2; // will be used as extracted result for cuda mult
|
||||
double *poly_exp_result;
|
||||
double2 *h_cpoly1;
|
||||
double2 *h_cpoly2; // will be used as a result poly
|
||||
double2 *d_cpoly1;
|
||||
double2 *d_cpoly2; // will be used as a result poly
|
||||
|
||||
public:
|
||||
void SetUp() {
|
||||
stream = cuda_create_stream(gpu_index);
|
||||
|
||||
// get src params
|
||||
polynomial_size = (int)GetParam().polynomial_size;
|
||||
samples = (int)GetParam().samples;
|
||||
|
||||
fft_setup(stream, &poly1, &poly2, &h_cpoly1, &h_cpoly2, &d_cpoly1,
|
||||
&d_cpoly2, polynomial_size, samples);
|
||||
|
||||
// allocate memory
|
||||
poly_exp_result =
|
||||
(double *)malloc(polynomial_size * 2 * samples * sizeof(double));
|
||||
memset(poly_exp_result, 0., polynomial_size * 2 * samples * sizeof(double));
|
||||
|
||||
// execute school book multiplication
|
||||
for (size_t p = 0; p < (size_t)samples; p++) {
|
||||
auto left = &poly1[p * polynomial_size];
|
||||
auto right = &poly2[p * polynomial_size];
|
||||
auto res = &poly_exp_result[p * polynomial_size * 2];
|
||||
|
||||
// multiplication
|
||||
for (std::size_t i = 0; i < polynomial_size; ++i) {
|
||||
for (std::size_t j = 0; j < polynomial_size; ++j) {
|
||||
res[i + j] += left[i] * right[j];
|
||||
}
|
||||
}
|
||||
|
||||
// make result negacyclic
|
||||
for (size_t i = 0; i < polynomial_size; i++) {
|
||||
res[i] = res[i] - res[i + polynomial_size];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
fft_teardown(stream, poly1, poly2, h_cpoly1, h_cpoly2, d_cpoly1, d_cpoly2);
|
||||
free(poly_exp_result);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(FourierTransformTestPrimitives_u64, cuda_fft_mult) {
|
||||
|
||||
int r = 0;
|
||||
auto cur_input1 = &d_cpoly1[r * polynomial_size / 2 * samples];
|
||||
auto cur_input2 = &d_cpoly2[r * polynomial_size / 2 * samples];
|
||||
auto cur_h_c_res = &h_cpoly2[r * polynomial_size / 2 * samples];
|
||||
auto cur_poly2 = &poly2[r * polynomial_size * samples];
|
||||
auto cur_expected = &poly_exp_result[r * polynomial_size * 2 * samples];
|
||||
|
||||
cuda_fourier_polynomial_mul(cur_input1, cur_input2, cur_input2, stream,
|
||||
polynomial_size, samples);
|
||||
|
||||
cuda_memcpy_async_to_cpu(cur_h_c_res, cur_input2,
|
||||
polynomial_size / 2 * samples * sizeof(double2),
|
||||
stream);
|
||||
cuda_synchronize_stream(stream);
|
||||
|
||||
for (int p = 0; p < samples; p++) {
|
||||
for (size_t i = 0; i < (size_t)polynomial_size / 2; i++) {
|
||||
cur_poly2[p * polynomial_size + i] =
|
||||
cur_h_c_res[p * polynomial_size / 2 + i].x;
|
||||
cur_poly2[p * polynomial_size + i + polynomial_size / 2] =
|
||||
cur_h_c_res[p * polynomial_size / 2 + i].y;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t p = 0; p < (size_t)samples; p++) {
|
||||
for (size_t i = 0; i < (size_t)polynomial_size; i++) {
|
||||
EXPECT_NEAR(cur_poly2[p * polynomial_size + i],
|
||||
cur_expected[p * 2 * polynomial_size + i], 1e-9);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
::testing::internal::ParamGenerator<FourierTransformTestParams> fft_params_u64 =
|
||||
::testing::Values((FourierTransformTestParams){256, 100},
|
||||
(FourierTransformTestParams){512, 100},
|
||||
(FourierTransformTestParams){1024, 100},
|
||||
(FourierTransformTestParams){2048, 100},
|
||||
(FourierTransformTestParams){4096, 100},
|
||||
(FourierTransformTestParams){8192, 50},
|
||||
(FourierTransformTestParams){16384, 10});
|
||||
|
||||
std::string
|
||||
printParamName(::testing::TestParamInfo<FourierTransformTestParams> p) {
|
||||
FourierTransformTestParams params = p.param;
|
||||
|
||||
return "N_" + std::to_string(params.polynomial_size) + "_samples_" +
|
||||
std::to_string(params.samples);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(fftInstantiation, FourierTransformTestPrimitives_u64,
|
||||
fft_params_u64, printParamName);
|
||||
@@ -1,162 +0,0 @@
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <gtest/gtest.h>
|
||||
#include <setup_and_teardown.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
const unsigned REPETITIONS = 2;
|
||||
const unsigned SAMPLES = 50;
|
||||
|
||||
typedef struct {
|
||||
int input_lwe_dimension;
|
||||
int output_lwe_dimension;
|
||||
DynamicDistribution noise_distribution;
|
||||
int ksk_base_log;
|
||||
int ksk_level;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
int number_of_inputs;
|
||||
} KeyswitchTestParams;
|
||||
|
||||
class KeyswitchTestPrimitives_u64
|
||||
: public ::testing::TestWithParam<KeyswitchTestParams> {
|
||||
protected:
|
||||
int input_lwe_dimension;
|
||||
int output_lwe_dimension;
|
||||
DynamicDistribution noise_distribution;
|
||||
int ksk_base_log;
|
||||
int ksk_level;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
int number_of_inputs;
|
||||
int payload_modulus;
|
||||
uint64_t delta;
|
||||
cuda_stream_t *stream;
|
||||
int gpu_index = 0;
|
||||
uint64_t *lwe_sk_in_array;
|
||||
uint64_t *lwe_sk_out_array;
|
||||
uint64_t *plaintexts;
|
||||
uint64_t *d_ksk_array;
|
||||
uint64_t *d_lwe_ct_out_array;
|
||||
uint64_t *d_lwe_ct_in_array;
|
||||
uint64_t *lwe_in_ct;
|
||||
uint64_t *lwe_out_ct;
|
||||
uint64_t *lwe_input_indexes;
|
||||
uint64_t *lwe_output_indexes;
|
||||
|
||||
public:
|
||||
// Test arithmetic functions
|
||||
void SetUp() {
|
||||
stream = cuda_create_stream(gpu_index);
|
||||
|
||||
// TestParams
|
||||
input_lwe_dimension = (int)GetParam().input_lwe_dimension;
|
||||
output_lwe_dimension = (int)GetParam().output_lwe_dimension;
|
||||
noise_distribution = (DynamicDistribution)GetParam().noise_distribution;
|
||||
ksk_base_log = (int)GetParam().ksk_base_log;
|
||||
ksk_level = (int)GetParam().ksk_level;
|
||||
message_modulus = (int)GetParam().message_modulus;
|
||||
carry_modulus = (int)GetParam().carry_modulus;
|
||||
number_of_inputs = (int)GetParam().number_of_inputs;
|
||||
|
||||
Seed seed;
|
||||
init_seed(&seed);
|
||||
|
||||
keyswitch_setup(stream, &seed, &lwe_sk_in_array, &lwe_sk_out_array,
|
||||
&d_ksk_array, &plaintexts, &d_lwe_ct_in_array,
|
||||
&lwe_input_indexes, &d_lwe_ct_out_array,
|
||||
&lwe_output_indexes, input_lwe_dimension,
|
||||
output_lwe_dimension, noise_distribution, ksk_base_log,
|
||||
ksk_level, message_modulus, carry_modulus, &payload_modulus,
|
||||
&delta, number_of_inputs, REPETITIONS, SAMPLES);
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
keyswitch_teardown(stream, lwe_sk_in_array, lwe_sk_out_array, d_ksk_array,
|
||||
plaintexts, d_lwe_ct_in_array, lwe_input_indexes,
|
||||
d_lwe_ct_out_array, lwe_output_indexes);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(KeyswitchTestPrimitives_u64, keyswitch) {
|
||||
uint64_t *lwe_out_ct = (uint64_t *)malloc(
|
||||
(output_lwe_dimension + 1) * number_of_inputs * sizeof(uint64_t));
|
||||
for (uint r = 0; r < REPETITIONS; r++) {
|
||||
uint64_t *lwe_out_sk =
|
||||
lwe_sk_out_array + (ptrdiff_t)(r * output_lwe_dimension);
|
||||
int ksk_size = ksk_level * (output_lwe_dimension + 1) * input_lwe_dimension;
|
||||
uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
|
||||
for (uint s = 0; s < SAMPLES; s++) {
|
||||
uint64_t *d_lwe_ct_in =
|
||||
d_lwe_ct_in_array +
|
||||
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
|
||||
(input_lwe_dimension + 1));
|
||||
// Execute keyswitch
|
||||
cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
stream, (void *)d_lwe_ct_out_array, (void *)lwe_output_indexes,
|
||||
(void *)d_lwe_ct_in, (void *)lwe_input_indexes, (void *)d_ksk,
|
||||
input_lwe_dimension, output_lwe_dimension, ksk_base_log, ksk_level,
|
||||
number_of_inputs);
|
||||
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_ct_out_array,
|
||||
number_of_inputs * (output_lwe_dimension + 1) *
|
||||
sizeof(uint64_t),
|
||||
stream);
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
|
||||
s * number_of_inputs + i];
|
||||
uint64_t decrypted = 0;
|
||||
core_crypto_lwe_decrypt(&decrypted,
|
||||
lwe_out_ct + i * (output_lwe_dimension + 1),
|
||||
lwe_out_sk, output_lwe_dimension);
|
||||
EXPECT_NE(decrypted, plaintext);
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
EXPECT_EQ(decoded, plaintext / delta);
|
||||
}
|
||||
}
|
||||
}
|
||||
free(lwe_out_ct);
|
||||
}
|
||||
|
||||
// Defines for which parameters set the PBS will be tested.
|
||||
// It executes each src for all pairs on phis X qs (Cartesian product)
|
||||
::testing::internal::ParamGenerator<KeyswitchTestParams> ksk_params_u64 =
|
||||
::testing::Values(
|
||||
// n, k*N, noise_distribution, ks_base_log, ks_level,
|
||||
// message_modulus, carry_modulus, number_of_inputs
|
||||
(KeyswitchTestParams){
|
||||
567, 1280, new_gaussian_from_std_dev(sqrt(2.9802322387695312e-18)),
|
||||
3, 3, 2, 1, 10},
|
||||
(KeyswitchTestParams){
|
||||
694, 1536, new_gaussian_from_std_dev(sqrt(2.9802322387695312e-18)),
|
||||
4, 3, 2, 1, 10},
|
||||
(KeyswitchTestParams){
|
||||
769, 2048, new_gaussian_from_std_dev(sqrt(2.9802322387695312e-18)),
|
||||
4, 3, 2, 1, 10},
|
||||
(KeyswitchTestParams){
|
||||
754, 2048, new_gaussian_from_std_dev(sqrt(2.9802322387695312e-18)),
|
||||
3, 5, 2, 1, 10},
|
||||
(KeyswitchTestParams){742, 2048,
|
||||
new_gaussian_from_std_dev(sqrt(4.9982771e-11)), 3,
|
||||
5, 4, 1, 10},
|
||||
(KeyswitchTestParams){
|
||||
847, 4096, new_gaussian_from_std_dev(sqrt(2.9802322387695312e-18)),
|
||||
4, 4, 2, 1, 10});
|
||||
|
||||
std::string printParamName(::testing::TestParamInfo<KeyswitchTestParams> p) {
|
||||
KeyswitchTestParams params = p.param;
|
||||
|
||||
return "na_" + std::to_string(params.input_lwe_dimension) + "_nb_" +
|
||||
std::to_string(params.output_lwe_dimension) + "_baselog_" +
|
||||
std::to_string(params.ksk_base_log) + "_ksk_level_" +
|
||||
std::to_string(params.ksk_level);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(KeyswitchInstantiation, KeyswitchTestPrimitives_u64,
|
||||
ksk_params_u64, printParamName);
|
||||
@@ -1,215 +0,0 @@
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <functional>
|
||||
#include <gtest/gtest.h>
|
||||
#include <setup_and_teardown.h>
|
||||
#include <utils.h>
|
||||
|
||||
typedef struct {
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
DynamicDistribution lwe_noise_distribution;
|
||||
DynamicDistribution glwe_noise_distribution;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
int number_of_inputs;
|
||||
int grouping_factor;
|
||||
int repetitions;
|
||||
int samples;
|
||||
} MultiBitBootstrapTestParams;
|
||||
|
||||
class MultiBitBootstrapTestPrimitives_u64
|
||||
: public ::testing::TestWithParam<MultiBitBootstrapTestParams> {
|
||||
protected:
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
DynamicDistribution lwe_noise_distribution;
|
||||
DynamicDistribution glwe_noise_distribution;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
int payload_modulus;
|
||||
int number_of_inputs;
|
||||
int grouping_factor;
|
||||
uint64_t delta;
|
||||
cuda_stream_t *stream;
|
||||
int gpu_index = 0;
|
||||
uint64_t *lwe_sk_in_array;
|
||||
uint64_t *lwe_sk_out_array;
|
||||
uint64_t *plaintexts;
|
||||
uint64_t *d_bsk_array;
|
||||
uint64_t *d_lut_pbs_identity;
|
||||
uint64_t *d_lut_pbs_indexes;
|
||||
uint64_t *d_lwe_ct_in_array;
|
||||
uint64_t *d_lwe_ct_out_array;
|
||||
uint64_t *lwe_ct_out_array;
|
||||
uint64_t *d_lwe_input_indexes;
|
||||
uint64_t *d_lwe_output_indexes;
|
||||
int8_t *pbs_buffer;
|
||||
|
||||
int repetitions;
|
||||
int samples;
|
||||
|
||||
public:
|
||||
void SetUp() {
|
||||
stream = cuda_create_stream(gpu_index);
|
||||
|
||||
// TestParams
|
||||
lwe_dimension = (int)GetParam().lwe_dimension;
|
||||
glwe_dimension = (int)GetParam().glwe_dimension;
|
||||
polynomial_size = (int)GetParam().polynomial_size;
|
||||
grouping_factor = (int)GetParam().grouping_factor;
|
||||
lwe_noise_distribution =
|
||||
(DynamicDistribution)GetParam().lwe_noise_distribution;
|
||||
glwe_noise_distribution =
|
||||
(DynamicDistribution)GetParam().glwe_noise_distribution;
|
||||
pbs_base_log = (int)GetParam().pbs_base_log;
|
||||
pbs_level = (int)GetParam().pbs_level;
|
||||
message_modulus = (int)GetParam().message_modulus;
|
||||
carry_modulus = (int)GetParam().carry_modulus;
|
||||
number_of_inputs = (int)GetParam().number_of_inputs;
|
||||
|
||||
Seed seed;
|
||||
init_seed(&seed);
|
||||
|
||||
repetitions = (int)GetParam().repetitions;
|
||||
samples = (int)GetParam().samples;
|
||||
|
||||
bootstrap_multibit_setup(
|
||||
stream, &seed, &lwe_sk_in_array, &lwe_sk_out_array, &d_bsk_array,
|
||||
&plaintexts, &d_lut_pbs_identity, &d_lut_pbs_indexes,
|
||||
&d_lwe_ct_in_array, &d_lwe_input_indexes, &d_lwe_ct_out_array,
|
||||
&d_lwe_output_indexes, &pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, grouping_factor, lwe_noise_distribution,
|
||||
glwe_noise_distribution, pbs_base_log, pbs_level, message_modulus,
|
||||
carry_modulus, &payload_modulus, &delta, number_of_inputs, repetitions,
|
||||
samples);
|
||||
|
||||
lwe_ct_out_array =
|
||||
(uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
|
||||
number_of_inputs * sizeof(uint64_t));
|
||||
}
|
||||
|
||||
void TearDown() {
|
||||
free(lwe_ct_out_array);
|
||||
|
||||
cleanup_cuda_multi_bit_pbs_64(stream, &pbs_buffer);
|
||||
bootstrap_multibit_teardown(
|
||||
stream, lwe_sk_in_array, lwe_sk_out_array, d_bsk_array, plaintexts,
|
||||
d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
|
||||
d_lwe_input_indexes, d_lwe_ct_out_array, d_lwe_output_indexes);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(MultiBitBootstrapTestPrimitives_u64, multi_bit_pbs) {
|
||||
|
||||
int bsk_size = (lwe_dimension / grouping_factor) * pbs_level *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) * polynomial_size *
|
||||
(1 << grouping_factor);
|
||||
|
||||
for (int r = 0; r < repetitions; r++) {
|
||||
uint64_t *d_bsk = d_bsk_array + (ptrdiff_t)(bsk_size * r);
|
||||
uint64_t *lwe_sk_out =
|
||||
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
|
||||
for (int s = 0; s < samples; s++) {
|
||||
uint64_t *d_lwe_ct_in =
|
||||
d_lwe_ct_in_array +
|
||||
(ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
|
||||
(lwe_dimension + 1));
|
||||
// Execute PBS
|
||||
cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
|
||||
stream, (void *)d_lwe_ct_out_array, (void *)d_lwe_output_indexes,
|
||||
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
|
||||
(void *)d_lwe_ct_in, (void *)d_lwe_input_indexes, (void *)d_bsk,
|
||||
pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, pbs_base_log, pbs_level, number_of_inputs, 1, 0,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
|
||||
// Copy result to the host memory
|
||||
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
|
||||
(glwe_dimension * polynomial_size + 1) *
|
||||
number_of_inputs * sizeof(uint64_t),
|
||||
stream);
|
||||
|
||||
for (int j = 0; j < number_of_inputs; j++) {
|
||||
uint64_t *result =
|
||||
lwe_ct_out_array +
|
||||
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
|
||||
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
|
||||
s * number_of_inputs + j];
|
||||
uint64_t decrypted = 0;
|
||||
core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out,
|
||||
glwe_dimension * polynomial_size);
|
||||
|
||||
EXPECT_NE(decrypted, plaintext)
|
||||
<< "Repetition: " << r << ", sample: " << s << ", input: " << j;
|
||||
|
||||
// The bit before the message
|
||||
uint64_t rounding_bit = delta >> 1;
|
||||
|
||||
// Compute the rounding bit
|
||||
uint64_t rounding = (decrypted & rounding_bit) << 1;
|
||||
uint64_t decoded = (decrypted + rounding) / delta;
|
||||
EXPECT_EQ(decoded, plaintext / delta)
|
||||
<< "Repetition: " << r << ", sample: " << s << ", input: " << j;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Defines for which parameters set the PBS will be tested.
|
||||
// It executes each src for all pairs on phis X qs (Cartesian product)
|
||||
::testing::internal::ParamGenerator<MultiBitBootstrapTestParams>
|
||||
multipbs_params_u64 = ::testing::Values(
|
||||
// fast src
|
||||
(MultiBitBootstrapTestParams){
|
||||
16, 1, 256, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
|
||||
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 23, 1, 2,
|
||||
2, 1, 2, 1, 10},
|
||||
(MultiBitBootstrapTestParams){
|
||||
16, 1, 256, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
|
||||
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 23, 1, 2,
|
||||
2, 128, 2, 1, 10},
|
||||
// 4_bits_multi_bit_group_2
|
||||
(MultiBitBootstrapTestParams){
|
||||
818, 1, 2048, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
|
||||
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 22, 1, 2,
|
||||
2, 1, 2, 1, 10},
|
||||
(MultiBitBootstrapTestParams){
|
||||
818, 1, 2048, new_gaussian_from_std_dev(sqrt(1.3880686109937e-15)),
|
||||
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-24)), 22, 1, 2,
|
||||
2, 128, 2, 1, 10},
|
||||
// 4_bits_multi_bit_group_3
|
||||
(MultiBitBootstrapTestParams){
|
||||
888, 1, 2048,
|
||||
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
|
||||
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
|
||||
1, 3, 1, 10},
|
||||
(MultiBitBootstrapTestParams){
|
||||
888, 1, 2048,
|
||||
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
|
||||
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
|
||||
128, 3, 1, 10});
|
||||
std::string
|
||||
printParamName(::testing::TestParamInfo<MultiBitBootstrapTestParams> p) {
|
||||
MultiBitBootstrapTestParams params = p.param;
|
||||
|
||||
return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
|
||||
std::to_string(params.glwe_dimension) + "_N_" +
|
||||
std::to_string(params.polynomial_size) + "_pbs_base_log_" +
|
||||
std::to_string(params.pbs_base_log) + "_pbs_level_" +
|
||||
std::to_string(params.pbs_level) + "_grouping_factor_" +
|
||||
std::to_string(params.grouping_factor) + "_number_of_inputs_" +
|
||||
std::to_string(params.number_of_inputs);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(MultiBitBootstrapInstantiation,
|
||||
MultiBitBootstrapTestPrimitives_u64,
|
||||
multipbs_params_u64, printParamName);
|
||||
@@ -1,249 +0,0 @@
|
||||
#include <algorithm>
|
||||
#include <bootstrap.h>
|
||||
#include <bootstrap_multibit.h>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <device.h>
|
||||
#include <functional>
|
||||
#include <random>
|
||||
#include <utils.h>
|
||||
|
||||
void init_seed(Seed *seed) {
|
||||
seed->lo = 0;
|
||||
seed->hi = 0;
|
||||
}
|
||||
|
||||
void shuffle_seed(Seed *seed) {
|
||||
// std::random_device rd;
|
||||
// std::mt19937 gen(rd());
|
||||
// std::uniform_int_distribution<unsigned long long> dis(
|
||||
// std::numeric_limits<std::uint64_t>::min(),
|
||||
// std::numeric_limits<std::uint64_t>::max());
|
||||
//
|
||||
// seed.lo += dis(gen);
|
||||
// seed.hi += dis(gen);
|
||||
|
||||
// This is a more convenient solution for testing
|
||||
seed->lo += 1;
|
||||
seed->hi += 1;
|
||||
}
|
||||
|
||||
// For each sample and repetition, create a plaintext
|
||||
// The payload_modulus is the message modulus times the carry modulus
|
||||
// (so the total message modulus)
|
||||
uint64_t *generate_plaintexts(uint64_t payload_modulus, uint64_t delta,
|
||||
int number_of_inputs, const unsigned repetitions,
|
||||
const unsigned samples) {
|
||||
|
||||
uint64_t *plaintext_array = (uint64_t *)malloc(
|
||||
repetitions * samples * number_of_inputs * sizeof(uint64_t));
|
||||
std::random_device rd;
|
||||
std::mt19937 gen(rd());
|
||||
std::uniform_int_distribution<unsigned long long> dis(
|
||||
std::numeric_limits<std::uint64_t>::min(),
|
||||
std::numeric_limits<std::uint64_t>::max());
|
||||
for (uint r = 0; r < repetitions; r++) {
|
||||
for (uint s = 0; s < samples; s++) {
|
||||
for (int i = 0; i < number_of_inputs; i++) {
|
||||
plaintext_array[r * samples * number_of_inputs + s * number_of_inputs +
|
||||
i] = (dis(gen) % payload_modulus) * delta;
|
||||
}
|
||||
}
|
||||
}
|
||||
return plaintext_array;
|
||||
}
|
||||
|
||||
uint64_t *generate_identity_lut_pbs(int polynomial_size, int glwe_dimension,
|
||||
int message_modulus, int carry_modulus,
|
||||
std::function<uint64_t(uint64_t)> func) {
|
||||
// Modulus of the msg contained in the msg bits and operations buffer
|
||||
uint64_t modulus_sup = message_modulus * carry_modulus;
|
||||
|
||||
// N/(p/2) = size of each block
|
||||
uint64_t box_size = polynomial_size / modulus_sup;
|
||||
|
||||
// Value of the shift we multiply our messages by
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (uint64_t)(modulus_sup);
|
||||
|
||||
// Create the plaintext lut_pbs
|
||||
uint64_t *plaintext_lut_pbs =
|
||||
(uint64_t *)malloc(polynomial_size * sizeof(uint64_t));
|
||||
|
||||
// This plaintext_lut_pbs extracts the carry bits
|
||||
for (uint64_t i = 0; i < modulus_sup; i++) {
|
||||
uint64_t index = i * box_size;
|
||||
for (uint64_t j = index; j < index + box_size; j++) {
|
||||
plaintext_lut_pbs[j] = func(i) * delta;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t half_box_size = box_size / 2;
|
||||
|
||||
// Negate the first half_box_size coefficients
|
||||
for (uint64_t i = 0; i < half_box_size; i++) {
|
||||
plaintext_lut_pbs[i] = -plaintext_lut_pbs[i];
|
||||
}
|
||||
|
||||
// Rotate the plaintext_lut_pbs
|
||||
std::rotate(plaintext_lut_pbs, plaintext_lut_pbs + half_box_size,
|
||||
plaintext_lut_pbs + polynomial_size);
|
||||
|
||||
// Create the GLWE lut_pbs
|
||||
uint64_t *lut_pbs = (uint64_t *)malloc(
|
||||
polynomial_size * (glwe_dimension + 1) * sizeof(uint64_t));
|
||||
for (int i = 0; i < polynomial_size * glwe_dimension; i++) {
|
||||
lut_pbs[i] = 0;
|
||||
}
|
||||
for (int i = 0; i < polynomial_size; i++) {
|
||||
int glwe_index = glwe_dimension * polynomial_size + i;
|
||||
lut_pbs[glwe_index] = plaintext_lut_pbs[i];
|
||||
}
|
||||
|
||||
free(plaintext_lut_pbs);
|
||||
return lut_pbs;
|
||||
}
|
||||
|
||||
// Generate repetitions LWE secret keys
|
||||
void generate_lwe_secret_keys(uint64_t **lwe_sk_array, int lwe_dimension,
|
||||
Seed *seed, const unsigned repetitions) {
|
||||
*lwe_sk_array =
|
||||
(uint64_t *)malloc(lwe_dimension * repetitions * sizeof(uint64_t));
|
||||
int shift = 0;
|
||||
for (uint r = 0; r < repetitions; r++) {
|
||||
// Generate the lwe secret key for each repetition
|
||||
core_crypto_lwe_secret_key(*lwe_sk_array + (ptrdiff_t)(shift),
|
||||
lwe_dimension, seed->lo, seed->hi);
|
||||
shift += lwe_dimension;
|
||||
}
|
||||
}
|
||||
|
||||
// Generate repetitions GLWE secret keys
|
||||
void generate_glwe_secret_keys(uint64_t **glwe_sk_array, int glwe_dimension,
|
||||
int polynomial_size, Seed *seed,
|
||||
const unsigned repetitions) {
|
||||
int glwe_sk_array_size = glwe_dimension * polynomial_size * repetitions;
|
||||
*glwe_sk_array = (uint64_t *)malloc(glwe_sk_array_size * sizeof(uint64_t));
|
||||
int shift = 0;
|
||||
for (uint r = 0; r < repetitions; r++) {
|
||||
// Generate the lwe secret key for each repetition
|
||||
core_crypto_lwe_secret_key(*glwe_sk_array + (ptrdiff_t)(shift),
|
||||
glwe_dimension * polynomial_size, seed->lo,
|
||||
seed->hi);
|
||||
shift += glwe_dimension * polynomial_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Generate repetitions LWE bootstrap keys
|
||||
void generate_lwe_bootstrap_keys(cuda_stream_t *stream,
|
||||
double **d_fourier_bsk_array,
|
||||
uint64_t *lwe_sk_in_array,
|
||||
uint64_t *lwe_sk_out_array, int lwe_dimension,
|
||||
int glwe_dimension, int polynomial_size,
|
||||
int pbs_level, int pbs_base_log, Seed *seed,
|
||||
DynamicDistribution noise_distribution,
|
||||
const unsigned repetitions) {
|
||||
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
|
||||
polynomial_size * (lwe_dimension + 1);
|
||||
int bsk_array_size = bsk_size * repetitions;
|
||||
|
||||
uint64_t *bsk_array = (uint64_t *)malloc(bsk_array_size * sizeof(uint64_t));
|
||||
*d_fourier_bsk_array =
|
||||
(double *)cuda_malloc_async(bsk_array_size * sizeof(double), stream);
|
||||
int shift_in = 0;
|
||||
int shift_out = 0;
|
||||
int shift_bsk = 0;
|
||||
|
||||
for (uint r = 0; r < repetitions; r++) {
|
||||
// Generate the bootstrap key for each repetition
|
||||
core_crypto_par_generate_lwe_bootstrapping_key(
|
||||
bsk_array + (ptrdiff_t)(shift_bsk), pbs_base_log, pbs_level,
|
||||
lwe_sk_in_array + (ptrdiff_t)(shift_in), lwe_dimension,
|
||||
lwe_sk_out_array + (ptrdiff_t)(shift_out), glwe_dimension,
|
||||
polynomial_size, noise_distribution, seed->lo, seed->hi);
|
||||
double *d_fourier_bsk = *d_fourier_bsk_array + (ptrdiff_t)(shift_bsk);
|
||||
uint64_t *bsk = bsk_array + (ptrdiff_t)(shift_bsk);
|
||||
cuda_synchronize_stream(stream);
|
||||
cuda_convert_lwe_bootstrap_key_64((void *)(d_fourier_bsk), (void *)(bsk),
|
||||
stream, lwe_dimension, glwe_dimension,
|
||||
pbs_level, polynomial_size);
|
||||
shift_in += lwe_dimension;
|
||||
shift_out += glwe_dimension * polynomial_size;
|
||||
shift_bsk += bsk_size;
|
||||
}
|
||||
cuda_synchronize_stream(stream);
|
||||
free(bsk_array);
|
||||
}
|
||||
|
||||
void generate_lwe_multi_bit_pbs_keys(
|
||||
cuda_stream_t *stream, uint64_t **d_bsk_array, uint64_t *lwe_sk_in_array,
|
||||
uint64_t *lwe_sk_out_array, int lwe_dimension, int glwe_dimension,
|
||||
int polynomial_size, int grouping_factor, int pbs_level, int pbs_base_log,
|
||||
Seed *seed, DynamicDistribution noise_distribution,
|
||||
const unsigned repetitions) {
|
||||
|
||||
int bsk_size = lwe_dimension * pbs_level * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * polynomial_size *
|
||||
(1 << grouping_factor) / grouping_factor;
|
||||
int bsk_array_size = bsk_size * repetitions;
|
||||
uint64_t *bsk_array = (uint64_t *)malloc(bsk_array_size * sizeof(uint64_t));
|
||||
|
||||
*d_bsk_array =
|
||||
(uint64_t *)cuda_malloc_async(bsk_array_size * sizeof(uint64_t), stream);
|
||||
for (uint r = 0; r < repetitions; r++) {
|
||||
int shift_in = 0;
|
||||
int shift_out = 0;
|
||||
int shift_bsk = 0;
|
||||
core_crypto_par_generate_lwe_multi_bit_bootstrapping_key(
|
||||
lwe_sk_in_array + (ptrdiff_t)(shift_in), lwe_dimension,
|
||||
lwe_sk_out_array + (ptrdiff_t)(shift_out), glwe_dimension,
|
||||
polynomial_size, bsk_array + (ptrdiff_t)(shift_bsk), pbs_base_log,
|
||||
pbs_level, grouping_factor, noise_distribution, 0, 0);
|
||||
uint64_t *d_bsk = *d_bsk_array + (ptrdiff_t)(shift_bsk);
|
||||
uint64_t *bsk = bsk_array + (ptrdiff_t)(shift_bsk);
|
||||
cuda_convert_lwe_multi_bit_bootstrap_key_64(
|
||||
d_bsk, bsk, stream, lwe_dimension, glwe_dimension, pbs_level,
|
||||
polynomial_size, grouping_factor);
|
||||
shift_in += lwe_dimension;
|
||||
shift_out += glwe_dimension * polynomial_size;
|
||||
shift_bsk += bsk_size;
|
||||
}
|
||||
cuda_synchronize_stream(stream);
|
||||
free(bsk_array);
|
||||
}
|
||||
|
||||
// Generate repetitions keyswitch keys
|
||||
void generate_lwe_keyswitch_keys(
|
||||
cuda_stream_t *stream, uint64_t **d_ksk_array, uint64_t *lwe_sk_in_array,
|
||||
uint64_t *lwe_sk_out_array, int input_lwe_dimension,
|
||||
int output_lwe_dimension, int ksk_level, int ksk_base_log, Seed *seed,
|
||||
DynamicDistribution noise_distribution, const unsigned repetitions) {
|
||||
|
||||
int ksk_size = ksk_level * (output_lwe_dimension + 1) * input_lwe_dimension;
|
||||
int ksk_array_size = ksk_size * repetitions;
|
||||
|
||||
uint64_t *ksk_array = (uint64_t *)malloc(ksk_array_size * sizeof(uint64_t));
|
||||
*d_ksk_array =
|
||||
(uint64_t *)cuda_malloc_async(ksk_array_size * sizeof(uint64_t), stream);
|
||||
int shift_in = 0;
|
||||
int shift_out = 0;
|
||||
int shift_ksk = 0;
|
||||
|
||||
for (uint r = 0; r < repetitions; r++) {
|
||||
// Generate the keyswitch key for each repetition
|
||||
core_crypto_par_generate_lwe_keyswitch_key(
|
||||
ksk_array + (ptrdiff_t)(shift_ksk), ksk_base_log, ksk_level,
|
||||
lwe_sk_in_array + (ptrdiff_t)(shift_in), input_lwe_dimension,
|
||||
lwe_sk_out_array + (ptrdiff_t)(shift_out), output_lwe_dimension,
|
||||
noise_distribution, seed->lo, seed->hi);
|
||||
uint64_t *d_ksk = *d_ksk_array + (ptrdiff_t)(shift_ksk);
|
||||
uint64_t *ksk = ksk_array + (ptrdiff_t)(shift_ksk);
|
||||
cuda_memcpy_async_to_gpu(d_ksk, ksk, ksk_size * sizeof(uint64_t), stream);
|
||||
|
||||
shift_in += input_lwe_dimension;
|
||||
shift_out += output_lwe_dimension;
|
||||
shift_ksk += ksk_size;
|
||||
}
|
||||
cuda_synchronize_stream(stream);
|
||||
free(ksk_array);
|
||||
}
|
||||
@@ -6,8 +6,8 @@ extern "C" {
|
||||
/// Create a new Cuda stream on GPU `gpu_index`
|
||||
pub fn cuda_create_stream(gpu_index: u32) -> *mut c_void;
|
||||
|
||||
/// Destroy the Cuda stream `v_stream`
|
||||
pub fn cuda_destroy_stream(v_stream: *mut c_void);
|
||||
/// Destroy the Cuda stream `v_stream` on GPU `gpu_index`
|
||||
pub fn cuda_destroy_stream(v_stream: *mut c_void) -> i32;
|
||||
|
||||
/// Allocate `size` memory on GPU `gpu_index` asynchronously
|
||||
pub fn cuda_malloc_async(size: u64, v_stream: *const c_void) -> *mut c_void;
|
||||
@@ -19,7 +19,7 @@ extern "C" {
|
||||
src: *const c_void,
|
||||
size: u64,
|
||||
v_stream: *const c_void,
|
||||
);
|
||||
) -> i32;
|
||||
|
||||
/// Copy `size` memory asynchronously from `src` on CPU to `dest` on GPU `gpu_index` using
|
||||
/// the Cuda stream `v_stream`.
|
||||
@@ -28,7 +28,7 @@ extern "C" {
|
||||
src: *const c_void,
|
||||
size: u64,
|
||||
v_stream: *const c_void,
|
||||
);
|
||||
) -> i32;
|
||||
|
||||
/// Copy `size` memory asynchronously from `src` to `dest` on the same GPU `gpu_index` using
|
||||
/// the Cuda stream `v_stream`.
|
||||
@@ -37,26 +37,31 @@ extern "C" {
|
||||
src: *const c_void,
|
||||
size: u64,
|
||||
v_stream: *const c_void,
|
||||
);
|
||||
) -> i32;
|
||||
|
||||
/// Copy `size` memory asynchronously from `src` on CPU to `dest` on GPU `gpu_index` using
|
||||
/// the Cuda stream `v_stream`.
|
||||
pub fn cuda_memset_async(dest: *mut c_void, value: u64, size: u64, v_stream: *const c_void);
|
||||
pub fn cuda_memset_async(
|
||||
dest: *mut c_void,
|
||||
value: u64,
|
||||
size: u64,
|
||||
v_stream: *const c_void,
|
||||
) -> i32;
|
||||
|
||||
/// Get the total number of Nvidia GPUs detected on the platform
|
||||
pub fn cuda_get_number_of_gpus() -> i32;
|
||||
|
||||
/// Synchronize all streams on GPU `gpu_index`
|
||||
pub fn cuda_synchronize_device(gpu_index: u32);
|
||||
pub fn cuda_synchronize_device(gpu_index: u32) -> i32;
|
||||
|
||||
/// Synchronize Cuda stream
|
||||
pub fn cuda_synchronize_stream(v_stream: *const c_void);
|
||||
pub fn cuda_synchronize_stream(v_stream: *const c_void) -> i32;
|
||||
|
||||
/// Free memory for pointer `ptr` on GPU `gpu_index` asynchronously, using stream `v_stream`
|
||||
pub fn cuda_drop_async(ptr: *mut c_void, v_stream: *const c_void);
|
||||
pub fn cuda_drop_async(ptr: *mut c_void, v_stream: *const c_void) -> i32;
|
||||
|
||||
/// Free memory for pointer `ptr` on GPU `gpu_index` synchronously
|
||||
pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32);
|
||||
pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32) -> i32;
|
||||
|
||||
/// Get the maximum amount of shared memory on GPU `gpu_index`
|
||||
pub fn cuda_get_max_shared_memory(gpu_index: u32) -> i32;
|
||||
@@ -215,7 +220,7 @@ extern "C" {
|
||||
|
||||
/// This cleanup function frees the data for the low latency PBS on GPU
|
||||
/// contained in pbs_buffer for 32 or 64-bit inputs.
|
||||
pub fn cleanup_cuda_bootstrap_low_latency_64(v_stream: *const c_void, pbs_buffer: *mut *mut i8);
|
||||
pub fn cleanup_cuda_bootstrap_low_latency(v_stream: *const c_void, pbs_buffer: *mut *mut i8);
|
||||
|
||||
/// This scratch function allocates the necessary amount of data on the GPU for
|
||||
/// the multi-bit PBS on 64-bit inputs into `pbs_buffer`.
|
||||
@@ -297,7 +302,7 @@ extern "C" {
|
||||
|
||||
/// This cleanup function frees the data for the multi-bit PBS on GPU
|
||||
/// contained in pbs_buffer for 64-bit inputs.
|
||||
pub fn cleanup_cuda_multi_bit_pbs_64(v_stream: *const c_void, pbs_buffer: *mut *mut i8);
|
||||
pub fn cleanup_cuda_multi_bit_pbs(v_stream: *const c_void, pbs_buffer: *mut *mut i8);
|
||||
|
||||
/// Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
|
||||
///
|
||||
|
||||
@@ -4,7 +4,6 @@ benchmark_parser
|
||||
|
||||
Parse criterion benchmark or keys size results.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import pathlib
|
||||
@@ -12,97 +11,45 @@ import json
|
||||
import sys
|
||||
|
||||
|
||||
ONE_HOUR_IN_NANOSECONDS = 3600e9
|
||||
ONE_HOUR_IN_NANOSECONDS = 3600E9
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"results",
|
||||
help="Location of criterion benchmark results directory."
|
||||
"If the --key-size option is used, then the value would have to point to"
|
||||
"a CSV file.",
|
||||
)
|
||||
parser.add_argument("output_file", help="File storing parsed results")
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--database",
|
||||
dest="database",
|
||||
help="Name of the database used to store results",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-w",
|
||||
"--hardware",
|
||||
dest="hardware",
|
||||
help="Hardware reference used to perform benchmark",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-V", "--project-version", dest="project_version", help="Commit hash reference"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-b",
|
||||
"--branch",
|
||||
dest="branch",
|
||||
help="Git branch name on which benchmark was performed",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--commit-date",
|
||||
dest="commit_date",
|
||||
help="Timestamp of commit hash used in project_version",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bench-date", dest="bench_date", help="Timestamp when benchmark was run"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--name-suffix",
|
||||
dest="name_suffix",
|
||||
default="",
|
||||
help="Suffix to append to each of the result test names",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--append-results",
|
||||
dest="append_results",
|
||||
action="store_true",
|
||||
help="Append parsed results to an existing file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--walk-subdirs",
|
||||
dest="walk_subdirs",
|
||||
action="store_true",
|
||||
help="Check for results in subdirectories",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--key-sizes",
|
||||
dest="key_sizes",
|
||||
action="store_true",
|
||||
help="Parse only the results regarding keys size measurements",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--key-gen",
|
||||
dest="key_gen",
|
||||
action="store_true",
|
||||
help="Parse only the results regarding keys generation time measurements",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--throughput",
|
||||
dest="throughput",
|
||||
action="store_true",
|
||||
help="Compute and append number of operations per second and"
|
||||
"operations per dollar",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--backend",
|
||||
dest="backend",
|
||||
default="cpu",
|
||||
help="Backend on which benchmarks have run",
|
||||
)
|
||||
parser.add_argument('results',
|
||||
help='Location of criterion benchmark results directory.'
|
||||
'If the --key-size option is used, then the value would have to point to'
|
||||
'a CSV file.')
|
||||
parser.add_argument('output_file', help='File storing parsed results')
|
||||
parser.add_argument('-d', '--database', dest='database',
|
||||
help='Name of the database used to store results')
|
||||
parser.add_argument('-w', '--hardware', dest='hardware',
|
||||
help='Hardware reference used to perform benchmark')
|
||||
parser.add_argument('-V', '--project-version', dest='project_version',
|
||||
help='Commit hash reference')
|
||||
parser.add_argument('-b', '--branch', dest='branch',
|
||||
help='Git branch name on which benchmark was performed')
|
||||
parser.add_argument('--commit-date', dest='commit_date',
|
||||
help='Timestamp of commit hash used in project_version')
|
||||
parser.add_argument('--bench-date', dest='bench_date',
|
||||
help='Timestamp when benchmark was run')
|
||||
parser.add_argument('--name-suffix', dest='name_suffix', default='',
|
||||
help='Suffix to append to each of the result test names')
|
||||
parser.add_argument('--append-results', dest='append_results', action='store_true',
|
||||
help='Append parsed results to an existing file')
|
||||
parser.add_argument('--walk-subdirs', dest='walk_subdirs', action='store_true',
|
||||
help='Check for results in subdirectories')
|
||||
parser.add_argument('--key-sizes', dest='key_sizes', action='store_true',
|
||||
help='Parse only the results regarding keys size measurements')
|
||||
parser.add_argument('--key-gen', dest='key_gen', action='store_true',
|
||||
help='Parse only the results regarding keys generation time measurements')
|
||||
parser.add_argument('--throughput', dest='throughput', action='store_true',
|
||||
help='Compute and append number of operations per second and'
|
||||
'operations per dollar')
|
||||
parser.add_argument('--backend', dest='backend', default='cpu',
|
||||
help='Backend on which benchmarks have run')
|
||||
|
||||
|
||||
def recursive_parse(
|
||||
directory,
|
||||
walk_subdirs=False,
|
||||
name_suffix="",
|
||||
compute_throughput=False,
|
||||
hardware_hourly_cost=None,
|
||||
):
|
||||
def recursive_parse(directory, walk_subdirs=False, name_suffix="", compute_throughput=False,
|
||||
hardware_hourly_cost=None):
|
||||
"""
|
||||
Parse all the benchmark results in a directory. It will attempt to parse all the files having a
|
||||
.json extension at the top-level of this directory.
|
||||
@@ -137,9 +84,7 @@ def recursive_parse(
|
||||
|
||||
full_name, test_name = parse_benchmark_file(subdir)
|
||||
if test_name is None:
|
||||
parsing_failures.append(
|
||||
(full_name, "'function_id' field is null in report")
|
||||
)
|
||||
parsing_failures.append((full_name, "'function_id' field is null in report"))
|
||||
continue
|
||||
|
||||
try:
|
||||
@@ -149,9 +94,7 @@ def recursive_parse(
|
||||
continue
|
||||
|
||||
for stat_name, value in parse_estimate_file(subdir).items():
|
||||
test_name_parts = list(
|
||||
filter(None, [test_name, stat_name, name_suffix])
|
||||
)
|
||||
test_name_parts = list(filter(None, [test_name, stat_name, name_suffix]))
|
||||
|
||||
result_values.append(
|
||||
_create_point(
|
||||
@@ -161,26 +104,19 @@ def recursive_parse(
|
||||
"latency",
|
||||
operator,
|
||||
params,
|
||||
display_name=display_name,
|
||||
display_name=display_name
|
||||
)
|
||||
)
|
||||
|
||||
lowercase_test_name = test_name.lower()
|
||||
# This is a special case where PBS are blasted as vector LWE ciphertext with
|
||||
# variable length to saturate the machine. To get the actual throughput we need to
|
||||
# multiply by the length of the vector.
|
||||
if (
|
||||
"pbs_throughput" in lowercase_test_name
|
||||
and lowercase_test_name.endswith("chunk")
|
||||
):
|
||||
if "PBS_throughput" in test_name and "chunk" in test_name:
|
||||
try:
|
||||
multiplier = int(
|
||||
lowercase_test_name.strip("chunk").split("::")[-1]
|
||||
)
|
||||
multiplier = int(test_name.split("chunk")[0].split("_")[-1])
|
||||
except ValueError:
|
||||
parsing_failures.append(
|
||||
(full_name, "failed to extract throughput multiplier")
|
||||
)
|
||||
parsing_failures.append((full_name,
|
||||
"failed to extract throughput multiplier"))
|
||||
continue
|
||||
else:
|
||||
multiplier = 1
|
||||
@@ -196,7 +132,7 @@ def recursive_parse(
|
||||
"throughput",
|
||||
operator,
|
||||
params,
|
||||
display_name="_".join([display_name, test_suffix]),
|
||||
display_name="_".join([display_name, test_suffix])
|
||||
)
|
||||
)
|
||||
test_name_parts.pop()
|
||||
@@ -206,23 +142,20 @@ def recursive_parse(
|
||||
test_name_parts.append(test_suffix)
|
||||
result_values.append(
|
||||
_create_point(
|
||||
multiplier
|
||||
* compute_ops_per_dollar(value, hardware_hourly_cost),
|
||||
multiplier * compute_ops_per_dollar(value, hardware_hourly_cost),
|
||||
"_".join(test_name_parts),
|
||||
bench_class,
|
||||
"throughput",
|
||||
operator,
|
||||
params,
|
||||
display_name="_".join([display_name, test_suffix]),
|
||||
display_name="_".join([display_name, test_suffix])
|
||||
)
|
||||
)
|
||||
|
||||
return result_values, parsing_failures
|
||||
|
||||
|
||||
def _create_point(
|
||||
value, test_name, bench_class, bench_type, operator, params, display_name=None
|
||||
):
|
||||
def _create_point(value, test_name, bench_class, bench_type, operator, params, display_name=None):
|
||||
return {
|
||||
"value": value,
|
||||
"test": test_name,
|
||||
@@ -230,8 +163,7 @@ def _create_point(
|
||||
"class": bench_class,
|
||||
"type": bench_type,
|
||||
"operator": operator,
|
||||
"params": params,
|
||||
}
|
||||
"params": params}
|
||||
|
||||
|
||||
def parse_benchmark_file(directory):
|
||||
@@ -274,24 +206,21 @@ def _parse_key_results(result_file, bench_type):
|
||||
|
||||
with result_file.open() as csv_file:
|
||||
reader = csv.reader(csv_file)
|
||||
for test_name, value in reader:
|
||||
for (test_name, value) in reader:
|
||||
try:
|
||||
params, display_name, operator = get_parameters(test_name)
|
||||
except Exception as err:
|
||||
parsing_failures.append((test_name, f"failed to get parameters: {err}"))
|
||||
continue
|
||||
|
||||
result_values.append(
|
||||
{
|
||||
"value": int(value),
|
||||
"test": test_name,
|
||||
"name": display_name,
|
||||
"class": "keygen",
|
||||
"type": bench_type,
|
||||
"operator": operator,
|
||||
"params": params,
|
||||
}
|
||||
)
|
||||
result_values.append({
|
||||
"value": int(value),
|
||||
"test": test_name,
|
||||
"name": display_name,
|
||||
"class": "keygen",
|
||||
"type": bench_type,
|
||||
"operator": operator,
|
||||
"params": params})
|
||||
|
||||
return result_values, parsing_failures
|
||||
|
||||
@@ -359,7 +288,7 @@ def compute_ops_per_second(data_point):
|
||||
|
||||
:return: number of operations per second
|
||||
"""
|
||||
return 1e9 / data_point
|
||||
return 1E9 / data_point
|
||||
|
||||
|
||||
def _parse_file_to_json(directory, filename):
|
||||
@@ -408,16 +337,9 @@ def check_mandatory_args(input_args):
|
||||
|
||||
missing_args = []
|
||||
for arg_name in vars(input_args):
|
||||
if arg_name in [
|
||||
"results_dir",
|
||||
"output_file",
|
||||
"name_suffix",
|
||||
"append_results",
|
||||
"walk_subdirs",
|
||||
"key_sizes",
|
||||
"key_gen",
|
||||
"throughput",
|
||||
]:
|
||||
if arg_name in ["results_dir", "output_file", "name_suffix",
|
||||
"append_results", "walk_subdirs", "key_sizes",
|
||||
"key_gen", "throughput"]:
|
||||
continue
|
||||
if not getattr(input_args, arg_name):
|
||||
missing_args.append(arg_name)
|
||||
@@ -432,7 +354,7 @@ if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
check_mandatory_args(args)
|
||||
|
||||
# failures = []
|
||||
#failures = []
|
||||
raw_results = pathlib.Path(args.results)
|
||||
if args.key_sizes or args.key_gen:
|
||||
if args.key_sizes:
|
||||
@@ -448,8 +370,7 @@ if __name__ == "__main__":
|
||||
if args.throughput:
|
||||
print("Throughput computation enabled")
|
||||
ec2_costs = json.loads(
|
||||
pathlib.Path("ci/ec2_products_cost.json").read_text(encoding="utf-8")
|
||||
)
|
||||
pathlib.Path("ci/ec2_products_cost.json").read_text(encoding="utf-8"))
|
||||
try:
|
||||
hardware_cost = abs(ec2_costs[args.hardware])
|
||||
print(f"Hardware hourly cost: {hardware_cost} $/h")
|
||||
@@ -457,13 +378,8 @@ if __name__ == "__main__":
|
||||
print(f"Cannot find hardware hourly cost for '{args.hardware}'")
|
||||
sys.exit(1)
|
||||
|
||||
results, failures = recursive_parse(
|
||||
raw_results,
|
||||
args.walk_subdirs,
|
||||
args.name_suffix,
|
||||
args.throughput,
|
||||
hardware_cost,
|
||||
)
|
||||
results, failures = recursive_parse(raw_results, args.walk_subdirs, args.name_suffix,
|
||||
args.throughput, hardware_cost)
|
||||
|
||||
print("Parsing results done")
|
||||
|
||||
|
||||
@@ -3,6 +3,5 @@
|
||||
"hpc7a.96xlarge": 7.7252,
|
||||
"p3.2xlarge": 3.06,
|
||||
"p4d.24xlarge": 32.7726,
|
||||
"p5.48xlarge": 98.32,
|
||||
"rtx4090": 0.04
|
||||
"p5.48xlarge": 98.32
|
||||
}
|
||||
|
||||
@@ -33,18 +33,6 @@ def check_security(filename):
|
||||
|
||||
print(f"\t{param.tag}...\t", end= "")
|
||||
|
||||
is_n_size_too_low = param.n <= 450
|
||||
is_noise_level_too_low = param.Xe.stddev < 4.0
|
||||
if is_n_size_too_low :
|
||||
reason = f"n size is too low {param.n} minimum is 450"
|
||||
elif is_noise_level_too_low:
|
||||
reason = f"noise level is too low {round(param.Xe.stddev,3)} minimum is 4.0"
|
||||
|
||||
if is_n_size_too_low or is_noise_level_too_low:
|
||||
print(f"FAIL\t{reason}")
|
||||
to_update.append((param, reason))
|
||||
continue
|
||||
|
||||
try:
|
||||
# The lattice estimator is not able to manage such large dimension.
|
||||
# If we have the security for smaller `n` then we have security for larger ones.
|
||||
@@ -82,7 +70,7 @@ if __name__ == "__main__":
|
||||
print("Some parameters need update")
|
||||
print("----------------------------")
|
||||
for param, reason in params_to_update:
|
||||
print(f"[{param.tag}] reason: {reason} (param: {param})")
|
||||
print(f"[{param.tag}] reason: {reason} (param)")
|
||||
sys.exit(int(1)) # Explicit conversion is needed to make this call work
|
||||
else:
|
||||
print("All parameters passed the security check")
|
||||
|
||||
25
ci/slab.toml
25
ci/slab.toml
@@ -20,7 +20,7 @@ instance_type = "hpc7a.96xlarge"
|
||||
|
||||
[profile.gpu-test]
|
||||
region = "us-east-1"
|
||||
image_id = "ami-0c0bf195ca4c175b6"
|
||||
image_id = "ami-05b4b37bcbb24dc48"
|
||||
instance_type = "p3.2xlarge"
|
||||
# One spawn attempt every 30 seconds for 1 hour
|
||||
spawn_retry_attempts = 120
|
||||
@@ -28,7 +28,7 @@ spawn_retry_duration = 60
|
||||
|
||||
[profile.gpu-bench]
|
||||
region = "us-east-1"
|
||||
image_id = "ami-0c0bf195ca4c175b6"
|
||||
image_id = "ami-05b4b37bcbb24dc48"
|
||||
instance_type = "p4d.24xlarge"
|
||||
# One spawn attempt every 30 seconds for 6 hours
|
||||
spawn_retry_attempts = 720
|
||||
@@ -37,7 +37,7 @@ max_spot_hourly_price = "100.0"
|
||||
|
||||
[profile.gpu-bench-big]
|
||||
region = "us-east-1"
|
||||
image_id = "ami-0c0bf195ca4c175b6"
|
||||
image_id = "ami-05b4b37bcbb24dc48"
|
||||
instance_type = "p5.48xlarge"
|
||||
spawn_retry_attempts = 720
|
||||
spawn_retry_duration = 360
|
||||
@@ -63,6 +63,11 @@ workflow = "aws_tfhe_wasm_tests.yml"
|
||||
profile = "cpu-small"
|
||||
check_run_name = "CPU AWS WASM Tests"
|
||||
|
||||
[command.cpu_fast_test]
|
||||
workflow = "aws_tfhe_fast_tests.yml"
|
||||
profile = "cpu-big"
|
||||
check_run_name = "CPU AWS Fast Tests"
|
||||
|
||||
[command.gpu_test]
|
||||
workflow = "aws_tfhe_gpu_tests.yml"
|
||||
profile = "gpu-test"
|
||||
@@ -128,15 +133,15 @@ workflow = "boolean_benchmark.yml"
|
||||
profile = "bench"
|
||||
check_run_name = "Boolean CPU AWS Benchmarks"
|
||||
|
||||
[command.core_crypto_bench]
|
||||
workflow = "core_crypto_benchmark.yml"
|
||||
[command.pbs_bench]
|
||||
workflow = "pbs_benchmark.yml"
|
||||
profile = "bench"
|
||||
check_run_name = "Core crypto CPU AWS Benchmarks"
|
||||
check_run_name = "PBS CPU AWS Benchmarks"
|
||||
|
||||
[command.core_crypto_gpu_bench]
|
||||
workflow = "core_crypto_gpu_benchmark.yml"
|
||||
profile = "gpu-test"
|
||||
check_run_name = "Core crypto GPU AWS Benchmarks"
|
||||
[command.pbs_gpu_bench]
|
||||
workflow = "pbs_gpu_benchmark.yml"
|
||||
profile = "gpu-bench"
|
||||
check_run_name = "PBS GPU AWS Benchmarks"
|
||||
|
||||
[command.wasm_client_bench]
|
||||
workflow = "wasm_client_benchmark.yml"
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# Concrete CSPRNG
|
||||
|
||||
This crate contains a fast *Cryptographically Secure Pseudoramdon Number Generator*, used in the
|
||||
[TFHE-rs](https://crates.io/crates/tfhe) library, you can find it [here](../tfhe/) in this repo.
|
||||
['concrete-core'](https://crates.io/crates/concrete-core) library, you can find it [here](../concrete-core/) in this repo.
|
||||
|
||||
The implementation is based on the AES blockcipher used in CTR mode, as described in the ISO/IEC
|
||||
18033-4 standard.
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user