mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-11 07:38:08 -05:00
Compare commits
418 Commits
am/refacto
...
jb/tmp
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d0937aae20 | ||
|
|
e81152a630 | ||
|
|
8c4675dc3e | ||
|
|
29fb4fbe77 | ||
|
|
f84c34c903 | ||
|
|
cc905a04c7 | ||
|
|
3fc791e813 | ||
|
|
d4f8fb8f57 | ||
|
|
68ce43d2f0 | ||
|
|
c5b9e5400a | ||
|
|
8167c85764 | ||
|
|
98bd45503c | ||
|
|
ed50042719 | ||
|
|
053d56a3d6 | ||
|
|
e5b117ca29 | ||
|
|
9de486f33c | ||
|
|
ccf879c9ae | ||
|
|
d3c1f91948 | ||
|
|
273dbe1b85 | ||
|
|
7ac061266f | ||
|
|
c1c56ab770 | ||
|
|
00dad37812 | ||
|
|
f94533d70d | ||
|
|
b7d7e68d0c | ||
|
|
e8135c207d | ||
|
|
601b200351 | ||
|
|
a0d5bf2fc2 | ||
|
|
58223dea09 | ||
|
|
1f3096b743 | ||
|
|
4a3d5d432a | ||
|
|
c6bfcd75a6 | ||
|
|
85dfd70c6b | ||
|
|
c720656340 | ||
|
|
1c209403a6 | ||
|
|
347fc9aaa7 | ||
|
|
198485b5fb | ||
|
|
bd7547c93d | ||
|
|
955495d714 | ||
|
|
902755c33c | ||
|
|
89f845fa4f | ||
|
|
9f89d2c09d | ||
|
|
ea0d146ed0 | ||
|
|
943ccdf450 | ||
|
|
f39896ac63 | ||
|
|
46a87c6f89 | ||
|
|
a5579532be | ||
|
|
41e1781226 | ||
|
|
697ce94ee2 | ||
|
|
a667b654ef | ||
|
|
1bff07b6eb | ||
|
|
59664e84c8 | ||
|
|
79dc101728 | ||
|
|
6828438898 | ||
|
|
a8f4cf7c29 | ||
|
|
30d2f5f66d | ||
|
|
112cc6f6c9 | ||
|
|
93581f7ee1 | ||
|
|
6e08e91109 | ||
|
|
75f0ad1d4b | ||
|
|
618758bd95 | ||
|
|
d770a271b3 | ||
|
|
80468494b2 | ||
|
|
26e5af542f | ||
|
|
f23b4f21dc | ||
|
|
b394da3dbb | ||
|
|
6007cd2c81 | ||
|
|
a6fdc46794 | ||
|
|
0134a4a0f2 | ||
|
|
68dfd96993 | ||
|
|
6811177178 | ||
|
|
753c7aa0d2 | ||
|
|
f38a9a9b4c | ||
|
|
c7f6eb0119 | ||
|
|
85da12c00f | ||
|
|
20b1427f72 | ||
|
|
716677f383 | ||
|
|
d09e5ab066 | ||
|
|
f8bfeb8927 | ||
|
|
67b543b6e7 | ||
|
|
b2cfe2765c | ||
|
|
8397637b24 | ||
|
|
42b7c2f403 | ||
|
|
b708abb10b | ||
|
|
e62808b2b4 | ||
|
|
62135791bf | ||
|
|
41c38d127b | ||
|
|
d55d68ec52 | ||
|
|
9faab7b9a6 | ||
|
|
ff539aab6b | ||
|
|
799829eab4 | ||
|
|
c30395daef | ||
|
|
ebce4fcfd4 | ||
|
|
85a428bb43 | ||
|
|
c4266bd610 | ||
|
|
76a7cd9b24 | ||
|
|
9baa54b636 | ||
|
|
863e0c275b | ||
|
|
cd13b40dbb | ||
|
|
1c8e88ebfd | ||
|
|
02bac34f1b | ||
|
|
4576508ccb | ||
|
|
7190dad1e3 | ||
|
|
18b9458401 | ||
|
|
747ade0a54 | ||
|
|
ada460b429 | ||
|
|
56f9b221eb | ||
|
|
52f3babde5 | ||
|
|
3ff5d551a9 | ||
|
|
0b1ea3b7dc | ||
|
|
e0fddc8ea7 | ||
|
|
5354cffd8e | ||
|
|
d258d1fcf4 | ||
|
|
7cecbb30b2 | ||
|
|
ece82c51a5 | ||
|
|
8c54c8200b | ||
|
|
b6bfe30065 | ||
|
|
d5c0c0242c | ||
|
|
6826b6b638 | ||
|
|
f0b4749aca | ||
|
|
eb4785001d | ||
|
|
16d6b2f75d | ||
|
|
9bdeb697ad | ||
|
|
b5615bb3ad | ||
|
|
37b94780b2 | ||
|
|
035a70d81f | ||
|
|
f5c971652d | ||
|
|
a0b75d9a37 | ||
|
|
90da50dc53 | ||
|
|
473a6a0f40 | ||
|
|
8f1a1da4e1 | ||
|
|
a3c07dedad | ||
|
|
5513d3a894 | ||
|
|
6ef0a2b4ef | ||
|
|
bba4bcee88 | ||
|
|
45befcaf40 | ||
|
|
97feefe2ed | ||
|
|
4ef8045a67 | ||
|
|
48f67fb427 | ||
|
|
bce3bf1733 | ||
|
|
253062c5aa | ||
|
|
b44ed91519 | ||
|
|
ddb010d8f1 | ||
|
|
8a9559c4d1 | ||
|
|
02265705fc | ||
|
|
7b4bb6ad55 | ||
|
|
fd084d50c5 | ||
|
|
f59cb6c632 | ||
|
|
c594734fcf | ||
|
|
f9669c3294 | ||
|
|
76665ab478 | ||
|
|
9b454abe2a | ||
|
|
da08115c10 | ||
|
|
8aec783dd9 | ||
|
|
4bf28b836a | ||
|
|
9df529bc59 | ||
|
|
71bff0963c | ||
|
|
eeaf45dbc7 | ||
|
|
353f279a9e | ||
|
|
8355ed5c10 | ||
|
|
84844bb4eb | ||
|
|
9b6e861f9b | ||
|
|
b73f24057d | ||
|
|
43c0799655 | ||
|
|
f94a63eedc | ||
|
|
35d65bcde7 | ||
|
|
0bfe59a656 | ||
|
|
ffe4c7135a | ||
|
|
a8f329fc75 | ||
|
|
11db96d394 | ||
|
|
ae8d48138c | ||
|
|
e912394b52 | ||
|
|
8958b6df98 | ||
|
|
aeb36ee14f | ||
|
|
b3976f2963 | ||
|
|
0d6e0c7224 | ||
|
|
bd26d0ecd6 | ||
|
|
16f457b57c | ||
|
|
6060882a7a | ||
|
|
3e2833ac64 | ||
|
|
bc85163c23 | ||
|
|
45b2548b17 | ||
|
|
0476ee0c3c | ||
|
|
8d77ea0a57 | ||
|
|
f10fa3f13c | ||
|
|
fd4e2059f4 | ||
|
|
e7352eee8b | ||
|
|
33a7e9f3e4 | ||
|
|
96da25ce90 | ||
|
|
548f2e5d05 | ||
|
|
f313b58c8e | ||
|
|
fd038346b7 | ||
|
|
f0fcfd517b | ||
|
|
0d2448e9e9 | ||
|
|
68ef237ae6 | ||
|
|
fb39864f05 | ||
|
|
69a5562aba | ||
|
|
d49ffdd26f | ||
|
|
c87c362d42 | ||
|
|
905ef4ea78 | ||
|
|
957dd47295 | ||
|
|
0a550ac803 | ||
|
|
8c62155429 | ||
|
|
3e23631bdc | ||
|
|
02b2fcf78d | ||
|
|
d9d222c1b5 | ||
|
|
f2011cd30d | ||
|
|
1b3d41ec44 | ||
|
|
65816a175a | ||
|
|
2d1cf95900 | ||
|
|
247072a81a | ||
|
|
c65a58c14f | ||
|
|
3a4859553e | ||
|
|
9c3a159ca1 | ||
|
|
e76ddd5a49 | ||
|
|
fa35b6ef8f | ||
|
|
a6e835b3f1 | ||
|
|
c586d64fab | ||
|
|
6108f180bf | ||
|
|
8f0e4f6c99 | ||
|
|
473b7e0c6a | ||
|
|
e9c92bc9a3 | ||
|
|
e4c0c4c15f | ||
|
|
ea7c579efc | ||
|
|
c9d19fca19 | ||
|
|
f3e6074480 | ||
|
|
17c110f536 | ||
|
|
45e27d8836 | ||
|
|
ee10508c99 | ||
|
|
9213436b93 | ||
|
|
aa2a8e31fe | ||
|
|
d49b8235bf | ||
|
|
cec4a5b60b | ||
|
|
15147b4359 | ||
|
|
f2ee360a47 | ||
|
|
6631aae069 | ||
|
|
fa9cd866e4 | ||
|
|
c632ac1b9a | ||
|
|
f0e6b4c395 | ||
|
|
2cd51ed36d | ||
|
|
dc04a5138e | ||
|
|
eda338aa29 | ||
|
|
df6fa86481 | ||
|
|
6742e150b0 | ||
|
|
93fac32755 | ||
|
|
9ac57e75c9 | ||
|
|
a7abee0491 | ||
|
|
0228a58cfc | ||
|
|
f98c680e95 | ||
|
|
dcc3d267e4 | ||
|
|
2067092e0a | ||
|
|
7c50216f7a | ||
|
|
77c0532793 | ||
|
|
00ddfdec8b | ||
|
|
ef9ec13999 | ||
|
|
b53c8aac3f | ||
|
|
f052c1f8ba | ||
|
|
fef6d18605 | ||
|
|
e5505ab686 | ||
|
|
ab2c5f09a8 | ||
|
|
40ae841a15 | ||
|
|
0a317c5f0e | ||
|
|
415a8a2de5 | ||
|
|
935da25360 | ||
|
|
dbeff4e4b4 | ||
|
|
1e50d0cdd2 | ||
|
|
7c5551bf45 | ||
|
|
95c36d54cb | ||
|
|
384e15ca5a | ||
|
|
526a53f3d4 | ||
|
|
7d17b71740 | ||
|
|
8ecb85e4dd | ||
|
|
cf30db7a30 | ||
|
|
2599f7d5ea | ||
|
|
ae88bb3264 | ||
|
|
6fb898db66 | ||
|
|
e750d2cd92 | ||
|
|
d8586080da | ||
|
|
4cc2e85556 | ||
|
|
303a65c88d | ||
|
|
18c01e74d6 | ||
|
|
a8b6c72910 | ||
|
|
c2b21ed709 | ||
|
|
ad41fdf5a5 | ||
|
|
eeae19f35f | ||
|
|
798572e58c | ||
|
|
36d375943c | ||
|
|
a1488b10d5 | ||
|
|
b153641280 | ||
|
|
48405959a4 | ||
|
|
d39e73be91 | ||
|
|
71447d845f | ||
|
|
837df59b44 | ||
|
|
303cac2092 | ||
|
|
bb1a969c34 | ||
|
|
9dac9242be | ||
|
|
e97ac815eb | ||
|
|
62feb59722 | ||
|
|
ac8916a30f | ||
|
|
069ea98ad6 | ||
|
|
e587d1835e | ||
|
|
16121e7487 | ||
|
|
3ab566de7b | ||
|
|
2309b07703 | ||
|
|
8755094c38 | ||
|
|
4da10e9dd5 | ||
|
|
cdda260063 | ||
|
|
be413fff50 | ||
|
|
3ed960d255 | ||
|
|
bdadd39a34 | ||
|
|
f03f2f9c6d | ||
|
|
f03ec9bbed | ||
|
|
5137751dd2 | ||
|
|
edc3449dbf | ||
|
|
6068c509de | ||
|
|
30a4348e3a | ||
|
|
3013e02d90 | ||
|
|
c029917c5c | ||
|
|
d23d04021b | ||
|
|
5a5e9e0ac1 | ||
|
|
cc0a3bad8d | ||
|
|
1d12f60849 | ||
|
|
a0db39c86e | ||
|
|
ef4558ac13 | ||
|
|
bfb22b4531 | ||
|
|
88025010e1 | ||
|
|
b1f4f3b330 | ||
|
|
7575a426ab | ||
|
|
1ac57218b1 | ||
|
|
b7c3f16e24 | ||
|
|
bf4f9198fb | ||
|
|
e618e1d05d | ||
|
|
000428d688 | ||
|
|
937c90666b | ||
|
|
b6a6f1b098 | ||
|
|
c2d7f1748c | ||
|
|
e8cd55dee6 | ||
|
|
95aea9dbe8 | ||
|
|
89f701d307 | ||
|
|
224146686f | ||
|
|
b6b5f92220 | ||
|
|
0fec9e252b | ||
|
|
53c9b82824 | ||
|
|
f670a950d6 | ||
|
|
a44970a9a3 | ||
|
|
55775b8e02 | ||
|
|
523d561de6 | ||
|
|
61a50d0bcc | ||
|
|
ee57f5658b | ||
|
|
9362965f50 | ||
|
|
00fb60451d | ||
|
|
18b9fd4464 | ||
|
|
eace0bfb85 | ||
|
|
af1be5ebca | ||
|
|
916bd8a09f | ||
|
|
20cb0642ce | ||
|
|
151f9f6d82 | ||
|
|
8db8cb49e4 | ||
|
|
b4583976a2 | ||
|
|
b450375da1 | ||
|
|
f02f1fb297 | ||
|
|
17642fa703 | ||
|
|
23fa9b24bd | ||
|
|
0453b9bd60 | ||
|
|
9b2cf67911 | ||
|
|
36a7656048 | ||
|
|
61c8eadd58 | ||
|
|
fdd4d9d1cc | ||
|
|
62700ab853 | ||
|
|
27445645e7 | ||
|
|
ea0cd26c0b | ||
|
|
ff48582679 | ||
|
|
a77c87ff12 | ||
|
|
6d143f1edc | ||
|
|
216e6b443a | ||
|
|
1400ae946c | ||
|
|
c332902a05 | ||
|
|
cf7a7f132d | ||
|
|
6e0a3b9ad7 | ||
|
|
1f825dde08 | ||
|
|
f9222de47c | ||
|
|
5732e8dd7a | ||
|
|
9db35c5474 | ||
|
|
b69f73e8e6 | ||
|
|
90bdf75147 | ||
|
|
233ea17adf | ||
|
|
df6ee79841 | ||
|
|
6497fb9a15 | ||
|
|
d8894e3b69 | ||
|
|
42636bab13 | ||
|
|
ec27d3dc6f | ||
|
|
5272c95de4 | ||
|
|
27d7ace3ef | ||
|
|
d80ab231a8 | ||
|
|
fe3fa531f9 | ||
|
|
5c1573c266 | ||
|
|
7772e8112d | ||
|
|
5e92cb1475 | ||
|
|
f51e19b071 | ||
|
|
aeb00ae584 | ||
|
|
ce5e9c1bdb | ||
|
|
4d4e124e94 | ||
|
|
ca6d37e06f | ||
|
|
e3143315f3 | ||
|
|
f8636fe814 | ||
|
|
7e72400321 | ||
|
|
728b409256 | ||
|
|
d91404e567 | ||
|
|
e11c3d7b7c | ||
|
|
6f8eeb043c | ||
|
|
00d55182b4 | ||
|
|
6f6ce106c3 | ||
|
|
68fcbb5280 | ||
|
|
3f46389cc8 | ||
|
|
9e8dd01cb9 | ||
|
|
0085ceb97b | ||
|
|
be9a4d2d9c | ||
|
|
87421e8307 | ||
|
|
0c3919628f |
2
.github/ISSUE_TEMPLATE/bug_report.md
vendored
2
.github/ISSUE_TEMPLATE/bug_report.md
vendored
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: Bug report
|
||||
about: Report a problem with concrete
|
||||
about: Report a problem with TFHE-rs
|
||||
title: ''
|
||||
labels: triage_required
|
||||
assignees: ''
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/feature_request.md
vendored
2
.github/ISSUE_TEMPLATE/feature_request.md
vendored
@@ -1,6 +1,6 @@
|
||||
---
|
||||
name: Feature request
|
||||
about: Suggest an idea for concrete
|
||||
about: Suggest an idea for TFHE-rs
|
||||
title: ''
|
||||
labels: feature_request
|
||||
assignees: ''
|
||||
|
||||
9
.github/actionlint.yaml
vendored
Normal file
9
.github/actionlint.yaml
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
self-hosted-runner:
|
||||
# Labels of self-hosted runner in array of strings.
|
||||
labels:
|
||||
- m1mac
|
||||
- 4090-desktop
|
||||
# Configuration variables in array of strings defined in your repository or
|
||||
# organization. `null` means disabling configuration variables check.
|
||||
# Empty array means no configuration variable is allowed.
|
||||
config-variables: null
|
||||
34
.github/workflows/approve_label.yml
vendored
Normal file
34
.github/workflows/approve_label.yml
vendored
Normal file
@@ -0,0 +1,34 @@
|
||||
# Manage approved label in pull request
|
||||
name: PR approved label manager
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
pull_request_review:
|
||||
types: [submitted]
|
||||
|
||||
jobs:
|
||||
trigger-tests:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
steps:
|
||||
- name: Get current labels
|
||||
uses: snnaplab/get-labels-action@f426df40304808ace3b5282d4f036515f7609576
|
||||
|
||||
# Remove label if a push is performed after an approval
|
||||
- name: Remove approved label
|
||||
if: ${{ github.event_name == 'pull_request' && contains(fromJSON(env.LABELS), 'approved') }}
|
||||
uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
|
||||
with:
|
||||
# We use a PAT to have the same user (zama-bot) for label deletion as for creation.
|
||||
github_token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
|
||||
labels: approved
|
||||
|
||||
# Add label only if the review is approved and if the label doesn't already exist
|
||||
- name: Add approved label
|
||||
uses: actions-ecosystem/action-add-labels@18f1af5e3544586314bbe15c0273249c770b2daf
|
||||
if: ${{ github.event_name == 'pull_request_review' && github.event.review.state == 'approved' && !contains(fromJSON(env.LABELS), 'approved') }}
|
||||
with:
|
||||
# We need to use a PAT to be able to trigger `labeled` event for the other workflow.
|
||||
github_token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
|
||||
labels: approved
|
||||
107
.github/workflows/aws_tfhe_fast_tests.yml
vendored
107
.github/workflows/aws_tfhe_fast_tests.yml
vendored
@@ -5,66 +5,56 @@ env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
# All the inputs are provided by Slab
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "AWS instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "AWS instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "AWS instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: 'Slab request ID'
|
||||
type: string
|
||||
fork_repo:
|
||||
description: 'Name of forked repo as user/repo'
|
||||
type: string
|
||||
fork_git_sha:
|
||||
description: 'Git SHA to checkout from fork'
|
||||
type: string
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
fast-tests:
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ inputs.runner_name }}
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (fast-tests)
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
|
||||
aws-region: ${{ steps.start-instance.outputs.aws-region }}
|
||||
steps:
|
||||
# Step used for log purpose.
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "ID: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
echo "Fork repo: ${{ inputs.fork_repo }}"
|
||||
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
repository: ${{ inputs.fork_repo }}
|
||||
ref: ${{ inputs.fork_git_sha }}
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
profile: cpu-big
|
||||
|
||||
fast-tests:
|
||||
name: Fast CPU tests
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: stable
|
||||
default: true
|
||||
|
||||
- name: Run concrete-csprng tests
|
||||
run: |
|
||||
@@ -120,8 +110,29 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Fast AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (fast-tests)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, fast-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
region: ${{ needs.setup-ec2.outputs.aws-region }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "EC2 teardown (fast-tests) failed. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
75
.github/workflows/aws_tfhe_gpu_4090_tests.yml
vendored
Normal file
75
.github/workflows/aws_tfhe_gpu_4090_tests.yml
vendored
Normal file
@@ -0,0 +1,75 @@
|
||||
# Compile and test tfhe-cuda-backend on an RTX 4090 machine
|
||||
name: TFHE Cuda Backend - 4090 full tests
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [labeled]
|
||||
|
||||
jobs:
|
||||
cuda-tests-linux:
|
||||
name: CUDA tests (RTX 4090)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, '4090_test') }}
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ["self-hosted", "4090-desktop"]
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Run fmt checks
|
||||
run: |
|
||||
make check_fmt_gpu
|
||||
|
||||
- name: Run clippy checks
|
||||
run: |
|
||||
make pcc_gpu
|
||||
|
||||
- name: Run core crypto, integer and internal CUDA backend tests
|
||||
run: |
|
||||
make test_gpu
|
||||
|
||||
- name: Run user docs tests
|
||||
run: |
|
||||
make test_user_doc_gpu
|
||||
|
||||
- name: Test C API
|
||||
run: |
|
||||
make test_c_api_gpu
|
||||
|
||||
- name: Run High Level API Tests
|
||||
run: |
|
||||
make test_high_level_api_gpu
|
||||
|
||||
- uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
|
||||
if: ${{ github.event_name == 'pull_request' }}
|
||||
with:
|
||||
labels: 4090_test
|
||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "CUDA RTX 4090 tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
144
.github/workflows/aws_tfhe_gpu_tests.yml
vendored
Normal file
144
.github/workflows/aws_tfhe_gpu_tests.yml
vendored
Normal file
@@ -0,0 +1,144 @@
|
||||
# Compile and test tfhe-cuda-backend on an AWS instance
|
||||
name: TFHE Cuda Backend - Full tests
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (cuda-tests)
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
|
||||
aws-region: ${{ steps.start-instance.outputs.aws-region }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
profile: gpu-test
|
||||
|
||||
cuda-tests-linux:
|
||||
name: CUDA tests
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run fmt checks
|
||||
run: |
|
||||
make check_fmt_gpu
|
||||
|
||||
- name: Run clippy checks
|
||||
run: |
|
||||
make pcc_gpu
|
||||
|
||||
- name: Run core crypto, integer and internal CUDA backend tests
|
||||
run: |
|
||||
make test_gpu
|
||||
|
||||
- name: Run user docs tests
|
||||
run: |
|
||||
make test_user_doc_gpu
|
||||
|
||||
- name: Test C API
|
||||
run: |
|
||||
make test_c_api_gpu
|
||||
|
||||
- name: Run High Level API Tests
|
||||
run: |
|
||||
make test_high_level_api_gpu
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "CUDA AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (cuda-tests)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, cuda-tests-linux ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
region: ${{ needs.setup-ec2.outputs.aws-region }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "EC2 teardown (cuda-tests) failed. (${{ env.ACTION_RUN_URL }})"
|
||||
125
.github/workflows/aws_tfhe_integer_tests.yml
vendored
125
.github/workflows/aws_tfhe_integer_tests.yml
vendored
@@ -1,77 +1,77 @@
|
||||
name: AWS Integer Tests on CPU
|
||||
name: AWS Unsigned Integer Tests on CPU
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
# All the inputs are provided by Slab
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "AWS instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "AWS instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "AWS instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: 'Slab request ID'
|
||||
type: string
|
||||
fork_repo:
|
||||
description: 'Name of forked repo as user/repo'
|
||||
type: string
|
||||
fork_git_sha:
|
||||
description: 'Git SHA to checkout from fork'
|
||||
type: string
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
|
||||
jobs:
|
||||
integer-tests:
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ inputs.runner_name }}
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (unsigned-integer-tests)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
|
||||
aws-region: ${{ steps.start-instance.outputs.aws-region }}
|
||||
steps:
|
||||
# Step used for log purpose.
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "ID: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
echo "Fork repo: ${{ inputs.fork_repo }}"
|
||||
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
repository: ${{ inputs.fork_repo }}
|
||||
ref: ${{ inputs.fork_git_sha }}
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
profile: cpu-big
|
||||
|
||||
unsigned-integer-tests:
|
||||
name: Unsigned integer tests
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: stable
|
||||
default: true
|
||||
|
||||
- name: Gen Keys if required
|
||||
run: |
|
||||
make GEN_KEY_CACHE_MULTI_BIT_ONLY=TRUE gen_key_cache
|
||||
|
||||
- name: Run unsigned integer multi-bit tests
|
||||
run: |
|
||||
AVX512_SUPPORT=ON make test_unsigned_integer_multi_bit_ci
|
||||
|
||||
- name: Gen Keys if required
|
||||
run: |
|
||||
make gen_key_cache
|
||||
|
||||
- name: Run integer tests
|
||||
- name: Run unsigned integer tests
|
||||
run: |
|
||||
AVX512_SUPPORT=ON BIG_TESTS_INSTANCE=TRUE make test_integer_ci
|
||||
AVX512_SUPPORT=ON BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_ci
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
@@ -79,8 +79,29 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
SLACK_MESSAGE: "Unsigned Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (unsigned-integer-tests)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, unsigned-integer-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
region: ${{ needs.setup-ec2.outputs.aws-region }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "EC2 teardown (unsigned-integer-tests) failed. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
90
.github/workflows/aws_tfhe_multi_bit_tests.yml
vendored
90
.github/workflows/aws_tfhe_multi_bit_tests.yml
vendored
@@ -1,90 +0,0 @@
|
||||
name: AWS Multi Bit Tests on CPU
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
# All the inputs are provided by Slab
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "AWS instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "AWS instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "AWS instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: 'Slab request ID'
|
||||
type: string
|
||||
fork_repo:
|
||||
description: 'Name of forked repo as user/repo'
|
||||
type: string
|
||||
fork_git_sha:
|
||||
description: 'Git SHA to checkout from fork'
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
multi-bit-tests:
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ inputs.runner_name }}
|
||||
steps:
|
||||
# Step used for log purpose.
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "ID: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
echo "Fork repo: ${{ inputs.fork_repo }}"
|
||||
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
with:
|
||||
repository: ${{ inputs.fork_repo }}
|
||||
ref: ${{ inputs.fork_git_sha }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
with:
|
||||
toolchain: stable
|
||||
default: true
|
||||
|
||||
- name: Gen Keys if required
|
||||
run: |
|
||||
make GEN_KEY_CACHE_MULTI_BIT_ONLY=TRUE gen_key_cache
|
||||
|
||||
- name: Run shortint multi-bit tests
|
||||
run: |
|
||||
make test_shortint_multi_bit_ci
|
||||
|
||||
- name: Run integer multi-bit tests
|
||||
run: |
|
||||
AVX512_SUPPORT=ON make test_integer_multi_bit_ci
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Shortint tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
111
.github/workflows/aws_tfhe_signed_integer_tests.yml
vendored
Normal file
111
.github/workflows/aws_tfhe_signed_integer_tests.yml
vendored
Normal file
@@ -0,0 +1,111 @@
|
||||
name: AWS Signed Integer Tests on CPU
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
|
||||
jobs:
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (signed-integer-tests)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
|
||||
aws-region: ${{ steps.start-instance.outputs.aws-region }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
profile: cpu-big
|
||||
|
||||
signed-integer-tests:
|
||||
name: Signed integer tests
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Gen Keys if required
|
||||
run: |
|
||||
make GEN_KEY_CACHE_MULTI_BIT_ONLY=TRUE gen_key_cache
|
||||
|
||||
- name: Run shortint multi-bit tests
|
||||
run: |
|
||||
make test_shortint_multi_bit_ci
|
||||
|
||||
- name: Run signed integer multi-bit tests
|
||||
run: |
|
||||
AVX512_SUPPORT=ON make test_signed_integer_multi_bit_ci
|
||||
|
||||
- name: Gen Keys if required
|
||||
run: |
|
||||
make gen_key_cache
|
||||
|
||||
- name: Run signed integer tests
|
||||
run: |
|
||||
AVX512_SUPPORT=ON BIG_TESTS_INSTANCE=TRUE make test_signed_integer_ci
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Signed Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (signed-integer-tests)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, signed-integer-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
region: ${{ needs.setup-ec2.outputs.aws-region }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "EC2 teardown (signed-integer-tests) failed. (${{ env.ACTION_RUN_URL }})"
|
||||
117
.github/workflows/aws_tfhe_tests.yml
vendored
117
.github/workflows/aws_tfhe_tests.yml
vendored
@@ -4,66 +4,58 @@ env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
# All the inputs are provided by Slab
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "AWS instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "AWS instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "AWS instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: 'Slab request ID'
|
||||
type: string
|
||||
fork_repo:
|
||||
description: 'Name of forked repo as user/repo'
|
||||
type: string
|
||||
fork_git_sha:
|
||||
description: 'Git SHA to checkout from fork'
|
||||
type: string
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
|
||||
jobs:
|
||||
shortint-tests:
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ inputs.runner_name }}
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (cpu-tests)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
|
||||
aws-region: ${{ steps.start-instance.outputs.aws-region }}
|
||||
steps:
|
||||
# Step used for log purpose.
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "ID: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
echo "Fork repo: ${{ inputs.fork_repo }}"
|
||||
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
repository: ${{ inputs.fork_repo }}
|
||||
ref: ${{ inputs.fork_git_sha }}
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
profile: cpu-big
|
||||
|
||||
cpu-tests:
|
||||
name: CPU tests
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: stable
|
||||
default: true
|
||||
|
||||
- name: Run concrete-csprng tests
|
||||
run: |
|
||||
@@ -100,6 +92,12 @@ jobs:
|
||||
- name: Run example tests
|
||||
run: |
|
||||
make test_examples
|
||||
make dark_market
|
||||
|
||||
- name: Run apps tests
|
||||
run: |
|
||||
make test_trivium
|
||||
make test_kreyvium
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
@@ -107,8 +105,29 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Shortint tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
SLACK_MESSAGE: "CPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (cpu-tests)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, cpu-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
region: ${{ needs.setup-ec2.outputs.aws-region }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "EC2 teardown (cpu-tests) failed. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
109
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
109
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
@@ -4,66 +4,58 @@ env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
# All the inputs are provided by Slab
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "AWS instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "AWS instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "AWS instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: 'Slab request ID'
|
||||
type: string
|
||||
fork_repo:
|
||||
description: 'Name of forked repo as user/repo'
|
||||
type: string
|
||||
fork_git_sha:
|
||||
description: 'Git SHA to checkout from fork'
|
||||
type: string
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
|
||||
jobs:
|
||||
wasm-tests:
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ inputs.runner_name }}
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (wasm-tests)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
|
||||
aws-region: ${{ steps.start-instance.outputs.aws-region }}
|
||||
steps:
|
||||
# Step used for log purpose.
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "ID: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
echo "Fork repo: ${{ inputs.fork_repo }}"
|
||||
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
repository: ${{ inputs.fork_repo }}
|
||||
ref: ${{ inputs.fork_git_sha }}
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
profile: cpu-small
|
||||
|
||||
wasm-tests:
|
||||
name: WASM tests
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: stable
|
||||
default: true
|
||||
|
||||
- name: Run js on wasm API tests
|
||||
run: |
|
||||
@@ -80,8 +72,29 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "WASM tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (wasm-tests)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, wasm-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
region: ${{ needs.setup-ec2.outputs.aws-region }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "EC2 teardown (wasm-tests) failed. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
13
.github/workflows/boolean_benchmark.yml
vendored
13
.github/workflows/boolean_benchmark.yml
vendored
@@ -32,6 +32,8 @@ env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-boolean-benchmarks:
|
||||
@@ -51,7 +53,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -61,14 +63,13 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make AVX512_SUPPORT=ON bench_boolean
|
||||
make bench_boolean
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
@@ -96,13 +97,13 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_boolean
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
10
.github/workflows/cargo_build.yml
vendored
10
.github/workflows/cargo_build.yml
vendored
@@ -6,6 +6,8 @@ on:
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref }}
|
||||
@@ -17,11 +19,11 @@ jobs:
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest, windows-latest]
|
||||
os: [ubuntu-latest, macos-latest-large, windows-latest]
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
|
||||
- name: Install and run newline linter checks
|
||||
if: matrix.os == 'ubuntu-latest'
|
||||
@@ -66,5 +68,9 @@ jobs:
|
||||
run: |
|
||||
make build_c_api
|
||||
|
||||
- name: Build coverage tests
|
||||
run: |
|
||||
make build_tfhe_coverage
|
||||
|
||||
# The wasm build check is a bit annoying to set-up here and is done during the tests in
|
||||
# aws_tfhe_tests.yml
|
||||
|
||||
27
.github/workflows/ci_lint.yml
vendored
Normal file
27
.github/workflows/ci_lint.yml
vendored
Normal file
@@ -0,0 +1,27 @@
|
||||
# Lint and check CI
|
||||
name: CI Lint and Checks
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
|
||||
env:
|
||||
ACTIONLINT_VERSION: 1.6.27
|
||||
|
||||
jobs:
|
||||
lint-check:
|
||||
name: Lint and checks
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
|
||||
- name: Get actionlint
|
||||
run: |
|
||||
bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash) ${{ env.ACTIONLINT_VERSION }}
|
||||
echo "f2ee6d561ce00fa93aab62a7791c1a0396ec7e8876b2a8f2057475816c550782 actionlint" > checksum
|
||||
sha256sum -c checksum
|
||||
ln -s "$(pwd)/actionlint" /usr/local/bin/
|
||||
|
||||
- name: Lint workflows
|
||||
run: |
|
||||
make lint_workflow
|
||||
34
.github/workflows/code_coverage.yml
vendored
34
.github/workflows/code_coverage.yml
vendored
@@ -4,6 +4,8 @@ env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
@@ -38,6 +40,7 @@ jobs:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ inputs.runner_name }}
|
||||
timeout-minutes: 11520 # 8 days
|
||||
steps:
|
||||
# Step used for log purpose.
|
||||
- name: Instance configuration used
|
||||
@@ -50,7 +53,7 @@ jobs:
|
||||
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
|
||||
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: ${{ inputs.fork_repo }}
|
||||
ref: ${{ inputs.fork_git_sha }}
|
||||
@@ -60,14 +63,13 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: stable
|
||||
default: true
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@408093d9ff9c134c33b974e0722ce06b9d6e8263
|
||||
uses: tj-actions/changed-files@aa08304bd477b800d468db44fe10f6c61f7f7b11
|
||||
with:
|
||||
files_yaml: |
|
||||
tfhe:
|
||||
@@ -79,6 +81,12 @@ jobs:
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
run: |
|
||||
make GEN_KEY_CACHE_COVERAGE_ONLY=TRUE gen_key_cache
|
||||
make gen_key_cache_core_crypto
|
||||
|
||||
- name: Run coverage for core_crypto
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
run: |
|
||||
make test_core_crypto_cov AVX512_SUPPORT=ON
|
||||
|
||||
- name: Run coverage for boolean
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
@@ -91,13 +99,27 @@ jobs:
|
||||
make test_shortint_cov
|
||||
|
||||
- name: Upload tfhe coverage to Codecov
|
||||
uses: codecov/codecov-action@eaaf4bedf32dbdc6b720b63067d99c4d77d6047d
|
||||
uses: codecov/codecov-action@54bcd8715eee62d40e33596ef5e8f0f48dbbccab
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
directory: ./coverage/
|
||||
fail_ci_if_error: true
|
||||
files: shortint/cobertura.xml,boolean/cobertura.xml
|
||||
files: shortint/cobertura.xml,boolean/cobertura.xml,core_crypto/cobertura.xml,core_crypto_avx512/cobertura.xml
|
||||
|
||||
- name: Run integer coverage
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
run: |
|
||||
make test_integer_cov
|
||||
|
||||
- name: Upload tfhe coverage to Codecov
|
||||
uses: codecov/codecov-action@54bcd8715eee62d40e33596ef5e8f0f48dbbccab
|
||||
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
directory: ./coverage/
|
||||
fail_ci_if_error: true
|
||||
files: integer/cobertura.xml
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# Run PBS benchmarks on an AWS instance and return parsed results to Slab CI bot.
|
||||
name: PBS benchmarks
|
||||
# Run core crypto benchmarks on an AWS instance and return parsed results to Slab CI bot.
|
||||
name: Core crypto benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
@@ -32,10 +32,12 @@ env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-pbs-benchmarks:
|
||||
name: Execute PBS benchmarks in EC2
|
||||
run-core-crypto-benchmarks:
|
||||
name: Execute core crypto benchmarks in EC2
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !cancelled() }}
|
||||
steps:
|
||||
@@ -51,7 +53,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -61,14 +63,14 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make AVX512_SUPPORT=ON bench_pbs
|
||||
make bench_pbs
|
||||
make bench_ks
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
@@ -86,13 +88,13 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_pbs
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
157
.github/workflows/core_crypto_gpu_benchmark.yml
vendored
Normal file
157
.github/workflows/core_crypto_gpu_benchmark.yml
vendored
Normal file
@@ -0,0 +1,157 @@
|
||||
# Run core crypto benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
|
||||
name: Core crypto GPU benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "Instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "Instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "Instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: "Slab request ID"
|
||||
type: string
|
||||
# This input is not used in this workflow but still mandatory since a calling workflow could
|
||||
# use it. If a triggering command include a user_inputs field, then the triggered workflow
|
||||
# must include this very input, otherwise the workflow won't be called.
|
||||
# See start_full_benchmarks.yml as example.
|
||||
user_inputs:
|
||||
description: "Type of benchmarks to run"
|
||||
type: string
|
||||
default: "weekly_benchmarks"
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
|
||||
jobs:
|
||||
run-core-crypto-benchmarks:
|
||||
name: Execute GPU core crypto benchmarks in EC2
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !cancelled() }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
steps:
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "IDs: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Get benchmark date
|
||||
run: |
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CUDA_PATH=$CUDA_PATH";
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "HOME=/home/ubuntu";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make bench_pbs_gpu
|
||||
make bench_ks_gpu
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
|
||||
COMMIT_HASH="$(git describe --tags --dirty)"
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware ${{ inputs.instance_type }} \
|
||||
--backend gpu \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--name-suffix avx512 \
|
||||
--walk-subdirs \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on downloaded artifact"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "PBS GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
74
.github/workflows/csprng_randomness_testing.yml
vendored
74
.github/workflows/csprng_randomness_testing.yml
vendored
@@ -1,74 +0,0 @@
|
||||
name: CSPRNG randomness testing Workflow
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
# All the inputs are provided by Slab
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "AWS instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "AWS instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "AWS instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: 'Slab request ID'
|
||||
type: string
|
||||
fork_repo:
|
||||
description: 'Name of forked repo as user/repo'
|
||||
type: string
|
||||
fork_git_sha:
|
||||
description: 'Git SHA to checkout from fork'
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
csprng-randomness-teting:
|
||||
name: CSPRNG randomness testing
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ inputs.runner_name }}
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
with:
|
||||
repository: ${{ inputs.fork_repo }}
|
||||
ref: ${{ inputs.fork_git_sha }}
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
with:
|
||||
toolchain: stable
|
||||
default: true
|
||||
|
||||
- name: Dieharder randomness test suite
|
||||
run: |
|
||||
make dieharder_csprng
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "concrete-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
96
.github/workflows/csprng_randomness_tests.yml
vendored
Normal file
96
.github/workflows/csprng_randomness_tests.yml
vendored
Normal file
@@ -0,0 +1,96 @@
|
||||
name: CSPRNG randomness testing Workflow
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
|
||||
|
||||
jobs:
|
||||
setup-ec2:
|
||||
name: Setup EC2 instance (csprng-randomness-tests)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
|
||||
aws-region: ${{ steps.start-instance.outputs.aws-region }}
|
||||
steps:
|
||||
- name: Start instance
|
||||
id: start-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: start
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
profile: cpu-small
|
||||
|
||||
csprng-randomness-tests:
|
||||
name: CSPRNG randomness tests
|
||||
needs: setup-ec2
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Dieharder randomness test suite
|
||||
run: |
|
||||
make dieharder_csprng
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "concrete-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
teardown-ec2:
|
||||
name: Teardown EC2 instance (csprng-randomness-tests)
|
||||
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
|
||||
needs: [ setup-ec2, csprng-randomness-tests ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
id: stop-instance
|
||||
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
|
||||
with:
|
||||
mode: stop
|
||||
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
|
||||
slab-url: ${{ secrets.SLAB_BASE_URL }}
|
||||
job-secret: ${{ secrets.JOB_SECRET }}
|
||||
region: ${{ needs.setup-ec2.outputs.aws-region }}
|
||||
label: ${{ needs.setup-ec2.outputs.runner-name }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "EC2 teardown (csprng-randomness-tests) failed. (${{ env.ACTION_RUN_URL }})"
|
||||
202
.github/workflows/gpu_4090_full_benchmark.yml
vendored
Normal file
202
.github/workflows/gpu_4090_full_benchmark.yml
vendored
Normal file
@@ -0,0 +1,202 @@
|
||||
# Run all benchmarks on an RTX 4090 machine and return parsed results to Slab CI bot.
|
||||
name: TFHE Cuda Backend - 4090 full benchmarks
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [labeled]
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Friday at 9p.m.
|
||||
- cron: "0 21 * * 5"
|
||||
|
||||
jobs:
|
||||
cuda-integer-benchmarks:
|
||||
name: Cuda integer benchmarks for all operations flavor (RTX 4090)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}_cuda_integer_bench
|
||||
cancel-in-progress: true
|
||||
runs-on: ["self-hosted", "4090-desktop"]
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
command: [integer, integer_multi_bit]
|
||||
op_flavor: [default, unchecked]
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Run integer benchmarks
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "rtx4090" \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ always() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Integer RTX 4090 full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
cuda-core-crypto-benchmarks:
|
||||
name: Cuda core crypto benchmarks (RTX 4090)
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
|
||||
needs: cuda-integer-benchmarks
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}_cuda_core_crypto_bench
|
||||
cancel-in-progress: true
|
||||
runs-on: ["self-hosted", "4090-desktop"]
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Run integer benchmarks
|
||||
run: |
|
||||
make bench_pbs_gpu
|
||||
make bench_ks_gpu
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware "rtx4090" \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ !success() && !cancelled() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "Core crypto RTX 4090 full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
remove_github_label:
|
||||
name: Remove 4090 bench label
|
||||
if: ${{ github.event_name == 'pull_request' }}
|
||||
needs: [cuda-integer-benchmarks, cuda-core-crypto-benchmarks]
|
||||
runs-on: ["self-hosted", "4090-desktop"]
|
||||
steps:
|
||||
- uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
|
||||
with:
|
||||
labels: 4090_bench
|
||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
||||
15
.github/workflows/integer_benchmark.yml
vendored
15
.github/workflows/integer_benchmark.yml
vendored
@@ -25,6 +25,8 @@ env:
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-integer-benchmarks:
|
||||
@@ -44,7 +46,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -54,14 +56,13 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make AVX512_SUPPORT=ON FAST_BENCH=TRUE bench_integer
|
||||
make FAST_BENCH=TRUE bench_integer
|
||||
|
||||
- name: Parse benchmarks to csv
|
||||
run: |
|
||||
@@ -69,7 +70,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -90,13 +91,13 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
31
.github/workflows/integer_full_benchmark.yml
vendored
31
.github/workflows/integer_full_benchmark.yml
vendored
@@ -28,6 +28,8 @@ env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
prepare-matrix:
|
||||
@@ -39,20 +41,17 @@ jobs:
|
||||
- name: Weekly benchmarks
|
||||
if: ${{ github.event.inputs.user_inputs == 'weekly_benchmarks' }}
|
||||
run: |
|
||||
echo "OP_FLAVOR=[\"default\", \"default_comp\", \"default_scalar\", \"default_scalar_comp\"]" >> ${GITHUB_ENV}
|
||||
echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Quarterly benchmarks
|
||||
if: ${{ github.event.inputs.user_inputs == 'quarterly_benchmarks' }}
|
||||
run: |
|
||||
echo "OP_FLAVOR=[\"default\", \"default_comp\", \"default_scalar\", \"default_scalar_comp\", \
|
||||
\"smart\", \"smart_comp\", \"smart_scalar\", \"smart_parallelized\", \"smart_parallelized_comp\", \"smart_scalar_parallelized\", \"smart_scalar_parallelized_comp\", \
|
||||
\"unchecked\", \"unchecked_comp\", \"unchecked_scalar\", \"unchecked_scalar_comp\", \
|
||||
\"misc\"]" >> ${GITHUB_ENV}
|
||||
echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\", \"misc\"]" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set operation flavor output
|
||||
id: set_op_flavor
|
||||
run: |
|
||||
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> ${GITHUB_OUTPUT}
|
||||
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
integer-benchmarks:
|
||||
name: Execute integer benchmarks for all operations flavor
|
||||
@@ -60,6 +59,7 @@ jobs:
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !cancelled() }}
|
||||
continue-on-error: true
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
strategy:
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
@@ -74,15 +74,17 @@ jobs:
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
@@ -90,13 +92,12 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -104,7 +105,7 @@ jobs:
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
@@ -120,7 +121,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
161
.github/workflows/integer_gpu_benchmark.yml
vendored
Normal file
161
.github/workflows/integer_gpu_benchmark.yml
vendored
Normal file
@@ -0,0 +1,161 @@
|
||||
# Run integer benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
|
||||
name: Integer GPU benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "Instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "Instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "Instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: "Slab request ID"
|
||||
type: string
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-integer-benchmarks:
|
||||
name: Execute integer benchmarks in EC2
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !cancelled() }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
steps:
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "IDs: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Get benchmark date
|
||||
run: |
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CUDA_PATH=$CUDA_PATH";
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_gpu
|
||||
|
||||
- name: Parse benchmarks to csv
|
||||
run: |
|
||||
make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
|
||||
COMMIT_HASH="$(git describe --tags --dirty)"
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware ${{ inputs.instance_type }} \
|
||||
--backend gpu \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ !success() && !cancelled() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Integer GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
169
.github/workflows/integer_gpu_full_benchmark.yml
vendored
Normal file
169
.github/workflows/integer_gpu_full_benchmark.yml
vendored
Normal file
@@ -0,0 +1,169 @@
|
||||
# Run all integer benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
|
||||
name: Integer GPU full benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "Instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "Instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "Instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: "Slab request ID"
|
||||
type: string
|
||||
# This input is not used in this workflow but still mandatory since a calling workflow could
|
||||
# use it. If a triggering command include a user_inputs field, then the triggered workflow
|
||||
# must include this very input, otherwise the workflow won't be called.
|
||||
# See start_full_benchmarks.yml as example.
|
||||
user_inputs:
|
||||
description: "Type of benchmarks to run"
|
||||
type: string
|
||||
default: "weekly_benchmarks"
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
integer-benchmarks:
|
||||
name: Execute integer benchmarks for all operations flavor
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
if: ${{ !cancelled() }}
|
||||
continue-on-error: true
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
command: [integer, integer_multi_bit]
|
||||
op_flavor: [default, unchecked]
|
||||
# explicit include-based build matrix, of known valid options
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
steps:
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "IDs: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CUDA_PATH=$CUDA_PATH";
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware ${{ inputs.instance_type }} \
|
||||
--backend gpu \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
slack-notification:
|
||||
name: Slack Notification
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !success() && !cancelled() }}
|
||||
needs: integer-benchmarks
|
||||
steps:
|
||||
- name: Notify
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Integer GPU full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
@@ -25,6 +25,8 @@ env:
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-integer-benchmarks:
|
||||
@@ -44,7 +46,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -54,14 +56,13 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
|
||||
- name: Run multi-bit benchmarks with AVX512
|
||||
run: |
|
||||
make AVX512_SUPPORT=ON FAST_BENCH=TRUE bench_integer_multi_bit
|
||||
make FAST_BENCH=TRUE bench_integer_multi_bit
|
||||
|
||||
- name: Parse benchmarks to csv
|
||||
run: |
|
||||
@@ -69,7 +70,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -90,13 +91,13 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
162
.github/workflows/integer_multi_bit_gpu_benchmark.yml
vendored
Normal file
162
.github/workflows/integer_multi_bit_gpu_benchmark.yml
vendored
Normal file
@@ -0,0 +1,162 @@
|
||||
# Run integer benchmarks with multi-bit cryptographic parameters on an AWS instance and return parsed results to Slab CI bot.
|
||||
name: Integer GPU Multi-bit benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "Instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "Instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "Instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: "Slab request ID"
|
||||
type: string
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
cuda-integer-benchmarks:
|
||||
name: Execute integer multi-bit benchmarks in EC2
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
if: ${{ !cancelled() }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
# explicit include-based build matrix, of known valid options
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
cuda: "12.2"
|
||||
gcc: 9
|
||||
env:
|
||||
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
|
||||
steps:
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "IDs: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Get benchmark date
|
||||
run: |
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Export CUDA variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CUDA_PATH=$CUDA_PATH";
|
||||
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
|
||||
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
|
||||
} >> "${GITHUB_ENV}"
|
||||
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
|
||||
|
||||
# Specify the correct host compilers
|
||||
- name: Export gcc and g++ variables
|
||||
if: ${{ !cancelled() }}
|
||||
run: |
|
||||
{
|
||||
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
|
||||
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run multi-bit benchmarks with AVX512
|
||||
run: |
|
||||
make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_multi_bit_gpu
|
||||
|
||||
- name: Parse benchmarks to csv
|
||||
run: |
|
||||
make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
|
||||
COMMIT_HASH="$(git describe --tags --dirty)"
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware ${{ inputs.instance_type }} \
|
||||
--backend gpu \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ !success() && !cancelled() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Integer GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
17
.github/workflows/m1_tests.yml
vendored
17
.github/workflows/m1_tests.yml
vendored
@@ -14,8 +14,9 @@ on:
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RUSTFLAGS: "-C target-cpu=native"
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
CARGO_PROFILE: release_lto_off
|
||||
FAST_TESTS: "TRUE"
|
||||
|
||||
concurrency:
|
||||
@@ -26,15 +27,16 @@ jobs:
|
||||
cargo-builds:
|
||||
if: ${{ (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'm1_test') }}
|
||||
runs-on: ["self-hosted", "m1mac"]
|
||||
# 12 hours, default is 6 hours, hopefully this is more than enough
|
||||
timeout-minutes: 720
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
|
||||
- name: Install latest stable
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: stable
|
||||
default: true
|
||||
|
||||
- name: Run pcc checks
|
||||
run: |
|
||||
@@ -111,10 +113,9 @@ jobs:
|
||||
run: |
|
||||
make test_shortint_multi_bit_ci
|
||||
|
||||
# # These multi bit integer tests are too slow on M1 with low core count and low RAM
|
||||
# - name: Run integer multi bit tests
|
||||
# run: |
|
||||
# make test_integer_multi_bit_ci
|
||||
- name: Run integer multi bit tests
|
||||
run: |
|
||||
make test_integer_multi_bit_ci
|
||||
|
||||
remove_label:
|
||||
name: Remove m1_test label
|
||||
|
||||
6
.github/workflows/make_release.yml
vendored
6
.github/workflows/make_release.yml
vendored
@@ -30,7 +30,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -49,7 +49,7 @@ jobs:
|
||||
|
||||
- name: Publish web package
|
||||
if: ${{ inputs.push_web_package }}
|
||||
uses: JS-DevTools/npm-publish@fe72237be0920f7a0cafd6a966c9b929c9466e9b
|
||||
uses: JS-DevTools/npm-publish@4b07b26a2f6e0a51846e1870223e545bae91c552
|
||||
with:
|
||||
token: ${{ secrets.NPM_TOKEN }}
|
||||
package: tfhe/pkg/package.json
|
||||
@@ -65,7 +65,7 @@ jobs:
|
||||
|
||||
- name: Publish Node package
|
||||
if: ${{ inputs.push_node_package }}
|
||||
uses: JS-DevTools/npm-publish@fe72237be0920f7a0cafd6a966c9b929c9466e9b
|
||||
uses: JS-DevTools/npm-publish@4b07b26a2f6e0a51846e1870223e545bae91c552
|
||||
with:
|
||||
token: ${{ secrets.NPM_TOKEN }}
|
||||
package: tfhe/pkg/package.json
|
||||
|
||||
@@ -18,7 +18,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
|
||||
5
.github/workflows/parameters_check.yml
vendored
5
.github/workflows/parameters_check.yml
vendored
@@ -17,13 +17,14 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
|
||||
- name: Checkout lattice-estimator
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: malb/lattice-estimator
|
||||
path: lattice_estimator
|
||||
ref: '53508253629d3b5d31a2ad110e85dc69391ccb95'
|
||||
|
||||
- name: Install Sage
|
||||
run: |
|
||||
|
||||
13
.github/workflows/shortint_benchmark.yml
vendored
13
.github/workflows/shortint_benchmark.yml
vendored
@@ -24,6 +24,8 @@ env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-shortint-benchmarks:
|
||||
@@ -43,7 +45,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -53,14 +55,13 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make AVX512_SUPPORT=ON bench_shortint
|
||||
make bench_shortint
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
@@ -88,13 +89,13 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_shortint
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
21
.github/workflows/shortint_full_benchmark.yml
vendored
21
.github/workflows/shortint_full_benchmark.yml
vendored
@@ -32,6 +32,8 @@ env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
shortint-benchmarks:
|
||||
@@ -51,15 +53,17 @@ jobs:
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
@@ -67,13 +71,12 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
@@ -81,7 +84,7 @@ jobs:
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_shortint
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_shortint
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
@@ -112,7 +115,7 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
130
.github/workflows/signed_integer_benchmark.yml
vendored
Normal file
130
.github/workflows/signed_integer_benchmark.yml
vendored
Normal file
@@ -0,0 +1,130 @@
|
||||
# Run signed integer benchmarks on an AWS instance and return parsed results to Slab CI bot.
|
||||
name: Signed Integer benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "Instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "Instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "Instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: "Slab request ID"
|
||||
type: string
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-integer-benchmarks:
|
||||
name: Execute signed integer benchmarks in EC2
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !cancelled() }}
|
||||
steps:
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "IDs: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Get benchmark date
|
||||
run: |
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make FAST_BENCH=TRUE bench_signed_integer
|
||||
|
||||
- name: Parse benchmarks to csv
|
||||
run: |
|
||||
make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
|
||||
COMMIT_HASH="$(git describe --tags --dirty)"
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware ${{ inputs.instance_type }} \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Signed integer benchmarks failed. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
136
.github/workflows/signed_integer_full_benchmark.yml
vendored
Normal file
136
.github/workflows/signed_integer_full_benchmark.yml
vendored
Normal file
@@ -0,0 +1,136 @@
|
||||
# Run all signed integer benchmarks on an AWS instance and return parsed results to Slab CI bot.
|
||||
name: Signed Integer full benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "Instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "Instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "Instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: "Slab request ID"
|
||||
type: string
|
||||
user_inputs:
|
||||
description: "Type of benchmarks to run"
|
||||
type: string
|
||||
default: "weekly_benchmarks"
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
integer-benchmarks:
|
||||
name: Execute signed integer benchmarks for all operations flavor
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !cancelled() }}
|
||||
continue-on-error: true
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
strategy:
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
command: [ integer, integer_multi_bit ]
|
||||
op_flavor: [ default, unchecked ]
|
||||
steps:
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "IDs: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get benchmark details
|
||||
run: |
|
||||
{
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)";
|
||||
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
|
||||
echo "COMMIT_HASH=$(git describe --tags --dirty)";
|
||||
} >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Run benchmarks with AVX512
|
||||
run: |
|
||||
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_signed_${{ matrix.command }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware ${{ inputs.instance_type }} \
|
||||
--project-version "${{ env.COMMIT_HASH }}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${{ env.COMMIT_DATE }}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
slack-notification:
|
||||
name: Slack Notification
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ failure() }}
|
||||
needs: integer-benchmarks
|
||||
steps:
|
||||
- name: Notify
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Signed integer full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
130
.github/workflows/signed_integer_multi_bit_benchmark.yml
vendored
Normal file
130
.github/workflows/signed_integer_multi_bit_benchmark.yml
vendored
Normal file
@@ -0,0 +1,130 @@
|
||||
# Run signed integer benchmarks with multi-bit cryptographic parameters on an AWS instance and return parsed results to Slab CI bot.
|
||||
name: Signed Integer Multi-bit benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
instance_id:
|
||||
description: "Instance ID"
|
||||
type: string
|
||||
instance_image_id:
|
||||
description: "Instance AMI ID"
|
||||
type: string
|
||||
instance_type:
|
||||
description: "Instance product type"
|
||||
type: string
|
||||
runner_name:
|
||||
description: "Action runner name"
|
||||
type: string
|
||||
request_id:
|
||||
description: "Slab request ID"
|
||||
type: string
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-integer-benchmarks:
|
||||
name: Execute signed integer multi-bit benchmarks in EC2
|
||||
runs-on: ${{ github.event.inputs.runner_name }}
|
||||
if: ${{ !cancelled() }}
|
||||
steps:
|
||||
- name: Instance configuration used
|
||||
run: |
|
||||
echo "IDs: ${{ inputs.instance_id }}"
|
||||
echo "AMI: ${{ inputs.instance_image_id }}"
|
||||
echo "Type: ${{ inputs.instance_type }}"
|
||||
echo "Request ID: ${{ inputs.request_id }}"
|
||||
|
||||
- name: Get benchmark date
|
||||
run: |
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up home
|
||||
# "Install rust" step require root user to have a HOME directory which is not set.
|
||||
run: |
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Run multi-bit benchmarks with AVX512
|
||||
run: |
|
||||
make FAST_BENCH=TRUE bench_signed_integer_multi_bit
|
||||
|
||||
- name: Parse benchmarks to csv
|
||||
run: |
|
||||
make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
|
||||
COMMIT_HASH="$(git describe --tags --dirty)"
|
||||
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
|
||||
--database tfhe_rs \
|
||||
--hardware ${{ inputs.instance_type }} \
|
||||
--project-version "${COMMIT_HASH}" \
|
||||
--branch ${{ github.ref_name }} \
|
||||
--commit-date "${COMMIT_DATE}" \
|
||||
--bench-date "${{ env.BENCH_DATE }}" \
|
||||
--walk-subdirs \
|
||||
--name-suffix avx512 \
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Send data to Slab
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Computing HMac on results file"
|
||||
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
|
||||
echo "Sending results to Slab..."
|
||||
curl -v -k \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Slab-Repository: ${{ github.repository }}" \
|
||||
-H "X-Slab-Command: store_data_v2" \
|
||||
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
|
||||
-d @${{ env.RESULTS_FILENAME }} \
|
||||
${{ secrets.SLAB_URL }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "Signed integer benchmarks failed. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
48
.github/workflows/start_benchmarks.yml
vendored
48
.github/workflows/start_benchmarks.yml
vendored
@@ -20,12 +20,24 @@ on:
|
||||
description: "Run integer benches"
|
||||
type: boolean
|
||||
default: true
|
||||
signed_integer_bench:
|
||||
description: "Run signed integer benches"
|
||||
type: boolean
|
||||
default: true
|
||||
integer_multi_bit_bench:
|
||||
description: "Run integer multi bit benches"
|
||||
type: boolean
|
||||
default: true
|
||||
pbs_bench:
|
||||
description: "Run PBS benches"
|
||||
signed_integer_multi_bit_bench:
|
||||
description: "Run signed integer multi bit benches"
|
||||
type: boolean
|
||||
default: true
|
||||
core_crypto_bench:
|
||||
description: "Run core crypto benches"
|
||||
type: boolean
|
||||
default: true
|
||||
core_crypto_gpu_bench:
|
||||
description: "Run core crypto benches on GPU"
|
||||
type: boolean
|
||||
default: true
|
||||
wasm_client_bench:
|
||||
@@ -38,17 +50,21 @@ jobs:
|
||||
if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
|
||||
strategy:
|
||||
matrix:
|
||||
command: [boolean_bench, shortint_bench, integer_bench, integer_multi_bit_bench, pbs_bench, wasm_client_bench]
|
||||
command: [ boolean_bench, shortint_bench,
|
||||
integer_bench, integer_multi_bit_bench,
|
||||
signed_integer_bench, signed_integer_multi_bit_bench,
|
||||
integer_gpu_bench, integer_multi_bit_gpu_bench,
|
||||
core_crypto_bench, core_crypto_gpu_bench, wasm_client_bench ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@408093d9ff9c134c33b974e0722ce06b9d6e8263
|
||||
uses: tj-actions/changed-files@aa08304bd477b800d468db44fe10f6c61f7f7b11
|
||||
with:
|
||||
files_yaml: |
|
||||
common_benches:
|
||||
@@ -69,23 +85,33 @@ jobs:
|
||||
integer_bench:
|
||||
- tfhe/src/shortint/**
|
||||
- tfhe/src/integer/**
|
||||
- tfhe/benches/integer/**
|
||||
- tfhe/benches/integer/bench.rs
|
||||
- .github/workflows/integer_benchmark.yml
|
||||
integer_multi_bit_bench:
|
||||
- tfhe/src/shortint/**
|
||||
- tfhe/src/integer/**
|
||||
- tfhe/benches/integer/**
|
||||
- .github/workflows/integer_benchmark.yml
|
||||
pbs_bench:
|
||||
- tfhe/benches/integer/bench.rs
|
||||
- .github/workflows/integer_multi_bit_benchmark.yml
|
||||
signed_integer_bench:
|
||||
- tfhe/src/shortint/**
|
||||
- tfhe/src/integer/**
|
||||
- tfhe/benches/integer/signed_bench.rs
|
||||
- .github/workflows/signed_integer_benchmark.yml
|
||||
signed_integer_multi_bit_bench:
|
||||
- tfhe/src/shortint/**
|
||||
- tfhe/src/integer/**
|
||||
- tfhe/benches/integer/signed_bench.rs
|
||||
- .github/workflows/signed_integer_multi_bit_benchmark.yml
|
||||
core_crypto_bench:
|
||||
- tfhe/src/core_crypto/**
|
||||
- tfhe/benches/core_crypto/**
|
||||
- .github/workflows/pbs_benchmark.yml
|
||||
- .github/workflows/core_crypto_benchmark.yml
|
||||
wasm_client_bench:
|
||||
- tfhe/web_wasm_parallel_tests/**
|
||||
- .github/workflows/wasm_client_benchmark.yml
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
8
.github/workflows/start_full_benchmarks.yml
vendored
8
.github/workflows/start_full_benchmarks.yml
vendored
@@ -24,16 +24,18 @@ jobs:
|
||||
if: ${{ (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
|
||||
strategy:
|
||||
matrix:
|
||||
command: [ boolean_bench, shortint_full_bench, integer_full_bench, pbs_bench, wasm_client_bench ]
|
||||
command: [ boolean_bench, shortint_full_bench,
|
||||
integer_full_bench, signed_integer_full_bench, integer_gpu_full_bench,
|
||||
core_crypto_bench, core_crypto_gpu_bench, wasm_client_bench ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
4
.github/workflows/sync_on_push.yml
vendored
4
.github/workflows/sync_on_push.yml
vendored
@@ -13,11 +13,11 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repo
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Save repo
|
||||
uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: repo-archive
|
||||
path: '.'
|
||||
|
||||
54
.github/workflows/trigger_aws_tests_on_pr.yml
vendored
54
.github/workflows/trigger_aws_tests_on_pr.yml
vendored
@@ -1,54 +0,0 @@
|
||||
# Trigger an AWS build each time commits are pushed to a pull request.
|
||||
name: PR AWS build trigger
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
pull_request_review:
|
||||
types: [submitted]
|
||||
|
||||
jobs:
|
||||
trigger-tests:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
steps:
|
||||
- name: Get current labels
|
||||
uses: snnaplab/get-labels-action@f426df40304808ace3b5282d4f036515f7609576
|
||||
|
||||
- name: Remove approved label
|
||||
if: ${{ github.event_name == 'pull_request' && contains(fromJSON(env.LABELS), 'approved') }}
|
||||
uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
|
||||
with:
|
||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
||||
labels: approved
|
||||
|
||||
- name: Launch fast tests
|
||||
if: ${{ github.event_name == 'pull_request' }}
|
||||
uses: mshick/add-pr-comment@a65df5f64fc741e91c59b8359a4bc56e57aaf5b1
|
||||
with:
|
||||
allow-repeats: true
|
||||
message: |
|
||||
@slab-ci cpu_fast_test
|
||||
|
||||
- name: Add approved label
|
||||
uses: actions-ecosystem/action-add-labels@18f1af5e3544586314bbe15c0273249c770b2daf
|
||||
if: ${{ github.event_name == 'pull_request_review' && github.event.review.state == 'approved' && !contains(fromJSON(env.LABELS), 'approved') }}
|
||||
with:
|
||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
||||
labels: approved
|
||||
|
||||
# PR label 'approved' presence is checked to avoid running the full test suite several times
|
||||
# in case of multiple approvals without new commits in between.
|
||||
- name: Launch full tests suite
|
||||
if: ${{ github.event_name == 'pull_request_review' && github.event.review.state == 'approved' && !contains(fromJSON(env.LABELS), 'approved') }}
|
||||
uses: mshick/add-pr-comment@a65df5f64fc741e91c59b8359a4bc56e57aaf5b1
|
||||
with:
|
||||
allow-repeats: true
|
||||
message: |
|
||||
Pull Request has been approved :tada:
|
||||
Launching full test suite...
|
||||
@slab-ci cpu_test
|
||||
@slab-ci cpu_integer_test
|
||||
@slab-ci cpu_multi_bit_test
|
||||
@slab-ci cpu_wasm_test
|
||||
@slab-ci csprng_randomness_testing
|
||||
11
.github/workflows/wasm_client_benchmark.yml
vendored
11
.github/workflows/wasm_client_benchmark.yml
vendored
@@ -32,6 +32,8 @@ env:
|
||||
CARGO_TERM_COLOR: always
|
||||
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
RUST_BACKTRACE: "full"
|
||||
RUST_MIN_STACK: "8388608"
|
||||
|
||||
jobs:
|
||||
run-wasm-client-benchmarks:
|
||||
@@ -51,7 +53,7 @@ jobs:
|
||||
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -61,10 +63,9 @@ jobs:
|
||||
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Install rust
|
||||
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
|
||||
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
|
||||
with:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
|
||||
- name: Run benchmarks
|
||||
run: |
|
||||
@@ -97,13 +98,13 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
|
||||
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
|
||||
with:
|
||||
name: ${{ github.sha }}_wasm
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
|
||||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
9
.gitignore
vendored
9
.gitignore
vendored
@@ -3,9 +3,9 @@ target/
|
||||
.vscode/
|
||||
|
||||
# Path we use for internal-keycache during tests
|
||||
./keys/
|
||||
/keys/
|
||||
# In case of symlinked keys
|
||||
./keys
|
||||
/keys
|
||||
|
||||
**/Cargo.lock
|
||||
**/*.bin
|
||||
@@ -18,4 +18,7 @@ target/
|
||||
dieharder_run.log
|
||||
|
||||
# Coverage reports
|
||||
./coverage/
|
||||
/coverage/
|
||||
|
||||
# Cuda local build
|
||||
backends/tfhe-cuda-backend/cuda/cmake-build-debug/
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[workspace]
|
||||
resolver = "2"
|
||||
members = ["tfhe", "tasks", "apps/trivium", "concrete-csprng"]
|
||||
members = ["tfhe", "tasks", "apps/trivium", "concrete-csprng", "backends/tfhe-cuda-backend"]
|
||||
|
||||
[profile.bench]
|
||||
lto = "fat"
|
||||
|
||||
2
LICENSE
2
LICENSE
@@ -1,6 +1,6 @@
|
||||
BSD 3-Clause Clear License
|
||||
|
||||
Copyright © 2023 ZAMA.
|
||||
Copyright © 2024 ZAMA.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
|
||||
399
Makefile
399
Makefile
@@ -6,7 +6,7 @@ TARGET_ARCH_FEATURE:=$(shell ./scripts/get_arch_feature.sh)
|
||||
RS_BUILD_TOOLCHAIN:=stable
|
||||
CARGO_RS_BUILD_TOOLCHAIN:=+$(RS_BUILD_TOOLCHAIN)
|
||||
CARGO_PROFILE?=release
|
||||
MIN_RUST_VERSION:=$(shell grep rust-version tfhe/Cargo.toml | cut -d '=' -f 2 | xargs)
|
||||
MIN_RUST_VERSION:=$(shell grep '^rust-version[[:space:]]*=' tfhe/Cargo.toml | cut -d '=' -f 2 | xargs)
|
||||
AVX512_SUPPORT?=OFF
|
||||
WASM_RUSTFLAGS:=
|
||||
BIG_TESTS_INSTANCE?=FALSE
|
||||
@@ -16,6 +16,18 @@ PARSE_INTEGER_BENCH_CSV_FILE?=tfhe_rs_integer_benches.csv
|
||||
FAST_TESTS?=FALSE
|
||||
FAST_BENCH?=FALSE
|
||||
BENCH_OP_FLAVOR?=DEFAULT
|
||||
NODE_VERSION=20
|
||||
FORWARD_COMPAT?=OFF
|
||||
# sed: -n, do not print input stream, -e means a script/expression
|
||||
# 1,/version/ indicates from the first line, to the line matching version at the start of the line
|
||||
# p indicates to print, so we keep only the start of the Cargo.toml until we hit the first version
|
||||
# entry which should be the version of tfhe
|
||||
TFHE_CURRENT_VERSION:=\
|
||||
$(shell sed -n -e '1,/^version/p' tfhe/Cargo.toml | \
|
||||
grep '^version[[:space:]]*=' | cut -d '=' -f 2 | xargs)
|
||||
# Cargo has a hard time distinguishing between our package from the workspace and a package that
|
||||
# could be a dependency, so we build an unambiguous spec here
|
||||
TFHE_SPEC:=tfhe@$(TFHE_CURRENT_VERSION)
|
||||
# This is done to avoid forgetting it, we still precise the RUSTFLAGS in the commands to be able to
|
||||
# copy paste the command in the terminal and change them if required without forgetting the flags
|
||||
export RUSTFLAGS?=-C target-cpu=native
|
||||
@@ -38,10 +50,20 @@ else
|
||||
COVERAGE_ONLY=
|
||||
endif
|
||||
|
||||
ifeq ($(FORWARD_COMPAT),ON)
|
||||
FORWARD_COMPAT_FEATURE=forward_compatibility
|
||||
else
|
||||
FORWARD_COMPAT_FEATURE=
|
||||
endif
|
||||
|
||||
# Variables used only for regex_engine example
|
||||
REGEX_STRING?=''
|
||||
REGEX_PATTERN?=''
|
||||
|
||||
# tfhe-cuda-backend
|
||||
TFHECUDA_SRC=backends/tfhe-cuda-backend/cuda
|
||||
TFHECUDA_BUILD=$(TFHECUDA_SRC)/build
|
||||
|
||||
# Exclude these files from coverage reports
|
||||
define COVERAGE_EXCLUDED_FILES
|
||||
--exclude-files apps/trivium/src/trivium/* \
|
||||
@@ -99,7 +121,7 @@ install_wasm_pack: install_rs_build_toolchain
|
||||
install_node:
|
||||
curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.3/install.sh | $(SHELL)
|
||||
source ~/.bashrc
|
||||
$(SHELL) -i -c 'nvm install node' || \
|
||||
$(SHELL) -i -c 'nvm install $(NODE_VERSION)' || \
|
||||
( echo "Unable to install node, unknown error." && exit 1 )
|
||||
|
||||
.PHONY: install_dieharder # Install dieharder for apt distributions or macOS
|
||||
@@ -120,16 +142,38 @@ install_tarpaulin: install_rs_build_toolchain
|
||||
.PHONY: check_linelint_installed # Check if linelint newline linter is installed
|
||||
check_linelint_installed:
|
||||
@printf "\n" | linelint - > /dev/null 2>&1 || \
|
||||
( echo "Unable to locate linelint. Try installing it: https://github.com/fernandrone/linelint/releases" && exit 1 )
|
||||
( echo "Unable to locate linelint. Try installing it: https://github.com/fernandrone/linelint/releases" && exit 1 )
|
||||
|
||||
.PHONY: check_actionlint_installed # Check if actionlint workflow linter is installed
|
||||
check_actionlint_installed:
|
||||
@actionlint --version > /dev/null 2>&1 || \
|
||||
( echo "Unable to locate actionlint. Try installing it: https://github.com/rhysd/actionlint/releases" && exit 1 )
|
||||
|
||||
.PHONY: fmt # Format rust code
|
||||
fmt: install_rs_check_toolchain
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
|
||||
|
||||
.PHONY: fmt_gpu # Format rust and cuda code
|
||||
fmt_gpu: install_rs_check_toolchain
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
|
||||
cd "$(TFHECUDA_SRC)" && ./format_tfhe_cuda_backend.sh
|
||||
|
||||
.PHONY: check_fmt # Check rust code format
|
||||
check_fmt: install_rs_check_toolchain
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
|
||||
|
||||
.PHONY: check_fmt_gpu # Check rust and cuda code format
|
||||
check_fmt_gpu: install_rs_check_toolchain
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
|
||||
cd "$(TFHECUDA_SRC)" && ./format_tfhe_cuda_backend.sh -c
|
||||
|
||||
.PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
|
||||
clippy_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu \
|
||||
--all-targets \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
|
||||
fix_newline: check_linelint_installed
|
||||
linelint -a .
|
||||
@@ -138,50 +182,60 @@ fix_newline: check_linelint_installed
|
||||
check_newline: check_linelint_installed
|
||||
linelint .
|
||||
|
||||
.PHONY: lint_workflow # Run static linter on GitHub workflows
|
||||
lint_workflow: check_actionlint_installed
|
||||
actionlint
|
||||
|
||||
.PHONY: clippy_core # Run clippy lints on core_crypto with and without experimental features
|
||||
clippy_core: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=$(TARGET_ARCH_FEATURE) \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=$(TARGET_ARCH_FEATURE),nightly-avx512 \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental,nightly-avx512 \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_boolean # Run clippy lints enabling the boolean features
|
||||
clippy_boolean: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_shortint # Run clippy lints enabling the shortint features
|
||||
clippy_shortint: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_integer # Run clippy lints enabling the integer features
|
||||
clippy_integer: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy # Run clippy lints enabling the boolean, shortint, integer
|
||||
clippy: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
|
||||
clippy_c_api: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
|
||||
clippy_js_wasm_api: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_tasks # Run clippy lints on helper tasks crate.
|
||||
clippy_tasks:
|
||||
@@ -190,15 +244,14 @@ clippy_tasks:
|
||||
|
||||
.PHONY: clippy_trivium # Run clippy lints on Trivium app
|
||||
clippy_trivium: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy -p tfhe-trivium \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
-p tfhe-trivium -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
|
||||
clippy_all_targets:
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,safe-deserialization \
|
||||
-p tfhe -- --no-deps -D warnings
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_concrete_csprng # Run clippy lints on concrete-csprng
|
||||
clippy_concrete_csprng:
|
||||
@@ -214,62 +267,80 @@ clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_triviu
|
||||
clippy_fast: clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core \
|
||||
clippy_concrete_csprng
|
||||
|
||||
.PHONY: gen_key_cache # Run the script to generate keys and cache them for shortint tests
|
||||
gen_key_cache: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
|
||||
--example generates_test_keys \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache -p tfhe -- \
|
||||
$(MULTI_BIT_ONLY) $(COVERAGE_ONLY)
|
||||
.PHONY: clippy_cuda_backend # Run clippy lints on the tfhe-cuda-backend
|
||||
clippy_cuda_backend: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
-p tfhe-cuda-backend -- --no-deps -D warnings
|
||||
|
||||
.PHONY: build_core # Build core_crypto without experimental features
|
||||
build_core: install_rs_build_toolchain install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE) -p tfhe
|
||||
--features=$(TARGET_ARCH_FEATURE) -p $(TFHE_SPEC)
|
||||
@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),$(AVX512_FEATURE) -p tfhe; \
|
||||
--features=$(TARGET_ARCH_FEATURE),$(AVX512_FEATURE) -p $(TFHE_SPEC); \
|
||||
fi
|
||||
|
||||
.PHONY: build_core_experimental # Build core_crypto with experimental features
|
||||
build_core_experimental: install_rs_build_toolchain install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental -p tfhe
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental -p $(TFHE_SPEC)
|
||||
@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental,$(AVX512_FEATURE) -p tfhe; \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental,$(AVX512_FEATURE) -p $(TFHE_SPEC); \
|
||||
fi
|
||||
|
||||
.PHONY: build_boolean # Build with boolean enabled
|
||||
build_boolean: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean -p tfhe --all-targets
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean -p $(TFHE_SPEC) --all-targets
|
||||
|
||||
.PHONY: build_shortint # Build with shortint enabled
|
||||
build_shortint: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint -p tfhe --all-targets
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint -p $(TFHE_SPEC) --all-targets
|
||||
|
||||
.PHONY: build_integer # Build with integer enabled
|
||||
build_integer: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer -p tfhe --all-targets
|
||||
--features=$(TARGET_ARCH_FEATURE),integer -p $(TFHE_SPEC) --all-targets
|
||||
|
||||
.PHONY: build_tfhe_full # Build with boolean, shortint and integer enabled
|
||||
build_tfhe_full: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p tfhe --all-targets
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p $(TFHE_SPEC) --all-targets
|
||||
|
||||
.PHONY: build_tfhe_coverage # Build with test coverage enabled
|
||||
build_tfhe_coverage: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) --tests
|
||||
|
||||
.PHONY: symlink_c_libs_without_fingerprint # Link the .a and .so files without the changing hash part in target
|
||||
symlink_c_libs_without_fingerprint:
|
||||
@./scripts/symlink_c_libs_without_fingerprint.sh \
|
||||
--cargo-profile "$(CARGO_PROFILE)" \
|
||||
--lib-name tfhe-c-api-dynamic-buffer
|
||||
|
||||
.PHONY: build_c_api # Build the C API for boolean, shortint and integer
|
||||
build_c_api: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,safe-deserialization \
|
||||
-p tfhe
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,$(FORWARD_COMPAT_FEATURE) \
|
||||
-p $(TFHE_SPEC)
|
||||
@"$(MAKE)" symlink_c_libs_without_fingerprint
|
||||
|
||||
.PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
|
||||
build_c_api_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,gpu \
|
||||
-p $(TFHE_SPEC)
|
||||
@"$(MAKE)" symlink_c_libs_without_fingerprint
|
||||
|
||||
.PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
|
||||
build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,safe-deserialization,experimental-force_fft_algo_dif4 \
|
||||
-p tfhe
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
|
||||
-p $(TFHE_SPEC)
|
||||
@"$(MAKE)" symlink_c_libs_without_fingerprint
|
||||
|
||||
.PHONY: build_web_js_api # Build the js API targeting the web browser
|
||||
build_web_js_api: install_rs_build_toolchain install_wasm_pack
|
||||
@@ -302,30 +373,70 @@ build_concrete_csprng: install_rs_build_toolchain
|
||||
.PHONY: test_core_crypto # Run the tests of the core_crypto module including experimental ones
|
||||
test_core_crypto: install_rs_build_toolchain install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental -p tfhe -- core_crypto::
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental -p $(TFHE_SPEC) -- core_crypto::
|
||||
@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental,$(AVX512_FEATURE) -p tfhe -- core_crypto::; \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental,$(AVX512_FEATURE) -p $(TFHE_SPEC) -- core_crypto::; \
|
||||
fi
|
||||
|
||||
.PHONY: test_core_crypto_cov # Run the tests of the core_crypto module with code coverage
|
||||
test_core_crypto_cov: install_rs_build_toolchain install_rs_check_toolchain install_tarpaulin
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
|
||||
--out xml --output-dir coverage/core_crypto --line --engine llvm --timeout 500 \
|
||||
--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache \
|
||||
-p $(TFHE_SPEC) -- core_crypto::
|
||||
@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
|
||||
--out xml --output-dir coverage/core_crypto_avx512 --line --engine llvm --timeout 500 \
|
||||
--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,$(AVX512_FEATURE) \
|
||||
-p $(TFHE_SPEC) -- -Z unstable-options --report-time core_crypto::; \
|
||||
fi
|
||||
|
||||
.PHONY: test_cuda_backend # Run the internal tests of the CUDA backend
|
||||
test_cuda_backend:
|
||||
mkdir -p "$(TFHECUDA_BUILD)" && \
|
||||
cd "$(TFHECUDA_BUILD)" && \
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON && \
|
||||
make -j && \
|
||||
make test
|
||||
|
||||
.PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
|
||||
test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
|
||||
|
||||
.PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
|
||||
test_core_crypto_gpu: install_rs_build_toolchain install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
|
||||
|
||||
.PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
|
||||
test_integer_gpu: install_rs_build_toolchain install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
|
||||
|
||||
.PHONY: test_boolean # Run the tests of the boolean module
|
||||
test_boolean: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean -p tfhe -- boolean::
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean -p $(TFHE_SPEC) -- boolean::
|
||||
|
||||
.PHONY: test_boolean_cov # Run the tests of the boolean module with code coverage
|
||||
test_boolean_cov: install_rs_check_toolchain install_tarpaulin
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
|
||||
--out xml --output-dir coverage/boolean --line --engine llvm --timeout 500 \
|
||||
$(COVERAGE_EXCLUDED_FILES) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache,__coverage \
|
||||
-p tfhe -- boolean::
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache \
|
||||
-p $(TFHE_SPEC) -- -Z unstable-options --report-time boolean::
|
||||
|
||||
.PHONY: test_c_api_rs # Run the rust tests for the C API
|
||||
test_c_api_rs: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,safe-deserialization \
|
||||
-p tfhe \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api \
|
||||
-p $(TFHE_SPEC) \
|
||||
c_api
|
||||
|
||||
.PHONY: test_c_api_c # Run the C tests for the C API
|
||||
@@ -335,69 +446,133 @@ test_c_api_c: build_c_api
|
||||
.PHONY: test_c_api # Run all the tests for the C API
|
||||
test_c_api: test_c_api_rs test_c_api_c
|
||||
|
||||
.PHONY: test_c_api_gpu # Run the C tests for the C API
|
||||
test_c_api_gpu: build_c_api_gpu
|
||||
./scripts/c_api_tests.sh --gpu
|
||||
|
||||
.PHONY: test_shortint_ci # Run the tests for shortint ci
|
||||
test_shortint_ci: install_rs_build_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
FAST_TESTS="$(FAST_TESTS)" \
|
||||
./scripts/shortint-tests.sh --rust-toolchain $(CARGO_RS_BUILD_TOOLCHAIN) \
|
||||
--cargo-profile "$(CARGO_PROFILE)"
|
||||
--cargo-profile "$(CARGO_PROFILE)" --tfhe-package "$(TFHE_SPEC)"
|
||||
|
||||
.PHONY: test_shortint_multi_bit_ci # Run the tests for shortint ci running only multibit tests
|
||||
test_shortint_multi_bit_ci: install_rs_build_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
FAST_TESTS="$(FAST_TESTS)" \
|
||||
./scripts/shortint-tests.sh --rust-toolchain $(CARGO_RS_BUILD_TOOLCHAIN) \
|
||||
--cargo-profile "$(CARGO_PROFILE)" --multi-bit
|
||||
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --tfhe-package "$(TFHE_SPEC)"
|
||||
|
||||
.PHONY: test_shortint # Run all the tests for shortint
|
||||
test_shortint: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache -p tfhe -- shortint::
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache -p $(TFHE_SPEC) -- shortint::
|
||||
|
||||
.PHONY: test_shortint_cov # Run the tests of the shortint module with code coverage
|
||||
test_shortint_cov: install_rs_check_toolchain install_tarpaulin
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
|
||||
--out xml --output-dir coverage/shortint --line --engine llvm --timeout 500 \
|
||||
$(COVERAGE_EXCLUDED_FILES) \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,__coverage \
|
||||
-p tfhe -- shortint::
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache \
|
||||
-p $(TFHE_SPEC) -- -Z unstable-options --report-time shortint::
|
||||
|
||||
.PHONY: test_integer_ci # Run the tests for integer ci
|
||||
test_integer_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
FAST_TESTS="$(FAST_TESTS)" \
|
||||
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
|
||||
--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)"
|
||||
--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
|
||||
--tfhe-package "$(TFHE_SPEC)"
|
||||
|
||||
.PHONY: test_unsigned_integer_ci # Run the tests for unsigned integer ci
|
||||
test_unsigned_integer_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
FAST_TESTS="$(FAST_TESTS)" \
|
||||
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
|
||||
--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
|
||||
--unsigned-only --tfhe-package "$(TFHE_SPEC)"
|
||||
|
||||
.PHONY: test_signed_integer_ci # Run the tests for signed integer ci
|
||||
test_signed_integer_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
FAST_TESTS="$(FAST_TESTS)" \
|
||||
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
|
||||
--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
|
||||
--signed-only --tfhe-package "$(TFHE_SPEC)"
|
||||
|
||||
.PHONY: test_integer_multi_bit_ci # Run the tests for integer ci running only multibit tests
|
||||
test_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
FAST_TESTS="$(FAST_TESTS)" \
|
||||
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
|
||||
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)"
|
||||
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
|
||||
--tfhe-package "$(TFHE_SPEC)"
|
||||
|
||||
.PHONY: test_unsigned_integer_multi_bit_ci # Run the tests for nsigned integer ci running only multibit tests
|
||||
test_unsigned_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
FAST_TESTS="$(FAST_TESTS)" \
|
||||
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
|
||||
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
|
||||
--unsigned-only --tfhe-package "$(TFHE_SPEC)"
|
||||
|
||||
.PHONY: test_signed_integer_multi_bit_ci # Run the tests for nsigned integer ci running only multibit tests
|
||||
test_signed_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
|
||||
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
|
||||
FAST_TESTS="$(FAST_TESTS)" \
|
||||
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
|
||||
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
|
||||
--signed-only --tfhe-package "$(TFHE_SPEC)"
|
||||
|
||||
.PHONY: test_safe_deserialization # Run the tests for safe deserialization
|
||||
test_safe_deserialization: install_rs_build_toolchain install_cargo_nextest
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,safe-deserialization -p tfhe -- safe_deserialization::
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) -- safe_deserialization::
|
||||
|
||||
.PHONY: test_integer # Run all the tests for integer
|
||||
test_integer: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache -p tfhe -- integer::
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache -p $(TFHE_SPEC) -- integer::
|
||||
|
||||
.PHONY: test_integer_cov # Run the tests of the integer module with code coverage
|
||||
test_integer_cov: install_rs_check_toolchain install_tarpaulin
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
|
||||
--out xml --output-dir coverage/integer --line --engine llvm --timeout 500 \
|
||||
--implicit-test-threads \
|
||||
--exclude-files $(COVERAGE_EXCLUDED_FILES) \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache \
|
||||
-p $(TFHE_SPEC) -- -Z unstable-options --report-time integer::
|
||||
|
||||
.PHONY: test_high_level_api # Run all the tests for high_level_api
|
||||
test_high_level_api: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p tfhe \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) \
|
||||
-- high_level_api::
|
||||
|
||||
test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) \
|
||||
-E "test(/high_level_api::.*gpu.*/)"
|
||||
|
||||
.PHONY: test_user_doc # Run tests from the .md documentation
|
||||
test_user_doc: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p tfhe \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) \
|
||||
-- test_user_docs::
|
||||
|
||||
.PHONY: test_user_doc_gpu # Run tests for GPU from the .md documentation
|
||||
test_user_doc_gpu: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu -p $(TFHE_SPEC) \
|
||||
-- test_user_docs::
|
||||
|
||||
.PHONY: test_fhe_strings # Run tests for fhe_strings example
|
||||
test_fhe_strings: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--example fhe_strings \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer
|
||||
|
||||
.PHONY: test_regex_engine # Run tests for regex_engine example
|
||||
test_regex_engine: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
@@ -432,7 +607,7 @@ test_concrete_csprng:
|
||||
doc: install_rs_check_toolchain
|
||||
RUSTDOCFLAGS="--html-in-header katex-header.html" \
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer --no-deps
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer --no-deps -p $(TFHE_SPEC)
|
||||
|
||||
.PHONY: docs # Build rust doc alias for doc
|
||||
docs: doc
|
||||
@@ -441,7 +616,7 @@ docs: doc
|
||||
lint_doc: install_rs_check_toolchain
|
||||
RUSTDOCFLAGS="--html-in-header katex-header.html -Dwarnings" \
|
||||
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer --no-deps
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p $(TFHE_SPEC) --no-deps
|
||||
|
||||
.PHONY: lint_docs # Build rust doc with linting enabled alias for lint_doc
|
||||
lint_docs: lint_doc
|
||||
@@ -458,18 +633,28 @@ format_doc_latex:
|
||||
.PHONY: check_compile_tests # Build tests in debug without running them
|
||||
check_compile_tests:
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache,safe-deserialization \
|
||||
-p tfhe
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache \
|
||||
-p $(TFHE_SPEC)
|
||||
|
||||
@if [[ "$(OS)" == "Linux" || "$(OS)" == "Darwin" ]]; then \
|
||||
"$(MAKE)" build_c_api; \
|
||||
"$(MAKE)" build_c_api && \
|
||||
./scripts/c_api_tests.sh --build-only; \
|
||||
fi
|
||||
|
||||
.PHONY: check_compile_tests_benches_gpu # Build tests in debug without running them
|
||||
check_compile_tests_benches_gpu: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache,gpu \
|
||||
-p $(TFHE_SPEC)
|
||||
mkdir -p "$(TFHECUDA_BUILD)" && \
|
||||
cd "$(TFHECUDA_BUILD)" && \
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Debug -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON -DTFHE_CUDA_BACKEND_BUILD_BENCHMARKS=ON && \
|
||||
make -j
|
||||
|
||||
.PHONY: build_nodejs_test_docker # Build a docker image with tools to run nodejs tests for wasm API
|
||||
build_nodejs_test_docker:
|
||||
DOCKER_BUILDKIT=1 docker build --build-arg RUST_TOOLCHAIN="$(RS_BUILD_TOOLCHAIN)" \
|
||||
-f docker/Dockerfile.wasm_tests -t tfhe-wasm-tests .
|
||||
-f docker/Dockerfile.wasm_tests --build-arg NODE_VERSION=$(NODE_VERSION) -t tfhe-wasm-tests .
|
||||
|
||||
.PHONY: test_nodejs_wasm_api_in_docker # Run tests for the nodejs on wasm API in a docker container
|
||||
test_nodejs_wasm_api_in_docker: build_nodejs_test_docker
|
||||
@@ -493,8 +678,8 @@ test_web_js_api_parallel: build_web_js_api_parallel
|
||||
.PHONY: ci_test_web_js_api_parallel # Run tests for the web wasm api
|
||||
ci_test_web_js_api_parallel: build_web_js_api_parallel
|
||||
source ~/.nvm/nvm.sh && \
|
||||
nvm install 20 && \
|
||||
nvm use 20 && \
|
||||
nvm install $(NODE_VERSION) && \
|
||||
nvm use $(NODE_VERSION) && \
|
||||
$(MAKE) -C tfhe/web_wasm_parallel_tests test-ci
|
||||
|
||||
.PHONY: no_tfhe_typo # Check we did not invert the h and f in tfhe
|
||||
@@ -513,27 +698,70 @@ dieharder_csprng: install_dieharder build_concrete_csprng
|
||||
# Benchmarks
|
||||
#
|
||||
|
||||
.PHONY: bench_integer # Run benchmarks for integer
|
||||
.PHONY: bench_integer # Run benchmarks for unsigned integer
|
||||
bench_integer: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p tfhe --
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_multi_bit # Run benchmarks for integer using multi-bit parameters
|
||||
.PHONY: bench_signed_integer # Run benchmarks for signed integer
|
||||
bench_signed_integer: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-signed-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
|
||||
bench_integer_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
|
||||
bench_integer_multi_bit: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
|
||||
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p tfhe --
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters
|
||||
bench_signed_integer_multi_bit: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
|
||||
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-signed-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters
|
||||
bench_integer_multi_bit_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
|
||||
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_shortint # Run benchmarks for shortint
|
||||
bench_shortint: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench shortint-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,$(AVX512_FEATURE) -p tfhe
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
|
||||
.PHONY: bench_oprf # Run benchmarks for shortint
|
||||
bench_oprf: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench oprf-shortint-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
RUSTFLAGS="$(RUSTFLAGS)" \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench oprf-integer-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
|
||||
|
||||
|
||||
.PHONY: bench_shortint_multi_bit # Run benchmarks for shortint using multi-bit parameters
|
||||
bench_shortint_multi_bit: install_rs_check_toolchain
|
||||
@@ -541,20 +769,38 @@ bench_shortint_multi_bit: install_rs_check_toolchain
|
||||
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench shortint-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,$(AVX512_FEATURE) -p tfhe --
|
||||
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
|
||||
|
||||
.PHONY: bench_boolean # Run benchmarks for boolean
|
||||
bench_boolean: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench boolean-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache,$(AVX512_FEATURE) -p tfhe
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
|
||||
.PHONY: bench_pbs # Run benchmarks for PBS
|
||||
bench_pbs: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench pbs-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,$(AVX512_FEATURE) -p tfhe
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
|
||||
.PHONY: bench_pbs_gpu # Run benchmarks for PBS on GPU backend
|
||||
bench_pbs_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench pbs-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
|
||||
.PHONY: bench_ks # Run benchmarks for keyswitch
|
||||
bench_ks: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench ks-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
|
||||
.PHONY: bench_ks_gpu # Run benchmarks for PBS on GPU backend
|
||||
bench_ks_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench ks-bench \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
|
||||
|
||||
.PHONY: bench_web_js_api_parallel # Run benchmarks for the web wasm api
|
||||
bench_web_js_api_parallel: build_web_js_api_parallel
|
||||
@@ -569,6 +815,18 @@ ci_bench_web_js_api_parallel: build_web_js_api_parallel
|
||||
#
|
||||
# Utility tools
|
||||
#
|
||||
.PHONY: gen_key_cache # Run the script to generate keys and cache them for shortint tests
|
||||
gen_key_cache: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
|
||||
--example generates_test_keys \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache -- \
|
||||
$(MULTI_BIT_ONLY) $(COVERAGE_ONLY)
|
||||
|
||||
.PHONY: gen_key_cache_core_crypto # Run function to generate keys and cache them for core_crypto tests
|
||||
gen_key_cache_core_crypto: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --tests --profile $(CARGO_PROFILE) \
|
||||
--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache -p $(TFHE_SPEC) -- --nocapture \
|
||||
core_crypto::keycache::generate_keys
|
||||
|
||||
.PHONY: measure_hlapi_compact_pk_ct_sizes # Measure sizes of public keys and ciphertext for high-level API
|
||||
measure_hlapi_compact_pk_ct_sizes: install_rs_check_toolchain
|
||||
@@ -631,9 +889,12 @@ sha256_bool: install_rs_check_toolchain
|
||||
--example sha256_bool \
|
||||
--features=$(TARGET_ARCH_FEATURE),boolean
|
||||
|
||||
.PHONY: pcc # pcc stands for pre commit checks
|
||||
.PHONY: pcc # pcc stands for pre commit checks (except GPU)
|
||||
pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_all check_compile_tests
|
||||
|
||||
.PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
|
||||
pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu
|
||||
|
||||
.PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
|
||||
fpcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_fast check_compile_tests
|
||||
|
||||
|
||||
191
README.md
191
README.md
@@ -2,36 +2,66 @@
|
||||
<!-- product name logo -->
|
||||
<img width=600 src="https://user-images.githubusercontent.com/5758427/231206749-8f146b97-3c5a-4201-8388-3ffa88580415.png">
|
||||
</p>
|
||||
<hr/>
|
||||
<p align="center">
|
||||
<a href="https://docs.zama.ai/tfhe-rs"> 📒 Read documentation</a> | <a href="https://zama.ai/community"> 💛 Community support</a>
|
||||
</p>
|
||||
<p align="center">
|
||||
<!-- Version badge using shields.io -->
|
||||
<a href="https://github.com/zama-ai/tfhe-rs/releases">
|
||||
<img src="https://img.shields.io/github/v/release/zama-ai/tfhe-rs?style=flat-square">
|
||||
</a>
|
||||
<!-- Zama Bounty Program -->
|
||||
<a href="https://github.com/zama-ai/bounty-program">
|
||||
<img src="https://img.shields.io/badge/Contribute-Zama%20Bounty%20Program-yellow?style=flat-square">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
<hr/>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://docs.zama.ai/tfhe-rs"> 📒 Documentation</a> | <a href="https://zama.ai/community"> 💛 Community support</a> | <a href="https://github.com/zama-ai/awesome-zama"> 📚 FHE resources by Zama</a>
|
||||
</p>
|
||||
|
||||
**TFHE-rs** is a pure Rust implementation of TFHE for boolean and integer
|
||||
arithmetics over encrypted data. It includes:
|
||||
- a **Rust** API
|
||||
- a **C** API
|
||||
- and a **client-side WASM** API
|
||||
|
||||
**TFHE-rs** is meant for developers and researchers who want full control over
|
||||
what they can do with TFHE, while not having to worry about the low level
|
||||
<p align="center">
|
||||
<a href="https://github.com/zama-ai/tfhe-rs/releases"><img src="https://img.shields.io/github/v/release/zama-ai/tfhe-rs?style=flat-square"></a>
|
||||
<a href="LICENSE"><img src="https://img.shields.io/badge/License-BSD--3--Clause--Clear-%23ffb243?style=flat-square"></a>
|
||||
<a href="https://github.com/zama-ai/bounty-program"><img src="https://img.shields.io/badge/Contribute-Zama%20Bounty%20Program-%23ffd208?style=flat-square"></a>
|
||||
</p>
|
||||
|
||||
## About
|
||||
|
||||
### What is TFHE-rs
|
||||
|
||||
**TFHE-rs** is a pure Rust implementation of TFHE for boolean and integer arithmetics over encrypted data.
|
||||
|
||||
It includes:
|
||||
- a **Rust** API
|
||||
- a **C** API
|
||||
- and a **client-side WASM** API
|
||||
|
||||
TFHE-rs is designed for developers and researchers who want full control over
|
||||
what they can do with TFHE, while not having to worry about the low-level
|
||||
implementation. The goal is to have a stable, simple, high-performance, and
|
||||
production-ready library for all the advanced features of TFHE.
|
||||
<br></br>
|
||||
|
||||
### Main features
|
||||
|
||||
- **Low-level cryptographic library** that implements Zama’s variant of TFHE, including programmable bootstrapping
|
||||
- **Implementation of the original TFHE boolean API** that can be used as a drop-in replacement for other TFHE libraries
|
||||
- **Short integer API** that enables exact, unbounded FHE integer arithmetics with up to 8 bits of message space
|
||||
- **Size-efficient public key encryption**
|
||||
- **Ciphertext and server key compression** for efficient data transfer
|
||||
- **Full Rust API, C bindings to the Rust High-Level API, and client-side Javascript API using WASM**.
|
||||
|
||||
*Learn more about TFHE-rs features in the [documentation](https://docs.zama.ai/tfhe-rs/readme).*
|
||||
<br></br>
|
||||
|
||||
## Table of Contents
|
||||
- **[Getting Started](#getting-started)**
|
||||
- [Cargo.toml configuration](#cargotoml-configuration)
|
||||
- [A simple example](#a-simple-example)
|
||||
- **[Resources](#resources)**
|
||||
- [TFHE deep dive](#tfhe-deep-dive)
|
||||
- [Tutorials](#tutorials)
|
||||
- [Documentation](#documentation)
|
||||
- **[Working with TFHE-rs](#working-with-tfhe-rs)**
|
||||
- [Disclaimers](#disclaimers)
|
||||
- [Citations](#citations)
|
||||
- [Contributing](#contributing)
|
||||
- [License](#license)
|
||||
- **[Support](#support)**
|
||||
<br></br>
|
||||
|
||||
## Getting Started
|
||||
The steps to run a first example are described below.
|
||||
|
||||
### Cargo.toml configuration
|
||||
To use the latest version of `TFHE-rs` in your project, you first need to add it as a dependency in your `Cargo.toml`:
|
||||
@@ -47,20 +77,24 @@ tfhe = { version = "*", features = ["boolean", "shortint", "integer", "x86_64-un
|
||||
```toml
|
||||
tfhe = { version = "*", features = ["boolean", "shortint", "integer", "aarch64-unix"] }
|
||||
```
|
||||
Note: users with ARM devices must compile `TFHE-rs` using a stable toolchain with version >= 1.72.
|
||||
|
||||
|
||||
+ For x86_64-based machines with the [`rdseed instruction`](https://en.wikipedia.org/wiki/RDRAND)
|
||||
running Windows:
|
||||
+ For x86_64-based machines with the [`rdseed instruction`](https://en.wikipedia.org/wiki/RDRAND) running Windows:
|
||||
|
||||
```toml
|
||||
tfhe = { version = "*", features = ["boolean", "shortint", "integer", "x86_64"] }
|
||||
```
|
||||
|
||||
Note: aarch64-based machines are not yet supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs
|
||||
> [!Note]
|
||||
> Note: You need to use a Rust version >= 1.73 to compile TFHE-rs.
|
||||
|
||||
> [!Note]
|
||||
> Note: aarch64-based machines are not yet supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.
|
||||
|
||||
## A simple example
|
||||
<p align="right">
|
||||
<a href="#about" > ↑ Back to top </a>
|
||||
</p>
|
||||
|
||||
### A simple example
|
||||
|
||||
Here is a full example:
|
||||
|
||||
@@ -70,9 +104,7 @@ use tfhe::{generate_keys, set_server_key, ConfigBuilder, FheUint32, FheUint8};
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// Basic configuration to use homomorphic integers
|
||||
let config = ConfigBuilder::all_disabled()
|
||||
.enable_default_integers()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
|
||||
// Key generation
|
||||
let (client_key, server_keys) = generate_keys(config);
|
||||
@@ -119,32 +151,64 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
To run this code, use the following command:
|
||||
<p align="center"> <code> cargo run --release </code> </p>
|
||||
|
||||
Note that when running code that uses `tfhe-rs`, it is highly recommended
|
||||
to run in release mode with cargo's `--release` flag to have the best performances possible,
|
||||
> [!Note]
|
||||
> Note that when running code that uses `TFHE-rs`, it is highly recommended
|
||||
to run in release mode with cargo's `--release` flag to have the best performances possible.
|
||||
|
||||
*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/getting-started/quick_start)*
|
||||
|
||||
<p align="right">
|
||||
<a href="#about" > ↑ Back to top </a>
|
||||
</p>
|
||||
|
||||
|
||||
## Contributing
|
||||
|
||||
There are two ways to contribute to TFHE-rs:
|
||||
## Resources
|
||||
|
||||
- you can open issues to report bugs or typos, or to suggest new ideas
|
||||
- you can ask to become an official contributor by emailing [hello@zama.ai](mailto:hello@zama.ai).
|
||||
(becoming an approved contributor involves signing our Contributor License Agreement (CLA))
|
||||
### TFHE deep dive
|
||||
- [TFHE Deep Dive - Part I - Ciphertext types](https://www.zama.ai/post/tfhe-deep-dive-part-1)
|
||||
- [TFHE Deep Dive - Part II - Encodings and linear leveled operations](https://www.zama.ai/post/tfhe-deep-dive-part-2)
|
||||
- [TFHE Deep Dive - Part III - Key switching and leveled multiplications](https://www.zama.ai/post/tfhe-deep-dive-part-3)
|
||||
- [TFHE Deep Dive - Part IV - Programmable Bootstrapping](https://www.zama.ai/post/tfhe-deep-dive-part-4)
|
||||
<br></br>
|
||||
|
||||
Only approved contributors can send pull requests, so please make sure to get in touch before you do!
|
||||
### Tutorials
|
||||
- [Homomorphic Parity Bit](https://docs.zama.ai/tfhe-rs/tutorials/parity_bit)
|
||||
- [Homomorphic Case Changing on Ascii String](https://docs.zama.ai/tfhe-rs/tutorials/ascii_fhe_string)
|
||||
- [Boolean SHA256 with TFHE-rs](https://www.zama.ai/post/boolean-sha256-tfhe-rs)
|
||||
- [Dark Market with TFHE-rs](https://www.zama.ai/post/dark-market-tfhe-rs)
|
||||
- [Regular Expression Engine with TFHE-rs](https://www.zama.ai/post/regex-engine-tfhe-rs)
|
||||
|
||||
## Credits
|
||||
|
||||
This library uses several dependencies and we would like to thank the contributors of those
|
||||
libraries.
|
||||
*Explore more useful resources in [TFHE-rs tutorials](https://docs.zama.ai/tfhe-rs/tutorials) and [Awesome Zama repo](https://github.com/zama-ai/awesome-zama)*
|
||||
<br></br>
|
||||
### Documentation
|
||||
|
||||
## Need support?
|
||||
<a target="_blank" href="https://community.zama.ai">
|
||||
<img src="https://user-images.githubusercontent.com/5758427/231115030-21195b55-2629-4c01-9809-be5059243999.png">
|
||||
</a>
|
||||
Full, comprehensive documentation is available here: [https://docs.zama.ai/tfhe-rs](https://docs.zama.ai/tfhe-rs).
|
||||
<p align="right">
|
||||
<a href="#about" > ↑ Back to top </a>
|
||||
</p>
|
||||
|
||||
## Citing TFHE-rs
|
||||
|
||||
## Working with TFHE-rs
|
||||
|
||||
### Disclaimers
|
||||
|
||||
#### Security Estimation
|
||||
|
||||
Security estimations are done using the
|
||||
[Lattice Estimator](https://github.com/malb/lattice-estimator)
|
||||
with `red_cost_model = reduction.RC.BDGL16`.
|
||||
|
||||
When a new update is published in the Lattice Estimator, we update parameters accordingly.
|
||||
|
||||
#### Side-Channel Attacks
|
||||
|
||||
Mitigation for side-channel attacks has not yet been implemented in TFHE-rs,
|
||||
and will be released in upcoming versions.
|
||||
<br></br>
|
||||
|
||||
### Citations
|
||||
To cite TFHE-rs in academic papers, please use the following entry:
|
||||
|
||||
```text
|
||||
@@ -156,22 +220,31 @@ To cite TFHE-rs in academic papers, please use the following entry:
|
||||
}
|
||||
```
|
||||
|
||||
## License
|
||||
### Contributing
|
||||
|
||||
This software is distributed under the BSD-3-Clause-Clear license. If you have any questions,
|
||||
please contact us at `hello@zama.ai`.
|
||||
There are two ways to contribute to TFHE-rs:
|
||||
|
||||
## Disclaimers
|
||||
- [Open issues](https://github.com/zama-ai/tfhe-rs/issues/new/choose) to report bugs and typos, or to suggest new ideas
|
||||
- Request to become an official contributor by emailing [hello@zama.ai](mailto:hello@zama.ai).
|
||||
|
||||
### Security Estimation
|
||||
Becoming an approved contributor involves signing our Contributor License Agreement (CLA). Only approved contributors can send pull requests, so please make sure to get in touch before you do!
|
||||
<br></br>
|
||||
|
||||
Security estimations are done using the
|
||||
[Lattice Estimator](https://github.com/malb/lattice-estimator)
|
||||
with `red_cost_model = reduction.RC.BDGL16`.
|
||||
### License
|
||||
This software is distributed under the **BSD-3-Clause-Clear** license. If you have any questions, please contact us at hello@zama.ai.
|
||||
<p align="right">
|
||||
<a href="#about" > ↑ Back to top </a>
|
||||
</p>
|
||||
|
||||
When a new update is published in the Lattice Estimator, we update parameters accordingly.
|
||||
|
||||
### Side-Channel Attacks
|
||||
## Support
|
||||
|
||||
Mitigation for side channel attacks have not yet been implemented in TFHE-rs,
|
||||
and will be released in upcoming versions.
|
||||
<a target="_blank" href="https://community.zama.ai">
|
||||
<img src="https://github.com/zama-ai/tfhe-rs/assets/157474013/8da6cf5b-51a0-4c86-9e75-fd0e4a4c64a4">
|
||||
</a>
|
||||
|
||||
🌟 If you find this project helpful or interesting, please consider giving it a star on GitHub! Your support helps to grow the community and motivates further development.
|
||||
|
||||
<p align="right">
|
||||
<a href="#about" > ↑ Back to top </a>
|
||||
</p>
|
||||
|
||||
@@ -6,7 +6,7 @@ use tfhe_trivium::KreyviumStream;
|
||||
use criterion::Criterion;
|
||||
|
||||
pub fn kreyvium_bool_gen(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::all_disabled().enable_default_bool().build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
|
||||
@@ -41,7 +41,7 @@ pub fn kreyvium_bool_gen(c: &mut Criterion) {
|
||||
}
|
||||
|
||||
pub fn kreyvium_bool_warmup(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::all_disabled().enable_default_bool().build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
|
||||
|
||||
@@ -6,9 +6,8 @@ use tfhe_trivium::{KreyviumStreamByte, TransCiphering};
|
||||
use criterion::Criterion;
|
||||
|
||||
pub fn kreyvium_byte_gen(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::all_disabled()
|
||||
.enable_default_integers()
|
||||
.enable_function_evaluation_integers()
|
||||
let config = ConfigBuilder::default()
|
||||
.enable_function_evaluation()
|
||||
.build();
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
@@ -36,9 +35,8 @@ pub fn kreyvium_byte_gen(c: &mut Criterion) {
|
||||
}
|
||||
|
||||
pub fn kreyvium_byte_trans(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::all_disabled()
|
||||
.enable_default_integers()
|
||||
.enable_function_evaluation_integers()
|
||||
let config = ConfigBuilder::default()
|
||||
.enable_function_evaluation()
|
||||
.build();
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
@@ -67,9 +65,8 @@ pub fn kreyvium_byte_trans(c: &mut Criterion) {
|
||||
}
|
||||
|
||||
pub fn kreyvium_byte_warmup(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::all_disabled()
|
||||
.enable_default_integers()
|
||||
.enable_function_evaluation_integers()
|
||||
let config = ConfigBuilder::default()
|
||||
.enable_function_evaluation()
|
||||
.build();
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
|
||||
@@ -8,9 +8,7 @@ use tfhe_trivium::{KreyviumStreamShortint, TransCiphering};
|
||||
use criterion::Criterion;
|
||||
|
||||
pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::all_disabled()
|
||||
.enable_default_integers()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
let (hl_client_key, hl_server_key) = generate_keys(config);
|
||||
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
|
||||
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
|
||||
@@ -60,9 +58,7 @@ pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
|
||||
}
|
||||
|
||||
pub fn kreyvium_shortint_gen(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::all_disabled()
|
||||
.enable_default_integers()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
let (hl_client_key, hl_server_key) = generate_keys(config);
|
||||
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
|
||||
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
|
||||
@@ -107,9 +103,7 @@ pub fn kreyvium_shortint_gen(c: &mut Criterion) {
|
||||
}
|
||||
|
||||
pub fn kreyvium_shortint_trans(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::all_disabled()
|
||||
.enable_default_integers()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
let (hl_client_key, hl_server_key) = generate_keys(config);
|
||||
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
|
||||
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
|
||||
|
||||
@@ -6,7 +6,7 @@ use tfhe_trivium::TriviumStream;
|
||||
use criterion::Criterion;
|
||||
|
||||
pub fn trivium_bool_gen(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::all_disabled().enable_default_bool().build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB".to_string();
|
||||
@@ -41,7 +41,7 @@ pub fn trivium_bool_gen(c: &mut Criterion) {
|
||||
}
|
||||
|
||||
pub fn trivium_bool_warmup(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::all_disabled().enable_default_bool().build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB".to_string();
|
||||
|
||||
@@ -6,9 +6,7 @@ use tfhe_trivium::{TransCiphering, TriviumStreamByte};
|
||||
use criterion::Criterion;
|
||||
|
||||
pub fn trivium_byte_gen(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::all_disabled()
|
||||
.enable_default_integers()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB".to_string();
|
||||
@@ -35,9 +33,7 @@ pub fn trivium_byte_gen(c: &mut Criterion) {
|
||||
}
|
||||
|
||||
pub fn trivium_byte_trans(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::all_disabled()
|
||||
.enable_default_integers()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB".to_string();
|
||||
@@ -65,9 +61,7 @@ pub fn trivium_byte_trans(c: &mut Criterion) {
|
||||
}
|
||||
|
||||
pub fn trivium_byte_warmup(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::all_disabled()
|
||||
.enable_default_integers()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB".to_string();
|
||||
|
||||
@@ -8,9 +8,7 @@ use tfhe_trivium::{TransCiphering, TriviumStreamShortint};
|
||||
use criterion::Criterion;
|
||||
|
||||
pub fn trivium_shortint_warmup(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::all_disabled()
|
||||
.enable_default_integers()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
let (hl_client_key, hl_server_key) = generate_keys(config);
|
||||
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
|
||||
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
|
||||
@@ -60,9 +58,7 @@ pub fn trivium_shortint_warmup(c: &mut Criterion) {
|
||||
}
|
||||
|
||||
pub fn trivium_shortint_gen(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::all_disabled()
|
||||
.enable_default_integers()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
let (hl_client_key, hl_server_key) = generate_keys(config);
|
||||
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
|
||||
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
|
||||
@@ -107,9 +103,7 @@ pub fn trivium_shortint_gen(c: &mut Criterion) {
|
||||
}
|
||||
|
||||
pub fn trivium_shortint_trans(c: &mut Criterion) {
|
||||
let config = ConfigBuilder::all_disabled()
|
||||
.enable_default_integers()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
let (hl_client_key, hl_server_key) = generate_keys(config);
|
||||
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
|
||||
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
|
||||
|
||||
@@ -170,7 +170,7 @@ fn kreyvium_test_4() {
|
||||
|
||||
#[test]
|
||||
fn kreyvium_test_fhe_long() {
|
||||
let config = ConfigBuilder::all_disabled().enable_default_bool().build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
|
||||
@@ -217,9 +217,7 @@ use tfhe::shortint::prelude::*;
|
||||
|
||||
#[test]
|
||||
fn kreyvium_test_shortint_long() {
|
||||
let config = ConfigBuilder::all_disabled()
|
||||
.enable_default_integers()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
let (hl_client_key, hl_server_key) = generate_keys(config);
|
||||
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
|
||||
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
|
||||
@@ -302,9 +300,8 @@ fn kreyvium_test_clear_byte() {
|
||||
|
||||
#[test]
|
||||
fn kreyvium_test_byte_long() {
|
||||
let config = ConfigBuilder::all_disabled()
|
||||
.enable_default_integers()
|
||||
.enable_function_evaluation_integers()
|
||||
let config = ConfigBuilder::default()
|
||||
.enable_function_evaluation()
|
||||
.build();
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
@@ -342,9 +339,8 @@ fn kreyvium_test_byte_long() {
|
||||
|
||||
#[test]
|
||||
fn kreyvium_test_fhe_byte_transciphering_long() {
|
||||
let config = ConfigBuilder::all_disabled()
|
||||
.enable_default_integers()
|
||||
.enable_function_evaluation_integers()
|
||||
let config = ConfigBuilder::default()
|
||||
.enable_function_evaluation()
|
||||
.build();
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
use crate::{KreyviumStreamByte, KreyviumStreamShortint, TriviumStreamByte, TriviumStreamShortint};
|
||||
use tfhe::shortint::Ciphertext;
|
||||
|
||||
use tfhe::prelude::*;
|
||||
use tfhe::{set_server_key, unset_server_key, FheUint64, FheUint8, ServerKey};
|
||||
|
||||
use rayon::prelude::*;
|
||||
|
||||
@@ -232,7 +232,7 @@ fn trivium_test_clear_byte() {
|
||||
|
||||
#[test]
|
||||
fn trivium_test_fhe_long() {
|
||||
let config = ConfigBuilder::all_disabled().enable_default_bool().build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB".to_string();
|
||||
@@ -277,9 +277,7 @@ fn trivium_test_fhe_long() {
|
||||
|
||||
#[test]
|
||||
fn trivium_test_fhe_byte_long() {
|
||||
let config = ConfigBuilder::all_disabled()
|
||||
.enable_default_integers()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB".to_string();
|
||||
@@ -316,9 +314,7 @@ fn trivium_test_fhe_byte_long() {
|
||||
|
||||
#[test]
|
||||
fn trivium_test_fhe_byte_transciphering_long() {
|
||||
let config = ConfigBuilder::all_disabled()
|
||||
.enable_default_integers()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
let (client_key, server_key) = generate_keys(config);
|
||||
|
||||
let key_string = "0053A6F94C9FF24598EB".to_string();
|
||||
@@ -357,9 +353,7 @@ use tfhe::shortint::prelude::*;
|
||||
|
||||
#[test]
|
||||
fn trivium_test_shortint_long() {
|
||||
let config = ConfigBuilder::all_disabled()
|
||||
.enable_default_integers()
|
||||
.build();
|
||||
let config = ConfigBuilder::default().build();
|
||||
let (hl_client_key, hl_server_key) = generate_keys(config);
|
||||
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
|
||||
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
|
||||
|
||||
18
backends/tfhe-cuda-backend/Cargo.toml
Normal file
18
backends/tfhe-cuda-backend/Cargo.toml
Normal file
@@ -0,0 +1,18 @@
|
||||
[package]
|
||||
name = "tfhe-cuda-backend"
|
||||
version = "0.2.0"
|
||||
edition = "2021"
|
||||
authors = ["Zama team"]
|
||||
license = "BSD-3-Clause-Clear"
|
||||
description = "Cuda implementation of TFHE-rs primitives."
|
||||
homepage = "https://www.zama.ai/"
|
||||
documentation = "https://docs.zama.ai/tfhe-rs"
|
||||
repository = "https://github.com/zama-ai/tfhe-rs"
|
||||
readme = "README.md"
|
||||
keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
|
||||
|
||||
[build-dependencies]
|
||||
cmake = { version = "0.1" }
|
||||
|
||||
[dependencies]
|
||||
thiserror = "1.0"
|
||||
28
backends/tfhe-cuda-backend/LICENSE
Normal file
28
backends/tfhe-cuda-backend/LICENSE
Normal file
@@ -0,0 +1,28 @@
|
||||
BSD 3-Clause Clear License
|
||||
|
||||
Copyright © 2024 ZAMA.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright notice, this
|
||||
list of conditions and the following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
3. Neither the name of ZAMA nor the names of its contributors may be used to endorse
|
||||
or promote products derived from this software without specific prior written permission.
|
||||
|
||||
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE.
|
||||
THIS SOFTWARE IS PROVIDED BY THE ZAMA AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
||||
ZAMA OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
|
||||
OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
||||
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
52
backends/tfhe-cuda-backend/README.md
Normal file
52
backends/tfhe-cuda-backend/README.md
Normal file
@@ -0,0 +1,52 @@
|
||||
# TFHE Cuda backend
|
||||
|
||||
## Introduction
|
||||
|
||||
The `tfhe-cuda-backend` holds the code for GPU acceleration of Zama's variant of TFHE.
|
||||
It implements CUDA/C++ functions to perform homomorphic operations on LWE ciphertexts.
|
||||
|
||||
It provides functions to allocate memory on the GPU, to copy data back
|
||||
and forth between the CPU and the GPU, to create and destroy Cuda streams, etc.:
|
||||
- `cuda_create_stream`, `cuda_destroy_stream`
|
||||
- `cuda_malloc`, `cuda_check_valid_malloc`
|
||||
- `cuda_memcpy_async_to_cpu`, `cuda_memcpy_async_to_gpu`
|
||||
- `cuda_get_number_of_gpus`
|
||||
- `cuda_synchronize_device`
|
||||
The cryptographic operations it provides are:
|
||||
- an amortized implementation of the TFHE programmable bootstrap: `cuda_bootstrap_amortized_lwe_ciphertext_vector_32` and `cuda_bootstrap_amortized_lwe_ciphertext_vector_64`
|
||||
- a low latency implementation of the TFHE programmable bootstrap: `cuda_bootstrap_low latency_lwe_ciphertext_vector_32` and `cuda_bootstrap_low_latency_lwe_ciphertext_vector_64`
|
||||
- the keyswitch: `cuda_keyswitch_lwe_ciphertext_vector_32` and `cuda_keyswitch_lwe_ciphertext_vector_64`
|
||||
- the larger precision programmable bootstrap (wop PBS, which supports up to 16 bits of message while the classical PBS only supports up to 8 bits of message) and its sub-components: `cuda_wop_pbs_64`, `cuda_extract_bits_64`, `cuda_circuit_bootstrap_64`, `cuda_cmux_tree_64`, `cuda_blind_rotation_sample_extraction_64`
|
||||
- acceleration for leveled operations: `cuda_negate_lwe_ciphertext_vector_64`, `cuda_add_lwe_ciphertext_vector_64`, `cuda_add_lwe_ciphertext_vector_plaintext_vector_64`, `cuda_mult_lwe_ciphertext_vector_cleartext_vector`.
|
||||
|
||||
## Dependencies
|
||||
|
||||
**Disclaimer**: Compilation on Windows/Mac is not supported yet. Only Nvidia GPUs are supported.
|
||||
|
||||
- nvidia driver - for example, if you're running Ubuntu 20.04 check this [page](https://linuxconfig.org/how-to-install-the-nvidia-drivers-on-ubuntu-20-04-focal-fossa-linux) for installation
|
||||
- [nvcc](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) >= 10.0
|
||||
- [gcc](https://gcc.gnu.org/) >= 8.0 - check this [page](https://gist.github.com/ax3l/9489132) for more details about nvcc/gcc compatible versions
|
||||
- [cmake](https://cmake.org/) >= 3.24
|
||||
|
||||
## Build
|
||||
|
||||
The Cuda project held in `tfhe-cuda-backend` can be compiled independently from TFHE-rs in the following way:
|
||||
```
|
||||
git clone git@github.com:zama-ai/tfhe-rs
|
||||
cd backends/tfhe-cuda-backend/cuda
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
make
|
||||
```
|
||||
The compute capability is detected automatically (with the first GPU information) and set accordingly.
|
||||
If your machine does not have an available Nvidia GPU, the compilation will work if you have the nvcc compiler installed. The generated executable will target a 7.0 compute capability (sm_70).
|
||||
|
||||
## Links
|
||||
|
||||
- [TFHE](https://eprint.iacr.org/2018/421.pdf)
|
||||
|
||||
## License
|
||||
|
||||
This software is distributed under the BSD-3-Clause-Clear license. If you have any questions,
|
||||
please contact us at `hello@zama.ai`.
|
||||
28
backends/tfhe-cuda-backend/build.rs
Normal file
28
backends/tfhe-cuda-backend/build.rs
Normal file
@@ -0,0 +1,28 @@
|
||||
use std::env;
|
||||
use std::process::Command;
|
||||
|
||||
fn main() {
|
||||
println!("Build tfhe-cuda-backend");
|
||||
if env::consts::OS == "linux" {
|
||||
let output = Command::new("./get_os_name.sh").output().unwrap();
|
||||
let distribution = String::from_utf8(output.stdout).unwrap();
|
||||
if distribution != "Ubuntu\n" {
|
||||
println!(
|
||||
"cargo:warning=This Linux distribution is not officially supported. \
|
||||
Only Ubuntu is supported by tfhe-cuda-backend at this time. Build may fail\n"
|
||||
);
|
||||
}
|
||||
let dest = cmake::build("cuda");
|
||||
println!("cargo:rustc-link-search=native={}", dest.display());
|
||||
println!("cargo:rustc-link-lib=static=tfhe_cuda_backend");
|
||||
println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
|
||||
println!("cargo:rustc-link-lib=gomp");
|
||||
println!("cargo:rustc-link-lib=cudart");
|
||||
println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
|
||||
println!("cargo:rustc-link-lib=stdc++");
|
||||
} else {
|
||||
panic!(
|
||||
"Error: platform not supported, tfhe-cuda-backend not built (only Linux is supported)"
|
||||
);
|
||||
}
|
||||
}
|
||||
10
backends/tfhe-cuda-backend/cuda/.cmake-format-config.py
Normal file
10
backends/tfhe-cuda-backend/cuda/.cmake-format-config.py
Normal file
@@ -0,0 +1,10 @@
|
||||
# -----------------------------
|
||||
# Options effecting formatting.
|
||||
# -----------------------------
|
||||
with section("format"):
|
||||
|
||||
# How wide to allow formatted cmake files
|
||||
line_width = 120
|
||||
|
||||
# How many spaces to tab for indent
|
||||
tab_size = 2
|
||||
1
backends/tfhe-cuda-backend/cuda/.gitignore
vendored
Normal file
1
backends/tfhe-cuda-backend/cuda/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
/build/
|
||||
91
backends/tfhe-cuda-backend/cuda/CMakeLists.txt
Normal file
91
backends/tfhe-cuda-backend/cuda/CMakeLists.txt
Normal file
@@ -0,0 +1,91 @@
|
||||
cmake_minimum_required(VERSION 3.24 FATAL_ERROR)
|
||||
project(tfhe_cuda_backend LANGUAGES CXX)
|
||||
|
||||
# See if the minimum CUDA version is available. If not, only enable documentation building.
|
||||
set(MINIMUM_SUPPORTED_CUDA_VERSION 10.0)
|
||||
include(CheckLanguage)
|
||||
# See if CUDA is available
|
||||
check_language(CUDA)
|
||||
# If so, enable CUDA to check the version.
|
||||
if(CMAKE_CUDA_COMPILER)
|
||||
enable_language(CUDA)
|
||||
endif()
|
||||
# If CUDA is not available, or the minimum version is too low do not build
|
||||
if(NOT CMAKE_CUDA_COMPILER)
|
||||
message(FATAL_ERROR "Cuda compiler not found.")
|
||||
endif()
|
||||
|
||||
if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS ${MINIMUM_SUPPORTED_CUDA_VERSION})
|
||||
message(FATAL_ERROR "CUDA ${MINIMUM_SUPPORTED_CUDA_VERSION} or greater is required for compilation.")
|
||||
endif()
|
||||
# Get CUDA compute capability
|
||||
set(OUTPUTFILE ${CMAKE_CURRENT_SOURCE_DIR}/cuda_script) # No suffix required
|
||||
set(CUDAFILE ${CMAKE_CURRENT_SOURCE_DIR}/check_cuda.cu)
|
||||
execute_process(COMMAND nvcc -lcuda ${CUDAFILE} -o ${OUTPUTFILE})
|
||||
execute_process(
|
||||
COMMAND ${OUTPUTFILE}
|
||||
RESULT_VARIABLE CUDA_RETURN_CODE
|
||||
OUTPUT_VARIABLE ARCH)
|
||||
file(REMOVE ${OUTPUTFILE})
|
||||
|
||||
if(${CUDA_RETURN_CODE} EQUAL 0)
|
||||
set(CUDA_SUCCESS "TRUE")
|
||||
else()
|
||||
set(CUDA_SUCCESS "FALSE")
|
||||
endif()
|
||||
|
||||
if(${CUDA_SUCCESS})
|
||||
message(STATUS "CUDA Architecture: ${ARCH}")
|
||||
message(STATUS "CUDA Version: ${CUDA_VERSION_STRING}")
|
||||
message(STATUS "CUDA Path: ${CUDA_TOOLKIT_ROOT_DIR}")
|
||||
message(STATUS "CUDA Libraries: ${CUDA_LIBRARIES}")
|
||||
message(STATUS "CUDA Performance Primitives: ${CUDA_npp_LIBRARY}")
|
||||
else()
|
||||
message(WARNING ${ARCH})
|
||||
endif()
|
||||
|
||||
if(NOT CMAKE_BUILD_TYPE)
|
||||
set(CMAKE_BUILD_TYPE Release)
|
||||
endif()
|
||||
|
||||
# Add OpenMP support
|
||||
find_package(OpenMP REQUIRED)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler ${OpenMP_CXX_FLAGS}")
|
||||
if(${CUDA_SUCCESS})
|
||||
set(CMAKE_CUDA_ARCHITECTURES native)
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES 70)
|
||||
endif()
|
||||
|
||||
# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
|
||||
set(CMAKE_CUDA_FLAGS
|
||||
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 \
|
||||
-std=c++17 --no-exceptions --expt-relaxed-constexpr -rdc=true \
|
||||
--use_fast_math -Xcompiler -fPIC")
|
||||
|
||||
set(INCLUDE_DIR include)
|
||||
|
||||
add_subdirectory(src)
|
||||
enable_testing()
|
||||
add_subdirectory(tests_and_benchmarks)
|
||||
target_include_directories(tfhe_cuda_backend PRIVATE ${INCLUDE_DIR})
|
||||
|
||||
# This is required for rust cargo build
|
||||
install(TARGETS tfhe_cuda_backend DESTINATION .)
|
||||
|
||||
install(TARGETS tfhe_cuda_backend DESTINATION lib)
|
||||
|
||||
# Define a function to add a lint target.
|
||||
find_file(CPPLINT NAMES cpplint cpplint.exe)
|
||||
if(CPPLINT)
|
||||
# Add a custom target to lint all child projects. Dependencies are specified in child projects.
|
||||
add_custom_target(all_lint)
|
||||
# Don't trigger this target on ALL_BUILD or Visual Studio 'Rebuild Solution'
|
||||
set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_ALL TRUE)
|
||||
# set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD TRUE)
|
||||
endif()
|
||||
3
backends/tfhe-cuda-backend/cuda/CPPLINT.cfg
Normal file
3
backends/tfhe-cuda-backend/cuda/CPPLINT.cfg
Normal file
@@ -0,0 +1,3 @@
|
||||
set noparent
|
||||
linelength=240
|
||||
filter=-legal/copyright,-readability/todo,-runtime/references,-build/c++17
|
||||
22
backends/tfhe-cuda-backend/cuda/check_cuda.cu
Normal file
22
backends/tfhe-cuda-backend/cuda/check_cuda.cu
Normal file
@@ -0,0 +1,22 @@
|
||||
#include <stdio.h>
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
cudaDeviceProp dP;
|
||||
float min_cc = 3.0;
|
||||
|
||||
int rc = cudaGetDeviceProperties(&dP, 0);
|
||||
if (rc != cudaSuccess) {
|
||||
cudaError_t error = cudaGetLastError();
|
||||
printf("CUDA error: %s", cudaGetErrorString(error));
|
||||
return rc; /* Failure */
|
||||
}
|
||||
if ((dP.major + (dP.minor / 10)) < min_cc) {
|
||||
printf("Min Compute Capability of %2.1f required: %d.%d found\n Not "
|
||||
"Building CUDA Code",
|
||||
min_cc, dP.major, dP.minor);
|
||||
return 1; /* Failure */
|
||||
} else {
|
||||
printf("-arch=sm_%d%d", dP.major, dP.minor);
|
||||
return 0; /* Success */
|
||||
}
|
||||
}
|
||||
19
backends/tfhe-cuda-backend/cuda/format_tfhe_cuda_backend.sh
Executable file
19
backends/tfhe-cuda-backend/cuda/format_tfhe_cuda_backend.sh
Executable file
@@ -0,0 +1,19 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
while getopts ":c" option; do
|
||||
case $option in
|
||||
c)
|
||||
# code to execute when flag1 is provided
|
||||
find ./{include,src,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file' --dry-run --Werror
|
||||
cmake-format -i CMakeLists.txt -c .cmake-format-config.py
|
||||
find ./{include,src,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
|
||||
git diff --exit-code
|
||||
exit
|
||||
;;
|
||||
esac
|
||||
done
|
||||
find ./{include,src,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file'
|
||||
cmake-format -i CMakeLists.txt -c .cmake-format-config.py
|
||||
find ./{include,src,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
|
||||
328
backends/tfhe-cuda-backend/cuda/include/bootstrap.h
Normal file
328
backends/tfhe-cuda-backend/cuda/include/bootstrap.h
Normal file
@@ -0,0 +1,328 @@
|
||||
#ifndef CUDA_BOOTSTRAP_H
|
||||
#define CUDA_BOOTSTRAP_H
|
||||
|
||||
#include "device.h"
|
||||
#include <cstdint>
|
||||
|
||||
enum PBS_TYPE { MULTI_BIT = 0, LOW_LAT = 1, AMORTIZED = 2 };
|
||||
enum PBS_VARIANT { DEFAULT = 0, FAST = 1 };
|
||||
|
||||
extern "C" {
|
||||
void cuda_fourier_polynomial_mul(void *input1, void *input2, void *output,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t total_polynomials);
|
||||
|
||||
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size);
|
||||
|
||||
void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size);
|
||||
|
||||
void scratch_cuda_bootstrap_amortized_32(
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_bootstrap_amortized_64(
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
void cleanup_cuda_bootstrap_amortized(cuda_stream_t *stream,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
void scratch_cuda_bootstrap_low_latency_32(
|
||||
cuda_stream_t *stream, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_bootstrap_low_latency_64(
|
||||
cuda_stream_t *stream, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
void cleanup_cuda_bootstrap_low_latency_32(cuda_stream_t *stream,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
void cleanup_cuda_bootstrap_low_latency_64(cuda_stream_t *stream,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
uint64_t get_buffer_size_bootstrap_amortized_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
|
||||
|
||||
uint64_t get_buffer_size_bootstrap_low_latency_64(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_bootstrap_low_latency_step_one(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_bootstrap_low_latency_step_two(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_bootstrap_low_latency(uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_full_sm_bootstrap_fast_low_latency(uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
get_buffer_size_partial_sm_bootstrap_fast_low_latency(
|
||||
uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
|
||||
}
|
||||
|
||||
template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;
|
||||
|
||||
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::LOW_LAT> {
|
||||
int8_t *d_mem;
|
||||
|
||||
Torus *global_accumulator;
|
||||
double2 *global_accumulator_fft;
|
||||
|
||||
PBS_VARIANT pbs_variant;
|
||||
|
||||
pbs_buffer(cuda_stream_t *stream, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
|
||||
bool allocate_gpu_memory) {
|
||||
this->pbs_variant = pbs_variant;
|
||||
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
switch (pbs_variant) {
|
||||
case PBS_VARIANT::DEFAULT: {
|
||||
uint64_t full_sm_step_one =
|
||||
get_buffer_size_full_sm_bootstrap_low_latency_step_one<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t full_sm_step_two =
|
||||
get_buffer_size_full_sm_bootstrap_low_latency_step_two<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
|
||||
uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
|
||||
uint64_t full_dm = full_sm_step_one;
|
||||
|
||||
uint64_t device_mem = 0;
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
} else if (max_shared_memory < full_sm_step_two) {
|
||||
device_mem =
|
||||
(partial_dm_step_two + partial_dm_step_one * level_count) *
|
||||
input_lwe_ciphertext_count * (glwe_dimension + 1);
|
||||
} else if (max_shared_memory < full_sm_step_one) {
|
||||
device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
|
||||
level_count * (glwe_dimension + 1);
|
||||
}
|
||||
// Otherwise, both kernels run all in shared memory
|
||||
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream);
|
||||
|
||||
global_accumulator_fft = (double2 *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
|
||||
(polynomial_size / 2) * sizeof(double2),
|
||||
stream);
|
||||
|
||||
global_accumulator = (Torus *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
polynomial_size * sizeof(Torus),
|
||||
stream);
|
||||
} break;
|
||||
case PBS_VARIANT::FAST: {
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
uint64_t partial_dm = full_sm - partial_sm;
|
||||
uint64_t full_dm = full_sm;
|
||||
uint64_t device_mem = 0;
|
||||
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
}
|
||||
|
||||
// Otherwise, both kernels run all in shared memory
|
||||
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream);
|
||||
|
||||
global_accumulator_fft = (double2 *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
|
||||
polynomial_size / 2 * sizeof(double2),
|
||||
stream);
|
||||
} break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unsupported implementation variant.")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void release(cuda_stream_t *stream) {
|
||||
cuda_drop_async(d_mem, stream);
|
||||
cuda_drop_async(global_accumulator_fft, stream);
|
||||
|
||||
if (pbs_variant == DEFAULT)
|
||||
cuda_drop_async(global_accumulator, stream);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t get_buffer_size_bootstrap_fast_low_latency(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
|
||||
|
||||
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_dm = full_sm - partial_sm;
|
||||
uint64_t full_dm = full_sm;
|
||||
uint64_t device_mem = 0;
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
}
|
||||
uint64_t buffer_size = device_mem + (glwe_dimension + 1) * level_count *
|
||||
input_lwe_ciphertext_count *
|
||||
polynomial_size / 2 * sizeof(double2);
|
||||
return buffer_size + buffer_size % sizeof(double2);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_bootstrap_fast_low_latency(uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t level_count,
|
||||
uint32_t num_samples,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_bootstrap_fast_low_latency_lwe_ciphertext_vector(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, LOW_LAT> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, double2 *bootstrapping_key,
|
||||
pbs_buffer<Torus, LOW_LAT> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_fast_bootstrap_low_latency(
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, LOW_LAT> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_bootstrap_low_latency(
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, LOW_LAT> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
|
||||
template <typename T>
|
||||
__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t level_count);
|
||||
|
||||
template <typename T>
|
||||
__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t level_count);
|
||||
|
||||
template <typename T>
|
||||
__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
|
||||
|
||||
#endif
|
||||
|
||||
#endif // CUDA_BOOTSTRAP_H
|
||||
155
backends/tfhe-cuda-backend/cuda/include/bootstrap_multibit.h
Normal file
155
backends/tfhe-cuda-backend/cuda/include/bootstrap_multibit.h
Normal file
@@ -0,0 +1,155 @@
|
||||
#ifndef CUDA_MULTI_BIT_H
|
||||
#define CUDA_MULTI_BIT_H
|
||||
|
||||
#include "bootstrap.h"
|
||||
#include <cstdint>
|
||||
|
||||
extern "C" {
|
||||
|
||||
bool has_support_to_cuda_bootstrap_fast_multi_bit(uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t level_count,
|
||||
uint32_t num_samples,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
void cuda_convert_lwe_multi_bit_bootstrap_key_64(
|
||||
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor);
|
||||
|
||||
void scratch_cuda_multi_bit_pbs_64(
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory,
|
||||
uint32_t chunk_size = 0);
|
||||
|
||||
void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
|
||||
|
||||
void scratch_cuda_generic_multi_bit_pbs_64(
|
||||
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size = 0);
|
||||
|
||||
void cuda_generic_multi_bit_pbs_lwe_ciphertext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
|
||||
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
|
||||
|
||||
void cleanup_cuda_multi_bit_pbs_32(cuda_stream_t *stream, int8_t **pbs_buffer);
|
||||
void cleanup_cuda_multi_bit_pbs_64(cuda_stream_t *stream, int8_t **pbs_buffer);
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_fast_multi_bit_pbs(
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_fast_multi_bit_pbs_lwe_ciphertext_vector(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t lwe_chunk_size = 0);
|
||||
|
||||
template <typename Torus, typename STorus>
|
||||
void scratch_cuda_multi_bit_pbs(
|
||||
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_multi_bit_pbs_lwe_ciphertext_vector(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
|
||||
uint32_t lwe_chunk_size = 0);
|
||||
|
||||
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
double2 *keybundle_fft;
|
||||
Torus *global_accumulator;
|
||||
double2 *global_accumulator_fft;
|
||||
|
||||
PBS_VARIANT pbs_variant;
|
||||
|
||||
pbs_buffer(cuda_stream_t *stream, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
|
||||
PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
|
||||
this->pbs_variant = pbs_variant;
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
switch (pbs_variant) {
|
||||
case DEFAULT:
|
||||
case FAST:
|
||||
keybundle_fft = (double2 *)cuda_malloc_async(
|
||||
input_lwe_ciphertext_count * lwe_chunk_size * level_count *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) *
|
||||
(polynomial_size / 2) * sizeof(double2),
|
||||
stream);
|
||||
global_accumulator = (Torus *)cuda_malloc_async(
|
||||
input_lwe_ciphertext_count * (glwe_dimension + 1) *
|
||||
polynomial_size * sizeof(Torus),
|
||||
stream);
|
||||
global_accumulator_fft = (double2 *)cuda_malloc_async(
|
||||
input_lwe_ciphertext_count * (glwe_dimension + 1) * level_count *
|
||||
(polynomial_size / 2) * sizeof(double2),
|
||||
stream);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unsupported implementation variant.")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void release(cuda_stream_t *stream) {
|
||||
cuda_drop_async(keybundle_fft, stream);
|
||||
cuda_drop_async(global_accumulator, stream);
|
||||
cuda_drop_async(global_accumulator_fft, stream);
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__host__ uint32_t get_lwe_chunk_size(uint32_t lwe_dimension,
|
||||
uint32_t level_count,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t num_samples);
|
||||
|
||||
__host__ uint32_t get_average_lwe_chunk_size(uint32_t lwe_dimension,
|
||||
uint32_t level_count,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t ct_count);
|
||||
|
||||
__host__ uint64_t get_max_buffer_size_multibit_bootstrap(
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_input_lwe_ciphertext_count);
|
||||
#endif
|
||||
|
||||
#endif // CUDA_MULTI_BIT_H
|
||||
18
backends/tfhe-cuda-backend/cuda/include/ciphertext.h
Normal file
18
backends/tfhe-cuda-backend/cuda/include/ciphertext.h
Normal file
@@ -0,0 +1,18 @@
|
||||
#ifndef CUDA_CIPHERTEXT_H
|
||||
#define CUDA_CIPHERTEXT_H
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
extern "C" {
|
||||
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
|
||||
void *v_stream,
|
||||
uint32_t gpu_index,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension);
|
||||
void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
|
||||
void *v_stream,
|
||||
uint32_t gpu_index,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension);
|
||||
};
|
||||
#endif
|
||||
94
backends/tfhe-cuda-backend/cuda/include/device.h
Normal file
94
backends/tfhe-cuda-backend/cuda/include/device.h
Normal file
@@ -0,0 +1,94 @@
|
||||
#ifndef DEVICE_H
|
||||
#define DEVICE_H
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#define synchronize_threads_in_block() __syncthreads()
|
||||
|
||||
extern "C" {
|
||||
|
||||
#define check_cuda_error(ans) \
|
||||
{ cuda_error((ans), __FILE__, __LINE__); }
|
||||
inline void cuda_error(cudaError_t code, const char *file, int line) {
|
||||
if (code != cudaSuccess) {
|
||||
std::fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code),
|
||||
file, line);
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
#define PANIC(format, ...) \
|
||||
{ \
|
||||
std::fprintf(stderr, "%s::%d::%s: panic.\n" format "\n", __FILE__, \
|
||||
__LINE__, __func__, ##__VA_ARGS__); \
|
||||
std::abort(); \
|
||||
}
|
||||
|
||||
struct cuda_stream_t {
|
||||
cudaStream_t stream;
|
||||
uint32_t gpu_index;
|
||||
|
||||
cuda_stream_t(uint32_t gpu_index) {
|
||||
this->gpu_index = gpu_index;
|
||||
|
||||
check_cuda_error(cudaStreamCreate(&stream));
|
||||
}
|
||||
|
||||
void release() {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
check_cuda_error(cudaStreamDestroy(stream));
|
||||
}
|
||||
|
||||
void synchronize() { check_cuda_error(cudaStreamSynchronize(stream)); }
|
||||
};
|
||||
|
||||
cuda_stream_t *cuda_create_stream(uint32_t gpu_index);
|
||||
|
||||
void cuda_destroy_stream(cuda_stream_t *stream);
|
||||
|
||||
void *cuda_malloc(uint64_t size, uint32_t gpu_index);
|
||||
|
||||
void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream);
|
||||
|
||||
void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
|
||||
|
||||
bool cuda_check_support_cooperative_groups();
|
||||
|
||||
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cuda_stream_t *stream);
|
||||
|
||||
void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cuda_stream_t *stream);
|
||||
|
||||
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
cuda_stream_t *stream);
|
||||
|
||||
void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||
cuda_stream_t *stream);
|
||||
|
||||
int cuda_get_number_of_gpus();
|
||||
|
||||
void cuda_synchronize_device(uint32_t gpu_index);
|
||||
|
||||
void cuda_drop(void *ptr, uint32_t gpu_index);
|
||||
|
||||
void cuda_drop_async(void *ptr, cuda_stream_t *stream);
|
||||
|
||||
int cuda_get_max_shared_memory(uint32_t gpu_index);
|
||||
|
||||
void cuda_synchronize_stream(cuda_stream_t *stream);
|
||||
|
||||
void cuda_stream_add_callback(cuda_stream_t *stream,
|
||||
cudaStreamCallback_t callback, void *user_data);
|
||||
|
||||
void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
|
||||
void *host_pointer);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
|
||||
Torus n);
|
||||
#endif
|
||||
100
backends/tfhe-cuda-backend/cuda/include/helper_debug.cuh
Normal file
100
backends/tfhe-cuda-backend/cuda/include/helper_debug.cuh
Normal file
@@ -0,0 +1,100 @@
|
||||
#include "cuComplex.h"
|
||||
#include "thrust/complex.h"
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
|
||||
#define PRINT_VARS
|
||||
#ifdef PRINT_VARS
|
||||
#define PRINT_DEBUG_5(var, begin, end, step, cond) \
|
||||
_print_debug(var, #var, begin, end, step, cond, "", false)
|
||||
#define PRINT_DEBUG_6(var, begin, end, step, cond, text) \
|
||||
_print_debug(var, #var, begin, end, step, cond, text, true)
|
||||
#define CAT(A, B) A##B
|
||||
#define PRINT_SELECT(NAME, NUM) CAT(NAME##_, NUM)
|
||||
#define GET_COUNT(_1, _2, _3, _4, _5, _6, COUNT, ...) COUNT
|
||||
#define VA_SIZE(...) GET_COUNT(__VA_ARGS__, 6, 5, 4, 3, 2, 1)
|
||||
#define PRINT_DEBUG(...) \
|
||||
PRINT_SELECT(PRINT_DEBUG, VA_SIZE(__VA_ARGS__))(__VA_ARGS__)
|
||||
#else
|
||||
#define PRINT_DEBUG(...)
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
__device__ typename std::enable_if<std::is_unsigned<T>::value, void>::type
|
||||
_print_debug(T *var, const char *var_name, int start, int end, int step,
|
||||
bool cond, const char *text, bool has_text) {
|
||||
__syncthreads();
|
||||
if (cond) {
|
||||
if (has_text)
|
||||
printf("%s\n", text);
|
||||
for (int i = start; i < end; i += step) {
|
||||
printf("%s[%u]: %u\n", var_name, i, var[i]);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ typename std::enable_if<std::is_signed<T>::value, void>::type
|
||||
_print_debug(T *var, const char *var_name, int start, int end, int step,
|
||||
bool cond, const char *text, bool has_text) {
|
||||
__syncthreads();
|
||||
if (cond) {
|
||||
if (has_text)
|
||||
printf("%s\n", text);
|
||||
for (int i = start; i < end; i += step) {
|
||||
printf("%s[%u]: %d\n", var_name, i, var[i]);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ typename std::enable_if<std::is_floating_point<T>::value, void>::type
|
||||
_print_debug(T *var, const char *var_name, int start, int end, int step,
|
||||
bool cond, const char *text, bool has_text) {
|
||||
__syncthreads();
|
||||
if (cond) {
|
||||
if (has_text)
|
||||
printf("%s\n", text);
|
||||
for (int i = start; i < end; i += step) {
|
||||
printf("%s[%u]: %.15f\n", var_name, i, var[i]);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__
|
||||
typename std::enable_if<std::is_same<T, thrust::complex<double>>::value,
|
||||
void>::type
|
||||
_print_debug(T *var, const char *var_name, int start, int end, int step,
|
||||
bool cond, const char *text, bool has_text) {
|
||||
__syncthreads();
|
||||
if (cond) {
|
||||
if (has_text)
|
||||
printf("%s\n", text);
|
||||
for (int i = start; i < end; i += step) {
|
||||
printf("%s[%u]: %.15f , %.15f\n", var_name, i, var[i].real(),
|
||||
var[i].imag());
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__
|
||||
typename std::enable_if<std::is_same<T, cuDoubleComplex>::value, void>::type
|
||||
_print_debug(T *var, const char *var_name, int start, int end, int step,
|
||||
bool cond, const char *text, bool has_text) {
|
||||
__syncthreads();
|
||||
if (cond) {
|
||||
if (has_text)
|
||||
printf("%s\n", text);
|
||||
for (int i = start; i < end; i += step) {
|
||||
printf("%s[%u]: %.15f , %.15f\n", var_name, i, var[i].x, var[i].y);
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
1341
backends/tfhe-cuda-backend/cuda/include/integer.h
Normal file
1341
backends/tfhe-cuda-backend/cuda/include/integer.h
Normal file
File diff suppressed because it is too large
Load Diff
21
backends/tfhe-cuda-backend/cuda/include/keyswitch.h
Normal file
21
backends/tfhe-cuda-backend/cuda/include/keyswitch.h
Normal file
@@ -0,0 +1,21 @@
|
||||
#ifndef CNCRT_KS_H_
|
||||
#define CNCRT_KS_H_
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
extern "C" {
|
||||
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_32(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples);
|
||||
}
|
||||
|
||||
#endif // CNCRT_KS_H_
|
||||
50
backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
Normal file
50
backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
Normal file
@@ -0,0 +1,50 @@
|
||||
#ifndef CUDA_LINALG_H_
|
||||
#define CUDA_LINALG_H_
|
||||
|
||||
#include "bootstrap.h"
|
||||
#include <cstdint>
|
||||
#include <device.h>
|
||||
|
||||
extern "C" {
|
||||
|
||||
void cuda_negate_lwe_ciphertext_vector_32(cuda_stream_t *stream,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_negate_lwe_ciphertext_vector_64(cuda_stream_t *stream,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_add_lwe_ciphertext_vector_32(cuda_stream_t *stream,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in_1,
|
||||
void *lwe_array_in_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_add_lwe_ciphertext_vector_64(cuda_stream_t *stream,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in_1,
|
||||
void *lwe_array_in_2,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
void *plaintext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
void *plaintext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
void *cleartext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
void *cleartext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
#endif // CUDA_LINALG_H_
|
||||
18
backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
Normal file
18
backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
Normal file
@@ -0,0 +1,18 @@
|
||||
set(SOURCES
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h)
|
||||
file(GLOB_RECURSE SOURCES "*.cu")
|
||||
add_library(tfhe_cuda_backend STATIC ${SOURCES})
|
||||
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
|
||||
target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
|
||||
target_include_directories(tfhe_cuda_backend PRIVATE .)
|
||||
1
backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
Normal file
1
backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
Normal file
@@ -0,0 +1 @@
|
||||
#include "ciphertext.cuh"
|
||||
44
backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
Normal file
44
backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
Normal file
@@ -0,0 +1,44 @@
|
||||
#ifndef CUDA_CIPHERTEXT_CUH
|
||||
#define CUDA_CIPHERTEXT_CUH
|
||||
|
||||
#include "ciphertext.h"
|
||||
#include "device.h"
|
||||
#include <cstdint>
|
||||
|
||||
template <typename T>
|
||||
void cuda_convert_lwe_ciphertext_vector_to_gpu(T *dest, T *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
|
||||
cuda_memcpy_async_to_gpu(dest, src, size, stream);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension) {
|
||||
cuda_convert_lwe_ciphertext_vector_to_gpu<uint64_t>(
|
||||
(uint64_t *)dest, (uint64_t *)src, stream, number_of_cts, lwe_dimension);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void cuda_convert_lwe_ciphertext_vector_to_cpu(T *dest, T *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
|
||||
cuda_memcpy_async_to_cpu(dest, src, size, stream);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
|
||||
cuda_stream_t *stream,
|
||||
uint32_t number_of_cts,
|
||||
uint32_t lwe_dimension) {
|
||||
cuda_convert_lwe_ciphertext_vector_to_cpu<uint64_t>(
|
||||
(uint64_t *)dest, (uint64_t *)src, stream, number_of_cts, lwe_dimension);
|
||||
}
|
||||
|
||||
#endif
|
||||
162
backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
Normal file
162
backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
Normal file
@@ -0,0 +1,162 @@
|
||||
#ifndef CNCRT_CRYPTO_CUH
|
||||
#define CNCRT_CRPYTO_CUH
|
||||
|
||||
#include "device.h"
|
||||
#include <cstdint>
|
||||
|
||||
/**
|
||||
* GadgetMatrix implements the iterator design pattern to decompose a set of
|
||||
* num_poly consecutive polynomials with degree params::degree. A total of
|
||||
* level_count levels is expected and each call to decompose_and_compress_next()
|
||||
* writes to the result the next level. It is also possible to advance an
|
||||
* arbitrary amount of levels by using decompose_and_compress_level().
|
||||
*
|
||||
* This class always decomposes the entire set of num_poly polynomials.
|
||||
* By default, it works on a single polynomial.
|
||||
*/
|
||||
#pragma once
|
||||
template <typename T, class params> class GadgetMatrix {
|
||||
private:
|
||||
uint32_t level_count;
|
||||
uint32_t base_log;
|
||||
uint32_t mask;
|
||||
uint32_t halfbg;
|
||||
uint32_t num_poly;
|
||||
T offset;
|
||||
int current_level;
|
||||
T mask_mod_b;
|
||||
T *state;
|
||||
|
||||
public:
|
||||
__device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state,
|
||||
uint32_t num_poly = 1)
|
||||
: base_log(base_log), level_count(level_count), num_poly(num_poly),
|
||||
state(state) {
|
||||
|
||||
mask_mod_b = (1ll << base_log) - 1ll;
|
||||
current_level = level_count;
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < num_poly * params::opt; i++) {
|
||||
state[tid] >>= (sizeof(T) * 8 - base_log * level_count);
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
// Decomposes all polynomials at once
|
||||
__device__ void decompose_and_compress_next(double2 *result) {
|
||||
for (int j = 0; j < num_poly; j++) {
|
||||
auto result_slice = result + j * params::degree / 2;
|
||||
decompose_and_compress_next_polynomial(result_slice, j);
|
||||
}
|
||||
}
|
||||
|
||||
// Decomposes a single polynomial
|
||||
__device__ void decompose_and_compress_next_polynomial(double2 *result,
|
||||
int j) {
|
||||
if (j == 0)
|
||||
current_level -= 1;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
auto state_slice = state + j * params::degree;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
T res_re = state_slice[tid] & mask_mod_b;
|
||||
T res_im = state_slice[tid + params::degree / 2] & mask_mod_b;
|
||||
state_slice[tid] >>= base_log;
|
||||
state_slice[tid + params::degree / 2] >>= base_log;
|
||||
T carry_re = ((res_re - 1ll) | state_slice[tid]) & res_re;
|
||||
T carry_im =
|
||||
((res_im - 1ll) | state_slice[tid + params::degree / 2]) & res_im;
|
||||
carry_re >>= (base_log - 1);
|
||||
carry_im >>= (base_log - 1);
|
||||
state_slice[tid] += carry_re;
|
||||
state_slice[tid + params::degree / 2] += carry_im;
|
||||
res_re -= carry_re << base_log;
|
||||
res_im -= carry_im << base_log;
|
||||
|
||||
result[tid].x = (int32_t)res_re;
|
||||
result[tid].y = (int32_t)res_im;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
// Decomposes a single polynomial
|
||||
__device__ void
|
||||
decompose_and_compress_next_polynomial_elements(double2 *result, int j) {
|
||||
if (j == 0)
|
||||
current_level -= 1;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
auto state_slice = state + j * params::degree;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
T res_re = state_slice[tid] & mask_mod_b;
|
||||
T res_im = state_slice[tid + params::degree / 2] & mask_mod_b;
|
||||
state_slice[tid] >>= base_log;
|
||||
state_slice[tid + params::degree / 2] >>= base_log;
|
||||
T carry_re = ((res_re - 1ll) | state_slice[tid]) & res_re;
|
||||
T carry_im =
|
||||
((res_im - 1ll) | state_slice[tid + params::degree / 2]) & res_im;
|
||||
carry_re >>= (base_log - 1);
|
||||
carry_im >>= (base_log - 1);
|
||||
state_slice[tid] += carry_re;
|
||||
state_slice[tid + params::degree / 2] += carry_im;
|
||||
res_re -= carry_re << base_log;
|
||||
res_im -= carry_im << base_log;
|
||||
|
||||
result[i].x = (int32_t)res_re;
|
||||
result[i].y = (int32_t)res_im;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
__device__ void decompose_and_compress_level(double2 *result, int level) {
|
||||
for (int i = 0; i < level_count - level; i++)
|
||||
decompose_and_compress_next(result);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> class GadgetMatrixSingle {
|
||||
private:
|
||||
uint32_t level_count;
|
||||
uint32_t base_log;
|
||||
uint32_t mask;
|
||||
uint32_t halfbg;
|
||||
T offset;
|
||||
|
||||
public:
|
||||
__device__ GadgetMatrixSingle(uint32_t base_log, uint32_t level_count)
|
||||
: base_log(base_log), level_count(level_count) {
|
||||
uint32_t bg = 1 << base_log;
|
||||
this->halfbg = bg / 2;
|
||||
this->mask = bg - 1;
|
||||
T temp = 0;
|
||||
for (int i = 0; i < this->level_count; i++) {
|
||||
temp += 1ULL << (sizeof(T) * 8 - (i + 1) * this->base_log);
|
||||
}
|
||||
this->offset = temp * this->halfbg;
|
||||
}
|
||||
|
||||
__device__ T decompose_one_level_single(T element, uint32_t level) {
|
||||
T s = element + this->offset;
|
||||
uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
|
||||
T temp1 = (s >> decal) & this->mask;
|
||||
return (T)(temp1 - this->halfbg);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus>
|
||||
__device__ Torus decompose_one(Torus &state, Torus mask_mod_b, int base_log) {
|
||||
Torus res = state & mask_mod_b;
|
||||
state >>= base_log;
|
||||
Torus carry = ((res - 1ll) | state) & res;
|
||||
carry >>= base_log - 1;
|
||||
state += carry;
|
||||
res -= carry << base_log;
|
||||
return res;
|
||||
}
|
||||
|
||||
#endif // CNCRT_CRPYTO_H
|
||||
74
backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
Normal file
74
backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
Normal file
@@ -0,0 +1,74 @@
|
||||
#ifndef CNCRT_GGSW_CUH
|
||||
#define CNCRT_GGSW_CUH
|
||||
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
|
||||
template <typename T, typename ST, class params, sharedMemDegree SMD>
|
||||
__global__ void device_batch_fft_ggsw_vector(double2 *dest, T *src,
|
||||
int8_t *device_mem) {
|
||||
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
double2 *selected_memory;
|
||||
|
||||
if constexpr (SMD == FULLSM)
|
||||
selected_memory = (double2 *)sharedmem;
|
||||
else
|
||||
selected_memory = (double2 *)device_mem[blockIdx.x * params::degree];
|
||||
|
||||
// Compression
|
||||
int offset = blockIdx.x * blockDim.x;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
ST x = src[(tid) + params::opt * offset];
|
||||
ST y = src[(tid + params::degree / 2) + params::opt * offset];
|
||||
selected_memory[tid].x = x / (double)std::numeric_limits<T>::max();
|
||||
selected_memory[tid].y = y / (double)std::numeric_limits<T>::max();
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(selected_memory);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Write the output to global memory
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < params::opt / 2; j++) {
|
||||
dest[tid + (params::opt >> 1) * offset] = selected_memory[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies the FFT transform on sequence of GGSW ciphertexts already in the
|
||||
* global memory
|
||||
*/
|
||||
template <typename T, typename ST, class params>
|
||||
void batch_fft_ggsw_vector(cuda_stream_t *stream, double2 *dest, T *src,
|
||||
int8_t *d_mem, uint32_t r, uint32_t glwe_dim,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t gpu_index, uint32_t max_shared_memory) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
int shared_memory_size = sizeof(double) * polynomial_size;
|
||||
|
||||
int gridSize = r * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
|
||||
int blockSize = polynomial_size / params::opt;
|
||||
|
||||
if (max_shared_memory < shared_memory_size) {
|
||||
device_batch_fft_ggsw_vector<T, ST, params, NOSM>
|
||||
<<<gridSize, blockSize, 0, stream->stream>>>(dest, src, d_mem);
|
||||
} else {
|
||||
device_batch_fft_ggsw_vector<T, ST, params, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream->stream>>>(dest, src,
|
||||
d_mem);
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif // CNCRT_GGSW_CUH
|
||||
48
backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
Normal file
48
backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
Normal file
@@ -0,0 +1,48 @@
|
||||
#include "keyswitch.cuh"
|
||||
#include "keyswitch.h"
|
||||
#include <cstdint>
|
||||
|
||||
/* Perform keyswitch on a batch of 32 bits input LWE ciphertexts.
|
||||
* Head out to the equivalent operation on 64 bits for more details.
|
||||
*/
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_32(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
stream, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(lwe_input_indexes), static_cast<uint32_t *>(ksk),
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
|
||||
}
|
||||
|
||||
/* Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
|
||||
*
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - lwe_array_out: output batch of num_samples keyswitched ciphertexts c =
|
||||
* (a0,..an-1,b) where n is the output LWE dimension (lwe_dimension_out)
|
||||
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing
|
||||
* lwe_dimension_in mask values + 1 body value
|
||||
* - ksk: the keyswitch key to be used in the operation
|
||||
* - base log: the log of the base used in the decomposition (should be the one
|
||||
* used to create the ksk)
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the keyswitch
|
||||
* - num_samples blocks of threads are launched
|
||||
*/
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *ksk,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
|
||||
}
|
||||
140
backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
Normal file
140
backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
Normal file
@@ -0,0 +1,140 @@
|
||||
#ifndef CNCRT_KS_CUH
|
||||
#define CNCRT_KS_CUH
|
||||
|
||||
#include "device.h"
|
||||
#include "gadget.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "torus.cuh"
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
template <typename Torus>
|
||||
__device__ Torus *get_ith_block(Torus *ksk, int i, int level,
|
||||
uint32_t lwe_dimension_out,
|
||||
uint32_t level_count) {
|
||||
int pos = i * level_count * (lwe_dimension_out + 1) +
|
||||
level * (lwe_dimension_out + 1);
|
||||
Torus *ptr = &ksk[pos];
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/*
|
||||
* keyswitch kernel
|
||||
* Each thread handles a piece of the following equation:
|
||||
* $$GLWE_s2(\Delta.m+e) = (0,0,..,0,b) - \sum_{i=0,k-1} <Dec(a_i),
|
||||
* (GLWE_s2(s1_i q/beta),..,GLWE(s1_i q/beta^l)>$$ where k is the dimension of
|
||||
* the GLWE ciphertext. If the polynomial dimension in GLWE is > 1, this
|
||||
* equation is solved for each polynomial coefficient. where Dec denotes the
|
||||
* decomposition with base beta and l levels and the inner product is done
|
||||
* between the decomposition of a_i and l GLWE encryptions of s1_i q/\beta^j,
|
||||
* with j in [1,l] We obtain a GLWE encryption of Delta.m (with Delta the
|
||||
* scaling factor) under key s2 instead of s1, with an increased noise
|
||||
*
|
||||
*/
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lwe_array_in,
|
||||
Torus *lwe_input_indexes, Torus *ksk, uint32_t lwe_dimension_in,
|
||||
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
|
||||
int lwe_lower, int lwe_upper, int cutoff) {
|
||||
int tid = threadIdx.x;
|
||||
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
|
||||
Torus *local_lwe_array_out = (Torus *)sharedmem;
|
||||
|
||||
auto block_lwe_array_in = get_chunk(
|
||||
lwe_array_in, lwe_input_indexes[blockIdx.x], lwe_dimension_in + 1);
|
||||
auto block_lwe_array_out = get_chunk(
|
||||
lwe_array_out, lwe_output_indexes[blockIdx.x], lwe_dimension_out + 1);
|
||||
|
||||
auto gadget = GadgetMatrixSingle<Torus>(base_log, level_count);
|
||||
|
||||
int lwe_part_per_thd;
|
||||
if (tid < cutoff) {
|
||||
lwe_part_per_thd = lwe_upper;
|
||||
} else {
|
||||
lwe_part_per_thd = lwe_lower;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (int k = 0; k < lwe_part_per_thd; k++) {
|
||||
int idx = tid + k * blockDim.x;
|
||||
local_lwe_array_out[idx] = 0;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (tid == 0) {
|
||||
local_lwe_array_out[lwe_dimension_out] =
|
||||
block_lwe_array_in[lwe_dimension_in];
|
||||
}
|
||||
|
||||
for (int i = 0; i < lwe_dimension_in; i++) {
|
||||
|
||||
__syncthreads();
|
||||
|
||||
Torus a_i =
|
||||
round_to_closest_multiple(block_lwe_array_in[i], base_log, level_count);
|
||||
|
||||
Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
|
||||
Torus mask_mod_b = (1ll << base_log) - 1ll;
|
||||
|
||||
for (int j = 0; j < level_count; j++) {
|
||||
auto ksk_block = get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
|
||||
Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
|
||||
for (int k = 0; k < lwe_part_per_thd; k++) {
|
||||
int idx = tid + k * blockDim.x;
|
||||
local_lwe_array_out[idx] -= (Torus)ksk_block[idx] * decomposed;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int k = 0; k < lwe_part_per_thd; k++) {
|
||||
int idx = tid + k * blockDim.x;
|
||||
block_lwe_array_out[idx] = local_lwe_array_out[idx];
|
||||
}
|
||||
}
|
||||
|
||||
/// assume lwe_array_in in the gpu
|
||||
template <typename Torus>
|
||||
__host__ void cuda_keyswitch_lwe_ciphertext_vector(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *ksk,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
constexpr int ideal_threads = 128;
|
||||
|
||||
int lwe_size = lwe_dimension_out + 1;
|
||||
int lwe_lower, lwe_upper, cutoff;
|
||||
if (lwe_size % ideal_threads == 0) {
|
||||
lwe_lower = lwe_size / ideal_threads;
|
||||
lwe_upper = lwe_size / ideal_threads;
|
||||
cutoff = 0;
|
||||
} else {
|
||||
int y = ceil((double)lwe_size / (double)ideal_threads) * ideal_threads -
|
||||
lwe_size;
|
||||
cutoff = ideal_threads - y;
|
||||
lwe_lower = lwe_size / ideal_threads;
|
||||
lwe_upper = (int)ceil((double)lwe_size / (double)ideal_threads);
|
||||
}
|
||||
|
||||
int lwe_size_after = lwe_size * num_samples;
|
||||
|
||||
int shared_mem = sizeof(Torus) * lwe_size;
|
||||
|
||||
cuda_memset_async(lwe_array_out, 0, sizeof(Torus) * lwe_size_after, stream);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
dim3 grid(num_samples, 1, 1);
|
||||
dim3 threads(ideal_threads, 1, 1);
|
||||
|
||||
keyswitch<Torus><<<grid, threads, shared_mem, stream->stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, lwe_lower,
|
||||
lwe_upper, cutoff);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif
|
||||
74
backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
Normal file
74
backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
Normal file
@@ -0,0 +1,74 @@
|
||||
#ifndef CNCRT_TORUS_CUH
|
||||
#define CNCRT_TORUS_CUH
|
||||
|
||||
#include "types/int128.cuh"
|
||||
#include <limits>
|
||||
|
||||
template <typename T>
|
||||
__device__ inline void typecast_double_to_torus(double x, T &r) {
|
||||
r = T(x);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ inline void typecast_double_to_torus<uint32_t>(double x,
|
||||
uint32_t &r) {
|
||||
r = __double2uint_rn(x);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ inline void typecast_double_to_torus<uint64_t>(double x,
|
||||
uint64_t &r) {
|
||||
// The ull intrinsic does not behave in the same way on all architectures and
|
||||
// on some platforms this causes the cmux tree test to fail
|
||||
// Hence the intrinsic is not used here
|
||||
uint128 nnnn = make_uint128_from_float(x);
|
||||
uint64_t lll = nnnn.lo_;
|
||||
r = lll;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
|
||||
uint32_t level_count) {
|
||||
T shift = sizeof(T) * 8 - level_count * base_log;
|
||||
T mask = 1ll << (shift - 1);
|
||||
T b = (x & mask) >> (shift - 1);
|
||||
T res = x >> shift;
|
||||
res += b;
|
||||
res <<= shift;
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void rescale_torus_element(T element, T &output,
|
||||
uint32_t log_shift) {
|
||||
output =
|
||||
round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
|
||||
(double)log_shift);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T rescale_torus_element(T element,
|
||||
uint32_t log_shift) {
|
||||
return round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
|
||||
(double)log_shift);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ __forceinline__ void
|
||||
rescale_torus_element<uint32_t>(uint32_t element, uint32_t &output,
|
||||
uint32_t log_shift) {
|
||||
output =
|
||||
round(__uint2double_rn(element) /
|
||||
(__uint2double_rn(std::numeric_limits<uint32_t>::max()) + 1.0) *
|
||||
__uint2double_rn(log_shift));
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ __forceinline__ void
|
||||
rescale_torus_element<uint64_t>(uint64_t element, uint64_t &output,
|
||||
uint32_t log_shift) {
|
||||
output = round(__ull2double_rn(element) /
|
||||
(__ull2double_rn(std::numeric_limits<uint64_t>::max()) + 1.0) *
|
||||
__uint2double_rn(log_shift));
|
||||
}
|
||||
#endif // CNCRT_TORUS_H
|
||||
242
backends/tfhe-cuda-backend/cuda/src/device.cu
Normal file
242
backends/tfhe-cuda-backend/cuda/src/device.cu
Normal file
@@ -0,0 +1,242 @@
|
||||
#include "device.h"
|
||||
#include <cstdint>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
/// Unsafe function to create a CUDA stream, must check first that GPU exists
|
||||
cuda_stream_t *cuda_create_stream(uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cuda_stream_t *stream = new cuda_stream_t(gpu_index);
|
||||
return stream;
|
||||
}
|
||||
|
||||
/// Unsafe function to destroy CUDA stream, must check first the GPU exists
|
||||
void cuda_destroy_stream(cuda_stream_t *stream) { stream->release(); }
|
||||
|
||||
/// Unsafe function that will try to allocate even if gpu_index is invalid
|
||||
/// or if there's not enough memory. A safe wrapper around it must call
|
||||
/// cuda_check_valid_malloc() first
|
||||
void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
void *ptr;
|
||||
check_cuda_error(cudaMalloc((void **)&ptr, size));
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/// Allocates a size-byte array at the device memory. Tries to do it
|
||||
/// asynchronously.
|
||||
void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream) {
|
||||
check_cuda_error(cudaSetDevice(stream->gpu_index));
|
||||
void *ptr;
|
||||
|
||||
#ifndef CUDART_VERSION
|
||||
#error CUDART_VERSION Undefined!
|
||||
#elif (CUDART_VERSION >= 11020)
|
||||
int support_async_alloc;
|
||||
check_cuda_error(cudaDeviceGetAttribute(&support_async_alloc,
|
||||
cudaDevAttrMemoryPoolsSupported,
|
||||
stream->gpu_index));
|
||||
|
||||
if (support_async_alloc) {
|
||||
check_cuda_error(cudaMallocAsync((void **)&ptr, size, stream->stream));
|
||||
} else {
|
||||
check_cuda_error(cudaMalloc((void **)&ptr, size));
|
||||
}
|
||||
#else
|
||||
check_cuda_error(cudaMalloc((void **)&ptr, size));
|
||||
#endif
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/// Check that allocation is valid
|
||||
void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
size_t total_mem, free_mem;
|
||||
check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
|
||||
if (size > free_mem) {
|
||||
PANIC("Cuda error: not enough memory on device. "
|
||||
"Available: %zu vs Requested: %lu",
|
||||
free_mem, size)
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns
|
||||
/// false if Cooperative Groups is not supported.
|
||||
/// true otherwise
|
||||
bool cuda_check_support_cooperative_groups() {
|
||||
int cooperative_groups_supported = 0;
|
||||
check_cuda_error(cudaDeviceGetAttribute(&cooperative_groups_supported,
|
||||
cudaDevAttrCooperativeLaunch, 0));
|
||||
|
||||
return cooperative_groups_supported > 0;
|
||||
}
|
||||
|
||||
/// Copy memory to the GPU asynchronously
|
||||
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cuda_stream_t *stream) {
|
||||
if (size == 0)
|
||||
return;
|
||||
cudaPointerAttributes attr;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
|
||||
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid device pointer in async copy to GPU.")
|
||||
}
|
||||
|
||||
check_cuda_error(cudaSetDevice(stream->gpu_index));
|
||||
check_cuda_error(
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream->stream));
|
||||
}
|
||||
|
||||
/// Copy memory within a GPU asynchronously
|
||||
void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
|
||||
cuda_stream_t *stream) {
|
||||
if (size == 0)
|
||||
return;
|
||||
cudaPointerAttributes attr_dest;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
|
||||
if (attr_dest.device != stream->gpu_index &&
|
||||
attr_dest.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
|
||||
}
|
||||
cudaPointerAttributes attr_src;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
|
||||
if (attr_src.device != stream->gpu_index &&
|
||||
attr_src.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
|
||||
}
|
||||
if (attr_src.device != attr_dest.device) {
|
||||
PANIC("Cuda error: different devices specified in copy from GPU to GPU.")
|
||||
}
|
||||
|
||||
check_cuda_error(cudaSetDevice(stream->gpu_index));
|
||||
check_cuda_error(cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice,
|
||||
stream->stream));
|
||||
}
|
||||
|
||||
/// Synchronizes device
|
||||
void cuda_synchronize_device(uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
check_cuda_error(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
|
||||
cuda_stream_t *stream) {
|
||||
if (size == 0)
|
||||
return;
|
||||
cudaPointerAttributes attr;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
|
||||
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid dest device pointer in cuda memset.")
|
||||
}
|
||||
check_cuda_error(cudaSetDevice(stream->gpu_index));
|
||||
check_cuda_error(cudaMemsetAsync(dest, val, size, stream->stream));
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (index < n)
|
||||
array[index] = value;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
|
||||
Torus n) {
|
||||
cudaPointerAttributes attr;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr, d_array));
|
||||
if (attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid dest device pointer in cuda set value.")
|
||||
}
|
||||
int block_size = 256;
|
||||
int num_blocks = (n + block_size - 1) / block_size;
|
||||
|
||||
// Launch the kernel
|
||||
cuda_set_value_kernel<<<num_blocks, block_size, 0, *stream>>>(d_array, value,
|
||||
n);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
/// Explicitly instantiate cuda_set_value_async for 32 and 64 bits
|
||||
template void cuda_set_value_async(cudaStream_t *stream, uint64_t *d_array,
|
||||
uint64_t value, uint64_t n);
|
||||
template void cuda_set_value_async(cudaStream_t *stream, uint32_t *d_array,
|
||||
uint32_t value, uint32_t n);
|
||||
|
||||
/// Copy memory to the CPU asynchronously
|
||||
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
cuda_stream_t *stream) {
|
||||
if (size == 0)
|
||||
return;
|
||||
cudaPointerAttributes attr;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr, src));
|
||||
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
|
||||
}
|
||||
|
||||
check_cuda_error(cudaSetDevice(stream->gpu_index));
|
||||
check_cuda_error(
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream->stream));
|
||||
}
|
||||
|
||||
/// Return number of GPUs available
|
||||
int cuda_get_number_of_gpus() {
|
||||
int num_gpus;
|
||||
check_cuda_error(cudaGetDeviceCount(&num_gpus));
|
||||
return num_gpus;
|
||||
}
|
||||
|
||||
/// Drop a cuda array
|
||||
void cuda_drop(void *ptr, uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
check_cuda_error(cudaFree(ptr));
|
||||
}
|
||||
|
||||
/// Drop a cuda array asynchronously, if supported on the device
|
||||
void cuda_drop_async(void *ptr, cuda_stream_t *stream) {
|
||||
|
||||
check_cuda_error(cudaSetDevice(stream->gpu_index));
|
||||
#ifndef CUDART_VERSION
|
||||
#error CUDART_VERSION Undefined!
|
||||
#elif (CUDART_VERSION >= 11020)
|
||||
int support_async_alloc;
|
||||
check_cuda_error(cudaDeviceGetAttribute(&support_async_alloc,
|
||||
cudaDevAttrMemoryPoolsSupported,
|
||||
stream->gpu_index));
|
||||
|
||||
if (support_async_alloc) {
|
||||
check_cuda_error(cudaFreeAsync(ptr, stream->stream));
|
||||
} else {
|
||||
check_cuda_error(cudaFree(ptr));
|
||||
}
|
||||
#else
|
||||
check_cuda_error(cudaFree(ptr));
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Get the maximum size for the shared memory
|
||||
int cuda_get_max_shared_memory(uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
cudaDeviceProp prop;
|
||||
check_cuda_error(cudaGetDeviceProperties(&prop, gpu_index));
|
||||
int max_shared_memory = 0;
|
||||
if (prop.major >= 6) {
|
||||
max_shared_memory = prop.sharedMemPerMultiprocessor;
|
||||
} else {
|
||||
max_shared_memory = prop.sharedMemPerBlock;
|
||||
}
|
||||
return max_shared_memory;
|
||||
}
|
||||
|
||||
void cuda_synchronize_stream(cuda_stream_t *stream) { stream->synchronize(); }
|
||||
|
||||
void cuda_stream_add_callback(cuda_stream_t *stream,
|
||||
cudaStreamCallback_t callback, void *user_data) {
|
||||
|
||||
check_cuda_error(
|
||||
cudaStreamAddCallback(stream->stream, callback, user_data, 0));
|
||||
}
|
||||
|
||||
void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
|
||||
void *host_pointer) {
|
||||
free(host_pointer);
|
||||
}
|
||||
725
backends/tfhe-cuda-backend/cuda/src/fft/bnsmfft.cuh
Normal file
725
backends/tfhe-cuda-backend/cuda/src/fft/bnsmfft.cuh
Normal file
@@ -0,0 +1,725 @@
|
||||
#ifndef GPU_BOOTSTRAP_FFT_CUH
|
||||
#define GPU_BOOTSTRAP_FFT_CUH
|
||||
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "twiddles.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
|
||||
/*
|
||||
* Direct negacyclic FFT:
|
||||
* - before the FFT the N real coefficients are stored into a
|
||||
* N/2 sized complex with the even coefficients in the real part
|
||||
* and the odd coefficients in the imaginary part. This is referred to
|
||||
* as the half-size FFT
|
||||
* - when calling BNSMFFT_direct for the forward negacyclic FFT of PBS,
|
||||
* opt is divided by 2 because the butterfly pattern is always applied
|
||||
* between pairs of coefficients
|
||||
* - instead of twisting each coefficient A_j before the FFT by
|
||||
* multiplying by the w^j roots of unity (aka twiddles, w=exp(-i pi /N)),
|
||||
* the FFT is modified, and for each level k of the FFT the twiddle:
|
||||
* w_j,k = exp(-i pi j/2^k)
|
||||
* is replaced with:
|
||||
* \zeta_j,k = exp(-i pi (2j-1)/2^k)
|
||||
*/
|
||||
template <class params> __device__ void NSMFFT_direct(double2 *A) {
|
||||
|
||||
/* We don't make bit reverse here, since twiddles are already reversed
|
||||
* Each thread is always in charge of "opt/2" pairs of coefficients,
|
||||
* which is why we always loop through N/2 by N/opt strides
|
||||
* The pragma unroll instruction tells the compiler to unroll the
|
||||
* full loop, which should increase performance
|
||||
*/
|
||||
|
||||
size_t tid = threadIdx.x;
|
||||
size_t twid_id;
|
||||
size_t i1, i2;
|
||||
double2 u, v, w;
|
||||
// level 1
|
||||
// we don't make actual complex multiplication on level1 since we have only
|
||||
// one twiddle, it's real and image parts are equal, so we can multiply
|
||||
// it with simpler operations
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
i1 = tid;
|
||||
i2 = tid + params::degree / 2;
|
||||
|
||||
u = A[i1];
|
||||
v = A[i2] * (double2){0.707106781186547461715008466854,
|
||||
0.707106781186547461715008466854};
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 2
|
||||
// from this level there are more than one twiddles and none of them has equal
|
||||
// real and imag parts, so complete complex multiplication is needed
|
||||
// for each level params::degree / 2^level represents number of coefficients
|
||||
// inside divided chunk of specific level
|
||||
//
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 4);
|
||||
i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
|
||||
i2 = i1 + params::degree / 4;
|
||||
|
||||
w = negtwiddles[twid_id + 2];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 3
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 8);
|
||||
i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
|
||||
i2 = i1 + params::degree / 8;
|
||||
|
||||
w = negtwiddles[twid_id + 4];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 4
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 16);
|
||||
i1 =
|
||||
2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
|
||||
i2 = i1 + params::degree / 16;
|
||||
|
||||
w = negtwiddles[twid_id + 8];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 5
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 32);
|
||||
i1 =
|
||||
2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
|
||||
i2 = i1 + params::degree / 32;
|
||||
|
||||
w = negtwiddles[twid_id + 16];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 6
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 64);
|
||||
i1 =
|
||||
2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
|
||||
i2 = i1 + params::degree / 64;
|
||||
|
||||
w = negtwiddles[twid_id + 32];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 7
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 128);
|
||||
i1 = 2 * (params::degree / 128) * twid_id +
|
||||
(tid & (params::degree / 128 - 1));
|
||||
i2 = i1 + params::degree / 128;
|
||||
|
||||
w = negtwiddles[twid_id + 64];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// from level 8, we need to check size of params degree, because we support
|
||||
// minimum actual polynomial size = 256, when compressed size is halfed and
|
||||
// minimum supported compressed size is 128, so we always need first 7
|
||||
// levels of butterfy operation, since butterfly levels are hardcoded
|
||||
// we need to check if polynomial size is big enough to require specific level
|
||||
// of butterfly.
|
||||
if constexpr (params::degree >= 256) {
|
||||
// level 8
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 256);
|
||||
i1 = 2 * (params::degree / 256) * twid_id +
|
||||
(tid & (params::degree / 256 - 1));
|
||||
i2 = i1 + params::degree / 256;
|
||||
|
||||
w = negtwiddles[twid_id + 128];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 512) {
|
||||
// level 9
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 512);
|
||||
i1 = 2 * (params::degree / 512) * twid_id +
|
||||
(tid & (params::degree / 512 - 1));
|
||||
i2 = i1 + params::degree / 512;
|
||||
|
||||
w = negtwiddles[twid_id + 256];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 1024) {
|
||||
// level 10
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 1024);
|
||||
i1 = 2 * (params::degree / 1024) * twid_id +
|
||||
(tid & (params::degree / 1024 - 1));
|
||||
i2 = i1 + params::degree / 1024;
|
||||
|
||||
w = negtwiddles[twid_id + 512];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 2048) {
|
||||
// level 11
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 2048);
|
||||
i1 = 2 * (params::degree / 2048) * twid_id +
|
||||
(tid & (params::degree / 2048 - 1));
|
||||
i2 = i1 + params::degree / 2048;
|
||||
|
||||
w = negtwiddles[twid_id + 1024];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 4096) {
|
||||
// level 12
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 4096);
|
||||
i1 = 2 * (params::degree / 4096) * twid_id +
|
||||
(tid & (params::degree / 4096 - 1));
|
||||
i2 = i1 + params::degree / 4096;
|
||||
|
||||
w = negtwiddles[twid_id + 2048];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// compressed size = 8192 is actual polynomial size = 16384.
|
||||
// from this size, twiddles can't fit in constant memory,
|
||||
// so from here, butterfly operation access device memory.
|
||||
if constexpr (params::degree >= 8192) {
|
||||
// level 13
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 8192);
|
||||
i1 = 2 * (params::degree / 8192) * twid_id +
|
||||
(tid & (params::degree / 8192 - 1));
|
||||
i2 = i1 + params::degree / 8192;
|
||||
|
||||
w = negtwiddles13[twid_id];
|
||||
u = A[i1];
|
||||
v = A[i2] * w;
|
||||
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* negacyclic inverse fft
|
||||
*/
|
||||
template <class params> __device__ void NSMFFT_inverse(double2 *A) {
|
||||
|
||||
/* We don't make bit reverse here, since twiddles are already reversed
|
||||
* Each thread is always in charge of "opt/2" pairs of coefficients,
|
||||
* which is why we always loop through N/2 by N/opt strides
|
||||
* The pragma unroll instruction tells the compiler to unroll the
|
||||
* full loop, which should increase performance
|
||||
*/
|
||||
|
||||
size_t tid = threadIdx.x;
|
||||
size_t twid_id;
|
||||
size_t i1, i2;
|
||||
double2 u, w;
|
||||
|
||||
// divide input by compressed polynomial size
|
||||
tid = threadIdx.x;
|
||||
for (size_t i = 0; i < params::opt; ++i) {
|
||||
A[tid] /= params::degree;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// none of the twiddles have equal real and imag part, so
|
||||
// complete complex multiplication has to be done
|
||||
// here we have more than one twiddle
|
||||
// mapping in backward fft is reversed
|
||||
// butterfly operation is started from last level
|
||||
|
||||
// compressed size = 8192 is actual polynomial size = 16384.
|
||||
// twiddles for this size can't fit in constant memory so
|
||||
// butterfly operation for this level acess device memory to fetch
|
||||
// twiddles
|
||||
if constexpr (params::degree >= 8192) {
|
||||
// level 13
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 8192);
|
||||
i1 = 2 * (params::degree / 8192) * twid_id +
|
||||
(tid & (params::degree / 8192 - 1));
|
||||
i2 = i1 + params::degree / 8192;
|
||||
|
||||
w = negtwiddles13[twid_id];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 4096) {
|
||||
// level 12
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 4096);
|
||||
i1 = 2 * (params::degree / 4096) * twid_id +
|
||||
(tid & (params::degree / 4096 - 1));
|
||||
i2 = i1 + params::degree / 4096;
|
||||
|
||||
w = negtwiddles[twid_id + 2048];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 2048) {
|
||||
// level 11
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 2048);
|
||||
i1 = 2 * (params::degree / 2048) * twid_id +
|
||||
(tid & (params::degree / 2048 - 1));
|
||||
i2 = i1 + params::degree / 2048;
|
||||
|
||||
w = negtwiddles[twid_id + 1024];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 1024) {
|
||||
// level 10
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 1024);
|
||||
i1 = 2 * (params::degree / 1024) * twid_id +
|
||||
(tid & (params::degree / 1024 - 1));
|
||||
i2 = i1 + params::degree / 1024;
|
||||
|
||||
w = negtwiddles[twid_id + 512];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 512) {
|
||||
// level 9
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 512);
|
||||
i1 = 2 * (params::degree / 512) * twid_id +
|
||||
(tid & (params::degree / 512 - 1));
|
||||
i2 = i1 + params::degree / 512;
|
||||
|
||||
w = negtwiddles[twid_id + 256];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::degree >= 256) {
|
||||
// level 8
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 256);
|
||||
i1 = 2 * (params::degree / 256) * twid_id +
|
||||
(tid & (params::degree / 256 - 1));
|
||||
i2 = i1 + params::degree / 256;
|
||||
|
||||
w = negtwiddles[twid_id + 128];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// below level 8, we don't need to check size of params degree, because we
|
||||
// support minimum actual polynomial size = 256, when compressed size is
|
||||
// halfed and minimum supported compressed size is 128, so we always need
|
||||
// last 7 levels of butterfy operation, since butterfly levels are hardcoded
|
||||
// we don't need to check if polynomial size is big enough to require
|
||||
// specific level of butterfly.
|
||||
// level 7
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 128);
|
||||
i1 = 2 * (params::degree / 128) * twid_id +
|
||||
(tid & (params::degree / 128 - 1));
|
||||
i2 = i1 + params::degree / 128;
|
||||
|
||||
w = negtwiddles[twid_id + 64];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 6
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 64);
|
||||
i1 =
|
||||
2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
|
||||
i2 = i1 + params::degree / 64;
|
||||
|
||||
w = negtwiddles[twid_id + 32];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 5
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 32);
|
||||
i1 =
|
||||
2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
|
||||
i2 = i1 + params::degree / 32;
|
||||
|
||||
w = negtwiddles[twid_id + 16];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 4
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 16);
|
||||
i1 =
|
||||
2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
|
||||
i2 = i1 + params::degree / 16;
|
||||
|
||||
w = negtwiddles[twid_id + 8];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 3
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 8);
|
||||
i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
|
||||
i2 = i1 + params::degree / 8;
|
||||
|
||||
w = negtwiddles[twid_id + 4];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 2
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 4);
|
||||
i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
|
||||
i2 = i1 + params::degree / 4;
|
||||
|
||||
w = negtwiddles[twid_id + 2];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// level 1
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < params::opt / 2; ++i) {
|
||||
twid_id = tid / (params::degree / 2);
|
||||
i1 = 2 * (params::degree / 2) * twid_id + (tid & (params::degree / 2 - 1));
|
||||
i2 = i1 + params::degree / 2;
|
||||
|
||||
w = negtwiddles[twid_id + 1];
|
||||
u = A[i1] - A[i2];
|
||||
|
||||
A[i1] += A[i2];
|
||||
A[i2] = u * conjugate(w);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
/*
|
||||
* global batch fft
|
||||
* does fft in half size
|
||||
* unrolling half size fft result in half size + 1 elements
|
||||
* this function must be called with actual degree
|
||||
* function takes as input already compressed input
|
||||
*/
|
||||
template <class params, sharedMemDegree SMD>
|
||||
__global__ void batch_NSMFFT(double2 *d_input, double2 *d_output,
|
||||
double2 *buffer) {
|
||||
extern __shared__ double2 sharedMemoryFFT[];
|
||||
double2 *fft = (SMD == NOSM) ? &buffer[blockIdx.x * params::degree / 2]
|
||||
: sharedMemoryFFT;
|
||||
int tid = threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] = d_input[blockIdx.x * (params::degree / 2) + tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
__syncthreads();
|
||||
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
d_output[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* global batch polynomial multiplication
|
||||
* only used for fft tests
|
||||
* d_input1 and d_output must not have the same pointer
|
||||
* d_input1 can be modified inside the function
|
||||
*/
|
||||
template <class params, sharedMemDegree SMD>
|
||||
__global__ void batch_polynomial_mul(double2 *d_input1, double2 *d_input2,
|
||||
double2 *d_output, double2 *buffer) {
|
||||
extern __shared__ double2 sharedMemoryFFT[];
|
||||
double2 *fft = (SMD == NOSM) ? &buffer[blockIdx.x * params::degree / 2]
|
||||
: sharedMemoryFFT;
|
||||
|
||||
// Move first polynomial into shared memory(if possible otherwise it will
|
||||
// be moved in device buffer)
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] = d_input1[blockIdx.x * (params::degree / 2) + tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
|
||||
// Perform direct negacyclic fourier transform
|
||||
__syncthreads();
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
__syncthreads();
|
||||
|
||||
// Put the result of direct fft inside input1
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
d_input1[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Move first polynomial into shared memory(if possible otherwise it will
|
||||
// be moved in device buffer)
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] = d_input2[blockIdx.x * (params::degree / 2) + tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
|
||||
// Perform direct negacyclic fourier transform on the second polynomial
|
||||
__syncthreads();
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
__syncthreads();
|
||||
|
||||
// calculate pointwise multiplication inside fft buffer
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] *= d_input1[blockIdx.x * (params::degree / 2) + tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
|
||||
// Perform backward negacyclic fourier transform
|
||||
__syncthreads();
|
||||
NSMFFT_inverse<HalfDegree<params>>(fft);
|
||||
__syncthreads();
|
||||
|
||||
// copy results in output buffer
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
d_output[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // GPU_BOOTSTRAP_FFT_CUH
|
||||
8197
backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu
Normal file
8197
backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu
Normal file
File diff suppressed because it is too large
Load Diff
13
backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cuh
Normal file
13
backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cuh
Normal file
@@ -0,0 +1,13 @@
|
||||
#ifndef GPU_BOOTSTRAP_TWIDDLES_CUH
|
||||
#define GPU_BOOTSTRAP_TWIDDLES_CUH
|
||||
|
||||
/*
|
||||
* 'negtwiddles' are stored in constant memory for faster access times
|
||||
* because of it's limitied size, only twiddles for up to 2^12 polynomial size
|
||||
* can be stored there, twiddles for 2^13 are stored in device memory
|
||||
* 'negtwiddles13'
|
||||
*/
|
||||
|
||||
extern __constant__ double2 negtwiddles[4096];
|
||||
extern __device__ double2 negtwiddles13[4096];
|
||||
#endif
|
||||
51
backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
Normal file
51
backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
Normal file
@@ -0,0 +1,51 @@
|
||||
#include "integer/bitwise_ops.cuh"
|
||||
|
||||
void scratch_cuda_integer_radix_bitop_kb_64(
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_integer_radix_bitop_kb<uint64_t>(
|
||||
stream, (int_bitop_buffer<uint64_t> **)mem_ptr, lwe_ciphertext_count,
|
||||
params, op_type, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_bitop_integer_radix_ciphertext_kb_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_1,
|
||||
void *lwe_array_2, int8_t *mem_ptr, void *bsk, void *ksk,
|
||||
uint32_t lwe_ciphertext_count) {
|
||||
|
||||
host_integer_radix_bitop_kb<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_1),
|
||||
static_cast<uint64_t *>(lwe_array_2),
|
||||
(int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
|
||||
lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
void cuda_bitnot_integer_radix_ciphertext_kb_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
|
||||
int8_t *mem_ptr, void *bsk, void *ksk, uint32_t lwe_ciphertext_count) {
|
||||
|
||||
host_integer_radix_bitnot_kb<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
(int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
|
||||
lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_bitop(cuda_stream_t *stream, int8_t **mem_ptr_void) {
|
||||
|
||||
int_bitop_buffer<uint64_t> *mem_ptr =
|
||||
(int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
52
backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
Normal file
52
backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
Normal file
@@ -0,0 +1,52 @@
|
||||
#ifndef CUDA_INTEGER_BITWISE_OPS_CUH
|
||||
#define CUDA_INTEGER_BITWISE_OPS_CUH
|
||||
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.cuh"
|
||||
#include "integer.h"
|
||||
#include "pbs/bootstrap_low_latency.cuh"
|
||||
#include "pbs/bootstrap_multibit.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <omp.h>
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_radix_bitop_kb(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_1, Torus *lwe_array_2,
|
||||
int_bitop_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
auto lut = mem_ptr->lut;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
stream, lwe_array_out, lwe_array_1, lwe_array_2, bsk, ksk,
|
||||
num_radix_blocks, lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_radix_bitnot_kb(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_in,
|
||||
int_bitop_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
auto lut = mem_ptr->lut;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, lwe_array_out, lwe_array_in, bsk, ksk, num_radix_blocks, lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_bitop_kb(
|
||||
cuda_stream_t *stream, int_bitop_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
*mem_ptr = new int_bitop_buffer<Torus>(stream, op, params, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
#endif
|
||||
45
backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
Normal file
45
backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
Normal file
@@ -0,0 +1,45 @@
|
||||
#include "integer/cmux.cuh"
|
||||
|
||||
void scratch_cuda_integer_radix_cmux_kb_64(
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
std::function<uint64_t(uint64_t)> predicate_lut_f =
|
||||
[](uint64_t x) -> uint64_t { return x == 1; };
|
||||
|
||||
scratch_cuda_integer_radix_cmux_kb(
|
||||
stream, (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
|
||||
lwe_ciphertext_count, params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_cmux_integer_radix_ciphertext_kb_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_condition,
|
||||
void *lwe_array_true, void *lwe_array_false, int8_t *mem_ptr, void *bsk,
|
||||
void *ksk, uint32_t lwe_ciphertext_count) {
|
||||
|
||||
host_integer_radix_cmux_kb<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_condition),
|
||||
static_cast<uint64_t *>(lwe_array_true),
|
||||
static_cast<uint64_t *>(lwe_array_false),
|
||||
(int_cmux_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
|
||||
|
||||
lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_cmux(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_cmux_buffer<uint64_t> *mem_ptr =
|
||||
(int_cmux_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
102
backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
Normal file
102
backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
Normal file
@@ -0,0 +1,102 @@
|
||||
#ifndef CUDA_INTEGER_CMUX_CUH
|
||||
#define CUDA_INTEGER_CMUX_CUH
|
||||
|
||||
#include "integer.cuh"
|
||||
#include <omp.h>
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void zero_out_if(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_input, Torus *lwe_condition,
|
||||
int_zero_out_if_buffer<Torus> *mem_ptr,
|
||||
int_radix_lut<Torus> *predicate, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = mem_ptr->params;
|
||||
|
||||
int big_lwe_size = params.big_lwe_dimension + 1;
|
||||
|
||||
// Left message is shifted
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = (params.big_lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
|
||||
// We can't use integer_radix_apply_bivariate_lookup_table_kb since the
|
||||
// second operand is fixed
|
||||
auto tmp_lwe_array_input = mem_ptr->tmp;
|
||||
for (int i = 0; i < num_radix_blocks; i++) {
|
||||
auto lwe_array_out_block = tmp_lwe_array_input + i * big_lwe_size;
|
||||
auto lwe_array_input_block = lwe_array_input + i * big_lwe_size;
|
||||
|
||||
device_pack_bivariate_blocks<<<num_blocks, num_threads, 0,
|
||||
stream->stream>>>(
|
||||
lwe_array_out_block, lwe_array_input_block, lwe_condition,
|
||||
predicate->lwe_indexes, params.big_lwe_dimension,
|
||||
params.message_modulus, 1);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, lwe_array_out, tmp_lwe_array_input, bsk, ksk, num_radix_blocks,
|
||||
predicate);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_radix_cmux_kb(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_condition, Torus *lwe_array_true,
|
||||
Torus *lwe_array_false,
|
||||
int_cmux_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
|
||||
// Since our CPU threads will be working on different streams we shall assert
|
||||
// the work in the main stream is completed
|
||||
stream->synchronize();
|
||||
auto true_stream = mem_ptr->zero_if_true_buffer->local_stream;
|
||||
auto false_stream = mem_ptr->zero_if_false_buffer->local_stream;
|
||||
|
||||
#pragma omp parallel sections
|
||||
{
|
||||
// Both sections may be executed in parallel
|
||||
#pragma omp section
|
||||
{
|
||||
auto mem_true = mem_ptr->zero_if_true_buffer;
|
||||
zero_out_if(true_stream, mem_ptr->tmp_true_ct, lwe_array_true,
|
||||
lwe_condition, mem_true, mem_ptr->inverted_predicate_lut, bsk,
|
||||
ksk, num_radix_blocks);
|
||||
}
|
||||
#pragma omp section
|
||||
{
|
||||
auto mem_false = mem_ptr->zero_if_false_buffer;
|
||||
zero_out_if(false_stream, mem_ptr->tmp_false_ct, lwe_array_false,
|
||||
lwe_condition, mem_false, mem_ptr->predicate_lut, bsk, ksk,
|
||||
num_radix_blocks);
|
||||
}
|
||||
}
|
||||
cuda_synchronize_stream(true_stream);
|
||||
cuda_synchronize_stream(false_stream);
|
||||
|
||||
// If the condition was true, true_ct will have kept its value and false_ct
|
||||
// will be 0 If the condition was false, true_ct will be 0 and false_ct will
|
||||
// have kept its value
|
||||
auto added_cts = mem_ptr->tmp_true_ct;
|
||||
host_addition(stream, added_cts, mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct,
|
||||
params.big_lwe_dimension, num_radix_blocks);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, lwe_array_out, added_cts, bsk, ksk, num_radix_blocks,
|
||||
mem_ptr->message_extract_lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_cmux_kb(
|
||||
cuda_stream_t *stream, int_cmux_buffer<Torus> **mem_ptr,
|
||||
std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
*mem_ptr = new int_cmux_buffer<Torus>(stream, predicate_lut_f, params,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
#endif
|
||||
83
backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
Normal file
83
backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
Normal file
@@ -0,0 +1,83 @@
|
||||
#include "integer/comparison.cuh"
|
||||
|
||||
void scratch_cuda_integer_radix_comparison_kb_64(
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, COMPARISON_TYPE op_type,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
switch (op_type) {
|
||||
case EQ:
|
||||
case NE:
|
||||
scratch_cuda_integer_radix_equality_check_kb<uint64_t>(
|
||||
stream, (int_comparison_buffer<uint64_t> **)mem_ptr,
|
||||
lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
|
||||
break;
|
||||
case GT:
|
||||
case GE:
|
||||
case LT:
|
||||
case LE:
|
||||
case MAX:
|
||||
case MIN:
|
||||
scratch_cuda_integer_radix_difference_check_kb<uint64_t>(
|
||||
stream, (int_comparison_buffer<uint64_t> **)mem_ptr,
|
||||
lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_1,
|
||||
void *lwe_array_2, int8_t *mem_ptr, void *bsk, void *ksk,
|
||||
uint32_t lwe_ciphertext_count) {
|
||||
|
||||
int_comparison_buffer<uint64_t> *buffer =
|
||||
(int_comparison_buffer<uint64_t> *)mem_ptr;
|
||||
switch (buffer->op) {
|
||||
case EQ:
|
||||
case NE:
|
||||
host_integer_radix_equality_check_kb<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_1),
|
||||
static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
|
||||
static_cast<uint64_t *>(ksk), lwe_ciphertext_count);
|
||||
break;
|
||||
case GT:
|
||||
case GE:
|
||||
case LT:
|
||||
case LE:
|
||||
host_integer_radix_difference_check_kb<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_1),
|
||||
static_cast<uint64_t *>(lwe_array_2), buffer,
|
||||
buffer->diff_buffer->operator_f, bsk, static_cast<uint64_t *>(ksk),
|
||||
lwe_ciphertext_count);
|
||||
break;
|
||||
case MAX:
|
||||
case MIN:
|
||||
host_integer_radix_maxmin_kb<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_1),
|
||||
static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
|
||||
static_cast<uint64_t *>(ksk), lwe_ciphertext_count);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: integer operation not supported")
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_comparison(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_comparison_buffer<uint64_t> *mem_ptr =
|
||||
(int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
537
backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
Normal file
537
backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
Normal file
@@ -0,0 +1,537 @@
|
||||
#ifndef CUDA_INTEGER_COMPARISON_OPS_CUH
|
||||
#define CUDA_INTEGER_COMPARISON_OPS_CUH
|
||||
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.cuh"
|
||||
#include "integer.h"
|
||||
#include "integer/cmux.cuh"
|
||||
#include "integer/negation.cuh"
|
||||
#include "integer/scalar_addition.cuh"
|
||||
#include "pbs/bootstrap_low_latency.cuh"
|
||||
#include "pbs/bootstrap_multibit.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
|
||||
// lwe_dimension + 1 threads
|
||||
// todo: This kernel MUST be refactored to a binary reduction
|
||||
template <typename Torus>
|
||||
__global__ void device_accumulate_all_blocks(Torus *output, Torus *input_block,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t num_blocks) {
|
||||
int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (idx < lwe_dimension + 1) {
|
||||
auto block = &input_block[idx];
|
||||
|
||||
Torus sum = block[0];
|
||||
for (int i = 1; i < num_blocks; i++) {
|
||||
sum += block[i * (lwe_dimension + 1)];
|
||||
}
|
||||
|
||||
output[idx] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void accumulate_all_blocks(cuda_stream_t *stream, Torus *output,
|
||||
Torus *input, uint32_t lwe_dimension,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = (lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
// Add all blocks and store in sum
|
||||
device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
|
||||
output, input, lwe_dimension, num_radix_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
/* This takes an array of lwe ciphertexts, where each is an encryption of
|
||||
* either 0 or 1.
|
||||
*
|
||||
* It writes in lwe_array_out a single lwe ciphertext encrypting 1 if all input
|
||||
* blocks are 1 otherwise the block encrypts 0
|
||||
*
|
||||
*/
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
auto are_all_block_true_buffer =
|
||||
mem_ptr->eq_buffer->are_all_block_true_buffer;
|
||||
|
||||
uint32_t total_modulus = message_modulus * carry_modulus;
|
||||
uint32_t max_value = total_modulus - 1;
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
lwe_array_out, lwe_array_in,
|
||||
num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
|
||||
|
||||
int lut_num_blocks = 0;
|
||||
uint32_t remaining_blocks = num_radix_blocks;
|
||||
while (remaining_blocks > 1) {
|
||||
// Split in max_value chunks
|
||||
uint32_t chunk_length = std::min(max_value, remaining_blocks);
|
||||
int num_chunks = remaining_blocks / chunk_length;
|
||||
|
||||
// Since all blocks encrypt either 0 or 1, we can sum max_value of them
|
||||
// as in the worst case we will be adding `max_value` ones
|
||||
auto input_blocks = lwe_array_out;
|
||||
auto accumulator = are_all_block_true_buffer->tmp_block_accumulated;
|
||||
for (int i = 0; i < num_chunks; i++) {
|
||||
accumulate_all_blocks(stream, accumulator, input_blocks,
|
||||
big_lwe_dimension, chunk_length);
|
||||
|
||||
accumulator += (big_lwe_dimension + 1);
|
||||
remaining_blocks -= (chunk_length - 1);
|
||||
input_blocks += (big_lwe_dimension + 1) * chunk_length;
|
||||
}
|
||||
accumulator = are_all_block_true_buffer->tmp_block_accumulated;
|
||||
auto is_equal_to_num_blocks_map =
|
||||
&are_all_block_true_buffer->is_equal_to_lut_map;
|
||||
|
||||
// Selects a LUT
|
||||
int_radix_lut<Torus> *lut;
|
||||
if (are_all_block_true_buffer->op == COMPARISON_TYPE::NE) {
|
||||
// is_non_zero_lut_buffer LUT
|
||||
lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
} else {
|
||||
if ((*is_equal_to_num_blocks_map).find(chunk_length) !=
|
||||
(*is_equal_to_num_blocks_map).end()) {
|
||||
// The LUT is already computed
|
||||
lut = (*is_equal_to_num_blocks_map)[chunk_length];
|
||||
} else {
|
||||
// LUT needs to be computed
|
||||
auto new_lut = new int_radix_lut<Torus>(stream, params, max_value,
|
||||
num_radix_blocks, true);
|
||||
|
||||
auto is_equal_to_num_blocks_lut_f = [max_value,
|
||||
chunk_length](Torus x) -> Torus {
|
||||
return (x & max_value) == chunk_length;
|
||||
};
|
||||
generate_device_accumulator<Torus>(
|
||||
stream, new_lut->lut, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, is_equal_to_num_blocks_lut_f);
|
||||
|
||||
(*is_equal_to_num_blocks_map)[chunk_length] = new_lut;
|
||||
lut = new_lut;
|
||||
}
|
||||
}
|
||||
|
||||
// Applies the LUT
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, lwe_array_out, accumulator, bsk, ksk, num_chunks, lut);
|
||||
}
|
||||
}
|
||||
|
||||
/* This takes an array of lwe ciphertexts, where each is an encryption of
|
||||
* either 0 or 1.
|
||||
*
|
||||
* It writes in lwe_array_out a single lwe ciphertext encrypting 1 if at least
|
||||
* one input ciphertext encrypts 1 otherwise encrypts 0
|
||||
*/
|
||||
template <typename Torus>
|
||||
__host__ void is_at_least_one_comparisons_block_true(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
|
||||
uint32_t num_radix_blocks) {
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
auto buffer = mem_ptr->eq_buffer->are_all_block_true_buffer;
|
||||
|
||||
uint32_t total_modulus = message_modulus * carry_modulus;
|
||||
uint32_t max_value = total_modulus - 1;
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
lwe_array_out, lwe_array_in,
|
||||
num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
|
||||
|
||||
uint32_t remaining_blocks = num_radix_blocks;
|
||||
while (remaining_blocks > 1) {
|
||||
// Split in max_value chunks
|
||||
uint32_t chunk_length = std::min(max_value, remaining_blocks);
|
||||
int num_chunks = remaining_blocks / chunk_length;
|
||||
|
||||
// Since all blocks encrypt either 0 or 1, we can sum max_value of them
|
||||
// as in the worst case we will be adding `max_value` ones
|
||||
auto input_blocks = lwe_array_out;
|
||||
auto accumulator = buffer->tmp_block_accumulated;
|
||||
for (int i = 0; i < num_chunks; i++) {
|
||||
accumulate_all_blocks(stream, accumulator, input_blocks,
|
||||
big_lwe_dimension, chunk_length);
|
||||
|
||||
accumulator += (big_lwe_dimension + 1);
|
||||
remaining_blocks -= (chunk_length - 1);
|
||||
input_blocks += (big_lwe_dimension + 1) * chunk_length;
|
||||
}
|
||||
accumulator = buffer->tmp_block_accumulated;
|
||||
|
||||
// Selects a LUT
|
||||
int_radix_lut<Torus> *lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
|
||||
// Applies the LUT
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, lwe_array_out, accumulator, bsk, ksk, num_chunks, lut);
|
||||
}
|
||||
}
|
||||
|
||||
// This takes an input slice of blocks.
|
||||
//
|
||||
// Each block can encrypt any value as long as its < message_modulus.
|
||||
//
|
||||
// It will compare blocks with 0, for either equality or difference.
|
||||
//
|
||||
// This returns a Vec of block, where each block encrypts 1 or 0
|
||||
// depending of if all blocks matched with the comparison type with 0.
|
||||
//
|
||||
// E.g. For ZeroComparisonType::Equality, if all input blocks are zero
|
||||
// than all returned block will encrypt 1
|
||||
//
|
||||
// The returned Vec will have less block than the number of input blocks.
|
||||
// The returned blocks potentially needs to be 'reduced' to one block
|
||||
// with eg are_all_comparisons_block_true.
|
||||
//
|
||||
// This function exists because sometimes it is faster to concatenate
|
||||
// multiple vec of 'boolean' shortint block before reducing them with
|
||||
// are_all_comparisons_block_true
|
||||
template <typename Torus>
|
||||
__host__ void host_compare_with_zero_equality(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
|
||||
int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
// The idea is that we will sum chunks of blocks until carries are full
|
||||
// then we compare the sum with 0.
|
||||
//
|
||||
// If all blocks were 0, the sum will be zero
|
||||
// If at least one bock was not zero, the sum won't be zero
|
||||
uint32_t total_modulus = message_modulus * carry_modulus;
|
||||
uint32_t message_max = message_modulus - 1;
|
||||
|
||||
uint32_t num_elements_to_fill_carry = (total_modulus - 1) / message_max;
|
||||
|
||||
size_t big_lwe_size = big_lwe_dimension + 1;
|
||||
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
int num_sum_blocks = 0;
|
||||
// Accumulator
|
||||
auto sum = lwe_array_out;
|
||||
|
||||
if (num_radix_blocks == 1) {
|
||||
// Just copy
|
||||
cuda_memcpy_async_gpu_to_gpu(sum, lwe_array_in, big_lwe_size_bytes, stream);
|
||||
num_sum_blocks = 1;
|
||||
} else {
|
||||
uint32_t remainder_blocks = num_radix_blocks;
|
||||
auto sum_i = sum;
|
||||
auto chunk = lwe_array_in;
|
||||
while (remainder_blocks > 1) {
|
||||
uint32_t chunk_size =
|
||||
std::min(remainder_blocks, num_elements_to_fill_carry);
|
||||
|
||||
accumulate_all_blocks(stream, sum_i, chunk, big_lwe_dimension,
|
||||
chunk_size);
|
||||
|
||||
num_sum_blocks++;
|
||||
remainder_blocks -= (chunk_size - 1);
|
||||
|
||||
// Update operands
|
||||
chunk += chunk_size * big_lwe_size;
|
||||
sum_i += big_lwe_size;
|
||||
}
|
||||
}
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, sum, sum, bsk, ksk, num_sum_blocks, zero_comparison);
|
||||
are_all_comparisons_block_true(stream, lwe_array_out, sum, mem_ptr, bsk, ksk,
|
||||
num_sum_blocks);
|
||||
|
||||
// The result will be in the two first block. Everything else is
|
||||
// garbage.
|
||||
cuda_memset_async(lwe_array_out + big_lwe_size, 0,
|
||||
big_lwe_size_bytes * (num_radix_blocks - 1), stream);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_equality_check_kb(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
|
||||
Torus *lwe_array_2, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
auto eq_buffer = mem_ptr->eq_buffer;
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
|
||||
// Applies the LUT for the comparison operation
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
integer_radix_apply_bivariate_lookup_table_kb(
|
||||
stream, comparisons, lwe_array_1, lwe_array_2, bsk, ksk, num_radix_blocks,
|
||||
eq_buffer->operator_lut);
|
||||
|
||||
// This takes a Vec of blocks, where each block is either 0 or 1.
|
||||
//
|
||||
// It return a block encrypting 1 if all input blocks are 1
|
||||
// otherwise the block encrypts 0
|
||||
are_all_comparisons_block_true(stream, lwe_array_out, comparisons, mem_ptr,
|
||||
bsk, ksk, num_radix_blocks);
|
||||
|
||||
// Zero all blocks but the first
|
||||
size_t big_lwe_size = big_lwe_dimension + 1;
|
||||
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
cuda_memset_async(lwe_array_out + big_lwe_size, 0,
|
||||
big_lwe_size_bytes * (num_radix_blocks - 1), stream);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_equality_check_kb(
|
||||
cuda_stream_t *stream, int_comparison_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
*mem_ptr = new int_comparison_buffer<Torus>(
|
||||
stream, op, params, num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_left, Torus *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
// When rhs > lhs, the subtraction will overflow, and the bit of padding will
|
||||
// be set to 1
|
||||
// meaning that the output of the pbs will be the negative (modulo message
|
||||
// space)
|
||||
//
|
||||
// Example:
|
||||
// lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
|
||||
// lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
|
||||
// Since there was an overflow the bit of padding is 1 and not 0.
|
||||
// When applying the LUT for an input value of 14 we would expect 1,
|
||||
// but since the bit of padding is 1, we will get -1 modulus our message
|
||||
// space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
|
||||
|
||||
// Subtract
|
||||
// Here we need the true lwe sub, not the one that comes from shortint.
|
||||
host_subtraction(stream, lwe_array_out, lwe_array_left, lwe_array_right,
|
||||
big_lwe_dimension, num_radix_blocks);
|
||||
|
||||
// Apply LUT to compare to 0
|
||||
auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
stream, lwe_array_out, lwe_array_out, bsk, ksk, num_radix_blocks,
|
||||
is_non_zero_lut);
|
||||
|
||||
// Add one
|
||||
// Here Lhs can have the following values: (-1) % (message modulus * carry
|
||||
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
|
||||
host_integer_radix_add_scalar_one_inplace(stream, lwe_array_out,
|
||||
big_lwe_dimension, num_radix_blocks,
|
||||
message_modulus, carry_modulus);
|
||||
}
|
||||
|
||||
// Reduces a vec containing shortint blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single shortint block containing the
|
||||
// final sign
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
tree_sign_reduction(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_block_comparisons,
|
||||
int_tree_sign_reduction_buffer<Torus> *tree_buffer,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto params = tree_buffer->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
// Tree reduction
|
||||
// Reduces a vec containing shortint blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single shortint block containing the
|
||||
// final sign
|
||||
size_t big_lwe_size = big_lwe_dimension + 1;
|
||||
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
auto x = tree_buffer->tmp_x;
|
||||
auto y = tree_buffer->tmp_y;
|
||||
if (x != lwe_block_comparisons)
|
||||
cuda_memcpy_async_gpu_to_gpu(x, lwe_block_comparisons,
|
||||
big_lwe_size_bytes * num_radix_blocks, stream);
|
||||
|
||||
uint32_t partial_block_count = num_radix_blocks;
|
||||
|
||||
auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
|
||||
while (partial_block_count > 2) {
|
||||
pack_blocks(stream, y, x, big_lwe_dimension, partial_block_count, 4);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, x, y, bsk, ksk, partial_block_count >> 1, inner_tree_leaf);
|
||||
|
||||
if ((partial_block_count % 2) != 0) {
|
||||
partial_block_count >>= 1;
|
||||
partial_block_count++;
|
||||
|
||||
auto last_y_block = y + (partial_block_count - 1) * big_lwe_size;
|
||||
auto last_x_block = x + (partial_block_count - 1) * big_lwe_size;
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(last_x_block, last_y_block,
|
||||
big_lwe_size_bytes, stream);
|
||||
} else {
|
||||
partial_block_count >>= 1;
|
||||
}
|
||||
}
|
||||
|
||||
auto last_lut = tree_buffer->tree_last_leaf_lut;
|
||||
auto block_selector_f = tree_buffer->block_selector_f;
|
||||
std::function<Torus(Torus)> f;
|
||||
|
||||
if (partial_block_count == 2) {
|
||||
pack_blocks(stream, y, x, big_lwe_dimension, partial_block_count, 4);
|
||||
|
||||
f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
|
||||
int msb = (x >> 2) & 3;
|
||||
int lsb = x & 3;
|
||||
|
||||
int final_sign = block_selector_f(msb, lsb);
|
||||
return sign_handler_f(final_sign);
|
||||
};
|
||||
} else {
|
||||
// partial_block_count == 1
|
||||
y = x;
|
||||
f = sign_handler_f;
|
||||
}
|
||||
generate_device_accumulator<Torus>(stream, last_lut->lut, glwe_dimension,
|
||||
polynomial_size, message_modulus,
|
||||
carry_modulus, f);
|
||||
|
||||
// Last leaf
|
||||
integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out, y, bsk,
|
||||
ksk, 1, last_lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_difference_check_kb(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_left,
|
||||
Torus *lwe_array_right, int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> reduction_lut_f, void *bsk, Torus *ksk,
|
||||
uint32_t total_num_radix_blocks) {
|
||||
|
||||
auto diff_buffer = mem_ptr->diff_buffer;
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
uint32_t num_radix_blocks = total_num_radix_blocks;
|
||||
auto lhs = lwe_array_left;
|
||||
auto rhs = lwe_array_right;
|
||||
if (carry_modulus == message_modulus) {
|
||||
// Packing is possible
|
||||
// Pack inputs
|
||||
Torus *packed_left = diff_buffer->tmp_packed_left;
|
||||
Torus *packed_right = diff_buffer->tmp_packed_right;
|
||||
pack_blocks(stream, packed_left, lwe_array_left, big_lwe_dimension,
|
||||
num_radix_blocks, message_modulus);
|
||||
pack_blocks(stream, packed_right, lwe_array_right, big_lwe_dimension,
|
||||
num_radix_blocks, message_modulus);
|
||||
// From this point we have half number of blocks
|
||||
num_radix_blocks /= 2;
|
||||
|
||||
// Clean noise
|
||||
auto cleaning_lut = mem_ptr->cleaning_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
stream, packed_left, packed_left, bsk, ksk, num_radix_blocks,
|
||||
cleaning_lut);
|
||||
integer_radix_apply_univariate_lookup_table_kb(
|
||||
stream, packed_right, packed_right, bsk, ksk, num_radix_blocks,
|
||||
cleaning_lut);
|
||||
|
||||
lhs = packed_left;
|
||||
rhs = packed_right;
|
||||
}
|
||||
|
||||
// comparisons will be assigned
|
||||
// - 0 if lhs < rhs
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk, ksk,
|
||||
num_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction(stream, lwe_array_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer, reduction_lut_f, bsk,
|
||||
ksk, num_radix_blocks);
|
||||
|
||||
// The result will be in the first block. Everything else is garbage.
|
||||
size_t big_lwe_size = big_lwe_dimension + 1;
|
||||
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
cuda_memset_async(lwe_array_out + big_lwe_size, 0,
|
||||
(total_num_radix_blocks - 1) * big_lwe_size_bytes, stream);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_radix_difference_check_kb(
|
||||
cuda_stream_t *stream, int_comparison_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_comparison_buffer<Torus>(
|
||||
stream, op, params, num_radix_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_radix_maxmin_kb(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_left, Torus *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
|
||||
Torus *ksk, uint32_t total_num_radix_blocks) {
|
||||
|
||||
// Compute the sign
|
||||
host_integer_radix_difference_check_kb(
|
||||
stream, mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
|
||||
mem_ptr, mem_ptr->cleaning_lut_f, bsk, ksk, total_num_radix_blocks);
|
||||
|
||||
// Selector
|
||||
host_integer_radix_cmux_kb(
|
||||
stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
|
||||
}
|
||||
|
||||
#endif
|
||||
140
backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
Normal file
140
backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
Normal file
@@ -0,0 +1,140 @@
|
||||
#include "integer/integer.cuh"
|
||||
#include <linear_algebra.h>
|
||||
|
||||
void cuda_full_propagation_64_inplace(
|
||||
cuda_stream_t *stream, void *input_blocks, int8_t *mem_ptr, void *ksk,
|
||||
void *bsk, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t ks_base_log, uint32_t ks_level,
|
||||
uint32_t pbs_base_log, uint32_t pbs_level, uint32_t grouping_factor,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<256>>(
|
||||
stream, static_cast<uint64_t *>(input_blocks),
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
|
||||
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
|
||||
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
|
||||
break;
|
||||
case 512:
|
||||
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<512>>(
|
||||
stream, static_cast<uint64_t *>(input_blocks),
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
|
||||
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
|
||||
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
|
||||
break;
|
||||
case 1024:
|
||||
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<1024>>(
|
||||
stream, static_cast<uint64_t *>(input_blocks),
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
|
||||
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
|
||||
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
|
||||
break;
|
||||
case 2048:
|
||||
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<2048>>(
|
||||
stream, static_cast<uint64_t *>(input_blocks),
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
|
||||
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
|
||||
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
|
||||
break;
|
||||
case 4096:
|
||||
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<4096>>(
|
||||
stream, static_cast<uint64_t *>(input_blocks),
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
|
||||
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
|
||||
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
|
||||
break;
|
||||
case 8192:
|
||||
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<8192>>(
|
||||
stream, static_cast<uint64_t *>(input_blocks),
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
|
||||
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
|
||||
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
|
||||
break;
|
||||
case 16384:
|
||||
host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<16384>>(
|
||||
stream, static_cast<uint64_t *>(input_blocks),
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
|
||||
bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
|
||||
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (full propagation inplace): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
" in the interval [256..16384].")
|
||||
}
|
||||
}
|
||||
|
||||
void scratch_cuda_full_propagation_64(
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
scratch_cuda_full_propagation<uint64_t>(
|
||||
stream, (int_fullprop_buffer<uint64_t> **)mem_ptr, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count, grouping_factor,
|
||||
input_lwe_ciphertext_count, message_modulus, carry_modulus, pbs_type,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cleanup_cuda_full_propagation(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_fullprop_buffer<uint64_t> *mem_ptr =
|
||||
(int_fullprop_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
cuda_drop_async(mem_ptr->lut_buffer, stream);
|
||||
cuda_drop_async(mem_ptr->lut_indexes, stream);
|
||||
|
||||
cuda_drop_async(mem_ptr->tmp_small_lwe_vector, stream);
|
||||
cuda_drop_async(mem_ptr->tmp_big_lwe_vector, stream);
|
||||
|
||||
switch (mem_ptr->pbs_type) {
|
||||
case LOW_LAT: {
|
||||
auto x = (pbs_buffer<uint64_t, LOW_LAT> *)(mem_ptr->pbs_buffer);
|
||||
x->release(stream);
|
||||
} break;
|
||||
case MULTI_BIT: {
|
||||
auto x = (pbs_buffer<uint64_t, MULTI_BIT> *)(mem_ptr->pbs_buffer);
|
||||
x->release(stream);
|
||||
} break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unsupported implementation variant.")
|
||||
}
|
||||
}
|
||||
|
||||
void scratch_cuda_propagate_single_carry_low_latency_kb_64_inplace(
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
scratch_cuda_propagate_single_carry_low_latency_kb_inplace(
|
||||
stream, (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_propagate_single_carry_low_latency_kb_64_inplace(
|
||||
cuda_stream_t *stream, void *lwe_array, int8_t *mem_ptr, void *bsk,
|
||||
void *ksk, uint32_t num_blocks) {
|
||||
host_propagate_single_carry_low_latency<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array),
|
||||
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsk,
|
||||
static_cast<uint64_t *>(ksk), num_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_propagate_single_carry_low_latency(cuda_stream_t *stream,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_sc_prop_memory<uint64_t> *mem_ptr =
|
||||
(int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
592
backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Normal file
592
backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Normal file
@@ -0,0 +1,592 @@
|
||||
#ifndef CUDA_INTEGER_CUH
|
||||
#define CUDA_INTEGER_CUH
|
||||
|
||||
#include "bootstrap.h"
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.h"
|
||||
#include "integer/scalar_addition.cuh"
|
||||
#include "linear_algebra.h"
|
||||
#include "linearalgebra/addition.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <functional>
|
||||
|
||||
// function rotates right radix ciphertext with specific value
|
||||
// grid is one dimensional
|
||||
// blockIdx.x represents x_th block of radix ciphertext
|
||||
template <typename Torus>
|
||||
__global__ void radix_blocks_rotate_right(Torus *dst, Torus *src,
|
||||
uint32_t value, uint32_t blocks_count,
|
||||
uint32_t lwe_size) {
|
||||
value %= blocks_count;
|
||||
|
||||
size_t tid = threadIdx.x;
|
||||
size_t src_block_id = blockIdx.x;
|
||||
size_t dst_block_id = (src_block_id + value) % blocks_count;
|
||||
size_t stride = blockDim.x;
|
||||
|
||||
auto cur_src_block = &src[src_block_id * lwe_size];
|
||||
auto cur_dst_block = &dst[dst_block_id * lwe_size];
|
||||
|
||||
for (size_t i = tid; i < lwe_size; i += stride) {
|
||||
cur_dst_block[i] = cur_src_block[i];
|
||||
}
|
||||
}
|
||||
|
||||
// function rotates left radix ciphertext with specific value
|
||||
// grid is one dimensional
|
||||
// blockIdx.x represents x_th block of radix ciphertext
|
||||
template <typename Torus>
|
||||
__global__ void radix_blocks_rotate_left(Torus *dst, Torus *src, uint32_t value,
|
||||
uint32_t blocks_count,
|
||||
uint32_t lwe_size) {
|
||||
value %= blocks_count;
|
||||
size_t src_block_id = blockIdx.x;
|
||||
|
||||
size_t tid = threadIdx.x;
|
||||
size_t dst_block_id = (src_block_id >= value)
|
||||
? src_block_id - value
|
||||
: src_block_id - value + blocks_count;
|
||||
size_t stride = blockDim.x;
|
||||
|
||||
auto cur_src_block = &src[src_block_id * lwe_size];
|
||||
auto cur_dst_block = &dst[dst_block_id * lwe_size];
|
||||
|
||||
for (size_t i = tid; i < lwe_size; i += stride) {
|
||||
cur_dst_block[i] = cur_src_block[i];
|
||||
}
|
||||
}
|
||||
|
||||
// polynomial_size threads
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
device_pack_bivariate_blocks(Torus *lwe_array_out, Torus *lwe_array_1,
|
||||
Torus *lwe_array_2, Torus *lwe_indexes,
|
||||
uint32_t lwe_dimension, uint32_t message_modulus,
|
||||
uint32_t num_blocks) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
if (tid < num_blocks * (lwe_dimension + 1)) {
|
||||
int block_id = tid / (lwe_dimension + 1);
|
||||
int coeff_id = tid % (lwe_dimension + 1);
|
||||
|
||||
int pos = lwe_indexes[block_id] * (lwe_dimension + 1) + coeff_id;
|
||||
lwe_array_out[pos] = lwe_array_1[pos] * message_modulus + lwe_array_2[pos];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void pack_bivariate_blocks(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_1, Torus *lwe_array_2,
|
||||
Torus *lwe_indexes, uint32_t lwe_dimension,
|
||||
uint32_t message_modulus,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// Left message is shifted
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = num_radix_blocks * (lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
|
||||
lwe_array_out, lwe_array_1, lwe_array_2, lwe_indexes, lwe_dimension,
|
||||
message_modulus, num_radix_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in, void *bsk,
|
||||
Torus *ksk, uint32_t num_radix_blocks, int_radix_lut<Torus> *lut) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// apply_lookup_table
|
||||
auto params = lut->params;
|
||||
auto pbs_type = params.pbs_type;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto small_lwe_dimension = params.small_lwe_dimension;
|
||||
auto ks_level = params.ks_level;
|
||||
auto ks_base_log = params.ks_base_log;
|
||||
auto pbs_level = params.pbs_level;
|
||||
auto pbs_base_log = params.pbs_base_log;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto grouping_factor = params.grouping_factor;
|
||||
|
||||
// Compute Keyswitch-PBS
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
stream, lut->tmp_lwe_after_ks, lut->lwe_indexes, lwe_array_in,
|
||||
lut->lwe_indexes, ksk, big_lwe_dimension, small_lwe_dimension,
|
||||
ks_base_log, ks_level, num_radix_blocks);
|
||||
|
||||
execute_pbs<Torus>(stream, lwe_array_out, lut->lwe_indexes, lut->lut,
|
||||
lut->lut_indexes, lut->tmp_lwe_after_ks, lut->lwe_indexes,
|
||||
bsk, lut->buffer, glwe_dimension, small_lwe_dimension,
|
||||
polynomial_size, pbs_base_log, pbs_level, grouping_factor,
|
||||
num_radix_blocks, 1, 0,
|
||||
cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
|
||||
Torus *lwe_array_2, void *bsk, Torus *ksk, uint32_t num_radix_blocks,
|
||||
int_radix_lut<Torus> *lut) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// apply_lookup_table_bivariate
|
||||
|
||||
auto params = lut->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto message_modulus = params.message_modulus;
|
||||
|
||||
// Left message is shifted
|
||||
pack_bivariate_blocks(stream, lut->tmp_lwe_before_ks, lwe_array_1,
|
||||
lwe_array_2, lut->lwe_indexes, big_lwe_dimension,
|
||||
message_modulus, num_radix_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
// Apply LUT
|
||||
integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out,
|
||||
lut->tmp_lwe_before_ks, bsk,
|
||||
ksk, num_radix_blocks, lut);
|
||||
}
|
||||
|
||||
// Rotates the slice in-place such that the first mid elements of the slice move
|
||||
// to the end while the last array_length elements move to the front. After
|
||||
// calling rotate_left, the element previously at index mid will become the
|
||||
// first element in the slice.
|
||||
template <typename Torus>
|
||||
void rotate_left(Torus *buffer, int mid, uint32_t array_length) {
|
||||
mid = mid % array_length;
|
||||
|
||||
std::rotate(buffer, buffer + mid, buffer + array_length);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t message_modulus,
|
||||
uint32_t carry_modulus,
|
||||
std::function<Torus(Torus)> f) {
|
||||
|
||||
uint32_t modulus_sup = message_modulus * carry_modulus;
|
||||
uint32_t box_size = polynomial_size / modulus_sup;
|
||||
Torus delta = (1ul << 63) / modulus_sup;
|
||||
|
||||
memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));
|
||||
|
||||
auto body = &acc[glwe_dimension * polynomial_size];
|
||||
|
||||
// This accumulator extracts the carry bits
|
||||
for (int i = 0; i < modulus_sup; i++) {
|
||||
int index = i * box_size;
|
||||
for (int j = index; j < index + box_size; j++) {
|
||||
auto f_eval = f(i);
|
||||
body[j] = f_eval * delta;
|
||||
}
|
||||
}
|
||||
|
||||
int half_box_size = box_size / 2;
|
||||
|
||||
// Negate the first half_box_size coefficients
|
||||
for (int i = 0; i < half_box_size; i++) {
|
||||
body[i] = -body[i];
|
||||
}
|
||||
|
||||
rotate_left(body, half_box_size, polynomial_size);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t message_modulus,
|
||||
uint32_t carry_modulus,
|
||||
std::function<Torus(Torus, Torus)> f) {
|
||||
|
||||
Torus factor_u64 = message_modulus;
|
||||
auto wrapped_f = [factor_u64, message_modulus, f](Torus input) -> Torus {
|
||||
Torus lhs = (input / factor_u64) % message_modulus;
|
||||
Torus rhs = (input % factor_u64) % message_modulus;
|
||||
|
||||
return f(lhs, rhs);
|
||||
};
|
||||
|
||||
generate_lookup_table<Torus>(acc, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, wrapped_f);
|
||||
}
|
||||
|
||||
/*
|
||||
* generate bivariate accumulator for device pointer
|
||||
* v_stream - cuda stream
|
||||
* acc - device pointer for bivariate accumulator
|
||||
* ...
|
||||
* f - wrapping function with two Torus inputs
|
||||
*/
|
||||
template <typename Torus>
|
||||
void generate_device_accumulator_bivariate(
|
||||
cuda_stream_t *stream, Torus *acc_bivariate, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
std::function<Torus(Torus, Torus)> f) {
|
||||
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
|
||||
// fill bivariate accumulator
|
||||
generate_lookup_table_bivariate<Torus>(h_lut, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, f);
|
||||
|
||||
// copy host lut and lut_indexes to device
|
||||
cuda_memcpy_async_to_gpu(
|
||||
acc_bivariate, h_lut,
|
||||
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream);
|
||||
|
||||
// Release memory when possible
|
||||
cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
|
||||
}
|
||||
|
||||
/*
|
||||
* generate bivariate accumulator for device pointer
|
||||
* v_stream - cuda stream
|
||||
* acc - device pointer for accumulator
|
||||
* ...
|
||||
* f - evaluating function with one Torus input
|
||||
*/
|
||||
template <typename Torus>
|
||||
void generate_device_accumulator(cuda_stream_t *stream, Torus *acc,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t message_modulus,
|
||||
uint32_t carry_modulus,
|
||||
std::function<Torus(Torus)> f) {
|
||||
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
|
||||
// fill accumulator
|
||||
generate_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, f);
|
||||
|
||||
// copy host lut and lut_indexes to device
|
||||
cuda_memcpy_async_to_gpu(
|
||||
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
|
||||
stream);
|
||||
|
||||
// Release memory when possible
|
||||
cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void scratch_cuda_propagate_single_carry_low_latency_kb_inplace(
|
||||
cuda_stream_t *stream, int_sc_prop_memory<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_sc_prop_memory<Torus>(stream, params, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_propagate_single_carry_low_latency(cuda_stream_t *stream,
|
||||
Torus *lwe_array,
|
||||
int_sc_prop_memory<Torus> *mem,
|
||||
void *bsk, Torus *ksk,
|
||||
uint32_t num_blocks) {
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto big_lwe_size = glwe_dimension * polynomial_size + 1;
|
||||
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
||||
|
||||
auto generates_or_propagates = mem->generates_or_propagates;
|
||||
auto step_output = mem->step_output;
|
||||
|
||||
auto luts_array = mem->luts_array;
|
||||
auto luts_carry_propagation_sum = mem->luts_carry_propagation_sum;
|
||||
auto message_acc = mem->message_acc;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, generates_or_propagates, lwe_array, bsk, ksk, num_blocks,
|
||||
luts_array);
|
||||
|
||||
// compute prefix sum with hillis&steele
|
||||
|
||||
int num_steps = ceil(log2((double)num_blocks));
|
||||
int space = 1;
|
||||
cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
|
||||
big_lwe_size_bytes * num_blocks, stream);
|
||||
|
||||
for (int step = 0; step < num_steps; step++) {
|
||||
auto cur_blocks = &step_output[space * big_lwe_size];
|
||||
auto prev_blocks = generates_or_propagates;
|
||||
int cur_total_blocks = num_blocks - space;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
stream, cur_blocks, cur_blocks, prev_blocks, bsk, ksk, cur_total_blocks,
|
||||
luts_carry_propagation_sum);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(&generates_or_propagates[space * big_lwe_size],
|
||||
cur_blocks,
|
||||
big_lwe_size_bytes * cur_total_blocks, stream);
|
||||
space *= 2;
|
||||
}
|
||||
|
||||
radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
|
||||
step_output, generates_or_propagates, 1, num_blocks, big_lwe_size);
|
||||
cuda_memset_async(step_output, 0, big_lwe_size_bytes, stream);
|
||||
|
||||
host_addition(stream, lwe_array, lwe_array, step_output,
|
||||
glwe_dimension * polynomial_size, num_blocks);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, lwe_array, lwe_array, bsk, ksk, num_blocks, message_acc);
|
||||
}
|
||||
|
||||
/*
|
||||
* input_blocks: input radix ciphertext propagation will happen inplace
|
||||
* acc_message_carry: list of two lut s, [(message_acc), (carry_acc)]
|
||||
* lut_indexes_message_carry: lut_indexes for message and carry, should always
|
||||
* be {0, 1} small_lwe_vector: output of keyswitch should have size = 2 *
|
||||
* (lwe_dimension + 1) * sizeof(Torus) big_lwe_vector: output of pbs should have
|
||||
* size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus)
|
||||
*/
|
||||
template <typename Torus, typename STorus, class params>
|
||||
void host_full_propagate_inplace(cuda_stream_t *stream, Torus *input_blocks,
|
||||
int_fullprop_buffer<Torus> *mem_ptr,
|
||||
Torus *ksk, void *bsk, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t ks_base_log,
|
||||
uint32_t ks_level, uint32_t pbs_base_log,
|
||||
uint32_t pbs_level, uint32_t grouping_factor,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
int big_lwe_size = (glwe_dimension * polynomial_size + 1);
|
||||
int small_lwe_size = (lwe_dimension + 1);
|
||||
|
||||
for (int i = 0; i < num_blocks; i++) {
|
||||
auto cur_input_block = &input_blocks[i * big_lwe_size];
|
||||
|
||||
cuda_keyswitch_lwe_ciphertext_vector<Torus>(
|
||||
stream, mem_ptr->tmp_small_lwe_vector, mem_ptr->lwe_indexes,
|
||||
cur_input_block, mem_ptr->lwe_indexes, ksk,
|
||||
polynomial_size * glwe_dimension, lwe_dimension, ks_base_log, ks_level,
|
||||
1);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(&mem_ptr->tmp_small_lwe_vector[small_lwe_size],
|
||||
mem_ptr->tmp_small_lwe_vector,
|
||||
small_lwe_size * sizeof(Torus), stream);
|
||||
|
||||
execute_pbs<Torus>(
|
||||
stream, mem_ptr->tmp_big_lwe_vector, mem_ptr->lwe_indexes,
|
||||
mem_ptr->lut_buffer, mem_ptr->lut_indexes,
|
||||
mem_ptr->tmp_small_lwe_vector, mem_ptr->lwe_indexes, bsk,
|
||||
mem_ptr->pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
|
||||
pbs_base_log, pbs_level, grouping_factor, 2, 2, 0,
|
||||
cuda_get_max_shared_memory(stream->gpu_index), mem_ptr->pbs_type);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
|
||||
big_lwe_size * sizeof(Torus), stream);
|
||||
|
||||
if (i < num_blocks - 1) {
|
||||
auto next_input_block = &input_blocks[(i + 1) * big_lwe_size];
|
||||
host_addition(stream, next_input_block, next_input_block,
|
||||
&mem_ptr->tmp_big_lwe_vector[big_lwe_size],
|
||||
glwe_dimension * polynomial_size, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void scratch_cuda_full_propagation(
|
||||
cuda_stream_t *stream, int_fullprop_buffer<Torus> **mem_ptr,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t pbs_level, uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int8_t *pbs_buffer;
|
||||
execute_scratch_pbs<Torus>(stream, &pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, pbs_level, grouping_factor,
|
||||
num_radix_blocks,
|
||||
cuda_get_max_shared_memory(stream->gpu_index),
|
||||
pbs_type, allocate_gpu_memory);
|
||||
|
||||
// LUT
|
||||
Torus *lut_buffer;
|
||||
if (allocate_gpu_memory) {
|
||||
// LUT is used as a trivial encryption, so we only allocate memory for the
|
||||
// body
|
||||
Torus lut_buffer_size =
|
||||
2 * (glwe_dimension + 1) * polynomial_size * sizeof(Torus);
|
||||
|
||||
lut_buffer = (Torus *)cuda_malloc_async(lut_buffer_size, stream);
|
||||
|
||||
// LUTs
|
||||
auto lut_f_message = [message_modulus](Torus x) -> Torus {
|
||||
return x % message_modulus;
|
||||
};
|
||||
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
|
||||
return x / message_modulus;
|
||||
};
|
||||
|
||||
//
|
||||
Torus *lut_buffer_message = lut_buffer;
|
||||
Torus *lut_buffer_carry =
|
||||
lut_buffer + (glwe_dimension + 1) * polynomial_size;
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
stream, lut_buffer_message, glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, lut_f_message);
|
||||
|
||||
generate_device_accumulator<Torus>(stream, lut_buffer_carry, glwe_dimension,
|
||||
polynomial_size, message_modulus,
|
||||
carry_modulus, lut_f_carry);
|
||||
}
|
||||
|
||||
Torus *lut_indexes;
|
||||
if (allocate_gpu_memory) {
|
||||
lut_indexes = (Torus *)cuda_malloc_async(2 * sizeof(Torus), stream);
|
||||
|
||||
Torus h_lut_indexes[2] = {0, 1};
|
||||
cuda_memcpy_async_to_gpu(lut_indexes, h_lut_indexes, 2 * sizeof(Torus),
|
||||
stream);
|
||||
}
|
||||
|
||||
Torus *lwe_indexes;
|
||||
if (allocate_gpu_memory) {
|
||||
Torus lwe_indexes_size = num_radix_blocks * sizeof(Torus);
|
||||
|
||||
lwe_indexes = (Torus *)cuda_malloc_async(lwe_indexes_size, stream);
|
||||
Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
|
||||
for (int i = 0; i < num_radix_blocks; i++)
|
||||
h_lwe_indexes[i] = i;
|
||||
cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size,
|
||||
stream);
|
||||
cuda_stream_add_callback(stream, host_free_on_stream_callback,
|
||||
h_lwe_indexes);
|
||||
}
|
||||
|
||||
// Temporary arrays
|
||||
Torus *small_lwe_vector;
|
||||
Torus *big_lwe_vector;
|
||||
if (allocate_gpu_memory) {
|
||||
Torus small_vector_size = 2 * (lwe_dimension + 1) * sizeof(Torus);
|
||||
Torus big_vector_size =
|
||||
2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus);
|
||||
|
||||
small_lwe_vector = (Torus *)cuda_malloc_async(small_vector_size, stream);
|
||||
big_lwe_vector = (Torus *)cuda_malloc_async(big_vector_size, stream);
|
||||
}
|
||||
|
||||
*mem_ptr = new int_fullprop_buffer<Torus>;
|
||||
|
||||
(*mem_ptr)->pbs_type = pbs_type;
|
||||
(*mem_ptr)->pbs_buffer = pbs_buffer;
|
||||
|
||||
(*mem_ptr)->lut_buffer = lut_buffer;
|
||||
(*mem_ptr)->lut_indexes = lut_indexes;
|
||||
(*mem_ptr)->lwe_indexes = lwe_indexes;
|
||||
|
||||
(*mem_ptr)->tmp_small_lwe_vector = small_lwe_vector;
|
||||
(*mem_ptr)->tmp_big_lwe_vector = big_lwe_vector;
|
||||
}
|
||||
|
||||
// (lwe_dimension+1) threads
|
||||
// (num_radix_blocks / 2) thread blocks
|
||||
template <typename Torus>
|
||||
__global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t num_radix_blocks, uint32_t factor) {
|
||||
int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
if (tid < (lwe_dimension + 1)) {
|
||||
for (int bid = 0; bid < (num_radix_blocks / 2); bid++) {
|
||||
Torus *lsb_block = lwe_array_in + (2 * bid) * (lwe_dimension + 1);
|
||||
Torus *msb_block = lsb_block + (lwe_dimension + 1);
|
||||
|
||||
Torus *packed_block = lwe_array_out + bid * (lwe_dimension + 1);
|
||||
|
||||
packed_block[tid] = lsb_block[tid] + factor * msb_block[tid];
|
||||
}
|
||||
|
||||
if (num_radix_blocks % 2 != 0) {
|
||||
// We couldn't pack the last block, so we just copy it
|
||||
Torus *lsb_block =
|
||||
lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
|
||||
Torus *last_block =
|
||||
lwe_array_out + (num_radix_blocks / 2) * (lwe_dimension + 1);
|
||||
|
||||
last_block[tid] = lsb_block[tid];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Packs the low ciphertext in the message parts of the high ciphertext
|
||||
// and moves the high ciphertext into the carry part.
|
||||
//
|
||||
// This requires the block parameters to have enough room for two ciphertexts,
|
||||
// so at least as many carry modulus as the message modulus
|
||||
//
|
||||
// Expects the carry buffer to be empty
|
||||
template <typename Torus>
|
||||
__host__ void pack_blocks(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *lwe_array_in, uint32_t lwe_dimension,
|
||||
uint32_t num_radix_blocks, uint32_t factor) {
|
||||
if (lwe_array_out == lwe_array_in)
|
||||
PANIC("Cuda error in pack blocks: input and output pointers must be "
|
||||
"different.");
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = (lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
device_pack_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
|
||||
lwe_array_out, lwe_array_in, lwe_dimension, num_radix_blocks, factor);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
device_create_trivial_radix(Torus *lwe_array, Torus *scalar_input,
|
||||
int32_t num_blocks, uint32_t lwe_dimension,
|
||||
uint64_t delta) {
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < num_blocks) {
|
||||
Torus scalar = scalar_input[tid];
|
||||
Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
|
||||
|
||||
*body = scalar * delta;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
create_trivial_radix(cuda_stream_t *stream, Torus *lwe_array_out,
|
||||
Torus *scalar_array, uint32_t lwe_dimension,
|
||||
uint32_t num_radix_blocks, uint32_t num_scalar_blocks,
|
||||
uint64_t message_modulus, uint64_t carry_modulus) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
size_t radix_size = (lwe_dimension + 1) * num_radix_blocks;
|
||||
cuda_memset_async(lwe_array_out, 0, radix_size * sizeof(Torus), stream);
|
||||
|
||||
if (num_scalar_blocks == 0)
|
||||
return;
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = num_scalar_blocks;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
// Value of the shift we multiply our messages by
|
||||
// If message_modulus and carry_modulus are always powers of 2 we can simplify
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_create_trivial_radix<<<grid, thds, 0, stream->stream>>>(
|
||||
lwe_array_out, scalar_array, num_scalar_blocks, lwe_dimension, delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif // TFHE_RS_INTERNAL_INTEGER_CUH
|
||||
109
backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
Normal file
109
backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
Normal file
@@ -0,0 +1,109 @@
|
||||
#include "integer/multiplication.cuh"
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the integer radix multiplication in keyswitch->bootstrap order.
|
||||
*/
|
||||
void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level,
|
||||
uint32_t ks_base_log, uint32_t ks_level, uint32_t grouping_factor,
|
||||
uint32_t num_radix_blocks, PBS_TYPE pbs_type, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
polynomial_size, lwe_dimension, ks_level, ks_base_log,
|
||||
pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 2048:
|
||||
scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
|
||||
stream, (int_mul_memory<uint64_t> **)mem_ptr, num_radix_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
"Only N = 2048 is supported")
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Computes a multiplication between two 64 bit radix lwe ciphertexts
|
||||
* encrypting integer values. keyswitch -> bootstrap pattern is used, function
|
||||
* works for single pair of radix ciphertexts, 'v_stream' can be used for
|
||||
* parallelization
|
||||
* - 'v_stream' is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - 'gpu_index' is the index of the GPU to be used in the kernel launch
|
||||
* - 'radix_lwe_out' is 64 bit radix big lwe ciphertext, product of
|
||||
* multiplication
|
||||
* - 'radix_lwe_left' left radix big lwe ciphertext
|
||||
* - 'radix_lwe_right' right radix big lwe ciphertext
|
||||
* - 'bsk' bootstrapping key in fourier domain
|
||||
* - 'ksk' keyswitching key
|
||||
* - 'mem_ptr'
|
||||
* - 'message_modulus' message_modulus
|
||||
* - 'carry_modulus' carry_modulus
|
||||
* - 'glwe_dimension' glwe_dimension
|
||||
* - 'lwe_dimension' is the dimension of small lwe ciphertext
|
||||
* - 'polynomial_size' polynomial size
|
||||
* - 'pbs_base_log' base log used in the pbs
|
||||
* - 'pbs_level' decomposition level count used in the pbs
|
||||
* - 'ks_level' decomposition level count used in the keyswitch
|
||||
* - 'num_blocks' is the number of big lwe ciphertext blocks inside radix
|
||||
* ciphertext
|
||||
* - 'pbs_type' selects which PBS implementation should be used
|
||||
* - 'max_shared_memory' maximum shared memory per cuda block
|
||||
*/
|
||||
void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
cuda_stream_t *stream, void *radix_lwe_out, void *radix_lwe_left,
|
||||
void *radix_lwe_right, void *bsk, void *ksk, int8_t *mem_ptr,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
|
||||
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 2048:
|
||||
host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<2048>>(
|
||||
stream, static_cast<uint64_t *>(radix_lwe_out),
|
||||
static_cast<uint64_t *>(radix_lwe_left),
|
||||
static_cast<uint64_t *>(radix_lwe_right), bsk,
|
||||
static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
"Only N = 2048 is supported")
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_mult(cuda_stream_t *stream, int8_t **mem_ptr_void) {
|
||||
|
||||
int_mul_memory<uint64_t> *mem_ptr =
|
||||
(int_mul_memory<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(stream);
|
||||
}
|
||||
|
||||
void cuda_small_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
cuda_stream_t *stream, void *lwe_array, uint64_t scalar,
|
||||
uint32_t lwe_dimension, uint32_t lwe_ciphertext_count) {
|
||||
|
||||
cuda_small_scalar_multiplication_integer_radix_ciphertext_64(
|
||||
stream, lwe_array, lwe_array, scalar, lwe_dimension,
|
||||
lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
void cuda_small_scalar_multiplication_integer_radix_ciphertext_64(
|
||||
cuda_stream_t *stream, void *output_lwe_array, void *input_lwe_array,
|
||||
uint64_t scalar, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count) {
|
||||
|
||||
host_integer_small_scalar_mult_radix(
|
||||
stream, static_cast<uint64_t *>(output_lwe_array),
|
||||
static_cast<uint64_t *>(input_lwe_array), scalar, lwe_dimension,
|
||||
lwe_ciphertext_count);
|
||||
}
|
||||
634
backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
Normal file
634
backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
Normal file
@@ -0,0 +1,634 @@
|
||||
#ifndef CUDA_INTEGER_MULT_CUH
|
||||
#define CUDA_INTEGER_MULT_CUH
|
||||
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
#include "bootstrap.h"
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer.h"
|
||||
#include "integer/integer.cuh"
|
||||
#include "linear_algebra.h"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
template <typename Torus, class params>
|
||||
__global__ void
|
||||
all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext,
|
||||
Torus *msb_ciphertext, Torus *radix_lwe_right,
|
||||
Torus *lsb_rhs, Torus *msb_rhs, int num_blocks) {
|
||||
|
||||
size_t block_id = blockIdx.x;
|
||||
double D = sqrt((2 * num_blocks + 1) * (2 * num_blocks + 1) - 8 * block_id);
|
||||
size_t radix_id = int((2 * num_blocks + 1 - D) / 2.);
|
||||
size_t local_block_id =
|
||||
block_id - (2 * num_blocks - radix_id + 1) / 2. * radix_id;
|
||||
bool process_msb = (local_block_id < (num_blocks - radix_id - 1));
|
||||
auto cur_lsb_block = &lsb_ciphertext[block_id * (params::degree + 1)];
|
||||
auto cur_msb_block =
|
||||
(process_msb)
|
||||
? &msb_ciphertext[(block_id - radix_id) * (params::degree + 1)]
|
||||
: nullptr;
|
||||
|
||||
auto cur_lsb_rhs_block = &lsb_rhs[block_id * (params::degree + 1)];
|
||||
auto cur_msb_rhs_block =
|
||||
(process_msb) ? &msb_rhs[(block_id - radix_id) * (params::degree + 1)]
|
||||
: nullptr;
|
||||
|
||||
auto cur_ct_right = &radix_lwe_right[radix_id * (params::degree + 1)];
|
||||
auto cur_src = &radix_lwe_left[local_block_id * (params::degree + 1)];
|
||||
|
||||
size_t tid = threadIdx.x;
|
||||
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
Torus value = cur_src[tid];
|
||||
if (process_msb) {
|
||||
cur_lsb_block[tid] = cur_msb_block[tid] = value;
|
||||
cur_lsb_rhs_block[tid] = cur_msb_rhs_block[tid] = cur_ct_right[tid];
|
||||
} else {
|
||||
cur_lsb_block[tid] = value;
|
||||
cur_lsb_rhs_block[tid] = cur_ct_right[tid];
|
||||
}
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
if (threadIdx.x == 0) {
|
||||
Torus value = cur_src[params::degree];
|
||||
if (process_msb) {
|
||||
cur_lsb_block[params::degree] = cur_msb_block[params::degree] = value;
|
||||
cur_lsb_rhs_block[params::degree] = cur_msb_rhs_block[params::degree] =
|
||||
cur_ct_right[params::degree];
|
||||
} else {
|
||||
cur_lsb_block[params::degree] = value;
|
||||
cur_lsb_rhs_block[params::degree] = cur_ct_right[params::degree];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void compress_device_array_with_map(cuda_stream_t *stream, Torus *src,
|
||||
Torus *dst, int *S, int *F, int num_blocks,
|
||||
uint32_t map_size, uint32_t unit_size,
|
||||
int &total_copied, bool is_message) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
for (int i = 0; i < map_size; i++) {
|
||||
int s_index = i * num_blocks + S[i];
|
||||
int number_of_unit = F[i] - S[i] + is_message;
|
||||
auto cur_dst = &dst[total_copied * unit_size];
|
||||
auto cur_src = &src[s_index * unit_size];
|
||||
size_t copy_size = unit_size * number_of_unit * sizeof(Torus);
|
||||
cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, stream);
|
||||
total_copied += number_of_unit;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void extract_message_carry_to_full_radix(cuda_stream_t *stream, Torus *src,
|
||||
Torus *dst, int *S, int *F,
|
||||
uint32_t map_size, uint32_t unit_size,
|
||||
int &total_copied,
|
||||
int &total_radix_copied,
|
||||
int num_blocks, bool is_message) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
size_t radix_size = unit_size * num_blocks;
|
||||
for (int i = 0; i < map_size; i++) {
|
||||
auto cur_dst_radix = &dst[total_radix_copied * radix_size];
|
||||
|
||||
int s_index = S[i];
|
||||
int number_of_unit = F[i] - s_index + is_message;
|
||||
|
||||
if (!is_message) {
|
||||
int zero_block_count = num_blocks - number_of_unit;
|
||||
cuda_memset_async(cur_dst_radix, 0,
|
||||
zero_block_count * unit_size * sizeof(Torus), stream);
|
||||
s_index = zero_block_count;
|
||||
}
|
||||
|
||||
auto cur_dst = &cur_dst_radix[s_index * unit_size];
|
||||
auto cur_src = &src[total_copied * unit_size];
|
||||
|
||||
size_t copy_size = unit_size * number_of_unit * sizeof(Torus);
|
||||
cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, stream);
|
||||
total_copied += number_of_unit;
|
||||
++total_radix_copied;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
|
||||
uint32_t chunk_size, uint32_t num_blocks) {
|
||||
|
||||
extern __shared__ Torus result[];
|
||||
size_t chunk_id = blockIdx.x;
|
||||
size_t chunk_elem_size = chunk_size * num_blocks * (params::degree + 1);
|
||||
size_t radix_elem_size = num_blocks * (params::degree + 1);
|
||||
auto src_chunk = &input_blocks[chunk_id * chunk_elem_size];
|
||||
auto dst_radix = &result_blocks[chunk_id * radix_elem_size];
|
||||
size_t block_stride = blockIdx.y * (params::degree + 1);
|
||||
auto dst_block = &dst_radix[block_stride];
|
||||
|
||||
// init shared mem with first radix of chunk
|
||||
size_t tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
result[tid] = src_chunk[block_stride + tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
result[params::degree] = src_chunk[block_stride + params::degree];
|
||||
}
|
||||
|
||||
// accumulate rest of the radixes
|
||||
for (int r_id = 1; r_id < chunk_size; r_id++) {
|
||||
auto cur_src_radix = &src_chunk[r_id * radix_elem_size];
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
result[tid] += cur_src_radix[block_stride + tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
if (threadIdx.x == 0) {
|
||||
result[params::degree] += cur_src_radix[block_stride + params::degree];
|
||||
}
|
||||
}
|
||||
|
||||
// put result from shared mem to global mem
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
dst_block[tid] = result[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
dst_block[params::degree] = result[params::degree];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
|
||||
Torus *msb_blocks,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t lsb_count, uint32_t msb_count,
|
||||
uint32_t num_blocks) {
|
||||
size_t big_lwe_dimension = glwe_dimension * params::degree + 1;
|
||||
size_t big_lwe_id = blockIdx.x;
|
||||
size_t radix_id = big_lwe_id / num_blocks;
|
||||
size_t block_id = big_lwe_id % num_blocks;
|
||||
size_t lsb_block_id = block_id - radix_id;
|
||||
size_t msb_block_id = block_id - radix_id - 1;
|
||||
|
||||
bool process_lsb = (radix_id <= block_id);
|
||||
bool process_msb = (radix_id + 1 <= block_id);
|
||||
|
||||
auto cur_res_lsb_ct = &result_blocks[big_lwe_id * big_lwe_dimension];
|
||||
auto cur_res_msb_ct =
|
||||
&result_blocks[num_blocks * num_blocks * big_lwe_dimension +
|
||||
big_lwe_id * big_lwe_dimension];
|
||||
Torus *cur_lsb_radix = &lsb_blocks[(2 * num_blocks - radix_id + 1) *
|
||||
radix_id / 2 * (params::degree + 1)];
|
||||
Torus *cur_msb_radix = (process_msb)
|
||||
? &msb_blocks[(2 * num_blocks - radix_id - 1) *
|
||||
radix_id / 2 * (params::degree + 1)]
|
||||
: nullptr;
|
||||
Torus *cur_lsb_ct = (process_lsb)
|
||||
? &cur_lsb_radix[lsb_block_id * (params::degree + 1)]
|
||||
: nullptr;
|
||||
Torus *cur_msb_ct = (process_msb)
|
||||
? &cur_msb_radix[msb_block_id * (params::degree + 1)]
|
||||
: nullptr;
|
||||
size_t tid = threadIdx.x;
|
||||
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
cur_res_lsb_ct[tid] = (process_lsb) ? cur_lsb_ct[tid] : 0;
|
||||
cur_res_msb_ct[tid] = (process_msb) ? cur_msb_ct[tid] : 0;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
cur_res_lsb_ct[params::degree] =
|
||||
(process_lsb) ? cur_lsb_ct[params::degree] : 0;
|
||||
cur_res_msb_ct[params::degree] =
|
||||
(process_msb) ? cur_msb_ct[params::degree] : 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, class params>
|
||||
__host__ void host_integer_mult_radix_kb(
|
||||
cuda_stream_t *stream, uint64_t *radix_lwe_out, uint64_t *radix_lwe_left,
|
||||
uint64_t *radix_lwe_right, void *bsk, uint64_t *ksk,
|
||||
int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
auto glwe_dimension = mem_ptr->params.glwe_dimension;
|
||||
auto polynomial_size = mem_ptr->params.polynomial_size;
|
||||
auto lwe_dimension = mem_ptr->params.small_lwe_dimension;
|
||||
auto message_modulus = mem_ptr->params.message_modulus;
|
||||
auto carry_modulus = mem_ptr->params.carry_modulus;
|
||||
|
||||
int big_lwe_dimension = glwe_dimension * polynomial_size;
|
||||
int big_lwe_size = big_lwe_dimension + 1;
|
||||
|
||||
// 'vector_result_lsb' contains blocks from all possible right shifts of
|
||||
// radix_lwe_left, only nonzero blocks are kept
|
||||
int lsb_vector_block_count = num_blocks * (num_blocks + 1) / 2;
|
||||
|
||||
// 'vector_result_msb' contains blocks from all possible shifts of
|
||||
// radix_lwe_left except the last blocks of each shift. Only nonzero blocks
|
||||
// are kept
|
||||
int msb_vector_block_count = num_blocks * (num_blocks - 1) / 2;
|
||||
|
||||
// total number of blocks msb and lsb
|
||||
int total_block_count = lsb_vector_block_count + msb_vector_block_count;
|
||||
|
||||
// buffer to keep all lsb and msb shifts
|
||||
// for lsb all nonzero blocks of each right shifts are kept
|
||||
// for 0 shift num_blocks blocks
|
||||
// for 1 shift num_blocks - 1 blocks
|
||||
// for num_blocks - 1 shift 1 block
|
||||
// (num_blocks + 1) * num_blocks / 2 blocks
|
||||
// for msb we don't keep track for last blocks so
|
||||
// for 0 shift num_blocks - 1 blocks
|
||||
// for 1 shift num_blocks - 2 blocks
|
||||
// for num_blocks - 1 shift 0 blocks
|
||||
// (num_blocks - 1) * num_blocks / 2 blocks
|
||||
// in total num_blocks^2 blocks
|
||||
// in each block three is big polynomial with
|
||||
// glwe_dimension * polynomial_size + 1 coefficients
|
||||
auto vector_result_sb = mem_ptr->vector_result_sb;
|
||||
|
||||
// buffer to keep lsb_vector + msb_vector
|
||||
// addition will happen in full terms so there will be
|
||||
// num_blocks terms and each term will have num_blocks block
|
||||
// num_blocks^2 blocks in total
|
||||
// and each blocks has big lwe ciphertext with
|
||||
// glwe_dimension * polynomial_size + 1 coefficients
|
||||
auto block_mul_res = mem_ptr->block_mul_res;
|
||||
|
||||
// buffer to keep keyswitch result of num_blocks^2 ciphertext
|
||||
// in total it has num_blocks^2 small lwe ciphertexts with
|
||||
// lwe_dimension +1 coefficients
|
||||
auto small_lwe_vector = mem_ptr->small_lwe_vector;
|
||||
|
||||
// it contains two lut, first for lsb extraction,
|
||||
// second for msb extraction, with total length =
|
||||
// 2 * (glwe_dimension + 1) * polynomial_size
|
||||
auto luts_array = mem_ptr->luts_array;
|
||||
|
||||
// accumulator to extract message
|
||||
// with length (glwe_dimension + 1) * polynomial_size
|
||||
auto luts_message = mem_ptr->luts_message;
|
||||
|
||||
// accumulator to extract carry
|
||||
// with length (glwe_dimension + 1) * polynomial_size
|
||||
auto luts_carry = mem_ptr->luts_carry;
|
||||
|
||||
// to be used as default indexing
|
||||
auto lwe_indexes = luts_array->lwe_indexes;
|
||||
|
||||
auto vector_result_lsb = &vector_result_sb[0];
|
||||
auto vector_result_msb =
|
||||
&vector_result_sb[lsb_vector_block_count *
|
||||
(polynomial_size * glwe_dimension + 1)];
|
||||
|
||||
auto vector_lsb_rhs = &block_mul_res[0];
|
||||
auto vector_msb_rhs = &block_mul_res[lsb_vector_block_count *
|
||||
(polynomial_size * glwe_dimension + 1)];
|
||||
|
||||
dim3 grid(lsb_vector_block_count, 1, 1);
|
||||
dim3 thds(params::degree / params::opt, 1, 1);
|
||||
|
||||
all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, stream->stream>>>(
|
||||
radix_lwe_left, vector_result_lsb, vector_result_msb, radix_lwe_right,
|
||||
vector_lsb_rhs, vector_msb_rhs, num_blocks);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
stream, block_mul_res, block_mul_res, vector_result_sb, bsk, ksk,
|
||||
total_block_count, luts_array);
|
||||
|
||||
vector_result_lsb = &block_mul_res[0];
|
||||
vector_result_msb = &block_mul_res[lsb_vector_block_count *
|
||||
(polynomial_size * glwe_dimension + 1)];
|
||||
|
||||
fill_radix_from_lsb_msb<Torus, params>
|
||||
<<<num_blocks * num_blocks, params::degree / params::opt, 0,
|
||||
stream->stream>>>(vector_result_sb, vector_result_lsb,
|
||||
vector_result_msb, glwe_dimension,
|
||||
lsb_vector_block_count, msb_vector_block_count,
|
||||
num_blocks);
|
||||
|
||||
auto new_blocks = block_mul_res;
|
||||
auto old_blocks = vector_result_sb;
|
||||
|
||||
// amount of current radixes after block_mul
|
||||
size_t r = 2 * num_blocks;
|
||||
|
||||
size_t total_modulus = message_modulus * carry_modulus;
|
||||
size_t message_max = message_modulus - 1;
|
||||
size_t chunk_size = (total_modulus - 1) / message_max;
|
||||
size_t ch_amount = r / chunk_size;
|
||||
|
||||
int terms_degree[r * num_blocks];
|
||||
int f_b[ch_amount];
|
||||
int l_b[ch_amount];
|
||||
|
||||
for (int i = 0; i < num_blocks * num_blocks; i++) {
|
||||
size_t r_id = i / num_blocks;
|
||||
size_t b_id = i % num_blocks;
|
||||
terms_degree[i] = (b_id >= r_id) ? 3 : 0;
|
||||
}
|
||||
auto terms_degree_msb = &terms_degree[num_blocks * num_blocks];
|
||||
for (int i = 0; i < num_blocks * num_blocks; i++) {
|
||||
size_t r_id = i / num_blocks;
|
||||
size_t b_id = i % num_blocks;
|
||||
terms_degree_msb[i] = (b_id > r_id) ? 2 : 0;
|
||||
}
|
||||
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
|
||||
while (r > chunk_size) {
|
||||
int cur_total_blocks = r * num_blocks;
|
||||
ch_amount = r / chunk_size;
|
||||
dim3 add_grid(ch_amount, num_blocks, 1);
|
||||
size_t sm_size = big_lwe_size * sizeof(Torus);
|
||||
cuda_memset_async(new_blocks, 0,
|
||||
ch_amount * num_blocks * big_lwe_size * sizeof(Torus),
|
||||
stream);
|
||||
|
||||
tree_add_chunks<Torus, params><<<add_grid, 256, sm_size, stream->stream>>>(
|
||||
new_blocks, old_blocks, chunk_size, num_blocks);
|
||||
|
||||
for (int c_id = 0; c_id < ch_amount; c_id++) {
|
||||
auto cur_chunk = &terms_degree[c_id * chunk_size * num_blocks];
|
||||
int mx = 0;
|
||||
int mn = num_blocks;
|
||||
for (int r_id = 1; r_id < chunk_size; r_id++) {
|
||||
auto cur_radix = &cur_chunk[r_id * num_blocks];
|
||||
for (int i = 0; i < num_blocks; i++) {
|
||||
if (cur_radix[i]) {
|
||||
mn = min(mn, i);
|
||||
mx = max(mx, i);
|
||||
}
|
||||
}
|
||||
}
|
||||
f_b[c_id] = mn;
|
||||
l_b[c_id] = mx;
|
||||
}
|
||||
|
||||
int total_copied = 0;
|
||||
int message_count = 0;
|
||||
int carry_count = 0;
|
||||
compress_device_array_with_map<Torus>(stream, new_blocks, old_blocks, f_b,
|
||||
l_b, num_blocks, ch_amount,
|
||||
big_lwe_size, total_copied, true);
|
||||
|
||||
message_count = total_copied;
|
||||
compress_device_array_with_map<Torus>(stream, new_blocks, old_blocks, f_b,
|
||||
l_b, num_blocks, ch_amount,
|
||||
big_lwe_size, total_copied, false);
|
||||
carry_count = total_copied - message_count;
|
||||
|
||||
auto message_blocks_vector = old_blocks;
|
||||
auto carry_blocks_vector =
|
||||
&old_blocks[message_count * (glwe_dimension * polynomial_size + 1)];
|
||||
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
stream, small_lwe_vector, lwe_indexes, old_blocks, lwe_indexes, ksk,
|
||||
polynomial_size * glwe_dimension, lwe_dimension,
|
||||
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_copied);
|
||||
|
||||
execute_pbs<Torus>(stream, message_blocks_vector, lwe_indexes,
|
||||
luts_message->lut, luts_message->lut_indexes,
|
||||
small_lwe_vector, lwe_indexes, bsk, luts_message->buffer,
|
||||
glwe_dimension, lwe_dimension, polynomial_size,
|
||||
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
|
||||
mem_ptr->params.grouping_factor, message_count, 1, 0,
|
||||
max_shared_memory, mem_ptr->params.pbs_type);
|
||||
|
||||
execute_pbs<Torus>(stream, carry_blocks_vector, lwe_indexes,
|
||||
luts_carry->lut, luts_carry->lut_indexes,
|
||||
&small_lwe_vector[message_count * (lwe_dimension + 1)],
|
||||
lwe_indexes, bsk, luts_carry->buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size,
|
||||
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
|
||||
mem_ptr->params.grouping_factor, carry_count, 1, 0,
|
||||
max_shared_memory, mem_ptr->params.pbs_type);
|
||||
|
||||
int rem_blocks = r % chunk_size * num_blocks;
|
||||
int new_blocks_created = 2 * ch_amount * num_blocks;
|
||||
int copy_size = rem_blocks * big_lwe_size * sizeof(Torus);
|
||||
|
||||
auto cur_dst = &new_blocks[new_blocks_created * big_lwe_size];
|
||||
auto cur_src = &old_blocks[(cur_total_blocks - rem_blocks) * big_lwe_size];
|
||||
cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, stream);
|
||||
|
||||
total_copied = 0;
|
||||
int total_radix_copied = 0;
|
||||
extract_message_carry_to_full_radix<Torus>(
|
||||
stream, old_blocks, new_blocks, f_b, l_b, ch_amount, big_lwe_size,
|
||||
total_copied, total_radix_copied, num_blocks, true);
|
||||
extract_message_carry_to_full_radix<Torus>(
|
||||
stream, old_blocks, new_blocks, f_b, l_b, ch_amount, big_lwe_size,
|
||||
total_copied, total_radix_copied, num_blocks, false);
|
||||
|
||||
std::swap(new_blocks, old_blocks);
|
||||
r = (new_blocks_created + rem_blocks) / num_blocks;
|
||||
}
|
||||
|
||||
dim3 add_grid(1, num_blocks, 1);
|
||||
size_t sm_size = big_lwe_size * sizeof(Torus);
|
||||
cuda_memset_async(radix_lwe_out, 0, num_blocks * big_lwe_size * sizeof(Torus),
|
||||
stream);
|
||||
tree_add_chunks<Torus, params><<<add_grid, 256, sm_size, stream->stream>>>(
|
||||
radix_lwe_out, old_blocks, r, num_blocks);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, vector_result_sb, radix_lwe_out, bsk, ksk, num_blocks,
|
||||
luts_message);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
stream, &block_mul_res[big_lwe_size], radix_lwe_out, bsk, ksk, num_blocks,
|
||||
luts_carry);
|
||||
|
||||
cuda_memset_async(block_mul_res, 0, big_lwe_size * sizeof(Torus), stream);
|
||||
|
||||
host_addition(stream, radix_lwe_out, vector_result_sb, block_mul_res,
|
||||
big_lwe_dimension, num_blocks);
|
||||
|
||||
host_propagate_single_carry_low_latency<Torus>(
|
||||
stream, radix_lwe_out, mem_ptr->scp_mem, bsk, ksk, num_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
|
||||
cuda_stream_t *stream, int_mul_memory<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
*mem_ptr = new int_mul_memory<Torus>(stream, params, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
// Function to apply lookup table,
|
||||
// It has two mode
|
||||
// lsb_msb_mode == true - extracts lsb and msb
|
||||
// lsb_msb_mode == false - extracts message and carry
|
||||
template <typename Torus, typename STorus, class params>
|
||||
void apply_lookup_table(Torus *input_ciphertexts, Torus *output_ciphertexts,
|
||||
int_mul_memory<Torus> *mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t pbs_base_log, uint32_t pbs_level,
|
||||
uint32_t ks_base_log, uint32_t ks_level,
|
||||
uint32_t grouping_factor,
|
||||
uint32_t lsb_message_blocks_count,
|
||||
uint32_t msb_carry_blocks_count,
|
||||
uint32_t max_shared_memory, bool lsb_msb_mode) {
|
||||
|
||||
int total_blocks_count = lsb_message_blocks_count + msb_carry_blocks_count;
|
||||
int gpu_n = mem_ptr->p2p_gpu_count;
|
||||
if (total_blocks_count < gpu_n)
|
||||
gpu_n = total_blocks_count;
|
||||
int gpu_blocks_count = total_blocks_count / gpu_n;
|
||||
int big_lwe_size = glwe_dimension * polynomial_size + 1;
|
||||
// int small_lwe_size = lwe_dimension + 1;
|
||||
|
||||
#pragma omp parallel for num_threads(gpu_n)
|
||||
for (int i = 0; i < gpu_n; i++) {
|
||||
cudaSetDevice(i);
|
||||
auto this_stream = mem_ptr->streams[i];
|
||||
// Index where input and output blocks start for current gpu
|
||||
int big_lwe_start_index = i * gpu_blocks_count * big_lwe_size;
|
||||
|
||||
// Last gpu might have extra blocks to process if total blocks number is not
|
||||
// divisible by gpu_n
|
||||
if (i == gpu_n - 1) {
|
||||
gpu_blocks_count += total_blocks_count % gpu_n;
|
||||
}
|
||||
|
||||
int can_access_peer;
|
||||
cudaDeviceCanAccessPeer(&can_access_peer, i, 0);
|
||||
if (i == 0) {
|
||||
check_cuda_error(
|
||||
cudaMemcpyAsync(mem_ptr->pbs_output_multi_gpu[i],
|
||||
&input_ciphertexts[big_lwe_start_index],
|
||||
gpu_blocks_count * big_lwe_size * sizeof(Torus),
|
||||
cudaMemcpyDeviceToDevice, *this_stream));
|
||||
} else if (can_access_peer) {
|
||||
check_cuda_error(cudaMemcpyPeerAsync(
|
||||
mem_ptr->pbs_output_multi_gpu[i], i,
|
||||
&input_ciphertexts[big_lwe_start_index], 0,
|
||||
gpu_blocks_count * big_lwe_size * sizeof(Torus), *this_stream));
|
||||
} else {
|
||||
// Uses host memory as middle ground
|
||||
cuda_memcpy_async_to_cpu(mem_ptr->device_to_device_buffer[i],
|
||||
&input_ciphertexts[big_lwe_start_index],
|
||||
gpu_blocks_count * big_lwe_size * sizeof(Torus),
|
||||
this_stream, i);
|
||||
cuda_memcpy_async_to_gpu(
|
||||
mem_ptr->pbs_output_multi_gpu[i], mem_ptr->device_to_device_buffer[i],
|
||||
gpu_blocks_count * big_lwe_size * sizeof(Torus), this_stream, i);
|
||||
}
|
||||
|
||||
// when lsb and msb have to be extracted
|
||||
// for first lsb_count blocks we need lsb_acc
|
||||
// for last msb_count blocks we need msb_acc
|
||||
// when message and carry have tobe extracted
|
||||
// for first message_count blocks we need message_acc
|
||||
// for last carry_count blocks we need carry_acc
|
||||
Torus *cur_lut_indexes;
|
||||
if (lsb_msb_mode) {
|
||||
cur_lut_indexes = (big_lwe_start_index < lsb_message_blocks_count)
|
||||
? mem_ptr->lut_indexes_lsb_multi_gpu[i]
|
||||
: mem_ptr->lut_indexes_msb_multi_gpu[i];
|
||||
|
||||
} else {
|
||||
cur_lut_indexes = (big_lwe_start_index < lsb_message_blocks_count)
|
||||
? mem_ptr->lut_indexes_message_multi_gpu[i]
|
||||
: mem_ptr->lut_indexes_carry_multi_gpu[i];
|
||||
}
|
||||
|
||||
// execute keyswitch on a current gpu with corresponding input and output
|
||||
// blocks pbs_output_multi_gpu[i] is an input for keyswitch and
|
||||
// pbs_input_multi_gpu[i] is an output for keyswitch
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
this_stream, i, mem_ptr->pbs_input_multi_gpu[i],
|
||||
mem_ptr->pbs_output_multi_gpu[i], mem_ptr->ksk_multi_gpu[i],
|
||||
polynomial_size * glwe_dimension, lwe_dimension, ks_base_log, ks_level,
|
||||
gpu_blocks_count);
|
||||
|
||||
// execute pbs on a current gpu with corresponding input and output
|
||||
cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
|
||||
this_stream, i, mem_ptr->pbs_output_multi_gpu[i],
|
||||
mem_ptr->lut_multi_gpu[i], cur_lut_indexes,
|
||||
mem_ptr->pbs_input_multi_gpu[i], mem_ptr->bsk_multi_gpu[i],
|
||||
mem_ptr->pbs_buffer_multi_gpu[i], lwe_dimension, glwe_dimension,
|
||||
polynomial_size, grouping_factor, pbs_base_log, pbs_level,
|
||||
grouping_factor, gpu_blocks_count, 2, 0, max_shared_memory);
|
||||
|
||||
// lookup table is applied and now data from current gpu have to be copied
|
||||
// back to gpu_0 in 'output_ciphertexts' buffer
|
||||
if (i == 0) {
|
||||
check_cuda_error(
|
||||
cudaMemcpyAsync(&output_ciphertexts[big_lwe_start_index],
|
||||
mem_ptr->pbs_output_multi_gpu[i],
|
||||
gpu_blocks_count * big_lwe_size * sizeof(Torus),
|
||||
cudaMemcpyDeviceToDevice, *this_stream));
|
||||
} else if (can_access_peer) {
|
||||
check_cuda_error(cudaMemcpyPeerAsync(
|
||||
&output_ciphertexts[big_lwe_start_index], 0,
|
||||
mem_ptr->pbs_output_multi_gpu[i], i,
|
||||
gpu_blocks_count * big_lwe_size * sizeof(Torus), *this_stream));
|
||||
} else {
|
||||
// Uses host memory as middle ground
|
||||
cuda_memcpy_async_to_cpu(
|
||||
mem_ptr->device_to_device_buffer[i], mem_ptr->pbs_output_multi_gpu[i],
|
||||
gpu_blocks_count * big_lwe_size * sizeof(Torus), this_stream, i);
|
||||
cuda_memcpy_async_to_gpu(&output_ciphertexts[big_lwe_start_index],
|
||||
mem_ptr->device_to_device_buffer[i],
|
||||
gpu_blocks_count * big_lwe_size * sizeof(Torus),
|
||||
this_stream, i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void device_small_scalar_radix_multiplication(T *output_lwe_array,
|
||||
T *input_lwe_array,
|
||||
T scalar,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
int index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int lwe_size = lwe_dimension + 1;
|
||||
if (index < num_blocks * lwe_size) {
|
||||
// Here we take advantage of the wrapping behaviour of uint
|
||||
output_lwe_array[index] = input_lwe_array[index] * scalar;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void host_integer_small_scalar_mult_radix(
|
||||
cuda_stream_t *stream, T *output_lwe_array, T *input_lwe_array, T scalar,
|
||||
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = input_lwe_ciphertext_count * lwe_size;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
device_small_scalar_radix_multiplication<<<grid, thds, 0, stream->stream>>>(
|
||||
output_lwe_array, input_lwe_array, scalar, input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
#endif
|
||||
12
backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
Normal file
12
backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
Normal file
@@ -0,0 +1,12 @@
|
||||
#include "integer/negation.cuh"
|
||||
|
||||
void cuda_negate_integer_radix_ciphertext_64_inplace(
|
||||
cuda_stream_t *stream, void *lwe_array, uint32_t lwe_dimension,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus) {
|
||||
|
||||
host_integer_radix_negation(stream, static_cast<uint64_t *>(lwe_array),
|
||||
static_cast<uint64_t *>(lwe_array), lwe_dimension,
|
||||
lwe_ciphertext_count, message_modulus,
|
||||
carry_modulus);
|
||||
}
|
||||
79
backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
Normal file
79
backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
Normal file
@@ -0,0 +1,79 @@
|
||||
#ifndef CUDA_INTEGER_NEGATE_CUH
|
||||
#define CUDA_INTEGER_NEGATE_CUH
|
||||
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
#include "device.h"
|
||||
#include "integer.h"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
|
||||
uint64_t lwe_dimension, uint64_t message_modulus,
|
||||
uint64_t carry_modulus, uint64_t delta) {
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < lwe_dimension + 1) {
|
||||
bool is_body = (tid == lwe_dimension);
|
||||
|
||||
// z = ceil( degree / 2^p ) * 2^p
|
||||
uint64_t z = (2 * message_modulus - 1) / message_modulus;
|
||||
__syncthreads();
|
||||
z *= message_modulus;
|
||||
|
||||
// (0,Delta*z) - ct
|
||||
output[tid] = (is_body ? z * delta - input[tid] : -input[tid]);
|
||||
|
||||
for (int radix_block_id = 1; radix_block_id < num_blocks;
|
||||
radix_block_id++) {
|
||||
tid += (lwe_dimension + 1);
|
||||
|
||||
// Subtract z/B to the next ciphertext to compensate for the addition of z
|
||||
uint64_t zb = z / message_modulus;
|
||||
|
||||
uint64_t encoded_zb = zb * delta;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// (0,Delta*z) - ct
|
||||
output[tid] =
|
||||
(is_body ? z * delta - (input[tid] + encoded_zb) : -input[tid]);
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_negation(cuda_stream_t *stream, Torus *output,
|
||||
Torus *input, uint32_t lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint64_t message_modulus,
|
||||
uint64_t carry_modulus) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = lwe_dimension + 1;
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = lwe_size;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
uint64_t shared_mem = input_lwe_ciphertext_count * sizeof(uint32_t);
|
||||
|
||||
// Value of the shift we multiply our messages by
|
||||
// If message_modulus and carry_modulus are always powers of 2 we can simplify
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_negation<<<grid, thds, shared_mem, stream->stream>>>(
|
||||
output, input, input_lwe_ciphertext_count, lwe_dimension, message_modulus,
|
||||
carry_modulus, delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,12 @@
|
||||
#include "integer/scalar_addition.cuh"
|
||||
|
||||
void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
|
||||
cuda_stream_t *stream, void *lwe_array, void *scalar_input,
|
||||
uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
|
||||
host_integer_radix_scalar_addition_inplace(
|
||||
stream, static_cast<uint64_t *>(lwe_array),
|
||||
static_cast<uint64_t *>(scalar_input), lwe_dimension,
|
||||
lwe_ciphertext_count, message_modulus, carry_modulus);
|
||||
}
|
||||
130
backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
Normal file
130
backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
Normal file
@@ -0,0 +1,130 @@
|
||||
#ifndef CUDA_INTEGER_ADD_CUH
|
||||
#define CUDA_INTEGER_ADD_CUH
|
||||
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
#include "device.h"
|
||||
#include "integer.h"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <stdio.h>
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void device_integer_radix_scalar_addition_inplace(
|
||||
Torus *lwe_array, Torus *scalar_input, int32_t num_blocks,
|
||||
uint32_t lwe_dimension, uint64_t delta) {
|
||||
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < num_blocks) {
|
||||
Torus scalar = scalar_input[tid];
|
||||
Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
|
||||
|
||||
*body += scalar * delta;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_addition_inplace(
|
||||
cuda_stream_t *stream, Torus *lwe_array, Torus *scalar_input,
|
||||
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = input_lwe_ciphertext_count;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
// Value of the shift we multiply our messages by
|
||||
// If message_modulus and carry_modulus are always powers of 2 we can simplify
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_scalar_addition_inplace<<<grid, thds, 0,
|
||||
stream->stream>>>(
|
||||
lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
|
||||
delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void device_integer_radix_add_scalar_one_inplace(
|
||||
Torus *lwe_array, int32_t num_blocks, uint32_t lwe_dimension,
|
||||
uint64_t delta) {
|
||||
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < num_blocks) {
|
||||
Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
|
||||
*body += delta;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_add_scalar_one_inplace(
|
||||
cuda_stream_t *stream, Torus *lwe_array, uint32_t lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = input_lwe_ciphertext_count;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
// Value of the shift we multiply our messages by
|
||||
// If message_modulus and carry_modulus are always powers of 2 we can simplify
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_add_scalar_one_inplace<<<grid, thds, 0,
|
||||
stream->stream>>>(
|
||||
lwe_array, input_lwe_ciphertext_count, lwe_dimension, delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void device_integer_radix_scalar_subtraction_inplace(
|
||||
Torus *lwe_array, Torus *scalar_input, int32_t num_blocks,
|
||||
uint32_t lwe_dimension, uint64_t delta) {
|
||||
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < num_blocks) {
|
||||
Torus scalar = scalar_input[tid];
|
||||
Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
|
||||
|
||||
*body -= scalar * delta;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_subtraction_inplace(
|
||||
cuda_stream_t *stream, Torus *lwe_array, Torus *scalar_input,
|
||||
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
cudaSetDevice(stream->gpu_index);
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = input_lwe_ciphertext_count;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
// Value of the shift we multiply our messages by
|
||||
// If message_modulus and carry_modulus are always powers of 2 we can simplify
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_scalar_subtraction_inplace<<<grid, thds, 0,
|
||||
stream->stream>>>(
|
||||
lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
|
||||
delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
#endif
|
||||
14
backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
Normal file
14
backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
Normal file
@@ -0,0 +1,14 @@
|
||||
#include "integer/scalar_bitops.cuh"
|
||||
|
||||
void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
|
||||
cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_input,
|
||||
void *clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr, void *bsk,
|
||||
void *ksk, uint32_t lwe_ciphertext_count, BITOP_TYPE op) {
|
||||
|
||||
host_integer_radix_scalar_bitop_kb<uint64_t>(
|
||||
stream, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_input),
|
||||
static_cast<uint64_t *>(clear_blocks), num_clear_blocks,
|
||||
(int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
|
||||
lwe_ciphertext_count, op);
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user